Merge branch 'podcast'

2020-06-21 13:00:49 +02:00 · 2020-06-21 13:00:49 +02:00 · 88ff77b968
commit 88ff77b968
parent 9e3c2c5194 e54ce8f90f
6 changed files with 327 additions and 31 deletions
--- a/doc/source/audiogenerators.rst
+++ b/doc/source/audiogenerators.rst
@ -33,6 +33,48 @@ mostrecent
 It is similar to randomdir, but instead of picking randomly, picks the most
 recent file (according to the ctime).

+podcast
+------------
+
+This is probably the most powerful generator that comes included with
+``larigira``. To use this generator, you would need to have a valid podcast
+URL. Beware, here the world __podcast__ refer to its very specific meaning of
+an xml-based format which resembles a RSS feed but has more media-specific
+entities. See `this specification
+<https://github.com/simplepie/simplepie-ng/wiki/Spec:-iTunes-Podcast-RSS>`_ for
+more technical details.
+
+So, if you have a valid podcast URL, larigira can look at it, extract audios,
+and play the most recent one. Here are some typical usecases for this:
+ * You want to play replica based on what you host on your radio's website.
+ * You want to play some audio from some other radio (or other kind of podcast
+   source)
+
+The podcast form has many many options, but I promise you that 90% of the cases
+are easily solved using ONLY the first option: enter the URL of the podcast
+and... it works!
+
+So, what are all the other options for? Well, to cover some other use cases.
+
+For example, let's say that at night you want to play a *random* show (not the
+last one, which is the default) that happened on your radio. Then you can
+change the "sort by" to be "random". Easy, right?
+
+Another typical usecase is selecting an audio that has a duration which "fits"
+with the schedule of your radio: not too long and not too short. You can do
+that with the "min len" and "max len" fields. For example, setting a `min_len`
+of `30min` and `max_len` of `1h15m` you can avoid picking flash news (too
+short) and very long shows.
+
+You can do many other things with its options, but I left those to your
+immagination. Let's just clarify the workflow:
+
+ * the podcast URL is fetched and audio information is retrieved
+ * filter: audios are filtered by min/max length
+ * sort: audios are sorted according to `sort_by` and `reverse`
+ * select: the n-th episode is fetched, according to `start` field
+
+
 script
 --------

--- a/larigira/audioform_podcast.py
+++ b/larigira/audioform_podcast.py
@ -0,0 +1,63 @@
+from flask_wtf import Form
+from wtforms import (BooleanField, IntegerField, SelectField, StringField,
+                     SubmitField, validators)
+from wtforms.fields.html5 import URLField
+
+
+class AudioForm(Form):
+    nick = StringField(
+        "Audio nick",
+        validators=[validators.required()],
+        description="A simple name to recognize this audio",
+    )
+    url = URLField(
+        "URL",
+        validators=[validators.required()],
+        description="URL of the podcast; it must be valid xml",
+    )
+
+    # TODO: group by filters/sort/select
+    min_len = StringField(
+        "Accetta solo audio lunghi almeno:",
+        description="Leaving this empty will disable this filter",
+    )
+    max_len = StringField(
+        "Accetta solo audio lunghi al massimo:",
+        description="Leaving this empty will disable this filter",
+    )
+    sort_by = SelectField(
+        "Sort episodes",
+        choices=[
+            ("none", "Don't sort"),
+            ("random", "Random"),
+            ("duration", "Duration"),
+            ("date", "date"),
+        ],
+    )
+    start = IntegerField(
+        "Play from episode number",
+        description="Episodes count from 0; 0 is a sane default",
+    )
+    reverse = BooleanField("Reverse sort (descending)")
+    submit = SubmitField("Submit")
+
+    def populate_from_audiospec(self, audiospec):
+        for key in ("nick", "url", "sort_by", "reverse", "min_len", "max_len"):
+            if key in audiospec:
+                getattr(self, key).data = audiospec[key]
+        self.start.data = int(audiospec.get("start", 0))
+
+
+def audio_receive(form):
+    d = {"kind": "podcast"}
+    for key in (
+        "nick",
+        "url",
+        "sort_by",
+        "reverse",
+        "min_len",
+        "max_len",
+        "start",
+    ):
+        d[key] = getattr(form, key).data
+    return d
--- a/larigira/audiogen_http.py
+++ b/larigira/audiogen_http.py
@ -1,31 +1,4 @@
-import os
-import logging
-import posixpath
-from tempfile import mkstemp
-import urllib.request
-from urllib.parse import urlparse
-
-log = logging.getLogger(__name__)
-
-
-def put(url, destdir=None, copy=False):
-    if url.split(":")[0] not in ("http", "https"):
-        log.warning("Not a valid URL: %s", url)
-        return None
-    ext = url.split(".")[-1]
-    if ext.lower() not in ("mp3", "ogg", "oga", "wma", "m4a"):
-        log.warning('Invalid format (%s) for "%s"', ext, url)
-        return None
-    if not copy:
-        return url
-    fname = posixpath.basename(urlparse(url).path)
-    # sanitize
-    fname = "".join(c for c in fname if c.isalnum() or c in list("._-")).rstrip()
-    tmp = mkstemp(suffix="." + ext, prefix="http-%s-" % fname, dir=destdir)
-    os.close(tmp[0])
-    log.info("downloading %s -> %s", url, tmp[1])
-    fname, headers = urllib.request.urlretrieve(url, tmp[1])
-    return "file://%s" % os.path.realpath(tmp[1])
+from larigira.fsutils import download_http


 def generate(spec):
@ -35,10 +8,10 @@ def generate(spec):
    Recognized argument is  "paths" (list of static paths)
    """
    if "urls" not in spec:
-        raise ValueError("Malformed audiospec: missing 'paths'")
+        raise ValueError("Malformed audiospec: missing 'urls'")

    for url in spec["urls"]:
-        ret = put(url, copy=True)
+        ret = download_http(url, copy=True, prefix="http")
        if ret is None:
            continue
        yield ret
--- a/larigira/audiogen_podcast.py
+++ b/larigira/audiogen_podcast.py
@ -0,0 +1,184 @@
+import datetime
+import logging
+import os
+import random
+import sys
+from subprocess import CalledProcessError, check_output
+
+import requests
+
+from larigira.fsutils import download_http
+from lxml import html
+from pytimeparse.timeparse import timeparse
+
+
+def delta_humanreadable(tdelta):
+    if tdelta is None:
+        return ""
+    days = tdelta.days
+    hours = (tdelta - datetime.timedelta(days=days)).seconds // 3600
+    if days:
+        return "{}d{}h".format(days, hours)
+    return "{}h".format(hours)
+
+
+def get_duration(url):
+    try:
+        lineout = check_output(
+            [
+                "ffprobe",
+                "-v",
+                "error",
+                "-show_entries",
+                "format=duration",
+                "-i",
+                url,
+            ]
+        ).split(b"\n")
+    except CalledProcessError as exc:
+        raise ValueError("error probing `%s`" % url) from exc
+    duration = next(l for l in lineout if l.startswith(b"duration="))
+    value = duration.split(b"=")[1]
+    return int(float(value))
+
+
+class Audio(object):
+    def __init__(self, url, duration=None, date=None):
+        self.url = url
+        if duration is None:
+            duration = get_duration(url.encode("utf-8"))
+        self.duration = duration
+        self.date = date
+        self.end_date = datetime.datetime(
+            9999, 12, 31, tzinfo=datetime.timezone.utc
+        )
+
+    def __str__(self):
+        return self.url
+
+    def __repr__(self):
+        return "<Audio {} ({} {})>".format(
+            self.url, self.duration, delta_humanreadable(self.age)
+        )
+
+    @property
+    def urls(self):
+        return [self.url]
+
+    @property
+    def age(self):
+        if self.date is None:
+            return None
+        now = datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc)
+
+        return now - self.date
+
+    @property
+    def valid(self):
+        return self.end_date >= datetime.datetime.utcnow().replace(
+            tzinfo=datetime.timezone.utc
+        )
+
+
+def get_tree(feed_url):
+    if feed_url.startswith("http:") or feed_url.startswith("https:"):
+        tree = html.fromstring(requests.get(feed_url).content)
+    else:
+        if not os.path.exists(feed_url):
+            raise ValueError("file not found: {}".format(feed_url))
+        tree = html.parse(open(feed_url))
+    return tree
+
+
+def get_item_date(el):
+    el_date = el.find("pubdate")
+    if el_date is None:
+        return None
+    for time_format in ("%Y-%m-%dT%H:%M:%S%z", "%a, %d %b %Y %H:%M:%S %z"):
+        try:
+            return datetime.datetime.strptime(el_date.text, time_format)
+        except:
+            continue
+
+
+def get_audio_from_item(item):
+    encl = item.find("enclosure")
+    url = encl.get("url")
+    audio_args = {}
+    if item.find("duration") is not None:
+        duration_parts = item.findtext("duration").split(":")
+        total_seconds = 0
+        for i, num in enumerate(reversed(duration_parts)):
+            total_seconds += int(num) * (60 ** i)
+        if total_seconds:
+            audio_args["duration"] = total_seconds
+    return Audio(url, **audio_args)
+
+
+def get_urls(tree):
+    items = tree.xpath("//item")
+    for it in items:
+        # title = it.find("title").text
+        audio = get_audio_from_item(it)
+        if audio.date is None:
+            audio.date = get_item_date(it)
+        yield audio
+
+
+def parse_duration(arg):
+    if arg.isdecimal():
+        secs = int(arg)
+    else:
+        secs = timeparse(arg)
+        if secs is None:
+            raise ValueError("%r is not a valid duration" % arg)
+    return secs
+
+
+def generate(spec):
+    if "url" not in spec:
+        raise ValueError("Malformed audiospec: missing 'url'")
+    audios = list(get_urls(get_tree(spec["url"])))
+    if spec.get("min_len", False):
+        audios = [
+            a for a in audios if a.duration >= parse_duration(spec["min_len"])
+        ]
+    if spec.get("max_len", False):
+        audios = [
+            a for a in audios if a.duration <= parse_duration(spec["max_len"])
+        ]
+
+    # sort
+    sort_by = spec.get("sort_by", "none")
+    if sort_by == "random":
+        random.shuffle(audios)
+    elif sort_by == "date":
+        audios.sort(key=lambda x: x.age)
+    elif sort_by == "duration":
+        audios.sort(key=lambda x: x.duration)
+
+    if spec.get("reverse", False):
+        audios.reverse()
+
+    # slice
+    audios = audios[int(spec.get("start", 0)) :]
+    audios = audios[: int(spec.get("howmany", 1))]
+
+    # copy local
+    local_audios = [
+        download_http(a.url, copy=spec.get("copy", True), prefix="podcast")
+        for a in audios
+    ]
+    return local_audios
+
+
+# TODO: testing
+# TODO: lxml should maybe be optional?
+# TODO: ui
+
+
+if __name__ == "__main__":
+    # less than proper testing
+    logging.basicConfig(level=logging.DEBUG)
+    for u in get_urls(get_tree(sys.argv[1])):
+        print(" -", repr(u))
--- a/larigira/fsutils.py
+++ b/larigira/fsutils.py
@ -1,6 +1,13 @@
-import os
 import fnmatch
+import logging
 import mimetypes
+import os
+import posixpath
+import urllib.request
+from tempfile import mkstemp
+from urllib.parse import urlparse
+
+log = logging.getLogger(__name__)


 def scan_dir(dirname, extension=None):
@ -37,3 +44,27 @@ def shortname(path):
    name = name.rsplit(".", 1)[0]  # no extension
    name = "".join(c for c in name if c.isalnum())  # no strange chars
    return name
+
+
+def download_http(url, destdir=None, copy=False, prefix="httpdl"):
+    if url.split(":")[0] not in ("http", "https"):
+        log.warning("Not a valid URL: %s", url)
+        return None
+    ext = url.split(".")[-1]
+    if ext.lower() not in ("mp3", "ogg", "oga", "wma", "m4a"):
+        log.warning('Invalid format (%s) for "%s"', ext, url)
+        return None
+    if not copy:
+        return url
+    fname = posixpath.basename(urlparse(url).path)
+    # sanitize
+    fname = "".join(
+        c for c in fname if c.isalnum() or c in list("._-")
+    ).rstrip()
+    tmp = mkstemp(
+        suffix="." + ext, prefix="%s-%s-" % (prefix, fname), dir=destdir
+    )
+    os.close(tmp[0])
+    log.info("downloading %s -> %s", url, tmp[1])
+    fname, headers = urllib.request.urlretrieve(url, tmp[1])
+    return "file://%s" % os.path.realpath(tmp[1])
--- a/setup.py
+++ b/setup.py
@ -73,6 +73,7 @@ setup(
            "mpd = larigira.audiogen_mpdrandom:generate_by_artist",
            "static = larigira.audiogen_static:generate",
            "http = larigira.audiogen_http:generate",
+            "podcast = larigira.audiogen_podcast:generate",
            "randomdir = larigira.audiogen_randomdir:generate",
            "mostrecent = larigira.audiogen_mostrecent:generate",
            "script = larigira.audiogen_script:generate",
@ -95,6 +96,7 @@ setup(
        "larigira.audioform_create": [
            "static = larigira.audioform_static:StaticAudioForm",
            "http = larigira.audioform_http:AudioForm",
+            "podcast = larigira.audioform_podcast:AudioForm",
            "script = larigira.audioform_script:ScriptAudioForm",
            "randomdir = larigira.audioform_randomdir:Form",
            "mostrecent = larigira.audioform_mostrecent:AudioForm",
@ -102,6 +104,7 @@ setup(
        "larigira.audioform_receive": [
            "static = larigira.audioform_static:staticaudio_receive",
            "http = larigira.audioform_http:audio_receive",
+            "podcast = larigira.audioform_podcast:audio_receive",
            "script = larigira.audioform_script:scriptaudio_receive",
            "randomdir = larigira.audioform_randomdir:receive",
            "mostrecent = larigira.audioform_mostrecent:audio_receive",