Merge branch 'podcast'

2020-06-21 13:00:49 +02:00 · 2020-06-21 13:00:49 +02:00 · 88ff77b968
commit 88ff77b968
parent 9e3c2c5194 e54ce8f90f
6 changed files with 327 additions and 31 deletions
--- a/doc/source/audiogenerators.rst
+++ b/doc/source/audiogenerators.rst
@ -33,6 +33,48 @@ mostrecent
 It is similar to randomdir, but instead of picking randomly, picks the most
 recent file (according to the ctime).
 podcast
 ------------
 This is probably the most powerful generator that comes included with
 ``larigira``. To use this generator, you would need to have a valid podcast
 URL. Beware, here the world __podcast__ refer to its very specific meaning of
 an xml-based format which resembles a RSS feed but has more media-specific
 entities. See `this specification
 <https://github.com/simplepie/simplepie-ng/wiki/Spec:-iTunes-Podcast-RSS>`_ for
 more technical details.
 So, if you have a valid podcast URL, larigira can look at it, extract audios,
 and play the most recent one. Here are some typical usecases for this:
 * You want to play replica based on what you host on your radio's website.
 * You want to play some audio from some other radio (or other kind of podcast
   source)
 The podcast form has many many options, but I promise you that 90% of the cases
 are easily solved using ONLY the first option: enter the URL of the podcast
 and... it works!
 So, what are all the other options for? Well, to cover some other use cases.
 For example, let's say that at night you want to play a *random* show (not the
 last one, which is the default) that happened on your radio. Then you can
 change the "sort by" to be "random". Easy, right?
 Another typical usecase is selecting an audio that has a duration which "fits"
 with the schedule of your radio: not too long and not too short. You can do
 that with the "min len" and "max len" fields. For example, setting a `min_len`
 of `30min` and `max_len` of `1h15m` you can avoid picking flash news (too
 short) and very long shows.
 You can do many other things with its options, but I left those to your
 immagination. Let's just clarify the workflow:
 * the podcast URL is fetched and audio information is retrieved
 * filter: audios are filtered by min/max length
 * sort: audios are sorted according to `sort_by` and `reverse`
 * select: the n-th episode is fetched, according to `start` field
 script
 --------
--- a/larigira/audioform_podcast.py
+++ b/larigira/audioform_podcast.py
@ -0,0 +1,63 @@
 from flask_wtf import Form
 from wtforms import (BooleanField, IntegerField, SelectField, StringField,
                     SubmitField, validators)
 from wtforms.fields.html5 import URLField
 class AudioForm(Form):
    nick = StringField(
        "Audio nick",
        validators=[validators.required()],
        description="A simple name to recognize this audio",
    )
    url = URLField(
        "URL",
        validators=[validators.required()],
        description="URL of the podcast; it must be valid xml",
    )
    # TODO: group by filters/sort/select
    min_len = StringField(
        "Accetta solo audio lunghi almeno:",
        description="Leaving this empty will disable this filter",
    )
    max_len = StringField(
        "Accetta solo audio lunghi al massimo:",
        description="Leaving this empty will disable this filter",
    )
    sort_by = SelectField(
        "Sort episodes",
        choices=[
            ("none", "Don't sort"),
            ("random", "Random"),
            ("duration", "Duration"),
            ("date", "date"),
        ],
    )
    start = IntegerField(
        "Play from episode number",
        description="Episodes count from 0; 0 is a sane default",
    )
    reverse = BooleanField("Reverse sort (descending)")
    submit = SubmitField("Submit")
    def populate_from_audiospec(self, audiospec):
        for key in ("nick", "url", "sort_by", "reverse", "min_len", "max_len"):
            if key in audiospec:
                getattr(self, key).data = audiospec[key]
        self.start.data = int(audiospec.get("start", 0))
 def audio_receive(form):
    d = {"kind": "podcast"}
    for key in (
        "nick",
        "url",
        "sort_by",
        "reverse",
        "min_len",
        "max_len",
        "start",
    ):
        d[key] = getattr(form, key).data
    return d
--- a/larigira/audiogen_http.py
+++ b/larigira/audiogen_http.py
@ -1,31 +1,4 @@
-import os
+from larigira.fsutils import download_http
 import logging
 import posixpath
 from tempfile import mkstemp
 import urllib.request
 from urllib.parse import urlparse
 log = logging.getLogger(__name__)
 def put(url, destdir=None, copy=False):
    if url.split(":")[0] not in ("http", "https"):
        log.warning("Not a valid URL: %s", url)
        return None
    ext = url.split(".")[-1]
    if ext.lower() not in ("mp3", "ogg", "oga", "wma", "m4a"):
        log.warning('Invalid format (%s) for "%s"', ext, url)
        return None
    if not copy:
        return url
    fname = posixpath.basename(urlparse(url).path)
    # sanitize
    fname = "".join(c for c in fname if c.isalnum() or c in list("._-")).rstrip()
    tmp = mkstemp(suffix="." + ext, prefix="http-%s-" % fname, dir=destdir)
    os.close(tmp[0])
    log.info("downloading %s -> %s", url, tmp[1])
    fname, headers = urllib.request.urlretrieve(url, tmp[1])
    return "file://%s" % os.path.realpath(tmp[1])
 def generate(spec):
@ -35,10 +8,10 @@ def generate(spec):
    Recognized argument is  "paths" (list of static paths)
    """
    if "urls" not in spec:
-        raise ValueError("Malformed audiospec: missing 'paths'")
+        raise ValueError("Malformed audiospec: missing 'urls'")
    for url in spec["urls"]:
-        ret = put(url, copy=True)
+        ret = download_http(url, copy=True, prefix="http")
        if ret is None:
            continue
        yield ret
--- a/larigira/audiogen_podcast.py
+++ b/larigira/audiogen_podcast.py
@ -0,0 +1,184 @@
 import datetime
 import logging
 import os
 import random
 import sys
 from subprocess import CalledProcessError, check_output
 import requests
 from larigira.fsutils import download_http
 from lxml import html
 from pytimeparse.timeparse import timeparse
 def delta_humanreadable(tdelta):
    if tdelta is None:
        return ""
    days = tdelta.days
    hours = (tdelta - datetime.timedelta(days=days)).seconds // 3600
    if days:
        return "{}d{}h".format(days, hours)
    return "{}h".format(hours)
 def get_duration(url):
    try:
        lineout = check_output(
            [
                "ffprobe",
                "-v",
                "error",
                "-show_entries",
                "format=duration",
                "-i",
                url,
            ]
        ).split(b"\n")
    except CalledProcessError as exc:
        raise ValueError("error probing `%s`" % url) from exc
    duration = next(l for l in lineout if l.startswith(b"duration="))
    value = duration.split(b"=")[1]
    return int(float(value))
 class Audio(object):
    def __init__(self, url, duration=None, date=None):
        self.url = url
        if duration is None:
            duration = get_duration(url.encode("utf-8"))
        self.duration = duration
        self.date = date
        self.end_date = datetime.datetime(
            9999, 12, 31, tzinfo=datetime.timezone.utc
        )
    def __str__(self):
        return self.url
    def __repr__(self):
        return "<Audio {} ({} {})>".format(
            self.url, self.duration, delta_humanreadable(self.age)
        )
    @property
    def urls(self):
        return [self.url]
    @property
    def age(self):
        if self.date is None:
            return None
        now = datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc)
        return now - self.date
    @property
    def valid(self):
        return self.end_date >= datetime.datetime.utcnow().replace(
            tzinfo=datetime.timezone.utc
        )
 def get_tree(feed_url):
    if feed_url.startswith("http:") or feed_url.startswith("https:"):
        tree = html.fromstring(requests.get(feed_url).content)
    else:
        if not os.path.exists(feed_url):
            raise ValueError("file not found: {}".format(feed_url))
        tree = html.parse(open(feed_url))
    return tree
 def get_item_date(el):
    el_date = el.find("pubdate")
    if el_date is None:
        return None
    for time_format in ("%Y-%m-%dT%H:%M:%S%z", "%a, %d %b %Y %H:%M:%S %z"):
        try:
            return datetime.datetime.strptime(el_date.text, time_format)
        except:
            continue
 def get_audio_from_item(item):
    encl = item.find("enclosure")
    url = encl.get("url")
    audio_args = {}
    if item.find("duration") is not None:
        duration_parts = item.findtext("duration").split(":")
        total_seconds = 0
        for i, num in enumerate(reversed(duration_parts)):
            total_seconds += int(num) * (60 ** i)
        if total_seconds:
            audio_args["duration"] = total_seconds
    return Audio(url, **audio_args)
 def get_urls(tree):
    items = tree.xpath("//item")
    for it in items:
        # title = it.find("title").text
        audio = get_audio_from_item(it)
        if audio.date is None:
            audio.date = get_item_date(it)
        yield audio
 def parse_duration(arg):
    if arg.isdecimal():
        secs = int(arg)
    else:
        secs = timeparse(arg)
        if secs is None:
            raise ValueError("%r is not a valid duration" % arg)
    return secs
 def generate(spec):
    if "url" not in spec:
        raise ValueError("Malformed audiospec: missing 'url'")
    audios = list(get_urls(get_tree(spec["url"])))
    if spec.get("min_len", False):
        audios = [
            a for a in audios if a.duration >= parse_duration(spec["min_len"])
        ]
    if spec.get("max_len", False):
        audios = [
            a for a in audios if a.duration <= parse_duration(spec["max_len"])
        ]
    # sort
    sort_by = spec.get("sort_by", "none")
    if sort_by == "random":
        random.shuffle(audios)
    elif sort_by == "date":
        audios.sort(key=lambda x: x.age)
    elif sort_by == "duration":
        audios.sort(key=lambda x: x.duration)
    if spec.get("reverse", False):
        audios.reverse()
    # slice
    audios = audios[int(spec.get("start", 0)) :]
    audios = audios[: int(spec.get("howmany", 1))]
    # copy local
    local_audios = [
        download_http(a.url, copy=spec.get("copy", True), prefix="podcast")
        for a in audios
    ]
    return local_audios
 # TODO: testing
 # TODO: lxml should maybe be optional?
 # TODO: ui
 if __name__ == "__main__":
    # less than proper testing
    logging.basicConfig(level=logging.DEBUG)
    for u in get_urls(get_tree(sys.argv[1])):
        print(" -", repr(u))
--- a/larigira/fsutils.py
+++ b/larigira/fsutils.py
@ -1,6 +1,13 @@
 import os
 import fnmatch
 import logging
 import mimetypes
 import os
 import posixpath
 import urllib.request
 from tempfile import mkstemp
 from urllib.parse import urlparse
 log = logging.getLogger(__name__)
 def scan_dir(dirname, extension=None):
@ -37,3 +44,27 @@ def shortname(path):
    name = name.rsplit(".", 1)[0]  # no extension
    name = "".join(c for c in name if c.isalnum())  # no strange chars
    return name
 def download_http(url, destdir=None, copy=False, prefix="httpdl"):
    if url.split(":")[0] not in ("http", "https"):
        log.warning("Not a valid URL: %s", url)
        return None
    ext = url.split(".")[-1]
    if ext.lower() not in ("mp3", "ogg", "oga", "wma", "m4a"):
        log.warning('Invalid format (%s) for "%s"', ext, url)
        return None
    if not copy:
        return url
    fname = posixpath.basename(urlparse(url).path)
    # sanitize
    fname = "".join(
        c for c in fname if c.isalnum() or c in list("._-")
    ).rstrip()
    tmp = mkstemp(
        suffix="." + ext, prefix="%s-%s-" % (prefix, fname), dir=destdir
    )
    os.close(tmp[0])
    log.info("downloading %s -> %s", url, tmp[1])
    fname, headers = urllib.request.urlretrieve(url, tmp[1])
    return "file://%s" % os.path.realpath(tmp[1])
--- a/setup.py
+++ b/setup.py
@ -73,6 +73,7 @@ setup(
            "mpd = larigira.audiogen_mpdrandom:generate_by_artist",
            "static = larigira.audiogen_static:generate",
            "http = larigira.audiogen_http:generate",
            "podcast = larigira.audiogen_podcast:generate",
            "randomdir = larigira.audiogen_randomdir:generate",
            "mostrecent = larigira.audiogen_mostrecent:generate",
            "script = larigira.audiogen_script:generate",
@ -95,6 +96,7 @@ setup(
        "larigira.audioform_create": [
            "static = larigira.audioform_static:StaticAudioForm",
            "http = larigira.audioform_http:AudioForm",
            "podcast = larigira.audioform_podcast:AudioForm",
            "script = larigira.audioform_script:ScriptAudioForm",
            "randomdir = larigira.audioform_randomdir:Form",
            "mostrecent = larigira.audioform_mostrecent:AudioForm",
@ -102,6 +104,7 @@ setup(
        "larigira.audioform_receive": [
            "static = larigira.audioform_static:staticaudio_receive",
            "http = larigira.audioform_http:audio_receive",
            "podcast = larigira.audioform_podcast:audio_receive",
            "script = larigira.audioform_script:scriptaudio_receive",
            "randomdir = larigira.audioform_randomdir:receive",
            "mostrecent = larigira.audioform_mostrecent:audio_receive",