Merge branch 'podcast'

This commit is contained in:
boyska 2020-06-21 13:00:49 +02:00
commit 88ff77b968
6 changed files with 327 additions and 31 deletions

View file

@ -33,6 +33,48 @@ mostrecent
It is similar to randomdir, but instead of picking randomly, picks the most
recent file (according to the ctime).
podcast
------------
This is probably the most powerful generator that comes included with
``larigira``. To use this generator, you would need to have a valid podcast
URL. Beware, here the world __podcast__ refer to its very specific meaning of
an xml-based format which resembles a RSS feed but has more media-specific
entities. See `this specification
<https://github.com/simplepie/simplepie-ng/wiki/Spec:-iTunes-Podcast-RSS>`_ for
more technical details.
So, if you have a valid podcast URL, larigira can look at it, extract audios,
and play the most recent one. Here are some typical usecases for this:
* You want to play replica based on what you host on your radio's website.
* You want to play some audio from some other radio (or other kind of podcast
source)
The podcast form has many many options, but I promise you that 90% of the cases
are easily solved using ONLY the first option: enter the URL of the podcast
and... it works!
So, what are all the other options for? Well, to cover some other use cases.
For example, let's say that at night you want to play a *random* show (not the
last one, which is the default) that happened on your radio. Then you can
change the "sort by" to be "random". Easy, right?
Another typical usecase is selecting an audio that has a duration which "fits"
with the schedule of your radio: not too long and not too short. You can do
that with the "min len" and "max len" fields. For example, setting a `min_len`
of `30min` and `max_len` of `1h15m` you can avoid picking flash news (too
short) and very long shows.
You can do many other things with its options, but I left those to your
immagination. Let's just clarify the workflow:
* the podcast URL is fetched and audio information is retrieved
* filter: audios are filtered by min/max length
* sort: audios are sorted according to `sort_by` and `reverse`
* select: the n-th episode is fetched, according to `start` field
script
--------

View file

@ -0,0 +1,63 @@
from flask_wtf import Form
from wtforms import (BooleanField, IntegerField, SelectField, StringField,
SubmitField, validators)
from wtforms.fields.html5 import URLField
class AudioForm(Form):
nick = StringField(
"Audio nick",
validators=[validators.required()],
description="A simple name to recognize this audio",
)
url = URLField(
"URL",
validators=[validators.required()],
description="URL of the podcast; it must be valid xml",
)
# TODO: group by filters/sort/select
min_len = StringField(
"Accetta solo audio lunghi almeno:",
description="Leaving this empty will disable this filter",
)
max_len = StringField(
"Accetta solo audio lunghi al massimo:",
description="Leaving this empty will disable this filter",
)
sort_by = SelectField(
"Sort episodes",
choices=[
("none", "Don't sort"),
("random", "Random"),
("duration", "Duration"),
("date", "date"),
],
)
start = IntegerField(
"Play from episode number",
description="Episodes count from 0; 0 is a sane default",
)
reverse = BooleanField("Reverse sort (descending)")
submit = SubmitField("Submit")
def populate_from_audiospec(self, audiospec):
for key in ("nick", "url", "sort_by", "reverse", "min_len", "max_len"):
if key in audiospec:
getattr(self, key).data = audiospec[key]
self.start.data = int(audiospec.get("start", 0))
def audio_receive(form):
d = {"kind": "podcast"}
for key in (
"nick",
"url",
"sort_by",
"reverse",
"min_len",
"max_len",
"start",
):
d[key] = getattr(form, key).data
return d

View file

@ -1,31 +1,4 @@
import os
import logging
import posixpath
from tempfile import mkstemp
import urllib.request
from urllib.parse import urlparse
log = logging.getLogger(__name__)
def put(url, destdir=None, copy=False):
if url.split(":")[0] not in ("http", "https"):
log.warning("Not a valid URL: %s", url)
return None
ext = url.split(".")[-1]
if ext.lower() not in ("mp3", "ogg", "oga", "wma", "m4a"):
log.warning('Invalid format (%s) for "%s"', ext, url)
return None
if not copy:
return url
fname = posixpath.basename(urlparse(url).path)
# sanitize
fname = "".join(c for c in fname if c.isalnum() or c in list("._-")).rstrip()
tmp = mkstemp(suffix="." + ext, prefix="http-%s-" % fname, dir=destdir)
os.close(tmp[0])
log.info("downloading %s -> %s", url, tmp[1])
fname, headers = urllib.request.urlretrieve(url, tmp[1])
return "file://%s" % os.path.realpath(tmp[1])
from larigira.fsutils import download_http
def generate(spec):
@ -35,10 +8,10 @@ def generate(spec):
Recognized argument is "paths" (list of static paths)
"""
if "urls" not in spec:
raise ValueError("Malformed audiospec: missing 'paths'")
raise ValueError("Malformed audiospec: missing 'urls'")
for url in spec["urls"]:
ret = put(url, copy=True)
ret = download_http(url, copy=True, prefix="http")
if ret is None:
continue
yield ret

View file

@ -0,0 +1,184 @@
import datetime
import logging
import os
import random
import sys
from subprocess import CalledProcessError, check_output
import requests
from larigira.fsutils import download_http
from lxml import html
from pytimeparse.timeparse import timeparse
def delta_humanreadable(tdelta):
if tdelta is None:
return ""
days = tdelta.days
hours = (tdelta - datetime.timedelta(days=days)).seconds // 3600
if days:
return "{}d{}h".format(days, hours)
return "{}h".format(hours)
def get_duration(url):
try:
lineout = check_output(
[
"ffprobe",
"-v",
"error",
"-show_entries",
"format=duration",
"-i",
url,
]
).split(b"\n")
except CalledProcessError as exc:
raise ValueError("error probing `%s`" % url) from exc
duration = next(l for l in lineout if l.startswith(b"duration="))
value = duration.split(b"=")[1]
return int(float(value))
class Audio(object):
def __init__(self, url, duration=None, date=None):
self.url = url
if duration is None:
duration = get_duration(url.encode("utf-8"))
self.duration = duration
self.date = date
self.end_date = datetime.datetime(
9999, 12, 31, tzinfo=datetime.timezone.utc
)
def __str__(self):
return self.url
def __repr__(self):
return "<Audio {} ({} {})>".format(
self.url, self.duration, delta_humanreadable(self.age)
)
@property
def urls(self):
return [self.url]
@property
def age(self):
if self.date is None:
return None
now = datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc)
return now - self.date
@property
def valid(self):
return self.end_date >= datetime.datetime.utcnow().replace(
tzinfo=datetime.timezone.utc
)
def get_tree(feed_url):
if feed_url.startswith("http:") or feed_url.startswith("https:"):
tree = html.fromstring(requests.get(feed_url).content)
else:
if not os.path.exists(feed_url):
raise ValueError("file not found: {}".format(feed_url))
tree = html.parse(open(feed_url))
return tree
def get_item_date(el):
el_date = el.find("pubdate")
if el_date is None:
return None
for time_format in ("%Y-%m-%dT%H:%M:%S%z", "%a, %d %b %Y %H:%M:%S %z"):
try:
return datetime.datetime.strptime(el_date.text, time_format)
except:
continue
def get_audio_from_item(item):
encl = item.find("enclosure")
url = encl.get("url")
audio_args = {}
if item.find("duration") is not None:
duration_parts = item.findtext("duration").split(":")
total_seconds = 0
for i, num in enumerate(reversed(duration_parts)):
total_seconds += int(num) * (60 ** i)
if total_seconds:
audio_args["duration"] = total_seconds
return Audio(url, **audio_args)
def get_urls(tree):
items = tree.xpath("//item")
for it in items:
# title = it.find("title").text
audio = get_audio_from_item(it)
if audio.date is None:
audio.date = get_item_date(it)
yield audio
def parse_duration(arg):
if arg.isdecimal():
secs = int(arg)
else:
secs = timeparse(arg)
if secs is None:
raise ValueError("%r is not a valid duration" % arg)
return secs
def generate(spec):
if "url" not in spec:
raise ValueError("Malformed audiospec: missing 'url'")
audios = list(get_urls(get_tree(spec["url"])))
if spec.get("min_len", False):
audios = [
a for a in audios if a.duration >= parse_duration(spec["min_len"])
]
if spec.get("max_len", False):
audios = [
a for a in audios if a.duration <= parse_duration(spec["max_len"])
]
# sort
sort_by = spec.get("sort_by", "none")
if sort_by == "random":
random.shuffle(audios)
elif sort_by == "date":
audios.sort(key=lambda x: x.age)
elif sort_by == "duration":
audios.sort(key=lambda x: x.duration)
if spec.get("reverse", False):
audios.reverse()
# slice
audios = audios[int(spec.get("start", 0)) :]
audios = audios[: int(spec.get("howmany", 1))]
# copy local
local_audios = [
download_http(a.url, copy=spec.get("copy", True), prefix="podcast")
for a in audios
]
return local_audios
# TODO: testing
# TODO: lxml should maybe be optional?
# TODO: ui
if __name__ == "__main__":
# less than proper testing
logging.basicConfig(level=logging.DEBUG)
for u in get_urls(get_tree(sys.argv[1])):
print(" -", repr(u))

View file

@ -1,6 +1,13 @@
import os
import fnmatch
import logging
import mimetypes
import os
import posixpath
import urllib.request
from tempfile import mkstemp
from urllib.parse import urlparse
log = logging.getLogger(__name__)
def scan_dir(dirname, extension=None):
@ -37,3 +44,27 @@ def shortname(path):
name = name.rsplit(".", 1)[0] # no extension
name = "".join(c for c in name if c.isalnum()) # no strange chars
return name
def download_http(url, destdir=None, copy=False, prefix="httpdl"):
if url.split(":")[0] not in ("http", "https"):
log.warning("Not a valid URL: %s", url)
return None
ext = url.split(".")[-1]
if ext.lower() not in ("mp3", "ogg", "oga", "wma", "m4a"):
log.warning('Invalid format (%s) for "%s"', ext, url)
return None
if not copy:
return url
fname = posixpath.basename(urlparse(url).path)
# sanitize
fname = "".join(
c for c in fname if c.isalnum() or c in list("._-")
).rstrip()
tmp = mkstemp(
suffix="." + ext, prefix="%s-%s-" % (prefix, fname), dir=destdir
)
os.close(tmp[0])
log.info("downloading %s -> %s", url, tmp[1])
fname, headers = urllib.request.urlretrieve(url, tmp[1])
return "file://%s" % os.path.realpath(tmp[1])

View file

@ -73,6 +73,7 @@ setup(
"mpd = larigira.audiogen_mpdrandom:generate_by_artist",
"static = larigira.audiogen_static:generate",
"http = larigira.audiogen_http:generate",
"podcast = larigira.audiogen_podcast:generate",
"randomdir = larigira.audiogen_randomdir:generate",
"mostrecent = larigira.audiogen_mostrecent:generate",
"script = larigira.audiogen_script:generate",
@ -95,6 +96,7 @@ setup(
"larigira.audioform_create": [
"static = larigira.audioform_static:StaticAudioForm",
"http = larigira.audioform_http:AudioForm",
"podcast = larigira.audioform_podcast:AudioForm",
"script = larigira.audioform_script:ScriptAudioForm",
"randomdir = larigira.audioform_randomdir:Form",
"mostrecent = larigira.audioform_mostrecent:AudioForm",
@ -102,6 +104,7 @@ setup(
"larigira.audioform_receive": [
"static = larigira.audioform_static:staticaudio_receive",
"http = larigira.audioform_http:audio_receive",
"podcast = larigira.audioform_podcast:audio_receive",
"script = larigira.audioform_script:scriptaudio_receive",
"randomdir = larigira.audioform_randomdir:receive",
"mostrecent = larigira.audioform_mostrecent:audio_receive",