larigira-scripts/feed
2020-11-16 15:11:42 +01:00

573 lines
16 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Feed parser with many features
from a feed, it supports filtering, subslicing, random picking
Beside feeds, it supports picking files from directories
"""
import datetime
import logging
import os
import posixpath
import random
import re
import urllib.request
from argparse import ArgumentParser, ArgumentTypeError
from bisect import bisect
from collections import OrderedDict
from subprocess import CalledProcessError, check_output
from urllib.parse import unquote, urlparse
import requests
from lxml import html
from pytimeparse.timeparse import timeparse
def get_int(s):
return int(re.findall(r"\d+", s)[0])
def DurationType(arg):
if arg.isdecimal():
secs = int(arg)
else:
secs = timeparse(arg)
if secs is None:
raise ArgumentTypeError("%r is not a valid duration" % arg)
return secs
def TimeDeltaType(arg):
if arg.isdecimal():
secs = int(arg)
else:
secs = timeparse(arg)
if secs is None:
raise ArgumentTypeError("%r is not a valid time range" % arg)
return datetime.timedelta(seconds=secs)
def weighted_choice(values, weights):
"""
random.choice with weights
weights must be integers greater than 0.
Their meaning is "relative", that is [1,2,3] is the same as [2,4,6]
"""
assert len(values) == len(weights)
total = 0
cum_weights = []
for w in weights:
total += w
cum_weights.append(total)
x = random.random() * total
i = bisect(cum_weights, x)
return values[i]
def delta_humanreadable(tdelta):
if tdelta is None:
return ""
days = tdelta.days
hours = (tdelta - datetime.timedelta(days=days)).seconds // 3600
if days:
return "{}d{}h".format(days, hours)
return "{}h".format(hours)
def duration_humanreadable(seconds):
hours = seconds // 3600
minutes = (seconds - hours * 3600) // 60
seconds = seconds % 60
if hours > 0:
return "{}h{}m{}s".format(hours, minutes, seconds)
return "{}m{}s".format(minutes, seconds)
class Audio(object):
def __init__(self, url, duration=None, date=None):
self.url = url
if duration is None:
duration = get_duration(url.encode("utf-8"))
self.duration = duration
self.date = date
self.end_date = datetime.datetime(9999, 12, 31, tzinfo=datetime.timezone.utc)
def __str__(self):
return self.url
def __repr__(self):
return "<Audio {} ({} {})>".format(
self.url,
duration_humanreadable(self.duration),
delta_humanreadable(self.age),
)
@property
def urls(self):
return [self.url]
@property
def age(self):
if self.date is None:
return None
now = datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc)
return now - self.date
@property
def valid(self):
return self.end_date >= datetime.datetime.utcnow().replace(
tzinfo=datetime.timezone.utc
)
class AudioGroup(list):
def __init__(self, description=None):
self.description = description or ""
self.audios = []
def __len__(self):
return len(self.audios)
def append(self, arg):
self.audios.append(arg)
def __str__(self):
return "\n".join(str(a) for a in self.audios)
def __repr__(self):
return '<AudioGroup "{}" ({} {})\n{} >'.format(
self.description,
duration_humanreadable(self.duration),
delta_humanreadable(self.age),
"\n".join(" " + repr(a) for a in self.audios),
)
@property
def duration(self):
return sum(a.duration for a in self.audios if a.duration is not None)
@property
def urls(self):
return [a.url for a in self.audios]
@property
def date(self):
for a in self.audios:
if hasattr(a, "date"):
return a.date
return None
@property
def age(self):
if self.date is None:
return None
now = datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc)
return now - self.date
@property
def valid(self):
return len(self.audios) > 0
def get_tree(feed_url):
if feed_url.startswith("http:") or feed_url.startswith("https:"):
tree = html.fromstring(requests.get(feed_url).content)
else:
if not os.path.exists(feed_url):
raise ValueError("file not found: {}".format(feed_url))
tree = html.parse(open(feed_url))
return tree
def get_audio_from_description(text):
# non-empty lines
lines = [line.strip() for line in text.split("\n") if line.strip()]
url = lines[0]
duration = None
metadata = {}
for line in text.split("\n")[1:]:
if line.strip() and "=" in line:
metadata[line.split("=")[0]] = line.split("=")[1]
if "durata" in metadata:
metadata["durata"] = get_int(metadata["durata"])
if "txdate" in metadata:
try:
metadata["txdate"] = datetime.datetime.strptime(
metadata["txdate"], "%Y-%m-%dT%H:%M:%S%z"
)
except ValueError:
logging.warning("could not parse txdate %s", metadata["txdate"])
del metadata["txdate"]
a = Audio(
unquote(url),
duration=metadata.get("durata", None),
date=metadata.get("txdate", None),
)
if "txdate" in metadata and "replica" in metadata:
if metadata["replica"].endswith("g"):
a.end_date = metadata["txdate"] + datetime.timedelta(
days=get_int(metadata["replica"])
)
return a
# copied from larigira.fsutils
def scan_dir_audio(dirname, extensions=("mp3", "oga", "wav", "ogg")):
for root, dirnames, filenames in os.walk(dirname):
for fname in filenames:
if fname.split(".")[-1].lower() in extensions:
yield os.path.join(root, fname)
def get_audio_from_dir(dirpath):
fpaths = scan_dir_audio(dirpath)
return [
Audio(
"file://" + os.path.realpath(u),
date=datetime.datetime.fromtimestamp(os.path.getmtime(u)).replace(
tzinfo=datetime.timezone.utc
),
)
for u in fpaths
]
def get_item_date(el):
el_date = el.find("pubdate")
if el_date is not None:
return datetime.datetime.strptime(el_date.text, "%Y-%m-%dT%H:%M:%S%z")
return None
def get_urls(tree):
items = tree.xpath("//item")
for it in items:
title = it.find("title").text
el_body = it.find("description")
if el_body is not None:
url = el_body.text
try:
audio = get_audio_from_description(url)
except Exception as exc:
logging.info("error getting duration for `%s`" % title)
continue
if audio.date is None:
audio.date = get_item_date(it)
yield audio
def get_grouped_urls(tree):
groups = OrderedDict()
items = tree.xpath("//item")
for item in items:
guid = item.xpath("guid")[0].text.strip()
if guid not in groups:
groups[guid] = AudioGroup(guid)
audio = get_audio_from_description(item.xpath("description")[0].text)
audio.date = get_item_date(item)
if audio.valid:
groups[guid].append(audio)
return groups
def get_duration(url):
try:
lineout = check_output(
["ffprobe", "-v", "error", "-show_entries", "format=duration", "-i", url]
).split(b"\n")
except CalledProcessError as exc:
raise ValueError("error probing `%s`" % url) from exc
duration = next(l for l in lineout if l.startswith(b"duration="))
value = duration.split(b"=")[1]
return int(float(value))
HELP = """
Collect audio informations from multiple sources (XML feeds).
Audios are (in that order):
1. Collected from feeds; (grouped by article if --group is used)
2. Filtered; everything that does not match with requirements is excluded
3. Sorted; even randomly
4. Sliced; take HOWMANY elements, skipping START elements
5. (if --copy) Copied
Usage: """
def get_parser():
p = ArgumentParser(HELP)
src = p.add_argument_group("sources", "How to deal with sources")
src.add_argument(
"--source-weights", help='Select only one "source" based on this weights'
)
src.add_argument(
"--group",
default=False,
action="store_true",
help="Group audios that belong to the same article",
)
filters = p.add_argument_group(
"filters", "Select only items that match " "these conditions"
)
filters.add_argument(
"--min-len",
default=0,
type=DurationType,
help="Exclude any audio that is shorter " "than MIN_LEN seconds",
)
filters.add_argument(
"--max-len",
default=0,
type=DurationType,
help="Exclude any audio that is longer " "than MAX_LEN seconds",
)
filters.add_argument(
"--sort-by", default="no", type=str, choices=("random", "date", "duration")
)
filters.add_argument(
"--reverse", default=False, action="store_true", help="Reverse list order"
)
filters.add_argument(
"--min-age",
default=datetime.timedelta(),
type=TimeDeltaType,
help="Exclude audio more recent than MIN_AGE",
)
filters.add_argument(
"--max-age",
default=datetime.timedelta(),
type=TimeDeltaType,
help="Exclude audio older than MAX_AGE",
)
fill = p.add_argument_group(
"fill", "Fill a 'block' with as many contents as possible"
)
fill.add_argument(
"--fill",
default=0,
type=DurationType,
help="Fill a block of duration LEN",
metavar="LEN",
)
fill.add_argument(
"--fill-reverse",
default=False,
action="store_true",
help="Reverse list order after the fill algorithm",
)
fill.add_argument(
"--fill-interleave-dir",
default=None,
type=str, # FIXME: does it even work?
help="Between each item, put a random file from DIR",
)
p.add_argument(
"--start",
default=0,
type=int,
help="0-indexed start number. " "By default, play from most recent",
)
p.add_argument(
"--howmany", default=1, type=int, help="If not specified, only 1 will be played"
)
p.add_argument(
"--slotsize", type=int, help="Seconds between each audio. Still unsupported"
)
general = p.add_argument_group("general", "General options")
general.add_argument(
"--copy", help="Copy files to $TMPDIR", default=False, action="store_true"
)
general.add_argument(
"--debug", help="Debug messages", default=False, action="store_true"
)
p.add_argument("urls", metavar="URL", nargs="+")
return p
def downloader(url, dest):
headers = {}
mode='wb'
if os.path.exists(dest):
headers["Range"] = "bytes=%d-" % os.stat(dest).st_size
mode = 'ab'
r = requests.get(url, stream=True, headers=headers)
if r.status_code == 416: # range not satisfiable
return
with open(dest, mode) as f:
for chunk in r.iter_content(chunk_size=1 << 16):
f.write(chunk)
def put(audio, copy=False):
if not copy:
for url in audio.urls:
print(url)
else:
for url in audio.urls:
if url.split(":")[0] in ("http", "https"):
destdir = os.environ.get("TMPDIR", ".")
fname = posixpath.basename(urlparse(url).path)
# sanitize
fname = "".join(
c for c in fname if c.isalnum() or c in list("._-")
).rstrip()
dest = os.path.join(destdir, fname)
os.makedirs(destdir, exist_ok=True)
downloader(url, dest)
print("file://%s" % os.path.realpath(dest))
else:
# FIXME: file:// urls are just copied
print(url)
def retrieve(url, args):
"""
returns a list of Audios or a list of AudioGroups
"""
if not args.group:
if os.path.isdir(url):
audiodir = get_audio_from_dir(url)
return audiodir
elif url.startswith("http:") or url.startswith("https:") or os.path.isfile(url):
return get_urls(get_tree(url))
else:
logging.info("unsupported url `%s`", url)
return []
else: # group
if os.path.isdir(url):
audiodir = get_audio_from_dir(url)
agroups = []
for a in audiodir:
ag = AudioGroup(os.path.basename(a.url))
ag.append(a)
agroups.append(ag)
return agroups
elif url.startswith("http:") or url.startswith("https:") or os.path.isfile(url):
groups = get_grouped_urls(get_tree(url))
return groups.values()
else:
logging.info("unsupported url `%s`", url)
return []
def audio_passes_filters(audio, args):
if not audio.valid:
return False
if args.max_len and audio.duration > args.max_len:
return False
if args.fill and audio.duration > args.fill:
return False
if args.min_len and audio.duration < args.min_len:
return False
if args.min_age.total_seconds() and audio.age < args.min_age:
return False
if args.max_age.total_seconds() and audio.age > args.max_age:
return False
return True
def main():
parser = get_parser()
args = parser.parse_args()
if not args.debug:
logging.basicConfig(level=logging.WARNING)
else:
logging.basicConfig(level=logging.DEBUG)
sources = args.urls
if args.source_weights:
weights = tuple(map(int, args.source_weights.split(":")))
if len(weights) != len(sources):
parser.exit(
status=2, message="Weight must be in the" " same number as sources\n"
)
sources = [weighted_choice(sources, weights)]
audios = []
for url in sources:
url_audios = retrieve(url, args)
audios += [au for au in url_audios if audio_passes_filters(au, args)]
# sort
if args.sort_by == "random":
random.shuffle(audios)
elif args.sort_by == "date":
audios.sort(key=lambda x: x.age)
elif args.sort_by == "duration":
audios.sort(key=lambda x: x.duration)
if args.reverse:
audios.reverse()
# slice
audios = audios[args.start :]
if not args.fill:
audios = audios[: args.howmany]
if args.fill and audios:
fill_audios = [audios.pop(0)]
duration = fill_audios[0].duration
for next_audio in audios:
next_duration = next_audio.duration
if args.fill_interleave_dir:
interleaving = Audio(
# TODO: factorize "pick file"
"file://"
+ os.path.join(
args.fill_interleave_dir,
random.choice(os.listdir(args.fill_interleave_dir)),
)
)
# logging.info("%r", interleaving)
next_duration += interleaving.duration
if args.fill - duration > next_duration:
if args.fill_interleave_dir:
fill_audios.append(interleaving)
fill_audios.append(next_audio)
duration += next_duration
audios = fill_audios
if args.fill_reverse:
audios.reverse()
# the for loop excludes the last one
# this is to support the --slotsize option
if not audios:
return
for audio in audios[:-1]:
if args.debug:
print(repr(audio))
else:
put(audio, args.copy)
if args.slotsize is not None:
duration = audio.duration
if duration < args.slotsize:
print("## musica per {} secondi".format(args.slotsize - duration))
# finally, the last one
if args.debug:
print(repr(audios[-1]))
else:
put(audios[-1], args.copy)
# else: # grouping; TODO: support slotsize
# for item in groups:
# if args.debug:
# print('#', item, groups[item].duration)
# print(groups[item])
if __name__ == "__main__":
main()