larigira-scripts/feed

758 lines
22 KiB
Text
Raw Normal View History

#!/usr/bin/env python3
"""
Feed parser with many features
from a feed, it supports filtering, subslicing, random picking
Beside feeds, it supports picking files from directories
"""
import datetime
import logging
import os
2021-09-25 00:13:38 +02:00
import glob
2017-05-25 19:22:00 +02:00
import posixpath
2017-06-12 23:26:43 +02:00
import random
import re
import sys
import urllib.request
from argparse import ArgumentParser, ArgumentTypeError
from bisect import bisect
from collections import OrderedDict
from subprocess import CalledProcessError, check_output
from urllib.parse import unquote, urlparse
import shutil
import requests
from lxml import html
from pytimeparse.timeparse import timeparse
2024-05-10 13:33:34 +02:00
DEBUG = False
class UnsupportedFeedtype(Exception):
pass
class DurationNotFound(Exception):
pass
class EmptySequenceError(Exception):
pass
class WeightZeroError(Exception):
pass
def debug(*args, **kwargs):
2024-05-10 13:33:34 +02:00
if not DEBUG:
return
kwargs.setdefault("file", sys.stderr)
print(*args, **kwargs)
def get_int(s):
return int(re.findall(r"\d+", s)[0])
def DurationType(arg):
if arg.isdecimal():
secs = int(arg)
else:
secs = timeparse(arg)
if secs is None:
raise ArgumentTypeError("%r is not a valid duration" % arg)
return secs
def TimeDeltaType(arg):
if arg.isdecimal():
secs = int(arg)
else:
secs = timeparse(arg)
if secs is None:
raise ArgumentTypeError("%r is not a valid time range" % arg)
return datetime.timedelta(seconds=secs)
def weighted_choice(values, weights):
"""
random.choice with weights
weights must be integers greater than 0.
Their meaning is "relative", that is [1,2,3] is the same as [2,4,6]
"""
assert len(values) == len(weights)
if not values:
2024-05-10 13:33:34 +02:00
raise EmptySequenceError() # Cannot do weighted choice from an empty sequence
if sum(weights) == 0:
2024-05-10 13:33:34 +02:00
raise WeightZeroError() # Cannot do weighted choice where weight=0
total = 0
cum_weights = []
for w in weights:
total += w
cum_weights.append(total)
x = random.random() * total
i = bisect(cum_weights, x)
return values[i]
def delta_humanreadable(tdelta):
if tdelta is None:
return ""
days = tdelta.days
hours = (tdelta - datetime.timedelta(days=days)).seconds // 3600
if days:
return "{}d{}h".format(days, hours)
return "{}h".format(hours)
2020-07-22 00:42:48 +02:00
def duration_humanreadable(seconds):
hours = seconds // 3600
minutes = (seconds - hours * 3600) // 60
seconds = seconds % 60
if hours > 0:
return "{}h{}m{}s".format(hours, minutes, seconds)
return "{}m{}s".format(minutes, seconds)
class Audio(object):
def __init__(self, url, duration=None, date=None):
self.url = url
2021-10-10 00:11:46 +02:00
self._duration = duration
self.date = date
self.end_date = datetime.datetime(9999, 12, 31, tzinfo=datetime.timezone.utc)
2024-05-10 13:34:26 +02:00
@classmethod
def from_trusted(cls, url_or_path) -> 'Audio':
if url_or_path.startswith('/'):
return cls('file://' + url_or_path)
return cls(url_or_path)
def __str__(self):
return self.url
def __repr__(self):
return "<Audio {} ({} {})>".format(
2020-07-22 00:42:48 +02:00
self.url,
duration_humanreadable(self.duration),
delta_humanreadable(self.age),
)
2021-10-10 00:11:46 +02:00
@property
def duration(self):
if self._duration is None:
self._duration = get_duration(self.url)
return self._duration
2017-06-12 23:26:36 +02:00
@property
def urls(self):
return [self.url]
@property
def age(self):
if self.date is None:
return None
now = datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc)
return now - self.date
@property
def valid(self):
return self.end_date >= datetime.datetime.utcnow().replace(
tzinfo=datetime.timezone.utc
)
class AudioGroup(list):
def __init__(self, description=None):
self.description = description or ""
self.audios = []
def __len__(self):
return len(self.audios)
def append(self, arg):
self.audios.append(arg)
def __str__(self):
return "\n".join(str(a) for a in self.audios)
def __repr__(self):
return '<AudioGroup "{}" ({} {})\n{} >'.format(
self.description,
2020-07-22 00:42:48 +02:00
duration_humanreadable(self.duration),
delta_humanreadable(self.age),
"\n".join(" " + repr(a) for a in self.audios),
)
@property
2018-02-13 20:28:01 +01:00
def duration(self):
return sum(a.duration for a in self.audios if a.duration is not None)
2017-06-12 23:26:36 +02:00
@property
def urls(self):
return [a.url for a in self.audios]
@property
def date(self):
for a in self.audios:
if hasattr(a, "date"):
return a.date
return None
@property
def age(self):
if self.date is None:
return None
now = datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc)
return now - self.date
@property
def valid(self):
return len(self.audios) > 0
def get_tree(feed_url):
if feed_url.startswith("http:") or feed_url.startswith("https:"):
tree = html.fromstring(requests.get(feed_url).content)
else:
if not os.path.exists(feed_url):
2024-05-10 13:33:34 +02:00
raise FileNotFoundError(feed_url)
tree = html.parse(open(feed_url))
return tree
def get_audio_from_description(text):
# non-empty lines
lines = [line.strip() for line in text.split("\n") if line.strip()]
url = lines[0]
2018-02-13 20:28:01 +01:00
duration = None
metadata = {}
for line in text.split("\n")[1:]:
if line.strip() and "=" in line:
metadata[line.split("=")[0]] = line.split("=")[1]
if "durata" in metadata:
2024-05-12 15:59:19 +02:00
try:
durata = get_int(metadata["durata"])
except Exception as exc:
logging.info("Could not get duration: %s" % exc)
del metadata['durata']
else:
metadata["durata"] = durata
if "txdate" in metadata:
try:
metadata["txdate"] = datetime.datetime.strptime(
metadata["txdate"], "%Y-%m-%dT%H:%M:%S%z"
)
except ValueError:
logging.warning("could not parse txdate %s", metadata["txdate"])
del metadata["txdate"]
a = Audio(
unquote(url),
duration=metadata.get("durata", None),
date=metadata.get("txdate", None),
)
if "txdate" in metadata and "replica" in metadata:
if metadata["replica"].endswith("g"):
a.end_date = metadata["txdate"] + datetime.timedelta(
days=get_int(metadata["replica"])
)
return a
2021-09-25 00:13:38 +02:00
def is_audio_file(fpath, extensions=("mp3", "oga", "wav", "ogg")):
if fpath.split(".")[-1].lower() in extensions:
return True
return False
# copied from larigira.fsutils
2021-09-25 00:13:38 +02:00
def scan_dir_audio(dirname):
for root, dirnames, filenames in os.walk(dirname):
for fname in filenames:
2021-09-25 00:13:38 +02:00
if is_audio_file(fname):
path = os.path.join(root, fname)
yield path
2021-09-25 00:13:38 +02:00
def get_audio_from_file(fpath):
a = Audio(
"file://" + os.path.realpath(fpath),
date=datetime.datetime.fromtimestamp(os.path.getmtime(fpath)).replace(
tzinfo=datetime.timezone.utc
),
)
return [a]
def get_audio_from_dir(dirpath):
fpaths = scan_dir_audio(dirpath)
ret = []
for u in fpaths:
try:
a = Audio(
"file://" + os.path.realpath(u),
date=datetime.datetime.fromtimestamp(os.path.getmtime(u)).replace(
tzinfo=datetime.timezone.utc
),
)
except ValueError:
continue
ret.append(a)
return ret
def get_item_date(el):
el_date = el.find("pubdate")
# Wed, 15 Jan 2020 22:45:33 +0000
formats = ["%a, %d %b %Y %H:%M:%S %z", "%Y-%m-%dT%H:%M:%S%z"]
if el_date is not None:
for fmt in formats:
try:
return datetime.datetime.strptime(el_date.text, fmt)
except ValueError:
pass
return None
def get_urls_generic(tree, url_selector="description[text()]", metadata_in_body=True):
items = tree.xpath("//item")
for it in items:
title = it.find("title").text
el_body = it.find("description")
if metadata_in_body and el_body is not None:
url = el_body.text
try:
audio = get_audio_from_description(url)
except Exception as exc:
logging.info("error getting duration for `%s`" % title)
continue
2019-01-27 18:17:58 +01:00
if audio.date is None:
audio.date = get_item_date(it)
yield audio
else:
try:
url = it.xpath(url_selector)[0]
except IndexError:
2021-10-10 00:12:13 +02:00
logging.warning("no audio found in %s", title)
else:
audio = Audio(url)
audio.date = get_item_date(it)
yield audio
def get_urls_from_podcast(tree):
return get_urls_generic(tree, url_selector="enclosure/@url", metadata_in_body=False)
def get_urls_from_custom_feed(tree):
return get_urls_generic(tree, metadata_in_body=True)
def get_urls_factory(url, args):
if args.feed_type == "customrss":
return get_urls_from_custom_feed
if args.feed_type == "podcast":
return get_urls_from_podcast
2024-05-10 13:33:34 +02:00
raise UnsupportedFeedtype(args.feed_type)
def get_grouped_urls(tree):
groups = OrderedDict()
items = tree.xpath("//item")
for item in items:
guid = item.xpath("guid")[0].text.strip()
if guid not in groups:
groups[guid] = AudioGroup(guid)
audio = get_audio_from_description(item.xpath("description")[0].text)
audio.date = get_item_date(item)
if audio.valid:
groups[guid].append(audio)
return groups
def get_duration(url):
try:
lineout = check_output(
["ffprobe", "-v", "error", "-show_entries", "format=duration", "-i", url]
).split(b"\n")
except CalledProcessError as exc:
2024-05-10 13:33:34 +02:00
raise DurationNotFound(url) from exc
duration = next(l for l in lineout if l.startswith(b"duration="))
value = duration.split(b"=")[1]
return int(float(value))
HELP = """
2018-02-13 21:49:30 +01:00
Collect audio informations from multiple sources (XML feeds).
Audios are (in that order):
1. Collected from feeds; (grouped by article if --group is used)
2. Filtered; everything that does not match with requirements is excluded
3. Sorted; even randomly
4. Sliced; take HOWMANY elements, skipping START elements
5. (if --copy) Copied
Usage: """
2018-02-13 21:49:30 +01:00
def get_parser():
2018-02-13 21:49:30 +01:00
p = ArgumentParser(HELP)
parsing = p.add_argument_group("parsing", "Feed parsing")
parsing.add_argument(
"--feed-type", type=str, choices=["customrss", "podcast"], default="customrss"
)
src = p.add_argument_group("sources", "How to deal with sources")
src.add_argument(
"--source-weights", help='Select only one "source" based on this weights'
)
src.add_argument(
"--group",
default=False,
action="store_true",
help="Group audios that belong to the same article",
)
2021-09-25 00:13:38 +02:00
src.add_argument(
"--glob",
default=False,
action="store_true",
help="Wildcards in filenames are interpreted",
)
filters = p.add_argument_group(
"filters", "Select only items that match " "these conditions"
)
filters.add_argument(
"--min-len",
default=0,
type=DurationType,
help="Exclude any audio that is shorter " "than MIN_LEN seconds",
)
filters.add_argument(
"--max-len",
default=0,
type=DurationType,
help="Exclude any audio that is longer " "than MAX_LEN seconds",
)
filters.add_argument(
"--sort-by", default="no", type=str, choices=("random", "date", "duration")
)
2021-09-24 23:56:00 +02:00
filters.add_argument(
'--random-seed', default=None, help='Initialize the random generator. For debug only')
filters.add_argument(
"--reverse", default=False, action="store_true", help="Reverse list order"
)
filters.add_argument(
"--min-age",
default=datetime.timedelta(),
type=TimeDeltaType,
help="Exclude audio more recent than MIN_AGE",
)
filters.add_argument(
"--max-age",
default=datetime.timedelta(),
type=TimeDeltaType,
help="Exclude audio older than MAX_AGE",
)
fill = p.add_argument_group(
"fill", "Fill a 'block' with as many contents as possible"
)
fill.add_argument(
"--fill",
default=0,
type=DurationType,
help="Fill a block of duration LEN",
metavar="LEN",
)
fill.add_argument(
"--fill-reverse",
default=False,
action="store_true",
help="Reverse list order after the fill algorithm",
)
fill.add_argument(
"--fill-interleave-dir",
default=None,
type=str, # FIXME: does it even work?
help="Between each item, put a random file from DIR",
)
2024-05-10 13:34:26 +02:00
intro = p.add_argument_group(
"intro", "Add intro/outro to output, but only if at least one audio will be output"
)
intro.add_argument("--intro", default=None, type=str, metavar="PATH")
intro.add_argument("--outro", default=None, type=str, metavar="PATH")
p.add_argument(
"--start",
default=0,
type=int,
help="0-indexed start number. " "By default, play from most recent",
)
p.add_argument(
"--howmany", default=1, type=int, help="If not specified, only 1 will be played"
)
p.add_argument(
"--slotsize", type=int, help="Seconds between each audio. Still unsupported"
)
general = p.add_argument_group("general", "General options")
general.add_argument(
"--copy", help="Copy files to $TMPDIR", default=False, action="store_true"
)
general.add_argument(
"--debug", help="Debug messages", default=False, action="store_true"
)
p.add_argument("urls", metavar="URL", nargs="+")
return p
2017-05-25 19:22:00 +02:00
def downloader(url, dest):
headers = {}
2020-11-16 15:31:13 +01:00
mode = "wb"
if os.path.exists(dest):
headers["Range"] = "bytes=%d-" % os.stat(dest).st_size
2020-11-16 15:31:13 +01:00
mode = "ab"
r = requests.get(url, stream=True, headers=headers)
if r.status_code == 416: # range not satisfiable
return
2020-11-16 15:11:42 +01:00
with open(dest, mode) as f:
for chunk in r.iter_content(chunk_size=1 << 16):
f.write(chunk)
2017-05-25 16:52:57 +02:00
def put(audio, copy=False):
if not copy:
2017-06-12 23:26:36 +02:00
for url in audio.urls:
print(url)
2017-05-25 16:52:57 +02:00
else:
destdir = os.environ.get("TMPDIR", ".")
os.makedirs(destdir, exist_ok=True)
2017-06-12 23:26:36 +02:00
for url in audio.urls:
if url.split(":")[0] in ("http", "https"):
fname = posixpath.basename(urlparse(url).path)
# sanitize
fname = "".join(
c for c in fname if c.isalnum() or c in list("._-")
).rstrip()
2024-05-12 15:59:47 +02:00
dest = os.path.join(destdir, "feed-" + fname)
downloader(url, dest)
2020-11-16 15:11:42 +01:00
print("file://%s" % os.path.realpath(dest))
elif url.startswith("file:///"):
src = url[len('file://'):]
dest = os.path.join(destdir, os.path.basename(src))
shutil.copy(src, dest)
print("file://%s" % os.path.realpath(dest))
else:
# what's that? let's just copy it
print(url)
2017-05-25 16:52:57 +02:00
def retrieve(url, args):
"""
returns a list of Audios or a list of AudioGroups
"""
if not args.group:
if os.path.isdir(url):
audiodir = get_audio_from_dir(url)
return audiodir
2021-09-25 00:13:38 +02:00
elif os.path.isfile(url) and is_audio_file(url):
return get_audio_from_file(url)
elif url.startswith("http:") or url.startswith("https:") or os.path.isfile(url):
getter = get_urls_factory(url, args)
tree = get_tree(url)
return getter(tree)
else:
logging.info("unsupported url `%s`", url)
return []
else: # group
if os.path.isdir(url):
audiodir = get_audio_from_dir(url)
agroups = []
for a in audiodir:
ag = AudioGroup(os.path.basename(a.url))
ag.append(a)
agroups.append(ag)
return agroups
2021-09-25 00:13:38 +02:00
elif os.path.isfile(url) and is_audio_file(url):
audio = get_audio_from_file(url)[0]
ag = AudioGroup(url)
ag.append(audio)
return [ag]
elif url.startswith("http:") or url.startswith("https:") or os.path.isfile(url):
groups = get_grouped_urls(get_tree(url))
return groups.values()
else:
logging.info("unsupported url `%s`", url)
return []
def audio_passes_filters(audio, args):
2021-09-24 23:28:49 +02:00
logging.debug(audio.end_date)
if not audio.valid:
return False
if args.max_len and audio.duration > args.max_len:
return False
if args.fill and audio.duration > args.fill:
return False
if args.min_len and audio.duration < args.min_len:
return False
if args.min_age.total_seconds() and audio.age < args.min_age:
return False
if args.max_age.total_seconds() and audio.age > args.max_age:
return False
return True
2021-09-25 00:13:38 +02:00
def expand_glob(sources: list, weights: list) -> tuple:
'''
Let's say that sources=["foo", "bar*"] and weight=["2", "3"] and on filesystem there are bar1 and bar2.
Result: ["foo", "bar1", "bar2"], ["2", "1.5", "1.5"]
2021-09-25 00:13:38 +02:00
'''
new_sources = []
new_weights = []
for src, weight in zip(sources, weights):
if not src.startswith('http://') and not src.startswith('https://') and '*' in src:
expanded_source = glob.glob(src)
else:
expanded_source = [src]
logging.debug("glob: %s -> %s", src, expanded_source)
expanded_weight = [weight / len(expanded_source)] * len(expanded_source)
2021-09-25 00:13:38 +02:00
new_sources += expanded_source
new_weights += expanded_weight
return new_sources, new_weights
2021-09-24 23:56:00 +02:00
def get_audio_by_source(args, parser):
sources = args.urls
if args.source_weights:
weights = list(map(int, args.source_weights.split(":")))
if len(weights) != len(sources):
parser.exit(
2021-09-24 23:56:00 +02:00
status=2,
message="Weight must be in the same number as sources\n",
)
else:
weights = [1] * len(sources)
2021-09-25 00:13:38 +02:00
if args.glob:
sources, weights = expand_glob(sources, weights)
audio_by_source = OrderedDict()
for i, url in enumerate(sources):
2021-09-24 23:28:49 +02:00
url_audios = list(retrieve(url, args))
logging.debug("Found %d audios in %s", len(url_audios), url)
url_audios = [au for au in url_audios if audio_passes_filters(au, args)]
2021-09-24 23:28:49 +02:00
logging.debug("%d of those are passing filters", len(url_audios))
audio_by_source[url] = url_audios
if not url_audios:
weights[i] = 0
if sum(weights) == 0:
return
sources = [weighted_choice(sources, weights)]
2021-09-24 23:56:00 +02:00
return audio_by_source, sources
2024-05-10 13:34:26 +02:00
def add_intro_outro(audios: list, args) -> list:
if not audios:
return audios
audios = audios.copy()
if args.intro:
audios.insert(0, Audio.from_trusted(args.intro))
if args.outro:
audios.append(Audio.from_trusted(args.outro))
return audios
2021-09-24 23:56:00 +02:00
def main():
parser = get_parser()
args = parser.parse_args()
if not args.debug:
logging.basicConfig(level=logging.WARNING)
else:
logging.basicConfig(level=logging.DEBUG)
if args.random_seed is not None:
random.seed(args.random_seed)
audio_by_source, sources = get_audio_by_source(args, parser)
audios = []
for source_url in sources:
audios += audio_by_source[source_url]
2021-09-24 23:28:49 +02:00
logging.debug("Found %d audios", len(audios))
# sort
if args.sort_by == "random":
2017-06-12 23:26:43 +02:00
random.shuffle(audios)
elif args.sort_by == "date":
audios.sort(key=lambda x: x.age)
elif args.sort_by == "duration":
2018-02-18 20:34:31 +01:00
audios.sort(key=lambda x: x.duration)
if args.reverse:
audios.reverse()
# slice
audios = audios[args.start :]
if not args.fill:
audios = audios[: args.howmany]
if args.fill and audios:
fill_audios = [audios.pop(0)]
duration = fill_audios[0].duration
for next_audio in audios:
next_duration = next_audio.duration
if args.fill_interleave_dir:
interleaving = Audio(
"file://"
+ random.choice(list(scan_dir_audio(args.fill_interleave_dir)))
)
# logging.info("%r", interleaving)
next_duration += interleaving.duration
if args.fill - duration > next_duration:
if args.fill_interleave_dir:
fill_audios.append(interleaving)
fill_audios.append(next_audio)
duration += next_duration
audios = fill_audios
if args.fill_reverse:
audios.reverse()
# the for loop excludes the last one
# this is to support the --slotsize option
if not audios:
return
2024-05-10 13:34:26 +02:00
audios = add_intro_outro(audios, args)
for audio in audios[:-1]:
if args.debug:
debug(repr(audio))
else:
2017-05-25 16:52:57 +02:00
put(audio, args.copy)
if args.slotsize is not None:
2018-02-13 20:28:01 +01:00
duration = audio.duration
if duration < args.slotsize:
# TODO: prendi musica da un'altra cartella
print("## musica per {} secondi".format(args.slotsize - duration))
# finally, the last one
if args.debug:
debug(repr(audios[-1]))
else:
2017-05-25 16:52:57 +02:00
put(audios[-1], args.copy)
# else: # grouping; TODO: support slotsize
# for item in groups:
# if args.debug:
2018-02-13 20:28:01 +01:00
# print('#', item, groups[item].duration)
# print(groups[item])
if __name__ == "__main__":
main()