123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769 |
- #!/usr/bin/env python3
- """
- Feed parser with many features
- from a feed, it supports filtering, subslicing, random picking
- Beside feeds, it supports picking files from directories
- """
- import datetime
- import logging
- import os
- import glob
- import posixpath
- import random
- import re
- import sys
- import urllib.request
- from argparse import ArgumentParser, ArgumentTypeError
- from bisect import bisect
- from collections import OrderedDict
- from subprocess import CalledProcessError, check_output
- from urllib.parse import unquote, urlparse
- import shutil
- import requests
- from lxml import html
- from pytimeparse.timeparse import timeparse
- DEBUG = False
- class UnsupportedFeedtype(Exception):
- pass
- class DurationNotFound(Exception):
- pass
- class EmptySequenceError(Exception):
- pass
- class WeightZeroError(Exception):
- pass
- def debug(*args, **kwargs):
- if not DEBUG:
- return
- kwargs.setdefault("file", sys.stderr)
- print(*args, **kwargs)
- def get_int(s):
- return int(re.findall(r"\d+", s)[0])
- def DurationType(arg):
- if arg.isdecimal():
- secs = int(arg)
- else:
- secs = timeparse(arg)
- if secs is None:
- raise ArgumentTypeError("%r is not a valid duration" % arg)
- return secs
- def TimeDeltaType(arg):
- if arg.isdecimal():
- secs = int(arg)
- else:
- secs = timeparse(arg)
- if secs is None:
- raise ArgumentTypeError("%r is not a valid time range" % arg)
- return datetime.timedelta(seconds=secs)
- def weighted_choice(values, weights):
- """
- random.choice with weights
- weights must be integers greater than 0.
- Their meaning is "relative", that is [1,2,3] is the same as [2,4,6]
- """
- assert len(values) == len(weights)
- if not values:
- raise EmptySequenceError() # Cannot do weighted choice from an empty sequence
- if sum(weights) == 0:
- raise WeightZeroError() # Cannot do weighted choice where weight=0
- total = 0
- cum_weights = []
- for w in weights:
- total += w
- cum_weights.append(total)
- x = random.random() * total
- i = bisect(cum_weights, x)
- return values[i]
- def delta_humanreadable(tdelta):
- if tdelta is None:
- return ""
- days = tdelta.days
- hours = (tdelta - datetime.timedelta(days=days)).seconds // 3600
- if days:
- return "{}d{}h".format(days, hours)
- return "{}h".format(hours)
- def duration_humanreadable(seconds):
- hours = seconds // 3600
- minutes = (seconds - hours * 3600) // 60
- seconds = seconds % 60
- if hours > 0:
- return "{}h{}m{}s".format(hours, minutes, seconds)
- return "{}m{}s".format(minutes, seconds)
- class Audio(object):
- def __init__(self, url, duration=None, date=None):
- self.url = url
- self._duration = duration
- self.date = date
- self.end_date = datetime.datetime(9999, 12, 31, tzinfo=datetime.timezone.utc)
- @classmethod
- def from_trusted(cls, url_or_path) -> 'Audio':
- if url_or_path.startswith('/'):
- return cls('file://' + url_or_path)
- return cls(url_or_path)
- def __str__(self):
- return self.url
- def __repr__(self):
- return "<Audio {} ({} {})>".format(
- self.url,
- duration_humanreadable(self.duration),
- delta_humanreadable(self.age),
- )
- @property
- def duration(self):
- if self._duration is None:
- self._duration = get_duration(self.url)
- return self._duration
- @property
- def urls(self):
- return [self.url]
- @property
- def age(self):
- if self.date is None:
- return None
- now = datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc)
- return now - self.date
- @property
- def valid(self):
- return self.end_date >= datetime.datetime.utcnow().replace(
- tzinfo=datetime.timezone.utc
- )
- class AudioGroup(list):
- def __init__(self, description=None):
- self.description = description or ""
- self.audios = []
- def __len__(self):
- return len(self.audios)
- def append(self, arg):
- self.audios.append(arg)
- def __str__(self):
- return "\n".join(str(a) for a in self.audios)
- def __repr__(self):
- return '<AudioGroup "{}" ({} {})\n{} >'.format(
- self.description,
- duration_humanreadable(self.duration),
- delta_humanreadable(self.age),
- "\n".join(" " + repr(a) for a in self.audios),
- )
- @property
- def duration(self):
- return sum(a.duration for a in self.audios if a.duration is not None)
- @property
- def urls(self):
- return [a.url for a in self.audios]
- @property
- def date(self):
- for a in self.audios:
- if hasattr(a, "date"):
- return a.date
- return None
- @property
- def age(self):
- if self.date is None:
- return None
- now = datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc)
- return now - self.date
- @property
- def valid(self):
- return len(self.audios) > 0
- def get_tree(feed_url):
- if feed_url.startswith("http:") or feed_url.startswith("https:"):
- tree = html.fromstring(requests.get(feed_url).content)
- else:
- if not os.path.exists(feed_url):
- raise FileNotFoundError(feed_url)
- tree = html.parse(open(feed_url))
- return tree
- def get_audio_from_description(text):
- # non-empty lines
- lines = [line.strip() for line in text.split("\n") if line.strip()]
- url = lines[0]
- duration = None
- metadata = {}
- for line in text.split("\n")[1:]:
- if line.strip() and "=" in line:
- metadata[line.split("=")[0]] = line.split("=")[1]
- if "durata" in metadata:
- try:
- durata = get_int(metadata["durata"])
- except Exception as exc:
- logging.info("Could not get duration: %s" % exc)
- del metadata['durata']
- else:
- metadata["durata"] = durata
- if "txdate" in metadata:
- try:
- metadata["txdate"] = datetime.datetime.strptime(
- metadata["txdate"], "%Y-%m-%dT%H:%M:%S%z"
- )
- except ValueError:
- logging.warning("could not parse txdate %s", metadata["txdate"])
- del metadata["txdate"]
- a = Audio(
- unquote(url),
- duration=metadata.get("durata", None),
- date=metadata.get("txdate", None),
- )
- if "txdate" in metadata and "replica" in metadata:
- if metadata["replica"].endswith("g"):
- a.end_date = metadata["txdate"] + datetime.timedelta(
- days=get_int(metadata["replica"])
- )
- return a
- def is_audio_file(fpath, extensions=("mp3", "oga", "wav", "ogg")):
- if fpath.split(".")[-1].lower() in extensions:
- return True
- return False
- # copied from larigira.fsutils
- def scan_dir_audio(dirname):
- for root, dirnames, filenames in os.walk(dirname):
- for fname in filenames:
- if is_audio_file(fname):
- path = os.path.join(root, fname)
- yield path
- def get_audio_from_file(fpath):
- a = Audio(
- "file://" + os.path.realpath(fpath),
- date=datetime.datetime.fromtimestamp(os.path.getmtime(fpath)).replace(
- tzinfo=datetime.timezone.utc
- ),
- )
- return [a]
- def get_audio_from_dir(dirpath):
- fpaths = scan_dir_audio(dirpath)
- ret = []
- for u in fpaths:
- try:
- a = Audio(
- "file://" + os.path.realpath(u),
- date=datetime.datetime.fromtimestamp(os.path.getmtime(u)).replace(
- tzinfo=datetime.timezone.utc
- ),
- )
- except ValueError:
- continue
- ret.append(a)
- return ret
- def get_item_date(el):
- el_date = el.find("pubdate")
- # Wed, 15 Jan 2020 22:45:33 +0000
- formats = ["%a, %d %b %Y %H:%M:%S %z", "%Y-%m-%dT%H:%M:%S%z"]
- if el_date is not None:
- for fmt in formats:
- try:
- return datetime.datetime.strptime(el_date.text, fmt)
- except ValueError:
- pass
- return None
- def get_urls_generic(tree, url_selector="description[text()]", metadata_in_body=True):
- items = tree.xpath("//item")
- for it in items:
- title = it.find("title").text
- el_body = it.find("description")
- if metadata_in_body and el_body is not None:
- url = el_body.text
- try:
- audio = get_audio_from_description(url)
- except Exception as exc:
- logging.info("error getting duration for `%s`: %s" % (title, exc))
- continue
- if audio.date is None:
- audio.date = get_item_date(it)
- yield audio
- else:
- try:
- url = it.xpath(url_selector)[0]
- except IndexError:
- logging.warning("no audio found in %s", title)
- else:
- audio = Audio(url)
- audio.date = get_item_date(it)
- yield audio
- def get_urls_from_podcast(tree):
- return get_urls_generic(tree, url_selector="enclosure/@url", metadata_in_body=False)
- def get_urls_from_custom_feed(tree):
- return get_urls_generic(tree, metadata_in_body=True)
- def get_urls_factory(url, args):
- if args.feed_type == "customrss":
- return get_urls_from_custom_feed
- if args.feed_type == "podcast":
- return get_urls_from_podcast
- raise UnsupportedFeedtype(args.feed_type)
- def get_grouped_urls(tree):
- groups = OrderedDict()
- items = tree.xpath("//item")
- for item in items:
- guid = item.xpath("guid")[0].text.strip()
- if guid not in groups:
- groups[guid] = AudioGroup(guid)
- audio = get_audio_from_description(item.xpath("description")[0].text)
- audio.date = get_item_date(item)
- if audio.valid:
- groups[guid].append(audio)
- return groups
- def get_duration(url):
- try:
- lineout = check_output(
- ["ffprobe", "-v", "error", "-show_entries", "format=duration", "-i", url]
- ).split(b"\n")
- except CalledProcessError as exc:
- raise DurationNotFound(url) from exc
- duration = next(l for l in lineout if l.startswith(b"duration="))
- value = duration.split(b"=")[1]
- return int(float(value))
- HELP = """
- Collect audio informations from multiple sources (XML feeds).
- Audios are (in that order):
- 1. Collected from feeds; (grouped by article if --group is used)
- 2. Filtered; everything that does not match with requirements is excluded
- 3. Sorted; even randomly
- 4. Sliced; take HOWMANY elements, skipping START elements
- 5. (if --copy) Copied
- Usage: """
- def get_parser():
- p = ArgumentParser(HELP)
- parsing = p.add_argument_group("parsing", "Feed parsing")
- parsing.add_argument(
- "--feed-type", type=str, choices=["customrss", "podcast"], default="customrss"
- )
- src = p.add_argument_group("sources", "How to deal with sources")
- src.add_argument(
- "--source-weights", help='Select only one "source" based on this weights'
- )
- src.add_argument(
- "--group",
- default=False,
- action="store_true",
- help="Group audios that belong to the same article",
- )
- src.add_argument(
- "--glob",
- default=False,
- action="store_true",
- help="Wildcards in filenames are interpreted",
- )
- filters = p.add_argument_group(
- "filters", "Select only items that match " "these conditions"
- )
- filters.add_argument(
- "--min-len",
- default=0,
- type=DurationType,
- help="Exclude any audio that is shorter " "than MIN_LEN seconds",
- )
- filters.add_argument(
- "--max-len",
- default=0,
- type=DurationType,
- help="Exclude any audio that is longer " "than MAX_LEN seconds",
- )
- filters.add_argument(
- "--sort-by", default="no", type=str, choices=("random", "date", "duration")
- )
- filters.add_argument(
- '--random-seed', default=None, help='Initialize the random generator. For debug only')
- filters.add_argument(
- "--reverse", default=False, action="store_true", help="Reverse list order"
- )
- filters.add_argument(
- "--min-age",
- default=datetime.timedelta(),
- type=TimeDeltaType,
- help="Exclude audio more recent than MIN_AGE",
- )
- filters.add_argument(
- "--max-age",
- default=datetime.timedelta(),
- type=TimeDeltaType,
- help="Exclude audio older than MAX_AGE",
- )
- fill = p.add_argument_group(
- "fill", "Fill a 'block' with as many contents as possible"
- )
- fill.add_argument(
- "--fill",
- default=0,
- type=DurationType,
- help="Fill a block of duration LEN",
- metavar="LEN",
- )
- fill.add_argument(
- "--fill-reverse",
- default=False,
- action="store_true",
- help="Reverse list order after the fill algorithm",
- )
- fill.add_argument(
- "--fill-interleave-dir",
- default=None,
- type=str, # FIXME: does it even work?
- help="Between each item, put a random file from DIR",
- )
- intro = p.add_argument_group(
- "intro", "Add intro/outro to output, but only if at least one audio will be output"
- )
- intro.add_argument("--intro", default=None, type=str, metavar="PATH")
- intro.add_argument("--outro", default=None, type=str, metavar="PATH")
- p.add_argument(
- "--start",
- default=0,
- type=int,
- help="0-indexed start number. " "By default, play from most recent",
- )
- p.add_argument(
- "--howmany", default=1, type=int, help="If not specified, only 1 will be played"
- )
- p.add_argument(
- "--slotsize", type=int, help="Seconds between each audio. Still unsupported"
- )
- general = p.add_argument_group("general", "General options")
- general.add_argument(
- "--copy", help="Copy files to $TMPDIR", default=False, action="store_true"
- )
- general.add_argument(
- "--debug", help="Debug messages", default=False, action="store_true"
- )
- p.add_argument("urls", metavar="URL", nargs="+")
- return p
- def downloader(url, dest):
- headers = {}
- mode = "wb"
- if os.path.exists(dest):
- headers["Range"] = "bytes=%d-" % os.stat(dest).st_size
- mode = "ab"
- r = requests.get(url, stream=True, headers=headers)
- if r.status_code == 416: # range not satisfiable
- return
- with open(dest, mode) as f:
- for chunk in r.iter_content(chunk_size=1 << 16):
- f.write(chunk)
- def put(audio, copy=False):
- if not copy:
- for url in audio.urls:
- print(url)
- else:
- destdir = os.environ.get("TMPDIR", ".")
- os.makedirs(destdir, exist_ok=True)
- for url in audio.urls:
- if url.split(":")[0] in ("http", "https"):
- fname = posixpath.basename(urlparse(url).path)
- # sanitize
- fname = "".join(
- c for c in fname if c.isalnum() or c in list("._-")
- ).rstrip()
- dest = os.path.join(destdir, "feed-" + fname)
- downloader(url, dest)
- print("file://%s" % os.path.realpath(dest))
- elif url.startswith("file:///"):
- src = url[len('file://'):]
- dest = os.path.join(destdir, os.path.basename(src))
- shutil.copy(src, dest)
- print("file://%s" % os.path.realpath(dest))
- else:
- # what's that? let's just copy it
- print(url)
- def retrieve(url, args):
- """
- returns a list of Audios or a list of AudioGroups
- """
- if not args.group:
- if os.path.isdir(url):
- audiodir = get_audio_from_dir(url)
- return audiodir
- elif os.path.isfile(url) and is_audio_file(url):
- return get_audio_from_file(url)
- elif url.startswith("http:") or url.startswith("https:") or os.path.isfile(url):
- getter = get_urls_factory(url, args)
- tree = get_tree(url)
- return getter(tree)
- else:
- logging.info("unsupported url `%s`", url)
- return []
- else: # group
- if os.path.isdir(url):
- audiodir = get_audio_from_dir(url)
- agroups = []
- for a in audiodir:
- ag = AudioGroup(os.path.basename(a.url))
- ag.append(a)
- agroups.append(ag)
- return agroups
- elif os.path.isfile(url) and is_audio_file(url):
- audio = get_audio_from_file(url)[0]
- ag = AudioGroup(url)
- ag.append(audio)
- return [ag]
- elif url.startswith("http:") or url.startswith("https:") or os.path.isfile(url):
- groups = get_grouped_urls(get_tree(url))
- return groups.values()
- else:
- logging.info("unsupported url `%s`", url)
- return []
- def audio_passes_filters(audio, args):
- try:
- logging.debug(audio.end_date)
- if not audio.valid:
- return False
- if args.max_len and audio.duration > args.max_len:
- return False
- if args.fill and audio.duration > args.fill:
- return False
- if args.min_len and audio.duration < args.min_len:
- return False
- if args.min_age.total_seconds() and audio.age < args.min_age:
- return False
- if args.max_age.total_seconds() and audio.age > args.max_age:
- return False
- return True
- except DurationNotFound:
- return False
- def expand_glob(sources: list, weights: list) -> tuple:
- '''
- Let's say that sources=["foo", "bar*"] and weight=["2", "3"] and on filesystem there are bar1 and bar2.
- Result: ["foo", "bar1", "bar2"], ["2", "1.5", "1.5"]
- '''
- new_sources = []
- new_weights = []
- for src, weight in zip(sources, weights):
- if not src.startswith('http://') and not src.startswith('https://') and '*' in src:
- expanded_source = glob.glob(src)
- else:
- expanded_source = [src]
- logging.debug("glob: %s -> %s", src, expanded_source)
- expanded_weight = [weight / len(expanded_source)] * len(expanded_source)
- new_sources += expanded_source
- new_weights += expanded_weight
- return new_sources, new_weights
- def get_audio_by_source(args, parser) -> tuple[OrderedDict, list]:
- sources = args.urls
- if args.source_weights:
- weights = list(map(int, args.source_weights.split(":")))
- if len(weights) != len(sources):
- parser.exit(
- status=2,
- message="Weight must be in the same number as sources\n",
- )
- else:
- weights = [1] * len(sources)
- if sum(weights) == 0:
- return [], []
- if args.glob:
- sources, weights = expand_glob(sources, weights)
- audio_by_source = OrderedDict()
- for i, url in enumerate(sources):
- url_audios = list(retrieve(url, args))
- logging.debug("Found %d audios in %s", len(url_audios), url)
- url_audios = [au for au in url_audios if audio_passes_filters(au, args)]
- logging.debug("%d of those are passing filters", len(url_audios))
- audio_by_source[url] = url_audios
- if not url_audios:
- weights[i] = 0
- if sum(weights) == 0:
- return [], []
- sources = [weighted_choice(sources, weights)]
- return audio_by_source, sources
- def add_intro_outro(audios: list, args) -> list:
- if not audios:
- return audios
- audios = audios.copy()
- if args.intro:
- audios.insert(0, Audio.from_trusted(args.intro))
- if args.outro:
- audios.append(Audio.from_trusted(args.outro))
- return audios
- def main():
- parser = get_parser()
- args = parser.parse_args()
- if not args.debug:
- logging.basicConfig(level=logging.WARNING)
- else:
- global DEBUG
- DEBUG = True
- logging.basicConfig(level=logging.DEBUG)
- if args.random_seed is not None:
- random.seed(args.random_seed)
- audio_by_source, sources = get_audio_by_source(args, parser)
- audios = []
- for source_url in sources:
- audios += audio_by_source[source_url]
- logging.debug("Found %d audios", len(audios))
- # sort
- if args.sort_by == "random":
- random.shuffle(audios)
- elif args.sort_by == "date":
- audios.sort(key=lambda x: x.age)
- elif args.sort_by == "duration":
- audios.sort(key=lambda x: x.duration)
- if args.reverse:
- audios.reverse()
- # slice
- audios = audios[args.start :]
- if not args.fill:
- audios = audios[: args.howmany]
- if args.fill and audios:
- fill_audios = [audios.pop(0)]
- duration = fill_audios[0].duration
- for next_audio in audios:
- next_duration = next_audio.duration
- if args.fill_interleave_dir:
- interleaving = Audio(
- "file://"
- + random.choice(list(scan_dir_audio(args.fill_interleave_dir)))
- )
- # logging.info("%r", interleaving)
- next_duration += interleaving.duration
- if args.fill - duration > next_duration:
- if args.fill_interleave_dir:
- fill_audios.append(interleaving)
- fill_audios.append(next_audio)
- duration += next_duration
- audios = fill_audios
- if args.fill_reverse:
- audios.reverse()
- debug(f"Filled {duration}s out of {args.fill}s; left {args.fill - duration}s")
- # the for loop excludes the last one
- # this is to support the --slotsize option
- if not audios:
- return
- audios = add_intro_outro(audios, args)
- for audio in audios[:-1]:
- if args.debug:
- debug(repr(audio))
- else:
- put(audio, args.copy)
- if args.slotsize is not None:
- duration = audio.duration
- if duration < args.slotsize:
- # TODO: prendi musica da un'altra cartella
- print("## musica per {} secondi".format(args.slotsize - duration))
- # finally, the last one
- if args.debug:
- debug(repr(audios[-1]))
- else:
- put(audios[-1], args.copy)
- # else: # grouping; TODO: support slotsize
- # for item in groups:
- # if args.debug:
- # print('#', item, groups[item].duration)
- # print(groups[item])
- if __name__ == "__main__":
- main()
|