semplifica audio/audiogroup

ora il codice di filtraggio è uguale
This commit is contained in:
boyska 2020-07-21 13:18:37 +02:00
parent 0ebb62c318
commit daeb2ff3db

389
feed
View file

@ -1,31 +1,31 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
''' """
Feed parser with many features Feed parser with many features
from a feed, it supports filtering, subslicing, random picking from a feed, it supports filtering, subslicing, random picking
Beside feeds, it supports picking files from directories Beside feeds, it supports picking files from directories
''' """
import os import datetime
import logging import logging
from argparse import ArgumentParser, ArgumentTypeError import os
from subprocess import check_output, CalledProcessError
from collections import OrderedDict
import re
import urllib.request
from urllib.parse import urlparse, unquote
import posixpath import posixpath
import random import random
import re
import urllib.request
from argparse import ArgumentParser, ArgumentTypeError
from bisect import bisect from bisect import bisect
import datetime from collections import OrderedDict
from subprocess import CalledProcessError, check_output
from urllib.parse import unquote, urlparse
from lxml import html
import requests import requests
from lxml import html
from pytimeparse.timeparse import timeparse from pytimeparse.timeparse import timeparse
def get_int(s): def get_int(s):
return int(re.findall(r'\d+', s)[0]) return int(re.findall(r"\d+", s)[0])
def DurationType(arg): def DurationType(arg):
@ -34,27 +34,28 @@ def DurationType(arg):
else: else:
secs = timeparse(arg) secs = timeparse(arg)
if secs is None: if secs is None:
raise ArgumentTypeError('%r is not a valid duration' % arg) raise ArgumentTypeError("%r is not a valid duration" % arg)
return secs return secs
def TimeDeltaType(arg): def TimeDeltaType(arg):
if arg.isdecimal(): if arg.isdecimal():
secs = int(arg) secs = int(arg)
else: else:
secs = timeparse(arg) secs = timeparse(arg)
if secs is None: if secs is None:
raise ArgumentTypeError('%r is not a valid time range' % arg) raise ArgumentTypeError("%r is not a valid time range" % arg)
return datetime.timedelta(seconds=secs) return datetime.timedelta(seconds=secs)
def weighted_choice(values, weights): def weighted_choice(values, weights):
''' """
random.choice with weights random.choice with weights
weights must be integers greater than 0. weights must be integers greater than 0.
Their meaning is "relative", that is [1,2,3] is the same as [2,4,6] Their meaning is "relative", that is [1,2,3] is the same as [2,4,6]
''' """
assert len(values) == len(weights) assert len(values) == len(weights)
total = 0 total = 0
cum_weights = [] cum_weights = []
@ -68,19 +69,19 @@ def weighted_choice(values, weights):
def delta_humanreadable(tdelta): def delta_humanreadable(tdelta):
if tdelta is None: if tdelta is None:
return '' return ""
days = tdelta.days days = tdelta.days
hours = (tdelta - datetime.timedelta(days=days)).seconds // 3600 hours = (tdelta - datetime.timedelta(days=days)).seconds // 3600
if days: if days:
return '{}d{}h'.format(days, hours) return "{}d{}h".format(days, hours)
return '{}h'.format(hours) return "{}h".format(hours)
class Audio(object): class Audio(object):
def __init__(self, url, duration=None, date=None): def __init__(self, url, duration=None, date=None):
self.url = url self.url = url
if duration is None: if duration is None:
duration = get_duration(url.encode('utf-8')) duration = get_duration(url.encode("utf-8"))
self.duration = duration self.duration = duration
self.date = date self.date = date
self.end_date = datetime.datetime(9999, 12, 31, tzinfo=datetime.timezone.utc) self.end_date = datetime.datetime(9999, 12, 31, tzinfo=datetime.timezone.utc)
@ -89,8 +90,9 @@ class Audio(object):
return self.url return self.url
def __repr__(self): def __repr__(self):
return '<Audio {} ({} {})>'.format(self.url, self.duration, return "<Audio {} ({} {})>".format(
delta_humanreadable(self.age)) self.url, self.duration, delta_humanreadable(self.age)
)
@property @property
def urls(self): def urls(self):
@ -106,12 +108,14 @@ class Audio(object):
@property @property
def valid(self): def valid(self):
return self.end_date >= datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc) return self.end_date >= datetime.datetime.utcnow().replace(
tzinfo=datetime.timezone.utc
)
class AudioGroup(list): class AudioGroup(list):
def __init__(self, description=None): def __init__(self, description=None):
self.description = description or '' self.description = description or ""
self.audios = [] self.audios = []
def __len__(self): def __len__(self):
@ -121,13 +125,15 @@ class AudioGroup(list):
self.audios.append(arg) self.audios.append(arg)
def __str__(self): def __str__(self):
return '\n'.join(str(a) for a in self.audios) return "\n".join(str(a) for a in self.audios)
def __repr__(self): def __repr__(self):
return '<AudioGroup "{}" ({} {})\n{} >'.\ return '<AudioGroup "{}" ({} {})\n{} >'.format(
format(self.description, self.duration, self.description,
delta_humanreadable(self.age), self.duration,
'\n'.join(' ' + repr(a) for a in self.audios)) delta_humanreadable(self.age),
"\n".join(" " + repr(a) for a in self.audios),
)
@property @property
def duration(self): def duration(self):
@ -140,7 +146,7 @@ class AudioGroup(list):
@property @property
def date(self): def date(self):
for a in self.audios: for a in self.audios:
if hasattr(a, 'date'): if hasattr(a, "date"):
return a.date return a.date
return None return None
@ -157,9 +163,8 @@ class AudioGroup(list):
return len(self.audios) > 0 return len(self.audios) > 0
def get_tree(feed_url): def get_tree(feed_url):
if feed_url.startswith('http:') or feed_url.startswith('https:'): if feed_url.startswith("http:") or feed_url.startswith("https:"):
tree = html.fromstring(requests.get(feed_url).content) tree = html.fromstring(requests.get(feed_url).content)
else: else:
if not os.path.exists(feed_url): if not os.path.exists(feed_url):
@ -170,70 +175,76 @@ def get_tree(feed_url):
def get_audio_from_description(text): def get_audio_from_description(text):
# non-empty lines # non-empty lines
lines = [line.strip() lines = [line.strip() for line in text.split("\n") if line.strip()]
for line in text.split('\n')
if line.strip()]
url = lines[0] url = lines[0]
duration = None duration = None
metadata = {} metadata = {}
for line in text.split('\n')[1:]: for line in text.split("\n")[1:]:
if line.strip() and '=' in line: if line.strip() and "=" in line:
metadata[line.split('=')[0]] = line.split('=')[1] metadata[line.split("=")[0]] = line.split("=")[1]
if 'durata' in metadata: if "durata" in metadata:
metadata['durata'] = get_int(metadata['durata']) metadata["durata"] = get_int(metadata["durata"])
if 'txdate' in metadata: if "txdate" in metadata:
try: try:
metadata['txdate'] = datetime.datetime.strptime( metadata["txdate"] = datetime.datetime.strptime(
metadata['txdate'], '%Y-%m-%dT%H:%M:%S%z') metadata["txdate"], "%Y-%m-%dT%H:%M:%S%z"
)
except ValueError: except ValueError:
logging.warning('could not parse txdate %s', metadata['txdate']) logging.warning("could not parse txdate %s", metadata["txdate"])
del metadata['txdate'] del metadata["txdate"]
a = Audio(unquote(url), a = Audio(
duration=metadata.get('durata', None), unquote(url),
date=metadata.get('txdate', None)) duration=metadata.get("durata", None),
date=metadata.get("txdate", None),
)
if 'txdate' in metadata and 'replica' in metadata: if "txdate" in metadata and "replica" in metadata:
if metadata['replica'].endswith('g'): if metadata["replica"].endswith("g"):
a.end_date = metadata['txdate'] + datetime.timedelta( a.end_date = metadata["txdate"] + datetime.timedelta(
days=get_int(metadata['replica'])) days=get_int(metadata["replica"])
)
return a return a
# copied from larigira.fsutils # copied from larigira.fsutils
def scan_dir_audio(dirname, extensions=('mp3', 'oga', 'wav', 'ogg')): def scan_dir_audio(dirname, extensions=("mp3", "oga", "wav", "ogg")):
for root, dirnames, filenames in os.walk(dirname): for root, dirnames, filenames in os.walk(dirname):
for fname in filenames: for fname in filenames:
if fname.split('.')[-1].lower() in extensions: if fname.split(".")[-1].lower() in extensions:
yield os.path.join(root, fname) yield os.path.join(root, fname)
def get_audio_from_dir(dirpath): def get_audio_from_dir(dirpath):
fpaths = scan_dir_audio(dirpath) fpaths = scan_dir_audio(dirpath)
return [Audio('file://' + os.path.realpath(u), return [
date=datetime.datetime.fromtimestamp(os.path.getmtime(u)). Audio(
replace(tzinfo=datetime.timezone.utc)) "file://" + os.path.realpath(u),
for u in fpaths] date=datetime.datetime.fromtimestamp(os.path.getmtime(u)).replace(
tzinfo=datetime.timezone.utc
),
)
for u in fpaths
]
def get_item_date(el): def get_item_date(el):
el_date = el.find('pubdate') el_date = el.find("pubdate")
if el_date is not None: if el_date is not None:
return datetime.datetime.strptime( return datetime.datetime.strptime(el_date.text, "%Y-%m-%dT%H:%M:%S%z")
el_date.text, '%Y-%m-%dT%H:%M:%S%z')
return None return None
def get_urls(tree): def get_urls(tree):
items = tree.xpath('//item') items = tree.xpath("//item")
for it in items: for it in items:
title = it.find('title').text title = it.find("title").text
el_body = it.find('description') el_body = it.find("description")
if el_body is not None: if el_body is not None:
url = el_body.text url = el_body.text
try: try:
audio = get_audio_from_description(url) audio = get_audio_from_description(url)
except Exception as exc: except Exception as exc:
logging.info('error getting duration for `%s`' % title) logging.info("error getting duration for `%s`" % title)
continue continue
if audio.date is None: if audio.date is None:
audio.date = get_item_date(it) audio.date = get_item_date(it)
@ -242,12 +253,12 @@ def get_urls(tree):
def get_grouped_urls(tree): def get_grouped_urls(tree):
groups = OrderedDict() groups = OrderedDict()
items = tree.xpath('//item') items = tree.xpath("//item")
for item in items: for item in items:
guid = item.xpath('guid')[0].text.strip() guid = item.xpath("guid")[0].text.strip()
if guid not in groups: if guid not in groups:
groups[guid] = AudioGroup(guid) groups[guid] = AudioGroup(guid)
audio = get_audio_from_description(item.xpath('description')[0].text) audio = get_audio_from_description(item.xpath("description")[0].text)
audio.date = get_item_date(item) audio.date = get_item_date(item)
if audio.valid: if audio.valid:
groups[guid].append(audio) groups[guid].append(audio)
@ -256,17 +267,17 @@ def get_grouped_urls(tree):
def get_duration(url): def get_duration(url):
try: try:
lineout = check_output(['ffprobe', '-v', 'error', lineout = check_output(
'-show_entries', 'format=duration', ["ffprobe", "-v", "error", "-show_entries", "format=duration", "-i", url]
'-i', url]).split(b'\n') ).split(b"\n")
except CalledProcessError as exc: except CalledProcessError as exc:
raise ValueError('error probing `%s`' % url) from exc raise ValueError("error probing `%s`" % url) from exc
duration = next(l for l in lineout if l.startswith(b'duration=')) duration = next(l for l in lineout if l.startswith(b"duration="))
value = duration.split(b'=')[1] value = duration.split(b"=")[1]
return int(float(value)) return int(float(value))
HELP = ''' HELP = """
Collect audio informations from multiple sources (XML feeds). Collect audio informations from multiple sources (XML feeds).
Audios are (in that order): Audios are (in that order):
1. Collected from feeds; (grouped by article if --group is used) 1. Collected from feeds; (grouped by article if --group is used)
@ -274,52 +285,79 @@ Audios are (in that order):
3. Sorted; even randomly 3. Sorted; even randomly
4. Sliced; take HOWMANY elements, skipping START elements 4. Sliced; take HOWMANY elements, skipping START elements
5. (if --copy) Copied 5. (if --copy) Copied
Usage: ''' Usage: """
def get_parser(): def get_parser():
p = ArgumentParser(HELP) p = ArgumentParser(HELP)
src = p.add_argument_group('sources', 'How to deal with sources') src = p.add_argument_group("sources", "How to deal with sources")
src.add_argument('--source-weights', src.add_argument(
help='Select only one "source" based on this weights') "--source-weights", help='Select only one "source" based on this weights'
src.add_argument('--group', default=False, action='store_true', )
help='Group audios that belong to the same article') src.add_argument(
"--group",
default=False,
action="store_true",
help="Group audios that belong to the same article",
)
filters = p.add_argument_group('filters', 'Select only items that match ' filters = p.add_argument_group(
'these conditions') "filters", "Select only items that match " "these conditions"
filters.add_argument('--min-len', default=0, type=DurationType, )
help='Exclude any audio that is shorter ' filters.add_argument(
'than MIN_LEN seconds') "--min-len",
filters.add_argument('--max-len', default=0, type=DurationType, default=0,
help='Exclude any audio that is longer ' type=DurationType,
'than MAX_LEN seconds') help="Exclude any audio that is shorter " "than MIN_LEN seconds",
filters.add_argument('--sort-by', default='no', type=str, )
choices=('random', 'date', 'duration')) filters.add_argument(
filters.add_argument('--reverse', default=False, "--max-len",
action='store_true', help='Reverse list order') default=0,
type=DurationType,
help="Exclude any audio that is longer " "than MAX_LEN seconds",
)
filters.add_argument(
"--sort-by", default="no", type=str, choices=("random", "date", "duration")
)
filters.add_argument(
"--reverse", default=False, action="store_true", help="Reverse list order"
)
filters.add_argument('--min-age', default=datetime.timedelta(), filters.add_argument(
type=TimeDeltaType, "--min-age",
help='Exclude audio more recent than MIN_AGE') default=datetime.timedelta(),
filters.add_argument('--max-age', default=datetime.timedelta(), type=TimeDeltaType,
type=TimeDeltaType, help="Exclude audio more recent than MIN_AGE",
help='Exclude audio older than MAX_AGE') )
filters.add_argument(
"--max-age",
default=datetime.timedelta(),
type=TimeDeltaType,
help="Exclude audio older than MAX_AGE",
)
p.add_argument('--start', default=0, type=int, p.add_argument(
help='0-indexed start number. ' "--start",
'By default, play from most recent') default=0,
p.add_argument('--howmany', default=1, type=int, type=int,
help='If not specified, only 1 will be played') help="0-indexed start number. " "By default, play from most recent",
p.add_argument('--slotsize', type=int, )
help='Seconds between each audio. Still unsupported') p.add_argument(
"--howmany", default=1, type=int, help="If not specified, only 1 will be played"
)
p.add_argument(
"--slotsize", type=int, help="Seconds between each audio. Still unsupported"
)
general = p.add_argument_group('general', 'General options') general = p.add_argument_group("general", "General options")
general.add_argument('--copy', help='Copy files to $TMPDIR', default=False, general.add_argument(
action='store_true') "--copy", help="Copy files to $TMPDIR", default=False, action="store_true"
general.add_argument('--debug', help='Debug messages', default=False, )
action='store_true') general.add_argument(
"--debug", help="Debug messages", default=False, action="store_true"
)
p.add_argument('urls', metavar='URL', nargs='+') p.add_argument("urls", metavar="URL", nargs="+")
return p return p
@ -329,21 +367,66 @@ def put(audio, copy=False):
print(url) print(url)
else: else:
for url in audio.urls: for url in audio.urls:
if url.split(':')[0] in ('http', 'https'): if url.split(":")[0] in ("http", "https"):
destdir = (os.environ.get('TMPDIR', '.')) destdir = os.environ.get("TMPDIR", ".")
fname = posixpath.basename(urlparse(url).path) fname = posixpath.basename(urlparse(url).path)
# sanitize # sanitize
fname = "".join(c for c in fname fname = "".join(
if c.isalnum() or c in list('._-')).rstrip() c for c in fname if c.isalnum() or c in list("._-")
).rstrip()
dest = os.path.join(destdir, fname) dest = os.path.join(destdir, fname)
os.makedirs(destdir, exist_ok=True) os.makedirs(destdir, exist_ok=True)
fname, headers = urllib.request.urlretrieve(url, dest) fname, headers = urllib.request.urlretrieve(url, dest)
print('file://%s' % os.path.realpath(fname)) print("file://%s" % os.path.realpath(fname))
else: else:
# FIXME: file:// urls are just copied # FIXME: file:// urls are just copied
print(url) print(url)
def retrieve(url, args):
"""
returns a list of Audios or a list of AudioGroups
"""
if not args.group:
if os.path.isdir(url):
audiodir = get_audio_from_dir(url)
return audiodir
elif url.startswith("http:") or url.startswith("https:") or os.path.isfile(url):
return get_urls(get_tree(url))
else:
logging.info("unsupported url `%s`", url)
return []
else: # group
if os.path.isdir(url):
audiodir = get_audio_from_dir(url)
agroups = []
for a in audiodir:
ag = AudioGroup(os.path.basename(a.url))
ag.append(a)
agroups.append(ag)
return agroups
elif url.startswith("http:") or url.startswith("https:") or os.path.isfile(url):
groups = get_grouped_urls(get_tree(url))
return groups.values()
else:
logging.info("unsupported url `%s`", url)
return []
def audio_passes_filters(audio, args):
if not audio.valid:
return False
if args.max_len and audio.duration > args.max_len:
return False
if args.min_len and audio.duration < args.min_len:
return False
if args.min_age.total_seconds() and audio.age < args.min_age:
return False
if args.max_age.total_seconds() and audio.age > args.max_age:
return False
return True
def main(): def main():
parser = get_parser() parser = get_parser()
args = parser.parse_args() args = parser.parse_args()
@ -354,75 +437,32 @@ def main():
sources = args.urls sources = args.urls
if args.source_weights: if args.source_weights:
weights = tuple(map(int, args.source_weights.split(':'))) weights = tuple(map(int, args.source_weights.split(":")))
if len(weights) != len(sources): if len(weights) != len(sources):
parser.exit(status=2, message='Weight must be in the' parser.exit(
' same number as sources\n') status=2, message="Weight must be in the" " same number as sources\n"
)
sources = [weighted_choice(sources, weights)] sources = [weighted_choice(sources, weights)]
audios = [] audios = []
for url in sources: for url in sources:
if not args.group: url_audios = retrieve(url, args)
if os.path.isdir(url): audios += [au for au in url_audios if audio_passes_filters(au, args)]
audiodir = get_audio_from_dir(url)
audios += audiodir
elif url.startswith('http:') or url.startswith('https:') \
or os.path.isfile(url):
audios += get_urls(get_tree(url))
else:
logging.info('unsupported url `%s`', url)
audios = [audio for audio in audios if
(audio.valid) and
(args.max_len == 0 or
audio.duration <= args.max_len) and
(args.min_len == 0 or
audio.duration >= args.min_len) and
(args.min_age.total_seconds() == 0 or
audio.age >= args.min_age) and
(args.max_age.total_seconds() == 0 or
audio.age <= args.max_age)
]
else: # group
if os.path.isdir(url):
audiodir = get_audio_from_dir(url)
agroups = []
for a in audiodir:
ag = AudioGroup(os.path.basename(a.url))
ag.append(a)
agroups.append(ag)
elif url.startswith('http:') or url.startswith('https:') \
or os.path.isfile(url):
groups = get_grouped_urls(get_tree(url))
agroups = groups.values()
else:
logging.info('unsupported url `%s`', url)
audios += [g for g in agroups
if
(g.valid) and
(args.max_len == 0 or
g.duration <= args.max_len) and
(args.min_len == 0 or
g.duration >= args.max_len) and
(args.min_age.total_seconds() == 0 or
g.age >= args.min_age) and
(args.max_age.total_seconds() == 0 or
g.age <= args.max_age)
]
# sort # sort
if args.sort_by == 'random': if args.sort_by == "random":
random.shuffle(audios) random.shuffle(audios)
elif args.sort_by == 'date': elif args.sort_by == "date":
audios.sort(key=lambda x: x.age) audios.sort(key=lambda x: x.age)
elif args.sort_by == 'duration': elif args.sort_by == "duration":
audios.sort(key=lambda x: x.duration) audios.sort(key=lambda x: x.duration)
if args.reverse: if args.reverse:
audios.reverse() audios.reverse()
# slice # slice
audios = audios[args.start:] audios = audios[args.start :]
audios = audios[:args.howmany] audios = audios[: args.howmany]
# the for loop excludes the last one # the for loop excludes the last one
# this is to support the --slotsize option # this is to support the --slotsize option
@ -436,13 +476,14 @@ def main():
if args.slotsize is not None: if args.slotsize is not None:
duration = audio.duration duration = audio.duration
if duration < args.slotsize: if duration < args.slotsize:
print('## musica per {} secondi' print("## musica per {} secondi".format(args.slotsize - duration))
.format(args.slotsize - duration))
# finally, the last one # finally, the last one
if args.debug: if args.debug:
print(repr(audios[-1])) print(repr(audios[-1]))
else: else:
put(audios[-1], args.copy) put(audios[-1], args.copy)
# else: # grouping; TODO: support slotsize # else: # grouping; TODO: support slotsize
# for item in groups: # for item in groups:
# if args.debug: # if args.debug:
@ -450,5 +491,5 @@ def main():
# print(groups[item]) # print(groups[item])
if __name__ == '__main__': if __name__ == "__main__":
main() main()