boyska
af4ff802ff
la txdate puo' essere pescata dal corpo (attualmente la cosa e' parziale) in base al flag replica= viene ricavata una data di fine validita per un audio. Sia gli audio sia gli audiogroup hanno un flag "valid"
452 lines
14 KiB
Python
Executable file
452 lines
14 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
'''
|
|
Feed parser with many features
|
|
|
|
from a feed, it supports filtering, subslicing, random picking
|
|
|
|
Beside feeds, it supports picking files from directories
|
|
'''
|
|
import os
|
|
import logging
|
|
from argparse import ArgumentParser, ArgumentTypeError
|
|
from subprocess import check_output, CalledProcessError
|
|
from collections import OrderedDict
|
|
import re
|
|
import urllib.request
|
|
from urllib.parse import urlparse, unquote
|
|
import posixpath
|
|
import random
|
|
from bisect import bisect
|
|
import datetime
|
|
|
|
from lxml import html
|
|
import requests
|
|
from pytimeparse.timeparse import timeparse
|
|
|
|
|
|
def get_int(s):
|
|
return int(re.findall(r'\d+', s)[0])
|
|
|
|
|
|
def DurationType(arg):
|
|
if arg.isdecimal():
|
|
secs = int(arg)
|
|
else:
|
|
secs = timeparse(arg)
|
|
if secs is None:
|
|
raise ArgumentTypeError('%r is not a valid duration' % arg)
|
|
return secs
|
|
|
|
def TimeDeltaType(arg):
|
|
if arg.isdecimal():
|
|
secs = int(arg)
|
|
else:
|
|
secs = timeparse(arg)
|
|
if secs is None:
|
|
raise ArgumentTypeError('%r is not a valid time range' % arg)
|
|
return datetime.timedelta(seconds=secs)
|
|
|
|
|
|
def weighted_choice(values, weights):
|
|
'''
|
|
random.choice with weights
|
|
|
|
weights must be integers greater than 0.
|
|
|
|
Their meaning is "relative", that is [1,2,3] is the same as [2,4,6]
|
|
'''
|
|
assert len(values) == len(weights)
|
|
total = 0
|
|
cum_weights = []
|
|
for w in weights:
|
|
total += w
|
|
cum_weights.append(total)
|
|
x = random.random() * total
|
|
i = bisect(cum_weights, x)
|
|
return values[i]
|
|
|
|
|
|
def delta_humanreadable(tdelta):
|
|
if tdelta is None:
|
|
return ''
|
|
days = tdelta.days
|
|
hours = (tdelta - datetime.timedelta(days=days)).seconds // 3600
|
|
if days:
|
|
return '{}d{}h'.format(days, hours)
|
|
return '{}h'.format(hours)
|
|
|
|
|
|
class Audio(object):
|
|
def __init__(self, url, duration=None, date=None):
|
|
self.url = url
|
|
if duration is None:
|
|
duration = get_duration(url.encode('utf-8'))
|
|
self.duration = duration
|
|
self.date = date
|
|
self.end_date = datetime.datetime(9999, 12, 31, tzinfo=datetime.timezone.utc)
|
|
|
|
def __str__(self):
|
|
return self.url
|
|
|
|
def __repr__(self):
|
|
return '<Audio {} ({} {})>'.format(self.url, self.duration,
|
|
delta_humanreadable(self.age))
|
|
|
|
@property
|
|
def urls(self):
|
|
return [self.url]
|
|
|
|
@property
|
|
def age(self):
|
|
if self.date is None:
|
|
return None
|
|
now = datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc)
|
|
|
|
return now - self.date
|
|
|
|
@property
|
|
def valid(self):
|
|
return self.end_date >= datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc)
|
|
|
|
|
|
class AudioGroup(list):
|
|
def __init__(self, description=None):
|
|
self.description = description or ''
|
|
self.audios = []
|
|
|
|
def __len__(self):
|
|
return len(self.audios)
|
|
|
|
def append(self, arg):
|
|
self.audios.append(arg)
|
|
|
|
def __str__(self):
|
|
return '\n'.join(str(a) for a in self.audios)
|
|
|
|
def __repr__(self):
|
|
return '<AudioGroup "{}" ({} {})\n{} >'.\
|
|
format(self.description, self.duration,
|
|
delta_humanreadable(self.age),
|
|
'\n'.join(' ' + repr(a) for a in self.audios))
|
|
|
|
@property
|
|
def duration(self):
|
|
return sum(a.duration for a in self.audios if a.duration is not None)
|
|
|
|
@property
|
|
def urls(self):
|
|
return [a.url for a in self.audios]
|
|
|
|
@property
|
|
def date(self):
|
|
for a in self.audios:
|
|
if hasattr(a, 'date'):
|
|
return a.date
|
|
return None
|
|
|
|
@property
|
|
def age(self):
|
|
if self.date is None:
|
|
return None
|
|
now = datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc)
|
|
|
|
return now - self.date
|
|
|
|
@property
|
|
def valid(self):
|
|
return len(self.audios) > 0
|
|
|
|
|
|
|
|
def get_tree(feed_url):
|
|
if feed_url.startswith('http:') or feed_url.startswith('https:'):
|
|
tree = html.fromstring(requests.get(feed_url).content)
|
|
else:
|
|
if not os.path.exists(feed_url):
|
|
raise ValueError("file not found: {}".format(feed_url))
|
|
tree = html.parse(open(feed_url))
|
|
return tree
|
|
|
|
|
|
def get_audio_from_description(text):
|
|
# non-empty lines
|
|
lines = [line.strip()
|
|
for line in text.split('\n')
|
|
if line.strip()]
|
|
url = lines[0]
|
|
duration = None
|
|
metadata = {}
|
|
for line in text.split('\n')[1:]:
|
|
if line.strip() and '=' in line:
|
|
metadata[line.split('=')[0]] = line.split('=')[1]
|
|
if 'durata' in metadata:
|
|
metadata['durata'] = get_int(metadata['durata'])
|
|
if 'txdate' in metadata:
|
|
try:
|
|
metadata['txdate'] = datetime.datetime.strptime(
|
|
metadata['txdate'], '%Y-%m-%dT%H:%M:%S%z')
|
|
except ValueError:
|
|
logging.warning('could not parse txdate %s', metadata['txdate'])
|
|
del metadata['txdate']
|
|
a = Audio(unquote(url), metadata.get('durata', None))
|
|
|
|
if 'txdate' in metadata and 'replica' in metadata:
|
|
if metadata['replica'].endswith('g'):
|
|
a.end_date = metadata['txdate'] + datetime.timedelta(
|
|
days=get_int(metadata['replica']))
|
|
return a
|
|
|
|
|
|
# copied from larigira.fsutils
|
|
def scan_dir_audio(dirname, extensions=('mp3', 'oga', 'wav', 'ogg')):
|
|
for root, dirnames, filenames in os.walk(dirname):
|
|
for fname in filenames:
|
|
if fname.split('.')[-1].lower() in extensions:
|
|
yield os.path.join(root, fname)
|
|
|
|
|
|
def get_audio_from_dir(dirpath):
|
|
fpaths = scan_dir_audio(dirpath)
|
|
return [Audio('file://' + os.path.realpath(u),
|
|
date=datetime.datetime.fromtimestamp(os.path.getmtime(u)).
|
|
replace(tzinfo=datetime.timezone.utc))
|
|
for u in fpaths]
|
|
|
|
|
|
def get_item_date(el):
|
|
# TODO: pick from txdate=
|
|
el_date = el.find('pubdate')
|
|
if el_date is not None:
|
|
return datetime.datetime.strptime(
|
|
el_date.text, '%Y-%m-%dT%H:%M:%S%z')
|
|
return None
|
|
|
|
|
|
def get_urls(tree):
|
|
items = tree.xpath('//item')
|
|
for it in items:
|
|
title = it.find('title').text
|
|
el_body = it.find('description')
|
|
if el_body is not None:
|
|
url = el_body.text
|
|
try:
|
|
audio = get_audio_from_description(url)
|
|
except Exception as exc:
|
|
logging.info('error getting duration for `%s`' % title)
|
|
continue
|
|
audio.date = get_item_date(it)
|
|
yield audio
|
|
|
|
|
|
def get_grouped_urls(tree):
|
|
groups = OrderedDict()
|
|
items = tree.xpath('//item')
|
|
for item in items:
|
|
guid = item.xpath('guid')[0].text.strip()
|
|
if guid not in groups:
|
|
groups[guid] = AudioGroup(guid)
|
|
audio = get_audio_from_description(item.xpath('description')[0].text)
|
|
audio.date = get_item_date(item)
|
|
if audio.valid:
|
|
groups[guid].append(audio)
|
|
return groups
|
|
|
|
|
|
def get_duration(url):
|
|
try:
|
|
lineout = check_output(['ffprobe', '-v', 'error',
|
|
'-show_entries', 'format=duration',
|
|
'-i', url]).split(b'\n')
|
|
except CalledProcessError as exc:
|
|
raise ValueError('error probing `%s`' % url) from exc
|
|
duration = next(l for l in lineout if l.startswith(b'duration='))
|
|
value = duration.split(b'=')[1]
|
|
return int(float(value))
|
|
|
|
|
|
HELP = '''
|
|
Collect audio informations from multiple sources (XML feeds).
|
|
Audios are (in that order):
|
|
1. Collected from feeds; (grouped by article if --group is used)
|
|
2. Filtered; everything that does not match with requirements is excluded
|
|
3. Sorted; even randomly
|
|
4. Sliced; take HOWMANY elements, skipping START elements
|
|
5. (if --copy) Copied
|
|
Usage: '''
|
|
|
|
|
|
def get_parser():
|
|
p = ArgumentParser(HELP)
|
|
src = p.add_argument_group('sources', 'How to deal with sources')
|
|
src.add_argument('--source-weights',
|
|
help='Select only one "source" based on this weights')
|
|
src.add_argument('--group', default=False, action='store_true',
|
|
help='Group audios that belong to the same article')
|
|
|
|
filters = p.add_argument_group('filters', 'Select only items that match '
|
|
'these conditions')
|
|
filters.add_argument('--min-len', default=0, type=DurationType,
|
|
help='Exclude any audio that is shorter '
|
|
'than MIN_LEN seconds')
|
|
filters.add_argument('--max-len', default=0, type=DurationType,
|
|
help='Exclude any audio that is longer '
|
|
'than MAX_LEN seconds')
|
|
filters.add_argument('--sort-by', default='no', type=str,
|
|
choices=('random', 'date', 'duration'))
|
|
filters.add_argument('--reverse', default=False,
|
|
action='store_true', help='Reverse list order')
|
|
|
|
filters.add_argument('--min-age', default=datetime.timedelta(),
|
|
type=TimeDeltaType,
|
|
help='Exclude audio more recent than MIN_AGE')
|
|
filters.add_argument('--max-age', default=datetime.timedelta(),
|
|
type=TimeDeltaType,
|
|
help='Exclude audio older than MAX_AGE')
|
|
|
|
p.add_argument('--start', default=0, type=int,
|
|
help='0-indexed start number. '
|
|
'By default, play from most recent')
|
|
p.add_argument('--howmany', default=1, type=int,
|
|
help='If not specified, only 1 will be played')
|
|
p.add_argument('--slotsize', type=int,
|
|
help='Seconds between each audio. Still unsupported')
|
|
|
|
general = p.add_argument_group('general', 'General options')
|
|
general.add_argument('--copy', help='Copy files to $TMPDIR', default=False,
|
|
action='store_true')
|
|
general.add_argument('--debug', help='Debug messages', default=False,
|
|
action='store_true')
|
|
|
|
p.add_argument('urls', metavar='URL', nargs='+')
|
|
return p
|
|
|
|
|
|
def put(audio, copy=False):
|
|
if not copy:
|
|
for url in audio.urls:
|
|
print(url)
|
|
else:
|
|
for url in audio.urls:
|
|
if url.split(':')[0] in ('http', 'https'):
|
|
destdir = (os.environ.get('TMPDIR', '.'))
|
|
fname = posixpath.basename(urlparse(url).path)
|
|
# sanitize
|
|
fname = "".join(c for c in fname
|
|
if c.isalnum() or c in list('._-')).rstrip()
|
|
dest = os.path.join(destdir, fname)
|
|
os.makedirs(destdir, exist_ok=True)
|
|
fname, headers = urllib.request.urlretrieve(url, dest)
|
|
print('file://%s' % os.path.realpath(fname))
|
|
else:
|
|
# FIXME: file:// urls are just copied
|
|
print(url)
|
|
|
|
|
|
def main():
|
|
parser = get_parser()
|
|
args = parser.parse_args()
|
|
if not args.debug:
|
|
logging.basicConfig(level=logging.WARNING)
|
|
else:
|
|
logging.basicConfig(level=logging.DEBUG)
|
|
sources = args.urls
|
|
|
|
if args.source_weights:
|
|
weights = tuple(map(int, args.source_weights.split(':')))
|
|
if len(weights) != len(sources):
|
|
parser.exit(status=2, message='Weight must be in the'
|
|
' same number as sources\n')
|
|
sources = [weighted_choice(sources, weights)]
|
|
|
|
audios = []
|
|
for url in sources:
|
|
if not args.group:
|
|
if os.path.isdir(url):
|
|
audiodir = get_audio_from_dir(url)
|
|
audios += audiodir
|
|
elif url.startswith('http:') or url.startswith('https:') \
|
|
or os.path.isfile(url):
|
|
audios += get_urls(get_tree(url))
|
|
else:
|
|
logging.info('unsupported url `%s`', url)
|
|
audios = [audio for audio in audios if
|
|
(audio.valid) and
|
|
(args.max_len == 0 or
|
|
audio.duration <= args.max_len) and
|
|
(args.min_len == 0 or
|
|
audio.duration >= args.min_len) and
|
|
(args.min_age.total_seconds() == 0 or
|
|
audio.age >= args.min_age) and
|
|
(args.max_age.total_seconds() == 0 or
|
|
audio.age <= args.max_age)
|
|
]
|
|
else: # group
|
|
if os.path.isdir(url):
|
|
audiodir = get_audio_from_dir(url)
|
|
agroups = []
|
|
for a in audiodir:
|
|
ag = AudioGroup(os.path.basename(a.url))
|
|
ag.append(a)
|
|
agroups.append(ag)
|
|
elif url.startswith('http:') or url.startswith('https:') \
|
|
or os.path.isfile(url):
|
|
groups = get_grouped_urls(get_tree(url))
|
|
agroups = groups.values()
|
|
else:
|
|
logging.info('unsupported url `%s`', url)
|
|
audios += [g for g in agroups
|
|
if
|
|
(g.valid) and
|
|
(args.max_len == 0 or
|
|
g.duration <= args.max_len) and
|
|
(args.min_len == 0 or
|
|
g.duration >= args.max_len) and
|
|
(args.min_age.total_seconds() == 0 or
|
|
g.age >= args.min_age) and
|
|
(args.max_age.total_seconds() == 0 or
|
|
g.age <= args.max_age)
|
|
]
|
|
|
|
# sort
|
|
if args.sort_by == 'random':
|
|
random.shuffle(audios)
|
|
elif args.sort_by == 'date':
|
|
audios.sort(key=lambda x: x.age)
|
|
elif args.sort_by == 'duration':
|
|
audios.sort(key=lambda x: x.duration)
|
|
|
|
if args.reverse:
|
|
audios.reverse()
|
|
|
|
# slice
|
|
audios = audios[args.start:]
|
|
audios = audios[:args.howmany]
|
|
|
|
# the for loop excludes the last one
|
|
# this is to support the --slotsize option
|
|
if not audios:
|
|
return
|
|
for audio in audios[:-1]:
|
|
if args.debug:
|
|
print(repr(audio))
|
|
else:
|
|
put(audio, args.copy)
|
|
if args.slotsize is not None:
|
|
duration = audio.duration
|
|
if duration < args.slotsize:
|
|
print('## musica per {} secondi'
|
|
.format(args.slotsize - duration))
|
|
# finally, the last one
|
|
if args.debug:
|
|
print(repr(audios[-1]))
|
|
else:
|
|
put(audios[-1], args.copy)
|
|
# else: # grouping; TODO: support slotsize
|
|
# for item in groups:
|
|
# if args.debug:
|
|
# print('#', item, groups[item].duration)
|
|
# print(groups[item])
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|