123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454 |
- #!/usr/bin/env python3
- '''
- Feed parser with many features
- from a feed, it supports filtering, subslicing, random picking
- Beside feeds, it supports picking files from directories
- '''
- import os
- import logging
- from argparse import ArgumentParser, ArgumentTypeError
- from subprocess import check_output, CalledProcessError
- from collections import OrderedDict
- import re
- import urllib.request
- from urllib.parse import urlparse, unquote
- import posixpath
- import random
- from bisect import bisect
- import datetime
- from lxml import html
- import requests
- from pytimeparse.timeparse import timeparse
- def get_int(s):
- return int(re.findall(r'\d+', s)[0])
- def DurationType(arg):
- if arg.isdecimal():
- secs = int(arg)
- else:
- secs = timeparse(arg)
- if secs is None:
- raise ArgumentTypeError('%r is not a valid duration' % arg)
- return secs
- def TimeDeltaType(arg):
- if arg.isdecimal():
- secs = int(arg)
- else:
- secs = timeparse(arg)
- if secs is None:
- raise ArgumentTypeError('%r is not a valid time range' % arg)
- return datetime.timedelta(seconds=secs)
- def weighted_choice(values, weights):
- '''
- random.choice with weights
- weights must be integers greater than 0.
- Their meaning is "relative", that is [1,2,3] is the same as [2,4,6]
- '''
- assert len(values) == len(weights)
- total = 0
- cum_weights = []
- for w in weights:
- total += w
- cum_weights.append(total)
- x = random.random() * total
- i = bisect(cum_weights, x)
- return values[i]
- def delta_humanreadable(tdelta):
- if tdelta is None:
- return ''
- days = tdelta.days
- hours = (tdelta - datetime.timedelta(days=days)).seconds // 3600
- if days:
- return '{}d{}h'.format(days, hours)
- return '{}h'.format(hours)
- class Audio(object):
- def __init__(self, url, duration=None, date=None):
- self.url = url
- if duration is None:
- duration = get_duration(url.encode('utf-8'))
- self.duration = duration
- self.date = date
- self.end_date = datetime.datetime(9999, 12, 31, tzinfo=datetime.timezone.utc)
- def __str__(self):
- return self.url
- def __repr__(self):
- return '<Audio {} ({} {})>'.format(self.url, self.duration,
- delta_humanreadable(self.age))
- @property
- def urls(self):
- return [self.url]
- @property
- def age(self):
- if self.date is None:
- return None
- now = datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc)
- return now - self.date
- @property
- def valid(self):
- return self.end_date >= datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc)
- class AudioGroup(list):
- def __init__(self, description=None):
- self.description = description or ''
- self.audios = []
- def __len__(self):
- return len(self.audios)
- def append(self, arg):
- self.audios.append(arg)
- def __str__(self):
- return '\n'.join(str(a) for a in self.audios)
- def __repr__(self):
- return '<AudioGroup "{}" ({} {})\n{} >'.\
- format(self.description, self.duration,
- delta_humanreadable(self.age),
- '\n'.join(' ' + repr(a) for a in self.audios))
- @property
- def duration(self):
- return sum(a.duration for a in self.audios if a.duration is not None)
- @property
- def urls(self):
- return [a.url for a in self.audios]
- @property
- def date(self):
- for a in self.audios:
- if hasattr(a, 'date'):
- return a.date
- return None
- @property
- def age(self):
- if self.date is None:
- return None
- now = datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc)
- return now - self.date
- @property
- def valid(self):
- return len(self.audios) > 0
- def get_tree(feed_url):
- if feed_url.startswith('http:') or feed_url.startswith('https:'):
- tree = html.fromstring(requests.get(feed_url).content)
- else:
- if not os.path.exists(feed_url):
- raise ValueError("file not found: {}".format(feed_url))
- tree = html.parse(open(feed_url))
- return tree
- def get_audio_from_description(text):
- # non-empty lines
- lines = [line.strip()
- for line in text.split('\n')
- if line.strip()]
- url = lines[0]
- duration = None
- metadata = {}
- for line in text.split('\n')[1:]:
- if line.strip() and '=' in line:
- metadata[line.split('=')[0]] = line.split('=')[1]
- if 'durata' in metadata:
- metadata['durata'] = get_int(metadata['durata'])
- if 'txdate' in metadata:
- try:
- metadata['txdate'] = datetime.datetime.strptime(
- metadata['txdate'], '%Y-%m-%dT%H:%M:%S%z')
- except ValueError:
- logging.warning('could not parse txdate %s', metadata['txdate'])
- del metadata['txdate']
- a = Audio(unquote(url),
- duration=metadata.get('durata', None),
- date=metadata.get('txdate', None))
- if 'txdate' in metadata and 'replica' in metadata:
- if metadata['replica'].endswith('g'):
- a.end_date = metadata['txdate'] + datetime.timedelta(
- days=get_int(metadata['replica']))
- return a
- # copied from larigira.fsutils
- def scan_dir_audio(dirname, extensions=('mp3', 'oga', 'wav', 'ogg')):
- for root, dirnames, filenames in os.walk(dirname):
- for fname in filenames:
- if fname.split('.')[-1].lower() in extensions:
- yield os.path.join(root, fname)
- def get_audio_from_dir(dirpath):
- fpaths = scan_dir_audio(dirpath)
- return [Audio('file://' + os.path.realpath(u),
- date=datetime.datetime.fromtimestamp(os.path.getmtime(u)).
- replace(tzinfo=datetime.timezone.utc))
- for u in fpaths]
- def get_item_date(el):
- el_date = el.find('pubdate')
- if el_date is not None:
- return datetime.datetime.strptime(
- el_date.text, '%Y-%m-%dT%H:%M:%S%z')
- return None
- def get_urls(tree):
- items = tree.xpath('//item')
- for it in items:
- title = it.find('title').text
- el_body = it.find('description')
- if el_body is not None:
- url = el_body.text
- try:
- audio = get_audio_from_description(url)
- except Exception as exc:
- logging.info('error getting duration for `%s`' % title)
- continue
- if audio.date is None:
- audio.date = get_item_date(it)
- yield audio
- def get_grouped_urls(tree):
- groups = OrderedDict()
- items = tree.xpath('//item')
- for item in items:
- guid = item.xpath('guid')[0].text.strip()
- if guid not in groups:
- groups[guid] = AudioGroup(guid)
- audio = get_audio_from_description(item.xpath('description')[0].text)
- audio.date = get_item_date(item)
- if audio.valid:
- groups[guid].append(audio)
- return groups
- def get_duration(url):
- try:
- lineout = check_output(['ffprobe', '-v', 'error',
- '-show_entries', 'format=duration',
- '-i', url]).split(b'\n')
- except CalledProcessError as exc:
- raise ValueError('error probing `%s`' % url) from exc
- duration = next(l for l in lineout if l.startswith(b'duration='))
- value = duration.split(b'=')[1]
- return int(float(value))
- HELP = '''
- Collect audio informations from multiple sources (XML feeds).
- Audios are (in that order):
- 1. Collected from feeds; (grouped by article if --group is used)
- 2. Filtered; everything that does not match with requirements is excluded
- 3. Sorted; even randomly
- 4. Sliced; take HOWMANY elements, skipping START elements
- 5. (if --copy) Copied
- Usage: '''
- def get_parser():
- p = ArgumentParser(HELP)
- src = p.add_argument_group('sources', 'How to deal with sources')
- src.add_argument('--source-weights',
- help='Select only one "source" based on this weights')
- src.add_argument('--group', default=False, action='store_true',
- help='Group audios that belong to the same article')
- filters = p.add_argument_group('filters', 'Select only items that match '
- 'these conditions')
- filters.add_argument('--min-len', default=0, type=DurationType,
- help='Exclude any audio that is shorter '
- 'than MIN_LEN seconds')
- filters.add_argument('--max-len', default=0, type=DurationType,
- help='Exclude any audio that is longer '
- 'than MAX_LEN seconds')
- filters.add_argument('--sort-by', default='no', type=str,
- choices=('random', 'date', 'duration'))
- filters.add_argument('--reverse', default=False,
- action='store_true', help='Reverse list order')
- filters.add_argument('--min-age', default=datetime.timedelta(),
- type=TimeDeltaType,
- help='Exclude audio more recent than MIN_AGE')
- filters.add_argument('--max-age', default=datetime.timedelta(),
- type=TimeDeltaType,
- help='Exclude audio older than MAX_AGE')
- p.add_argument('--start', default=0, type=int,
- help='0-indexed start number. '
- 'By default, play from most recent')
- p.add_argument('--howmany', default=1, type=int,
- help='If not specified, only 1 will be played')
- p.add_argument('--slotsize', type=int,
- help='Seconds between each audio. Still unsupported')
- general = p.add_argument_group('general', 'General options')
- general.add_argument('--copy', help='Copy files to $TMPDIR', default=False,
- action='store_true')
- general.add_argument('--debug', help='Debug messages', default=False,
- action='store_true')
- p.add_argument('urls', metavar='URL', nargs='+')
- return p
- def put(audio, copy=False):
- if not copy:
- for url in audio.urls:
- print(url)
- else:
- for url in audio.urls:
- if url.split(':')[0] in ('http', 'https'):
- destdir = (os.environ.get('TMPDIR', '.'))
- fname = posixpath.basename(urlparse(url).path)
- # sanitize
- fname = "".join(c for c in fname
- if c.isalnum() or c in list('._-')).rstrip()
- dest = os.path.join(destdir, fname)
- os.makedirs(destdir, exist_ok=True)
- fname, headers = urllib.request.urlretrieve(url, dest)
- print('file://%s' % os.path.realpath(fname))
- else:
- # FIXME: file:// urls are just copied
- print(url)
- def main():
- parser = get_parser()
- args = parser.parse_args()
- if not args.debug:
- logging.basicConfig(level=logging.WARNING)
- else:
- logging.basicConfig(level=logging.DEBUG)
- sources = args.urls
- if args.source_weights:
- weights = tuple(map(int, args.source_weights.split(':')))
- if len(weights) != len(sources):
- parser.exit(status=2, message='Weight must be in the'
- ' same number as sources\n')
- sources = [weighted_choice(sources, weights)]
- audios = []
- for url in sources:
- if not args.group:
- if os.path.isdir(url):
- audiodir = get_audio_from_dir(url)
- audios += audiodir
- elif url.startswith('http:') or url.startswith('https:') \
- or os.path.isfile(url):
- audios += get_urls(get_tree(url))
- else:
- logging.info('unsupported url `%s`', url)
- audios = [audio for audio in audios if
- (audio.valid) and
- (args.max_len == 0 or
- audio.duration <= args.max_len) and
- (args.min_len == 0 or
- audio.duration >= args.min_len) and
- (args.min_age.total_seconds() == 0 or
- audio.age >= args.min_age) and
- (args.max_age.total_seconds() == 0 or
- audio.age <= args.max_age)
- ]
- else: # group
- if os.path.isdir(url):
- audiodir = get_audio_from_dir(url)
- agroups = []
- for a in audiodir:
- ag = AudioGroup(os.path.basename(a.url))
- ag.append(a)
- agroups.append(ag)
- elif url.startswith('http:') or url.startswith('https:') \
- or os.path.isfile(url):
- groups = get_grouped_urls(get_tree(url))
- agroups = groups.values()
- else:
- logging.info('unsupported url `%s`', url)
- audios += [g for g in agroups
- if
- (g.valid) and
- (args.max_len == 0 or
- g.duration <= args.max_len) and
- (args.min_len == 0 or
- g.duration >= args.max_len) and
- (args.min_age.total_seconds() == 0 or
- g.age >= args.min_age) and
- (args.max_age.total_seconds() == 0 or
- g.age <= args.max_age)
- ]
- # sort
- if args.sort_by == 'random':
- random.shuffle(audios)
- elif args.sort_by == 'date':
- audios.sort(key=lambda x: x.age)
- elif args.sort_by == 'duration':
- audios.sort(key=lambda x: x.duration)
- if args.reverse:
- audios.reverse()
- # slice
- audios = audios[args.start:]
- audios = audios[:args.howmany]
- # the for loop excludes the last one
- # this is to support the --slotsize option
- if not audios:
- return
- for audio in audios[:-1]:
- if args.debug:
- print(repr(audio))
- else:
- put(audio, args.copy)
- if args.slotsize is not None:
- duration = audio.duration
- if duration < args.slotsize:
- print('## musica per {} secondi'
- .format(args.slotsize - duration))
- # finally, the last one
- if args.debug:
- print(repr(audios[-1]))
- else:
- put(audios[-1], args.copy)
- # else: # grouping; TODO: support slotsize
- # for item in groups:
- # if args.debug:
- # print('#', item, groups[item].duration)
- # print(groups[item])
- if __name__ == '__main__':
- main()
|