123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278 |
- #!/usr/bin/env python3
- '''
- Feed parser with many features
- from a feed, it supports filtering, subslicing, random picking
- Beside feeds, it supports picking files from directories
- '''
- import os
- import logging
- from argparse import ArgumentParser
- from subprocess import check_output
- from collections import OrderedDict
- import re
- import urllib.request
- from urllib.parse import urlparse, unquote
- import posixpath
- import random
- from bisect import bisect
- from lxml import html
- import requests
- def weighted_choice(values, weights):
- '''
- random.choice with weights
- weights must be integers greater than 0.
- Their meaning is "relative", that is [1,2,3] is the same as [2,4,6]
- '''
- assert len(values) == len(weights)
- total = 0
- cum_weights = []
- for w in weights:
- total += w
- cum_weights.append(total)
- x = random.random() * total
- i = bisect(cum_weights, x)
- return values[i]
- class Audio(object):
- def __init__(self, url, duration=None):
- self.url = url
- if duration is None:
- duration = get_duration(url.encode('utf-8'))
- self.duration = duration
- def __str__(self):
- return self.url
- def __repr__(self):
- return '<Audio {} ({})>'.format(self.url, self.duration)
- @property
- def urls(self):
- return [self.url]
- class AudioGroup(list):
- def __init__(self, description=None):
- self.description = description or ''
- self.audios = []
- def __len__(self):
- return len(self.audios)
- def append(self, arg):
- self.audios.append(arg)
- def __str__(self):
- return '\n'.join(str(a) for a in self.audios)
- def __repr__(self):
- return '<AudioGroup "{}" ({})\n{} >'.\
- format(self.description, self.duration,
- '\n'.join(' ' + repr(a) for a in self.audios))
- @property
- def duration(self):
- return sum(a.duration for a in self.audios if a.duration is not None)
- @property
- def urls(self):
- return [a.url for a in self.audios]
- def get_tree(feed_url):
- if feed_url.startswith('http:') or feed_url.startswith('https:'):
- tree = html.fromstring(requests.get(feed_url).content)
- else:
- if not os.path.exists(feed_url):
- raise ValueError("file not found: {}".format(feed_url))
- tree = html.parse(open(feed_url))
- return tree
- def get_audio_from_description(text):
- # non-empty lines
- lines = [line.strip()
- for line in text.split('\n')
- if line.strip()]
- url = lines[0]
- duration = None
- if len(lines) > 1:
- duration = int(re.findall(r'\d+', lines[1].split('=')[1].strip())[0])
- return Audio(unquote(url), duration)
- # copied from larigira.fsutils
- def scan_dir_audio(dirname, extensions=('mp3', 'oga', 'wav', 'ogg')):
- for root, dirnames, filenames in os.walk(dirname):
- for fname in filenames:
- if fname.split('.')[-1].lower() in extensions:
- yield os.path.join(root, fname)
- def get_audio_from_dir(dirpath):
- fpaths = scan_dir_audio(dirpath)
- return [Audio('file://' + os.path.realpath(u)) for u in fpaths]
- def get_urls(tree):
- urls = tree.xpath('//item/description')
- for url_elem in urls:
- yield get_audio_from_description(url_elem.text)
- def get_grouped_urls(tree):
- groups = OrderedDict()
- items = tree.xpath('//item')
- for item in items:
- guid = item.xpath('guid')[0].text.strip()
- if guid not in groups:
- groups[guid] = AudioGroup(guid)
- groups[guid].append(get_audio_from_description(
- item.xpath('description')[0].text))
- return groups
- def get_duration(url):
- lineout = check_output(['ffprobe', '-v', 'error',
- '-show_entries', 'format=duration',
- '-i', url]).split(b'\n')
- duration = next(l for l in lineout if l.startswith(b'duration='))
- value = duration.split(b'=')[1]
- return int(float(value))
- def get_parser():
- p = ArgumentParser('Get music from a (well-specified) xml feed')
- src = p.add_argument_group('sources', 'How to deal with sources')
- p.add_argument('--source-weights',
- help='Select only one "source" based on this weights')
- filters = p.add_argument_group('filters', 'Select only items that match these conditions')
- filters.add_argument('--max-len', default=0, type=int,
- help='Exclude any audio that is longer than MAXLEN seconds')
- filters.add_argument('--random', default=False,
- action='store_true', help='Pick randomly')
- p.add_argument('--start', default=0, type=int,
- help='0-indexed start number. '
- 'By default, play from most recent')
- p.add_argument('--howmany', default=1, type=int,
- help='If not specified, only 1 will be played')
- p.add_argument('--slotsize', help='Seconds between each audio', type=int)
- p.add_argument('--group', help='Group articles', default=False,
- action='store_true')
- p.add_argument('--copy', help='Copy files to $TMPDIR', default=False,
- action='store_true')
- p.add_argument('--debug', help='Debug messages', default=False,
- action='store_true')
- p.add_argument('urls', metavar='URL', nargs='+')
- return p
- def put(audio, copy=False):
- if not copy:
- for url in audio.urls:
- print(url)
- else:
- for url in audio.urls:
- if url.split(':')[0] in ('http', 'https'):
- destdir = (os.environ.get('TMPDIR', '.'))
- fname = posixpath.basename(urlparse(url).path)
- # sanitize
- fname = "".join(c for c in fname
- if c.isalnum() or c in list('._-')).rstrip()
- dest = os.path.join(destdir, fname)
- os.makedirs(destdir, exist_ok=True)
- fname, headers = urllib.request.urlretrieve(url, dest)
- print('file://%s' % os.path.realpath(fname))
- else:
- # FIXME: file:// urls are just copied
- print(url)
- def main():
- parser = get_parser()
- args = parser.parse_args()
- if not args.debug:
- logging.basicConfig(level=logging.WARNING)
- else:
- logging.basicConfig(level=logging.DEBUG)
- sources = args.urls
- if args.source_weights:
- weights = tuple(map(int, args.source_weights.split(':')))
- if len(weights) != len(sources):
- parser.exit(status=2, message='Weight must be in the'
- ' same number as sources\n')
- sources = [weighted_choice(sources, weights)]
- audios = []
- for url in sources:
- if url.startswith('http:') or url.startswith('https:') \
- or os.path.isfile(url):
- # download the feed
- tree = get_tree(url)
- if not args.group:
- # get audio urls, removing those that are too long
- audios += [audio for audio in get_urls(tree)
- if args.max_len == 0 or
- audio.duration <= args.max_len]
- else:
- groups = get_grouped_urls(tree)
- audios += [groups[g] for g in groups.keys()
- if args.max_len == 0 or
- groups[g].duration <= args.max_len
- ]
- elif os.path.isdir(url):
- audiodir = get_audio_from_dir(url)
- if not args.group:
- audios += audiodir
- else:
- for a in audiodir:
- ag = AudioGroup(os.path.basename(a.url))
- ag.append(a)
- audios.append(ag)
- else:
- logging.info('unsupported url `%s`', url)
- audios = audios[args.start:]
- if args.random:
- random.shuffle(audios)
- audios = audios[:args.howmany]
- # the for loop excludes the last one
- # this is to support the --slotsize option
- if not audios:
- return
- for audio in audios[:-1]:
- if args.debug:
- print(repr(audio))
- else:
- put(audio, args.copy)
- if args.slotsize is not None:
- duration = audio.duration
- if duration < args.slotsize:
- print('## musica per {} secondi'
- .format(args.slotsize - duration))
- # finally, the last one
- if args.debug:
- print(repr(audios[-1]))
- else:
- put(audios[-1], args.copy)
- # else: # grouping; TODO: support slotsize
- # for item in groups:
- # if args.debug:
- # print('#', item, groups[item].duration)
- # print(groups[item])
- if __name__ == '__main__':
- main()
|