larigira-scripts/feed
2017-05-26 12:57:15 +02:00

182 lines
5.7 KiB
Python
Executable file

#!/usr/bin/env python3
# pip install lxml requests
import os
from argparse import ArgumentParser
from subprocess import check_output
from collections import OrderedDict
import re
import urllib.request
from urllib.parse import urlparse, unquote
import posixpath
from lxml import html
import requests
class Audio(object):
def __init__(self, url, durata=None):
self.url = url
if durata is None:
durata = get_duration(url.encode('utf-8'))
self.durata = durata
def __str__(self):
return self.url
def __repr__(self):
return '<Audio {} ({})>'.format(self.url, self.durata)
class AudioGroup(list):
def __init__(self, description=None):
self.description = description or ''
self.audios = []
def __len__(self):
return len(self.audios)
def append(self, arg):
self.audios.append(arg)
def __str__(self):
return '\n'.join(str(a) for a in self.audios)
def __repr__(self):
return '<AudioGroup "{}" ({})\n{}>'.format(self.description,
self.durata,
'\n'.join(repr(a) for a in self.audios))
@property
def durata(self):
return sum(a.durata for a in self.audios if a.durata is not None)
def get_tree(feed_url):
if feed_url.startswith('http:') or feed_url.startswith('https:'):
tree = html.fromstring(requests.get(feed_url).content)
else:
if not os.path.exists(feed_url):
raise ValueError("file not found: {}".format(feed_url))
tree = html.parse(open(feed_url))
return tree
def get_audio_from_description(text):
# non-empty lines
lines = [line.strip()
for line in text.split('\n')
if line.strip()]
url = lines[0]
durata = None
if len(lines) > 1:
durata = int(re.findall(r'\d+', lines[1].split('=')[1].strip())[0])
return Audio(unquote(url), durata)
def get_urls(tree):
urls = tree.xpath('//item/description')
for url_elem in urls:
yield get_audio_from_description(url_elem.text)
def get_grouped_urls(tree):
groups = OrderedDict()
items = tree.xpath('//item')
for item in items:
guid = item.xpath('guid')[0].text.strip()
if guid not in groups:
groups[guid] = AudioGroup(guid)
groups[guid].append(get_audio_from_description(
item.xpath('description')[0].text))
return groups
def get_duration(url):
lineout = check_output(['ffprobe', '-v', 'error',
'-show_entries', 'format=duration',
'-i', url]).split(b'\n')
duration = next(l for l in lineout if l.startswith(b'duration='))
value = duration.split(b'=')[1]
return int(float(value))
def get_parser():
p = ArgumentParser('Get music from a (well-specified) xml feed')
p.add_argument('--start', default=0, type=int,
help='0-indexed start number. '
'By default, play from most recent')
p.add_argument('--max-len', default=0, type=int,
help='Exclude any audio that is longer than MAXLEN seconds')
p.add_argument('--howmany', default=1, type=int,
help='If not specified, only 1 will be played')
p.add_argument('--slotsize', help='Seconds between each audio', type=int)
p.add_argument('--group', help='Group articles', default=False,
action='store_true')
p.add_argument('--copy', help='Copy files to $TMPDIR', default=False,
action='store_true')
p.add_argument('--debug', help='Debug messages', default=False,
action='store_true')
p.add_argument('url')
return p
def put(audio, copy=False):
if not copy:
print(audio.url)
else:
destdir = (os.environ.get('TMPDIR', '.'))
fname = posixpath.basename(urlparse(audio.url).path)
# sanitize
fname = "".join(c for c in fname
if c.isalnum() or c in list('._-')).rstrip()
dest = os.path.join(destdir, fname)
os.makedirs(destdir, exist_ok=True)
fname, headers = urllib.request.urlretrieve(audio.url, dest)
print('file://%s' % os.path.realpath(fname))
def main():
args = get_parser().parse_args()
# download the feed
tree = get_tree(args.url)
if not args.group:
# get audio urls, removing those that are too long
audios = [audio for audio in get_urls(tree)
if args.max_len == 0 or
audio.durata <= args.max_len]
audios = audios[args.start:args.start+args.howmany]
else:
groups = get_grouped_urls(tree)
audios = [groups[g] for g in groups.keys()
if args.max_len == 0 or
groups[g].durata <= args.max_len
][args.start:args.start+args.howmany]
# the for loop excludes the last one
# this is to support the --slotsize option
if not audios:
return
for audio in audios[:-1]:
if args.debug:
print(repr(audio))
else:
put(audio, args.copy)
if args.slotsize is not None:
duration = audio.durata
if duration < args.slotsize:
print('## musica per {} secondi'
.format(args.slotsize - duration))
# finally, the last one
if args.debug:
print(repr(audios[-1]))
else:
put(audios[-1], args.copy)
# else: # grouping; TODO: support slotsize
# for item in groups:
# if args.debug:
# print('#', item, groups[item].durata)
# print(groups[item])
if __name__ == '__main__':
main()