larigira-scripts/feed

402 lines
13 KiB
Text
Raw Normal View History

#!/usr/bin/env python3
'''
Feed parser with many features
from a feed, it supports filtering, subslicing, random picking
Beside feeds, it supports picking files from directories
'''
import os
import logging
from argparse import ArgumentParser, ArgumentTypeError
from subprocess import check_output
from collections import OrderedDict
import re
2017-05-25 16:52:57 +02:00
import urllib.request
2017-05-25 19:22:00 +02:00
from urllib.parse import urlparse, unquote
import posixpath
2017-06-12 23:26:43 +02:00
import random
from bisect import bisect
import datetime
from lxml import html
import requests
from pytimeparse.timeparse import timeparse
def DurationType(arg):
if arg.isdecimal():
secs = int(arg)
else:
secs = timeparse(arg)
if secs is None:
raise ArgumentTypeError('%r is not a valid duration' % arg)
return secs
def TimeDeltaType(arg):
if arg.isdecimal():
secs = int(arg)
else:
secs = timeparse(arg)
if secs is None:
raise ArgumentTypeError('%r is not a valid time range' % arg)
return datetime.timedelta(seconds=secs)
def weighted_choice(values, weights):
'''
random.choice with weights
weights must be integers greater than 0.
Their meaning is "relative", that is [1,2,3] is the same as [2,4,6]
'''
assert len(values) == len(weights)
total = 0
cum_weights = []
for w in weights:
total += w
cum_weights.append(total)
x = random.random() * total
i = bisect(cum_weights, x)
return values[i]
def delta_humanreadable(tdelta):
if tdelta is None:
return ''
days = tdelta.days
hours = (tdelta - datetime.timedelta(days=days)).seconds // 3600
if days:
return '{}d{}h'.format(days, hours)
return '{}h'.format(hours)
class Audio(object):
def __init__(self, url, duration=None, date=None):
self.url = url
2018-02-13 20:28:01 +01:00
if duration is None:
duration = get_duration(url.encode('utf-8'))
self.duration = duration
self.date = date
def __str__(self):
return self.url
def __repr__(self):
return '<Audio {} ({} {})>'.format(self.url, self.duration,
delta_humanreadable(self.age))
2017-06-12 23:26:36 +02:00
@property
def urls(self):
return [self.url]
@property
def age(self):
if self.date is None:
return None
now = datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc)
return now - self.date
class AudioGroup(list):
def __init__(self, description=None):
self.description = description or ''
self.audios = []
def __len__(self):
return len(self.audios)
def append(self, arg):
self.audios.append(arg)
def __str__(self):
return '\n'.join(str(a) for a in self.audios)
def __repr__(self):
return '<AudioGroup "{}" ({} {})\n{} >'.\
2018-02-13 20:28:01 +01:00
format(self.description, self.duration,
delta_humanreadable(self.age),
'\n'.join(' ' + repr(a) for a in self.audios))
@property
2018-02-13 20:28:01 +01:00
def duration(self):
return sum(a.duration for a in self.audios if a.duration is not None)
2017-06-12 23:26:36 +02:00
@property
def urls(self):
return [a.url for a in self.audios]
@property
def date(self):
for a in self.audios:
if hasattr(a, 'date'):
return a.date
return None
@property
def age(self):
if self.date is None:
return None
now = datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc)
return now - self.date
def get_tree(feed_url):
if feed_url.startswith('http:') or feed_url.startswith('https:'):
tree = html.fromstring(requests.get(feed_url).content)
else:
if not os.path.exists(feed_url):
raise ValueError("file not found: {}".format(feed_url))
tree = html.parse(open(feed_url))
return tree
def get_audio_from_description(text):
# non-empty lines
lines = [line.strip()
for line in text.split('\n')
if line.strip()]
url = lines[0]
2018-02-13 20:28:01 +01:00
duration = None
if len(lines) > 1:
2018-02-13 20:28:01 +01:00
duration = int(re.findall(r'\d+', lines[1].split('=')[1].strip())[0])
return Audio(unquote(url), duration)
# copied from larigira.fsutils
def scan_dir_audio(dirname, extensions=('mp3', 'oga', 'wav', 'ogg')):
for root, dirnames, filenames in os.walk(dirname):
for fname in filenames:
if fname.split('.')[-1].lower() in extensions:
yield os.path.join(root, fname)
def get_audio_from_dir(dirpath):
fpaths = scan_dir_audio(dirpath)
return [Audio('file://' + os.path.realpath(u)) for u in fpaths]
def get_item_date(el):
el_date = el.find('pubdate')
if el_date is not None:
return datetime.datetime.strptime(
el_date.text, '%Y-%m-%dT%H:%M:%S%z')
return None
def get_urls(tree):
items = tree.xpath('//item')
for it in items:
el_body = it.find('description')
if el_body is not None:
audio = get_audio_from_description(el_body.text)
audio.date = get_item_date(it)
yield audio
def get_grouped_urls(tree):
groups = OrderedDict()
items = tree.xpath('//item')
for item in items:
guid = item.xpath('guid')[0].text.strip()
if guid not in groups:
groups[guid] = AudioGroup(guid)
audio = get_audio_from_description(item.xpath('description')[0].text)
audio.date = get_item_date(item)
groups[guid].append(audio)
return groups
def get_duration(url):
lineout = check_output(['ffprobe', '-v', 'error',
'-show_entries', 'format=duration',
'-i', url]).split(b'\n')
duration = next(l for l in lineout if l.startswith(b'duration='))
value = duration.split(b'=')[1]
return int(float(value))
2018-02-13 21:49:30 +01:00
HELP = '''
Collect audio informations from multiple sources (XML feeds).
Audios are (in that order):
1. Collected from feeds; (grouped by article if --group is used)
2. Filtered; everything that does not match with requirements is excluded
3. Sorted; even randomly
4. Sliced; take HOWMANY elements, skipping START elements
5. (if --copy) Copied
Usage: '''
def get_parser():
2018-02-13 21:49:30 +01:00
p = ArgumentParser(HELP)
2018-02-13 20:20:06 +01:00
src = p.add_argument_group('sources', 'How to deal with sources')
src.add_argument('--source-weights',
help='Select only one "source" based on this weights')
2018-02-13 21:49:30 +01:00
src.add_argument('--group', default=False, action='store_true',
help='Group audios that belong to the same article')
2018-02-13 20:20:06 +01:00
2018-02-13 21:49:30 +01:00
filters = p.add_argument_group('filters', 'Select only items that match '
'these conditions')
filters.add_argument('--max-len', default=0, type=DurationType,
2018-02-13 21:49:30 +01:00
help='Exclude any audio that is longer '
'than MAX_LEN seconds')
filters.add_argument('--sort-by', default='no', type=str,
choices=('random', 'date'))
filters.add_argument('--reverse', default=False,
action='store_true', help='Reverse list order')
filters.add_argument('--min-len', default=0, type=DurationType,
2018-02-13 21:49:30 +01:00
help='Exclude any audio that is shorter '
'than MIN_LEN seconds')
filters.add_argument('--min-age', default=datetime.timedelta(),
type=TimeDeltaType,
help='Exclude audio more recent than MIN_AGE')
filters.add_argument('--max-age', default=datetime.timedelta(),
type=TimeDeltaType,
help='Exclude audio older than MAX_AGE')
p.add_argument('--start', default=0, type=int,
help='0-indexed start number. '
'By default, play from most recent')
p.add_argument('--howmany', default=1, type=int,
help='If not specified, only 1 will be played')
2018-02-13 21:49:30 +01:00
p.add_argument('--slotsize', type=int,
help='Seconds between each audio. Still unsupported')
general = p.add_argument_group('general', 'General options')
general.add_argument('--copy', help='Copy files to $TMPDIR', default=False,
action='store_true')
general.add_argument('--debug', help='Debug messages', default=False,
action='store_true')
p.add_argument('urls', metavar='URL', nargs='+')
return p
2017-05-25 19:22:00 +02:00
2017-05-25 16:52:57 +02:00
def put(audio, copy=False):
if not copy:
2017-06-12 23:26:36 +02:00
for url in audio.urls:
print(url)
2017-05-25 16:52:57 +02:00
else:
2017-06-12 23:26:36 +02:00
for url in audio.urls:
if url.split(':')[0] in ('http', 'https'):
destdir = (os.environ.get('TMPDIR', '.'))
fname = posixpath.basename(urlparse(url).path)
# sanitize
fname = "".join(c for c in fname
if c.isalnum() or c in list('._-')).rstrip()
dest = os.path.join(destdir, fname)
os.makedirs(destdir, exist_ok=True)
fname, headers = urllib.request.urlretrieve(url, dest)
print('file://%s' % os.path.realpath(fname))
else:
# FIXME: file:// urls are just copied
print(url)
2017-05-25 16:52:57 +02:00
def main():
parser = get_parser()
args = parser.parse_args()
if not args.debug:
logging.basicConfig(level=logging.WARNING)
else:
logging.basicConfig(level=logging.DEBUG)
sources = args.urls
if args.source_weights:
weights = tuple(map(int, args.source_weights.split(':')))
if len(weights) != len(sources):
parser.exit(status=2, message='Weight must be in the'
' same number as sources\n')
sources = [weighted_choice(sources, weights)]
audios = []
for url in sources:
if url.startswith('http:') or url.startswith('https:') \
or os.path.isfile(url):
# download the feed
tree = get_tree(url)
# filtering
if not args.group:
# get audio urls, removing those that are too long
2018-01-30 21:20:06 +01:00
audios += [audio for audio in get_urls(tree) if
2018-02-13 21:49:30 +01:00
(args.max_len == 0 or
audio.duration <= args.max_len) and
(args.min_len == 0 or
audio.duration >= args.min_len) and
(args.min_age.total_seconds() == 0 or
audio.age >= args.min_age) and
(args.max_age.total_seconds() == 0 or
audio.age <= args.max_age)
2018-01-30 21:20:06 +01:00
]
else:
groups = get_grouped_urls(tree)
audios += [groups[g] for g in groups.keys()
if
(args.max_len == 0 or
groups[g].duration <= args.max_len) and
(args.min_len == 0 or
groups[g].duration >= args.max_len) and
(args.min_age.total_seconds() == 0 or
groups[g].age >= args.min_age) and
(args.max_age.total_seconds() == 0 or
groups[g].age <= args.max_age)
]
elif os.path.isdir(url):
audiodir = get_audio_from_dir(url)
if not args.group:
audios += audiodir
else:
for a in audiodir:
ag = AudioGroup(os.path.basename(a.url))
ag.append(a)
audios.append(ag)
else:
logging.info('unsupported url `%s`', url)
# sort
if args.sort_by == 'random':
2017-06-12 23:26:43 +02:00
random.shuffle(audios)
elif args.sort_by == 'date':
audios.sort(key=lambda x: x.age)
if args.reverse:
audios.reverse()
# slice
audios = audios[args.start:]
2017-06-12 23:26:43 +02:00
audios = audios[:args.howmany]
# the for loop excludes the last one
# this is to support the --slotsize option
if not audios:
return
for audio in audios[:-1]:
if args.debug:
print(repr(audio))
else:
2017-05-25 16:52:57 +02:00
put(audio, args.copy)
if args.slotsize is not None:
2018-02-13 20:28:01 +01:00
duration = audio.duration
if duration < args.slotsize:
print('## musica per {} secondi'
.format(args.slotsize - duration))
# finally, the last one
if args.debug:
print(repr(audios[-1]))
else:
2017-05-25 16:52:57 +02:00
put(audios[-1], args.copy)
# else: # grouping; TODO: support slotsize
# for item in groups:
# if args.debug:
2018-02-13 20:28:01 +01:00
# print('#', item, groups[item].duration)
# print(groups[item])
if __name__ == '__main__':
main()