feed: pesca audio dai feed con molte magie

2017-04-19 11:18:45 +02:00 · 2017-04-19 11:18:45 +02:00 · 235d7a938f
commit 235d7a938f
1 changed files with 161 additions and 0 deletions
--- a/161
+++ b/161
@ -0,0 +1,161 @@
+#!/usr/bin/env python3
+import os
+from argparse import ArgumentParser
+from subprocess import check_output
+from collections import OrderedDict
+import re
+
+from lxml import html
+import requests
+
+
+class Audio(object):
+    def __init__(self, url, durata=None):
+        self.url = url
+        if durata is None:
+            durata = get_duration(url.encode('utf-8'))
+        self.durata = durata
+
+    def __str__(self):
+        return self.url
+
+    def __repr__(self):
+        return '<Audio {} ({})>'.format(self.url, self.durata)
+
+
+class AudioGroup(list):
+    def __init__(self, description=None):
+        self.description = description or ''
+        self.audios = []
+
+    def __len__(self):
+        return len(self.audios)
+
+    def append(self, arg):
+        self.audios.append(arg)
+
+    def __str__(self):
+        return '\n'.join(str(a) for a in self.audios)
+
+    def __repr__(self):
+        return '<AudioGroup "{}" ({})\n{}>'.format(self.description,
+                                                   self.durata,
+                                                   '\n'.join(repr(a) for a in self.audios))
+
+    @property
+    def durata(self):
+        return sum(a.durata for a in self.audios if a.durata is not None)
+
+
+def get_tree(feed_url):
+    if feed_url.startswith('http:') or feed_url.startswith('https:'):
+        tree = html.fromstring(requests.get(feed_url).content)
+    else:
+        if not os.path.exists(feed_url):
+            raise ValueError("file not found: {}".format(feed_url))
+        tree = html.parse(open(feed_url))
+    return tree
+
+
+def get_audio_from_description(text):
+    # non-empty lines
+    lines = [line.strip()
+             for line in text.split('\n')
+             if line.strip()]
+    url = lines[0]
+    durata = None
+    if len(lines) > 1:
+        durata = int(re.findall(r'\d+', lines[1].split('=')[1].strip())[0])
+    return Audio(url, durata)
+
+
+def get_urls(tree):
+    urls = tree.xpath('//item/description')
+    for url_elem in urls:
+        yield get_audio_from_description(url_elem.text)
+
+
+def get_grouped_urls(tree):
+    groups = OrderedDict()
+    items = tree.xpath('//item')
+    for item in items:
+        guid = item.xpath('guid')[0].text.strip()
+        if guid not in groups:
+            groups[guid] = AudioGroup(guid)
+        groups[guid].append(get_audio_from_description(
+            item.xpath('description')[0].text))
+    return groups
+
+
+def get_duration(url):
+    lineout = check_output(['ffprobe', '-v', 'error',
+                            '-show_entries', 'format=duration',
+                            '-i', url]).split(b'\n')
+    duration = next(l for l in lineout if l.startswith(b'duration='))
+    value = duration.split(b'=')[1]
+    return int(float(value))
+
+
+def get_parser():
+    p = ArgumentParser('Get music from a (well-specified) xml feed')
+    p.add_argument('--start', default=0, type=int,
+                   help='0-indexed start number. '
+                   'By default, play from most recent')
+    p.add_argument('--max-len', default=0, type=int,
+                   help='Exclude any audio that is longer than MAXLEN seconds')
+    p.add_argument('--howmany', default=1, type=int,
+                   help='If not specified, only 1 will be played')
+    p.add_argument('--slotsize', help='Seconds between each audio', type=int)
+    p.add_argument('--group', help='Group articles', default=False,
+                   action='store_true')
+    p.add_argument('--debug', help='Debug messages', default=False,
+                   action='store_true')
+    p.add_argument('url')
+    return p
+
+
+def main():
+    args = get_parser().parse_args()
+    # download the feed
+    tree = get_tree(args.url)
+    if not args.group:
+        # get audio urls, removing those that are too long
+        audios = [audio for audio in get_urls(tree)
+                  if args.max_len == 0 or
+                  audio.durata <= args.max_len]
+        audios = audios[args.start:args.start+args.howmany]
+    else:
+        groups = get_grouped_urls(tree)
+        audios = [groups[g] for g in groups.keys()
+                  if args.max_len == 0 or
+                  groups[g].durata <= args.max_len
+                 ][args.start:args.start+args.howmany]
+
+    # the for loop excludes the last one
+    # this is to support  the --slotsize option
+    if not audios:
+        return
+    for audio in audios[:-1]:
+        if args.debug:
+            print(repr(audio))
+        else:
+            print(audio)
+        if args.slotsize is not None:
+            duration = audio.durata
+            if duration < args.slotsize:
+                print('## musica per {} secondi'
+                      .format(args.slotsize - duration))
+    # finally, the last one
+    if args.debug:
+        print(repr(audios[-1]))
+    else:
+        print(audios[-1])
+#     else:  # grouping; TODO: support slotsize
+#         for item in groups:
+#             if args.debug:
+#                 print('#', item, groups[item].durata)
+#             print(groups[item])
+
+
+if __name__ == '__main__':
+    main()