Browse Source

semplifica audio/audiogroup

ora il codice di filtraggio è uguale
boyska 3 years ago
parent
commit
daeb2ff3db
1 changed files with 221 additions and 180 deletions
  1. 221 180
      feed

+ 221 - 180
feed

@@ -1,31 +1,31 @@
 #!/usr/bin/env python3
-'''
+"""
 Feed parser with many features
 
 from a feed, it supports filtering, subslicing, random picking
 
 Beside feeds, it supports picking files from directories
-'''
-import os
+"""
+import datetime
 import logging
-from argparse import ArgumentParser, ArgumentTypeError
-from subprocess import check_output, CalledProcessError
-from collections import OrderedDict
-import re
-import urllib.request
-from urllib.parse import urlparse, unquote
+import os
 import posixpath
 import random
+import re
+import urllib.request
+from argparse import ArgumentParser, ArgumentTypeError
 from bisect import bisect
-import datetime
+from collections import OrderedDict
+from subprocess import CalledProcessError, check_output
+from urllib.parse import unquote, urlparse
 
-from lxml import html
 import requests
+from lxml import html
 from pytimeparse.timeparse import timeparse
 
 
 def get_int(s):
-    return int(re.findall(r'\d+', s)[0])
+    return int(re.findall(r"\d+", s)[0])
 
 
 def DurationType(arg):
@@ -34,27 +34,28 @@ def DurationType(arg):
     else:
         secs = timeparse(arg)
         if secs is None:
-            raise ArgumentTypeError('%r is not a valid duration' % arg)
+            raise ArgumentTypeError("%r is not a valid duration" % arg)
     return secs
 
+
 def TimeDeltaType(arg):
     if arg.isdecimal():
         secs = int(arg)
     else:
         secs = timeparse(arg)
         if secs is None:
-            raise ArgumentTypeError('%r is not a valid time range' % arg)
+            raise ArgumentTypeError("%r is not a valid time range" % arg)
     return datetime.timedelta(seconds=secs)
 
 
 def weighted_choice(values, weights):
-    '''
+    """
     random.choice with weights
 
     weights must be integers greater than 0.
 
     Their meaning is "relative", that is [1,2,3] is the same as [2,4,6]
-    '''
+    """
     assert len(values) == len(weights)
     total = 0
     cum_weights = []
@@ -68,19 +69,19 @@ def weighted_choice(values, weights):
 
 def delta_humanreadable(tdelta):
     if tdelta is None:
-        return ''
+        return ""
     days = tdelta.days
     hours = (tdelta - datetime.timedelta(days=days)).seconds // 3600
     if days:
-        return '{}d{}h'.format(days, hours)
-    return '{}h'.format(hours)
+        return "{}d{}h".format(days, hours)
+    return "{}h".format(hours)
 
 
 class Audio(object):
     def __init__(self, url, duration=None, date=None):
         self.url = url
         if duration is None:
-            duration = get_duration(url.encode('utf-8'))
+            duration = get_duration(url.encode("utf-8"))
         self.duration = duration
         self.date = date
         self.end_date = datetime.datetime(9999, 12, 31, tzinfo=datetime.timezone.utc)
@@ -89,8 +90,9 @@ class Audio(object):
         return self.url
 
     def __repr__(self):
-        return '<Audio {} ({} {})>'.format(self.url, self.duration,
-                                           delta_humanreadable(self.age))
+        return "<Audio {} ({} {})>".format(
+            self.url, self.duration, delta_humanreadable(self.age)
+        )
 
     @property
     def urls(self):
@@ -106,12 +108,14 @@ class Audio(object):
 
     @property
     def valid(self):
-        return self.end_date >= datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc)
+        return self.end_date >= datetime.datetime.utcnow().replace(
+            tzinfo=datetime.timezone.utc
+        )
 
 
 class AudioGroup(list):
     def __init__(self, description=None):
-        self.description = description or ''
+        self.description = description or ""
         self.audios = []
 
     def __len__(self):
@@ -121,13 +125,15 @@ class AudioGroup(list):
         self.audios.append(arg)
 
     def __str__(self):
-        return '\n'.join(str(a) for a in self.audios)
+        return "\n".join(str(a) for a in self.audios)
 
     def __repr__(self):
-        return '<AudioGroup "{}" ({} {})\n{} >'.\
-                format(self.description, self.duration,
-                       delta_humanreadable(self.age),
-                       '\n'.join('   ' + repr(a) for a in self.audios))
+        return '<AudioGroup "{}" ({} {})\n{} >'.format(
+            self.description,
+            self.duration,
+            delta_humanreadable(self.age),
+            "\n".join("   " + repr(a) for a in self.audios),
+        )
 
     @property
     def duration(self):
@@ -140,7 +146,7 @@ class AudioGroup(list):
     @property
     def date(self):
         for a in self.audios:
-            if hasattr(a, 'date'):
+            if hasattr(a, "date"):
                 return a.date
         return None
 
@@ -157,9 +163,8 @@ class AudioGroup(list):
         return len(self.audios) > 0
 
 
-
 def get_tree(feed_url):
-    if feed_url.startswith('http:') or feed_url.startswith('https:'):
+    if feed_url.startswith("http:") or feed_url.startswith("https:"):
         tree = html.fromstring(requests.get(feed_url).content)
     else:
         if not os.path.exists(feed_url):
@@ -170,70 +175,76 @@ def get_tree(feed_url):
 
 def get_audio_from_description(text):
     # non-empty lines
-    lines = [line.strip()
-             for line in text.split('\n')
-             if line.strip()]
+    lines = [line.strip() for line in text.split("\n") if line.strip()]
     url = lines[0]
     duration = None
     metadata = {}
-    for line in text.split('\n')[1:]:
-        if line.strip() and '=' in line:
-            metadata[line.split('=')[0]] = line.split('=')[1]
-    if 'durata' in metadata:
-        metadata['durata'] = get_int(metadata['durata'])
-    if 'txdate' in metadata:
+    for line in text.split("\n")[1:]:
+        if line.strip() and "=" in line:
+            metadata[line.split("=")[0]] = line.split("=")[1]
+    if "durata" in metadata:
+        metadata["durata"] = get_int(metadata["durata"])
+    if "txdate" in metadata:
         try:
-            metadata['txdate'] = datetime.datetime.strptime(
-                metadata['txdate'], '%Y-%m-%dT%H:%M:%S%z')
+            metadata["txdate"] = datetime.datetime.strptime(
+                metadata["txdate"], "%Y-%m-%dT%H:%M:%S%z"
+            )
         except ValueError:
-            logging.warning('could not parse txdate %s', metadata['txdate'])
-            del metadata['txdate']
-    a = Audio(unquote(url),
-              duration=metadata.get('durata', None),
-              date=metadata.get('txdate', None))
-
-    if 'txdate' in metadata and 'replica' in metadata:
-        if metadata['replica'].endswith('g'):
-            a.end_date = metadata['txdate'] + datetime.timedelta(
-                days=get_int(metadata['replica']))
+            logging.warning("could not parse txdate %s", metadata["txdate"])
+            del metadata["txdate"]
+    a = Audio(
+        unquote(url),
+        duration=metadata.get("durata", None),
+        date=metadata.get("txdate", None),
+    )
+
+    if "txdate" in metadata and "replica" in metadata:
+        if metadata["replica"].endswith("g"):
+            a.end_date = metadata["txdate"] + datetime.timedelta(
+                days=get_int(metadata["replica"])
+            )
     return a
 
 
 # copied from larigira.fsutils
-def scan_dir_audio(dirname, extensions=('mp3', 'oga', 'wav', 'ogg')):
+def scan_dir_audio(dirname, extensions=("mp3", "oga", "wav", "ogg")):
     for root, dirnames, filenames in os.walk(dirname):
         for fname in filenames:
-            if fname.split('.')[-1].lower() in extensions:
+            if fname.split(".")[-1].lower() in extensions:
                 yield os.path.join(root, fname)
 
 
 def get_audio_from_dir(dirpath):
     fpaths = scan_dir_audio(dirpath)
-    return [Audio('file://' + os.path.realpath(u),
-                  date=datetime.datetime.fromtimestamp(os.path.getmtime(u)).
-                  replace(tzinfo=datetime.timezone.utc))
-            for u in fpaths]
+    return [
+        Audio(
+            "file://" + os.path.realpath(u),
+            date=datetime.datetime.fromtimestamp(os.path.getmtime(u)).replace(
+                tzinfo=datetime.timezone.utc
+            ),
+        )
+        for u in fpaths
+    ]
 
 
 def get_item_date(el):
-    el_date = el.find('pubdate')
+    el_date = el.find("pubdate")
     if el_date is not None:
-        return datetime.datetime.strptime(
-            el_date.text, '%Y-%m-%dT%H:%M:%S%z')
+        return datetime.datetime.strptime(el_date.text, "%Y-%m-%dT%H:%M:%S%z")
     return None
 
 
 def get_urls(tree):
-    items = tree.xpath('//item')
+    items = tree.xpath("//item")
     for it in items:
-        title = it.find('title').text
-        el_body = it.find('description')
+        title = it.find("title").text
+        el_body = it.find("description")
         if el_body is not None:
             url = el_body.text
             try:
                 audio = get_audio_from_description(url)
             except Exception as exc:
-                logging.info('error getting duration for `%s`' % title)
+                logging.info("error getting duration for `%s`" % title)
                 continue
             if audio.date is None:
                 audio.date = get_item_date(it)
@@ -242,12 +253,12 @@ def get_urls(tree):
 
 def get_grouped_urls(tree):
     groups = OrderedDict()
-    items = tree.xpath('//item')
+    items = tree.xpath("//item")
     for item in items:
-        guid = item.xpath('guid')[0].text.strip()
+        guid = item.xpath("guid")[0].text.strip()
         if guid not in groups:
             groups[guid] = AudioGroup(guid)
-        audio = get_audio_from_description(item.xpath('description')[0].text)
+        audio = get_audio_from_description(item.xpath("description")[0].text)
         audio.date = get_item_date(item)
         if audio.valid:
             groups[guid].append(audio)
@@ -256,17 +267,17 @@ def get_grouped_urls(tree):
 
 def get_duration(url):
     try:
-        lineout = check_output(['ffprobe', '-v', 'error',
-                                '-show_entries', 'format=duration',
-                                '-i', url]).split(b'\n')
+        lineout = check_output(
+            ["ffprobe", "-v", "error", "-show_entries", "format=duration", "-i", url]
+        ).split(b"\n")
     except CalledProcessError as exc:
-        raise ValueError('error probing `%s`' % url) from exc
-    duration = next(l for l in lineout if l.startswith(b'duration='))
-    value = duration.split(b'=')[1]
+        raise ValueError("error probing `%s`" % url) from exc
+    duration = next(l for l in lineout if l.startswith(b"duration="))
+    value = duration.split(b"=")[1]
     return int(float(value))
 
 
-HELP = '''
+HELP = """
 Collect audio informations from multiple sources (XML feeds).
 Audios are (in that order):
  1. Collected from feeds; (grouped by article if --group is used)
@@ -274,52 +285,79 @@ Audios are (in that order):
  3. Sorted; even randomly
  4. Sliced; take HOWMANY elements, skipping START elements
  5. (if --copy) Copied
-Usage: '''
+Usage: """
 
 
 def get_parser():
     p = ArgumentParser(HELP)
-    src = p.add_argument_group('sources', 'How to deal with sources')
-    src.add_argument('--source-weights',
-                     help='Select only one "source" based on this weights')
-    src.add_argument('--group', default=False, action='store_true',
-                     help='Group audios that belong to the same article')
-
-    filters = p.add_argument_group('filters', 'Select only items that match '
-                                   'these conditions')
-    filters.add_argument('--min-len', default=0, type=DurationType,
-                         help='Exclude any audio that is shorter '
-                         'than MIN_LEN seconds')
-    filters.add_argument('--max-len', default=0, type=DurationType,
-                         help='Exclude any audio that is longer '
-                         'than MAX_LEN seconds')
-    filters.add_argument('--sort-by', default='no', type=str,
-                         choices=('random', 'date', 'duration'))
-    filters.add_argument('--reverse', default=False,
-                         action='store_true', help='Reverse list order')
-
-    filters.add_argument('--min-age', default=datetime.timedelta(),
-                         type=TimeDeltaType,
-                         help='Exclude audio more recent than MIN_AGE')
-    filters.add_argument('--max-age', default=datetime.timedelta(),
-                         type=TimeDeltaType,
-                         help='Exclude audio older than MAX_AGE')
-
-    p.add_argument('--start', default=0, type=int,
-                   help='0-indexed start number. '
-                   'By default, play from most recent')
-    p.add_argument('--howmany', default=1, type=int,
-                   help='If not specified, only 1 will be played')
-    p.add_argument('--slotsize', type=int,
-                   help='Seconds between each audio. Still unsupported')
-
-    general = p.add_argument_group('general', 'General options')
-    general.add_argument('--copy', help='Copy files to $TMPDIR', default=False,
-                         action='store_true')
-    general.add_argument('--debug', help='Debug messages', default=False,
-                         action='store_true')
-
-    p.add_argument('urls', metavar='URL', nargs='+')
+    src = p.add_argument_group("sources", "How to deal with sources")
+    src.add_argument(
+        "--source-weights", help='Select only one "source" based on this weights'
+    )
+    src.add_argument(
+        "--group",
+        default=False,
+        action="store_true",
+        help="Group audios that belong to the same article",
+    )
+
+    filters = p.add_argument_group(
+        "filters", "Select only items that match " "these conditions"
+    )
+    filters.add_argument(
+        "--min-len",
+        default=0,
+        type=DurationType,
+        help="Exclude any audio that is shorter " "than MIN_LEN seconds",
+    )
+    filters.add_argument(
+        "--max-len",
+        default=0,
+        type=DurationType,
+        help="Exclude any audio that is longer " "than MAX_LEN seconds",
+    )
+    filters.add_argument(
+        "--sort-by", default="no", type=str, choices=("random", "date", "duration")
+    )
+    filters.add_argument(
+        "--reverse", default=False, action="store_true", help="Reverse list order"
+    )
+
+    filters.add_argument(
+        "--min-age",
+        default=datetime.timedelta(),
+        type=TimeDeltaType,
+        help="Exclude audio more recent than MIN_AGE",
+    )
+    filters.add_argument(
+        "--max-age",
+        default=datetime.timedelta(),
+        type=TimeDeltaType,
+        help="Exclude audio older than MAX_AGE",
+    )
+
+    p.add_argument(
+        "--start",
+        default=0,
+        type=int,
+        help="0-indexed start number. " "By default, play from most recent",
+    )
+    p.add_argument(
+        "--howmany", default=1, type=int, help="If not specified, only 1 will be played"
+    )
+    p.add_argument(
+        "--slotsize", type=int, help="Seconds between each audio. Still unsupported"
+    )
+
+    general = p.add_argument_group("general", "General options")
+    general.add_argument(
+        "--copy", help="Copy files to $TMPDIR", default=False, action="store_true"
+    )
+    general.add_argument(
+        "--debug", help="Debug messages", default=False, action="store_true"
+    )
+
+    p.add_argument("urls", metavar="URL", nargs="+")
     return p
 
 
@@ -329,21 +367,66 @@ def put(audio, copy=False):
             print(url)
     else:
         for url in audio.urls:
-            if url.split(':')[0] in ('http', 'https'):
-                destdir = (os.environ.get('TMPDIR', '.'))
+            if url.split(":")[0] in ("http", "https"):
+                destdir = os.environ.get("TMPDIR", ".")
                 fname = posixpath.basename(urlparse(url).path)
                 # sanitize
-                fname = "".join(c for c in fname
-                                if c.isalnum() or c in list('._-')).rstrip()
+                fname = "".join(
+                    c for c in fname if c.isalnum() or c in list("._-")
+                ).rstrip()
                 dest = os.path.join(destdir, fname)
                 os.makedirs(destdir, exist_ok=True)
                 fname, headers = urllib.request.urlretrieve(url, dest)
-                print('file://%s' % os.path.realpath(fname))
+                print("file://%s" % os.path.realpath(fname))
             else:
                 # FIXME: file:// urls are just copied
                 print(url)
 
 
+def retrieve(url, args):
+    """
+    returns a list of Audios or a list of AudioGroups
+    """
+    if not args.group:
+        if os.path.isdir(url):
+            audiodir = get_audio_from_dir(url)
+            return audiodir
+        elif url.startswith("http:") or url.startswith("https:") or os.path.isfile(url):
+            return get_urls(get_tree(url))
+        else:
+            logging.info("unsupported url `%s`", url)
+            return []
+    else:  # group
+        if os.path.isdir(url):
+            audiodir = get_audio_from_dir(url)
+            agroups = []
+            for a in audiodir:
+                ag = AudioGroup(os.path.basename(a.url))
+                ag.append(a)
+                agroups.append(ag)
+            return agroups
+        elif url.startswith("http:") or url.startswith("https:") or os.path.isfile(url):
+            groups = get_grouped_urls(get_tree(url))
+            return groups.values()
+        else:
+            logging.info("unsupported url `%s`", url)
+            return []
+
+
+def audio_passes_filters(audio, args):
+    if not audio.valid:
+        return False
+    if args.max_len and audio.duration > args.max_len:
+        return False
+    if args.min_len and audio.duration < args.min_len:
+        return False
+    if args.min_age.total_seconds() and audio.age < args.min_age:
+        return False
+    if args.max_age.total_seconds() and audio.age > args.max_age:
+        return False
+    return True
+
+
 def main():
     parser = get_parser()
     args = parser.parse_args()
@@ -354,75 +437,32 @@ def main():
     sources = args.urls
 
     if args.source_weights:
-        weights = tuple(map(int, args.source_weights.split(':')))
+        weights = tuple(map(int, args.source_weights.split(":")))
         if len(weights) != len(sources):
-            parser.exit(status=2, message='Weight must be in the'
-                        ' same number as sources\n')
+            parser.exit(
+                status=2, message="Weight must be in the" " same number as sources\n"
+            )
         sources = [weighted_choice(sources, weights)]
 
     audios = []
     for url in sources:
-        if not args.group:
-            if os.path.isdir(url):
-                audiodir = get_audio_from_dir(url)
-                audios += audiodir
-            elif url.startswith('http:') or url.startswith('https:') \
-                    or os.path.isfile(url):
-                audios += get_urls(get_tree(url))
-            else:
-                logging.info('unsupported url `%s`', url)
-            audios = [audio for audio in audios if
-                      (audio.valid) and
-                      (args.max_len == 0 or
-                       audio.duration <= args.max_len) and
-                      (args.min_len == 0 or
-                       audio.duration >= args.min_len) and
-                      (args.min_age.total_seconds() == 0 or
-                       audio.age >= args.min_age) and
-                      (args.max_age.total_seconds() == 0 or
-                       audio.age <= args.max_age)
-                      ]
-        else:  # group
-            if os.path.isdir(url):
-                audiodir = get_audio_from_dir(url)
-                agroups = []
-                for a in audiodir:
-                    ag = AudioGroup(os.path.basename(a.url))
-                    ag.append(a)
-                    agroups.append(ag)
-            elif url.startswith('http:') or url.startswith('https:') \
-                    or os.path.isfile(url):
-                groups = get_grouped_urls(get_tree(url))
-                agroups = groups.values()
-            else:
-                logging.info('unsupported url `%s`', url)
-            audios += [g for g in agroups
-                       if
-                       (g.valid) and
-                       (args.max_len == 0 or
-                        g.duration <= args.max_len) and
-                       (args.min_len == 0 or
-                        g.duration >= args.max_len) and
-                       (args.min_age.total_seconds() == 0 or
-                        g.age >= args.min_age) and
-                       (args.max_age.total_seconds() == 0 or
-                        g.age <= args.max_age)
-                       ]
+        url_audios = retrieve(url, args)
+        audios += [au for au in url_audios if audio_passes_filters(au, args)]
 
     # sort
-    if args.sort_by == 'random':
+    if args.sort_by == "random":
         random.shuffle(audios)
-    elif args.sort_by == 'date':
+    elif args.sort_by == "date":
         audios.sort(key=lambda x: x.age)
-    elif args.sort_by == 'duration':
+    elif args.sort_by == "duration":
         audios.sort(key=lambda x: x.duration)
 
     if args.reverse:
         audios.reverse()
 
     # slice
-    audios = audios[args.start:]
-    audios = audios[:args.howmany]
+    audios = audios[args.start :]
+    audios = audios[: args.howmany]
 
     # the for loop excludes the last one
     # this is to support  the --slotsize option
@@ -436,13 +476,14 @@ def main():
         if args.slotsize is not None:
             duration = audio.duration
             if duration < args.slotsize:
-                print('## musica per {} secondi'
-                      .format(args.slotsize - duration))
+                print("## musica per {} secondi".format(args.slotsize - duration))
     # finally, the last one
     if args.debug:
         print(repr(audios[-1]))
     else:
         put(audios[-1], args.copy)
+
+
 #     else:  # grouping; TODO: support slotsize
 #         for item in groups:
 #             if args.debug:
@@ -450,5 +491,5 @@ def main():
 #             print(groups[item])
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()