semplifica audio/audiogroup

ora il codice di filtraggio è uguale
2020-07-21 13:18:37 +02:00 · 2020-07-21 13:18:37 +02:00 · daeb2ff3db
commit daeb2ff3db
parent 0ebb62c318
1 changed files with 215 additions and 174 deletions
--- a/389
+++ b/389
@ -1,31 +1,31 @@
 #!/usr/bin/env python3
-'''
+"""
 Feed parser with many features

 from a feed, it supports filtering, subslicing, random picking

 Beside feeds, it supports picking files from directories
-'''
-import os
+"""
+import datetime
 import logging
-from argparse import ArgumentParser, ArgumentTypeError
-from subprocess import check_output, CalledProcessError
-from collections import OrderedDict
-import re
-import urllib.request
-from urllib.parse import urlparse, unquote
+import os
 import posixpath
 import random
+import re
+import urllib.request
+from argparse import ArgumentParser, ArgumentTypeError
 from bisect import bisect
-import datetime
+from collections import OrderedDict
+from subprocess import CalledProcessError, check_output
+from urllib.parse import unquote, urlparse

-from lxml import html
 import requests
+from lxml import html
 from pytimeparse.timeparse import timeparse


 def get_int(s):
-    return int(re.findall(r'\d+', s)[0])
+    return int(re.findall(r"\d+", s)[0])


 def DurationType(arg):
@ -34,27 +34,28 @@ def DurationType(arg):
    else:
        secs = timeparse(arg)
        if secs is None:
-            raise ArgumentTypeError('%r is not a valid duration' % arg)
+            raise ArgumentTypeError("%r is not a valid duration" % arg)
    return secs

+
 def TimeDeltaType(arg):
    if arg.isdecimal():
        secs = int(arg)
    else:
        secs = timeparse(arg)
        if secs is None:
-            raise ArgumentTypeError('%r is not a valid time range' % arg)
+            raise ArgumentTypeError("%r is not a valid time range" % arg)
    return datetime.timedelta(seconds=secs)


 def weighted_choice(values, weights):
-    '''
+    """
    random.choice with weights

    weights must be integers greater than 0.

    Their meaning is "relative", that is [1,2,3] is the same as [2,4,6]
-    '''
+    """
    assert len(values) == len(weights)
    total = 0
    cum_weights = []
@ -68,19 +69,19 @@ def weighted_choice(values, weights):

 def delta_humanreadable(tdelta):
    if tdelta is None:
-        return ''
+        return ""
    days = tdelta.days
    hours = (tdelta - datetime.timedelta(days=days)).seconds // 3600
    if days:
-        return '{}d{}h'.format(days, hours)
-    return '{}h'.format(hours)
+        return "{}d{}h".format(days, hours)
+    return "{}h".format(hours)


 class Audio(object):
    def __init__(self, url, duration=None, date=None):
        self.url = url
        if duration is None:
-            duration = get_duration(url.encode('utf-8'))
+            duration = get_duration(url.encode("utf-8"))
        self.duration = duration
        self.date = date
        self.end_date = datetime.datetime(9999, 12, 31, tzinfo=datetime.timezone.utc)
@ -89,8 +90,9 @@ class Audio(object):
        return self.url

    def __repr__(self):
-        return '<Audio {} ({} {})>'.format(self.url, self.duration,
-                                           delta_humanreadable(self.age))
+        return "<Audio {} ({} {})>".format(
+            self.url, self.duration, delta_humanreadable(self.age)
+        )

    @property
    def urls(self):
@ -106,12 +108,14 @@ class Audio(object):

    @property
    def valid(self):
-        return self.end_date >= datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc)
+        return self.end_date >= datetime.datetime.utcnow().replace(
+            tzinfo=datetime.timezone.utc
+        )


 class AudioGroup(list):
    def __init__(self, description=None):
-        self.description = description or ''
+        self.description = description or ""
        self.audios = []

    def __len__(self):
@ -121,13 +125,15 @@ class AudioGroup(list):
        self.audios.append(arg)

    def __str__(self):
-        return '\n'.join(str(a) for a in self.audios)
+        return "\n".join(str(a) for a in self.audios)

    def __repr__(self):
-        return '<AudioGroup "{}" ({} {})\n{} >'.\
-                format(self.description, self.duration,
-                       delta_humanreadable(self.age),
-                       '\n'.join('   ' + repr(a) for a in self.audios))
+        return '<AudioGroup "{}" ({} {})\n{} >'.format(
+            self.description,
+            self.duration,
+            delta_humanreadable(self.age),
+            "\n".join("   " + repr(a) for a in self.audios),
+        )

    @property
    def duration(self):
@ -140,7 +146,7 @@ class AudioGroup(list):
    @property
    def date(self):
        for a in self.audios:
-            if hasattr(a, 'date'):
+            if hasattr(a, "date"):
                return a.date
        return None

@ -157,9 +163,8 @@ class AudioGroup(list):
        return len(self.audios) > 0


-
 def get_tree(feed_url):
-    if feed_url.startswith('http:') or feed_url.startswith('https:'):
+    if feed_url.startswith("http:") or feed_url.startswith("https:"):
        tree = html.fromstring(requests.get(feed_url).content)
    else:
        if not os.path.exists(feed_url):
@ -170,70 +175,76 @@ def get_tree(feed_url):

 def get_audio_from_description(text):
    # non-empty lines
-    lines = [line.strip()
-             for line in text.split('\n')
-             if line.strip()]
+    lines = [line.strip() for line in text.split("\n") if line.strip()]
    url = lines[0]
    duration = None
    metadata = {}
-    for line in text.split('\n')[1:]:
-        if line.strip() and '=' in line:
-            metadata[line.split('=')[0]] = line.split('=')[1]
-    if 'durata' in metadata:
-        metadata['durata'] = get_int(metadata['durata'])
-    if 'txdate' in metadata:
+    for line in text.split("\n")[1:]:
+        if line.strip() and "=" in line:
+            metadata[line.split("=")[0]] = line.split("=")[1]
+    if "durata" in metadata:
+        metadata["durata"] = get_int(metadata["durata"])
+    if "txdate" in metadata:
        try:
-            metadata['txdate'] = datetime.datetime.strptime(
-                metadata['txdate'], '%Y-%m-%dT%H:%M:%S%z')
+            metadata["txdate"] = datetime.datetime.strptime(
+                metadata["txdate"], "%Y-%m-%dT%H:%M:%S%z"
+            )
        except ValueError:
-            logging.warning('could not parse txdate %s', metadata['txdate'])
-            del metadata['txdate']
-    a = Audio(unquote(url),
-              duration=metadata.get('durata', None),
-              date=metadata.get('txdate', None))
+            logging.warning("could not parse txdate %s", metadata["txdate"])
+            del metadata["txdate"]
+    a = Audio(
+        unquote(url),
+        duration=metadata.get("durata", None),
+        date=metadata.get("txdate", None),
+    )

-    if 'txdate' in metadata and 'replica' in metadata:
-        if metadata['replica'].endswith('g'):
-            a.end_date = metadata['txdate'] + datetime.timedelta(
-                days=get_int(metadata['replica']))
+    if "txdate" in metadata and "replica" in metadata:
+        if metadata["replica"].endswith("g"):
+            a.end_date = metadata["txdate"] + datetime.timedelta(
+                days=get_int(metadata["replica"])
+            )
    return a


 # copied from larigira.fsutils
-def scan_dir_audio(dirname, extensions=('mp3', 'oga', 'wav', 'ogg')):
+def scan_dir_audio(dirname, extensions=("mp3", "oga", "wav", "ogg")):
    for root, dirnames, filenames in os.walk(dirname):
        for fname in filenames:
-            if fname.split('.')[-1].lower() in extensions:
+            if fname.split(".")[-1].lower() in extensions:
                yield os.path.join(root, fname)


 def get_audio_from_dir(dirpath):
    fpaths = scan_dir_audio(dirpath)
-    return [Audio('file://' + os.path.realpath(u),
-                  date=datetime.datetime.fromtimestamp(os.path.getmtime(u)).
-                  replace(tzinfo=datetime.timezone.utc))
-            for u in fpaths]
+    return [
+        Audio(
+            "file://" + os.path.realpath(u),
+            date=datetime.datetime.fromtimestamp(os.path.getmtime(u)).replace(
+                tzinfo=datetime.timezone.utc
+            ),
+        )
+        for u in fpaths
+    ]


 def get_item_date(el):
-    el_date = el.find('pubdate')
+    el_date = el.find("pubdate")
    if el_date is not None:
-        return datetime.datetime.strptime(
-            el_date.text, '%Y-%m-%dT%H:%M:%S%z')
+        return datetime.datetime.strptime(el_date.text, "%Y-%m-%dT%H:%M:%S%z")
    return None


 def get_urls(tree):
-    items = tree.xpath('//item')
+    items = tree.xpath("//item")
    for it in items:
-        title = it.find('title').text
-        el_body = it.find('description')
+        title = it.find("title").text
+        el_body = it.find("description")
        if el_body is not None:
            url = el_body.text
            try:
                audio = get_audio_from_description(url)
            except Exception as exc:
-                logging.info('error getting duration for `%s`' % title)
+                logging.info("error getting duration for `%s`" % title)
                continue
            if audio.date is None:
                audio.date = get_item_date(it)
@ -242,12 +253,12 @@ def get_urls(tree):

 def get_grouped_urls(tree):
    groups = OrderedDict()
-    items = tree.xpath('//item')
+    items = tree.xpath("//item")
    for item in items:
-        guid = item.xpath('guid')[0].text.strip()
+        guid = item.xpath("guid")[0].text.strip()
        if guid not in groups:
            groups[guid] = AudioGroup(guid)
-        audio = get_audio_from_description(item.xpath('description')[0].text)
+        audio = get_audio_from_description(item.xpath("description")[0].text)
        audio.date = get_item_date(item)
        if audio.valid:
            groups[guid].append(audio)
@ -256,17 +267,17 @@ def get_grouped_urls(tree):

 def get_duration(url):
    try:
-        lineout = check_output(['ffprobe', '-v', 'error',
-                                '-show_entries', 'format=duration',
-                                '-i', url]).split(b'\n')
+        lineout = check_output(
+            ["ffprobe", "-v", "error", "-show_entries", "format=duration", "-i", url]
+        ).split(b"\n")
    except CalledProcessError as exc:
-        raise ValueError('error probing `%s`' % url) from exc
-    duration = next(l for l in lineout if l.startswith(b'duration='))
-    value = duration.split(b'=')[1]
+        raise ValueError("error probing `%s`" % url) from exc
+    duration = next(l for l in lineout if l.startswith(b"duration="))
+    value = duration.split(b"=")[1]
    return int(float(value))


-HELP = '''
+HELP = """
 Collect audio informations from multiple sources (XML feeds).
 Audios are (in that order):
 1. Collected from feeds; (grouped by article if --group is used)
@ -274,52 +285,79 @@ Audios are (in that order):
 3. Sorted; even randomly
 4. Sliced; take HOWMANY elements, skipping START elements
 5. (if --copy) Copied
-Usage: '''
+Usage: """


 def get_parser():
    p = ArgumentParser(HELP)
-    src = p.add_argument_group('sources', 'How to deal with sources')
-    src.add_argument('--source-weights',
-                     help='Select only one "source" based on this weights')
-    src.add_argument('--group', default=False, action='store_true',
-                     help='Group audios that belong to the same article')
+    src = p.add_argument_group("sources", "How to deal with sources")
+    src.add_argument(
+        "--source-weights", help='Select only one "source" based on this weights'
+    )
+    src.add_argument(
+        "--group",
+        default=False,
+        action="store_true",
+        help="Group audios that belong to the same article",
+    )

-    filters = p.add_argument_group('filters', 'Select only items that match '
-                                   'these conditions')
-    filters.add_argument('--min-len', default=0, type=DurationType,
-                         help='Exclude any audio that is shorter '
-                         'than MIN_LEN seconds')
-    filters.add_argument('--max-len', default=0, type=DurationType,
-                         help='Exclude any audio that is longer '
-                         'than MAX_LEN seconds')
-    filters.add_argument('--sort-by', default='no', type=str,
-                         choices=('random', 'date', 'duration'))
-    filters.add_argument('--reverse', default=False,
-                         action='store_true', help='Reverse list order')
+    filters = p.add_argument_group(
+        "filters", "Select only items that match " "these conditions"
+    )
+    filters.add_argument(
+        "--min-len",
+        default=0,
+        type=DurationType,
+        help="Exclude any audio that is shorter " "than MIN_LEN seconds",
+    )
+    filters.add_argument(
+        "--max-len",
+        default=0,
+        type=DurationType,
+        help="Exclude any audio that is longer " "than MAX_LEN seconds",
+    )
+    filters.add_argument(
+        "--sort-by", default="no", type=str, choices=("random", "date", "duration")
+    )
+    filters.add_argument(
+        "--reverse", default=False, action="store_true", help="Reverse list order"
+    )

-    filters.add_argument('--min-age', default=datetime.timedelta(),
-                         type=TimeDeltaType,
-                         help='Exclude audio more recent than MIN_AGE')
-    filters.add_argument('--max-age', default=datetime.timedelta(),
-                         type=TimeDeltaType,
-                         help='Exclude audio older than MAX_AGE')
+    filters.add_argument(
+        "--min-age",
+        default=datetime.timedelta(),
+        type=TimeDeltaType,
+        help="Exclude audio more recent than MIN_AGE",
+    )
+    filters.add_argument(
+        "--max-age",
+        default=datetime.timedelta(),
+        type=TimeDeltaType,
+        help="Exclude audio older than MAX_AGE",
+    )

-    p.add_argument('--start', default=0, type=int,
-                   help='0-indexed start number. '
-                   'By default, play from most recent')
-    p.add_argument('--howmany', default=1, type=int,
-                   help='If not specified, only 1 will be played')
-    p.add_argument('--slotsize', type=int,
-                   help='Seconds between each audio. Still unsupported')
+    p.add_argument(
+        "--start",
+        default=0,
+        type=int,
+        help="0-indexed start number. " "By default, play from most recent",
+    )
+    p.add_argument(
+        "--howmany", default=1, type=int, help="If not specified, only 1 will be played"
+    )
+    p.add_argument(
+        "--slotsize", type=int, help="Seconds between each audio. Still unsupported"
+    )

-    general = p.add_argument_group('general', 'General options')
-    general.add_argument('--copy', help='Copy files to $TMPDIR', default=False,
-                         action='store_true')
-    general.add_argument('--debug', help='Debug messages', default=False,
-                         action='store_true')
+    general = p.add_argument_group("general", "General options")
+    general.add_argument(
+        "--copy", help="Copy files to $TMPDIR", default=False, action="store_true"
+    )
+    general.add_argument(
+        "--debug", help="Debug messages", default=False, action="store_true"
+    )

-    p.add_argument('urls', metavar='URL', nargs='+')
+    p.add_argument("urls", metavar="URL", nargs="+")
    return p


@ -329,21 +367,66 @@ def put(audio, copy=False):
            print(url)
    else:
        for url in audio.urls:
-            if url.split(':')[0] in ('http', 'https'):
-                destdir = (os.environ.get('TMPDIR', '.'))
+            if url.split(":")[0] in ("http", "https"):
+                destdir = os.environ.get("TMPDIR", ".")
                fname = posixpath.basename(urlparse(url).path)
                # sanitize
-                fname = "".join(c for c in fname
-                                if c.isalnum() or c in list('._-')).rstrip()
+                fname = "".join(
+                    c for c in fname if c.isalnum() or c in list("._-")
+                ).rstrip()
                dest = os.path.join(destdir, fname)
                os.makedirs(destdir, exist_ok=True)
                fname, headers = urllib.request.urlretrieve(url, dest)
-                print('file://%s' % os.path.realpath(fname))
+                print("file://%s" % os.path.realpath(fname))
            else:
                # FIXME: file:// urls are just copied
                print(url)


+def retrieve(url, args):
+    """
+    returns a list of Audios or a list of AudioGroups
+    """
+    if not args.group:
+        if os.path.isdir(url):
+            audiodir = get_audio_from_dir(url)
+            return audiodir
+        elif url.startswith("http:") or url.startswith("https:") or os.path.isfile(url):
+            return get_urls(get_tree(url))
+        else:
+            logging.info("unsupported url `%s`", url)
+            return []
+    else:  # group
+        if os.path.isdir(url):
+            audiodir = get_audio_from_dir(url)
+            agroups = []
+            for a in audiodir:
+                ag = AudioGroup(os.path.basename(a.url))
+                ag.append(a)
+                agroups.append(ag)
+            return agroups
+        elif url.startswith("http:") or url.startswith("https:") or os.path.isfile(url):
+            groups = get_grouped_urls(get_tree(url))
+            return groups.values()
+        else:
+            logging.info("unsupported url `%s`", url)
+            return []
+
+
+def audio_passes_filters(audio, args):
+    if not audio.valid:
+        return False
+    if args.max_len and audio.duration > args.max_len:
+        return False
+    if args.min_len and audio.duration < args.min_len:
+        return False
+    if args.min_age.total_seconds() and audio.age < args.min_age:
+        return False
+    if args.max_age.total_seconds() and audio.age > args.max_age:
+        return False
+    return True
+
+
 def main():
    parser = get_parser()
    args = parser.parse_args()
@ -354,75 +437,32 @@ def main():
    sources = args.urls

    if args.source_weights:
-        weights = tuple(map(int, args.source_weights.split(':')))
+        weights = tuple(map(int, args.source_weights.split(":")))
        if len(weights) != len(sources):
-            parser.exit(status=2, message='Weight must be in the'
-                        ' same number as sources\n')
+            parser.exit(
+                status=2, message="Weight must be in the" " same number as sources\n"
+            )
        sources = [weighted_choice(sources, weights)]

    audios = []
    for url in sources:
-        if not args.group:
-            if os.path.isdir(url):
-                audiodir = get_audio_from_dir(url)
-                audios += audiodir
-            elif url.startswith('http:') or url.startswith('https:') \
-                    or os.path.isfile(url):
-                audios += get_urls(get_tree(url))
-            else:
-                logging.info('unsupported url `%s`', url)
-            audios = [audio for audio in audios if
-                      (audio.valid) and
-                      (args.max_len == 0 or
-                       audio.duration <= args.max_len) and
-                      (args.min_len == 0 or
-                       audio.duration >= args.min_len) and
-                      (args.min_age.total_seconds() == 0 or
-                       audio.age >= args.min_age) and
-                      (args.max_age.total_seconds() == 0 or
-                       audio.age <= args.max_age)
-                      ]
-        else:  # group
-            if os.path.isdir(url):
-                audiodir = get_audio_from_dir(url)
-                agroups = []
-                for a in audiodir:
-                    ag = AudioGroup(os.path.basename(a.url))
-                    ag.append(a)
-                    agroups.append(ag)
-            elif url.startswith('http:') or url.startswith('https:') \
-                    or os.path.isfile(url):
-                groups = get_grouped_urls(get_tree(url))
-                agroups = groups.values()
-            else:
-                logging.info('unsupported url `%s`', url)
-            audios += [g for g in agroups
-                       if
-                       (g.valid) and
-                       (args.max_len == 0 or
-                        g.duration <= args.max_len) and
-                       (args.min_len == 0 or
-                        g.duration >= args.max_len) and
-                       (args.min_age.total_seconds() == 0 or
-                        g.age >= args.min_age) and
-                       (args.max_age.total_seconds() == 0 or
-                        g.age <= args.max_age)
-                       ]
+        url_audios = retrieve(url, args)
+        audios += [au for au in url_audios if audio_passes_filters(au, args)]

    # sort
-    if args.sort_by == 'random':
+    if args.sort_by == "random":
        random.shuffle(audios)
-    elif args.sort_by == 'date':
+    elif args.sort_by == "date":
        audios.sort(key=lambda x: x.age)
-    elif args.sort_by == 'duration':
+    elif args.sort_by == "duration":
        audios.sort(key=lambda x: x.duration)

    if args.reverse:
        audios.reverse()

    # slice
-    audios = audios[args.start:]
-    audios = audios[:args.howmany]
+    audios = audios[args.start :]
+    audios = audios[: args.howmany]

    # the for loop excludes the last one
    # this is to support  the --slotsize option
@ -436,13 +476,14 @@ def main():
        if args.slotsize is not None:
            duration = audio.duration
            if duration < args.slotsize:
-                print('## musica per {} secondi'
-                      .format(args.slotsize - duration))
+                print("## musica per {} secondi".format(args.slotsize - duration))
    # finally, the last one
    if args.debug:
        print(repr(audios[-1]))
    else:
        put(audios[-1], args.copy)
+
+
 #     else:  # grouping; TODO: support slotsize
 #         for item in groups:
 #             if args.debug:
@ -450,5 +491,5 @@ def main():
 #             print(groups[item])


-if __name__ == '__main__':
+if __name__ == "__main__":
    main()