semplifica audio/audiogroup

ora il codice di filtraggio è uguale
2020-07-21 13:18:37 +02:00 · 2020-07-21 13:18:37 +02:00 · daeb2ff3db
commit daeb2ff3db
parent 0ebb62c318
1 changed files with 215 additions and 174 deletions
--- a/389
+++ b/389
@ -1,31 +1,31 @@
 #!/usr/bin/env python3
-'''
+"""
 Feed parser with many features
 from a feed, it supports filtering, subslicing, random picking
 Beside feeds, it supports picking files from directories
-'''
+"""
-import os
+import datetime
 import logging
-from argparse import ArgumentParser, ArgumentTypeError
+import os
 from subprocess import check_output, CalledProcessError
 from collections import OrderedDict
 import re
 import urllib.request
 from urllib.parse import urlparse, unquote
 import posixpath
 import random
 import re
 import urllib.request
 from argparse import ArgumentParser, ArgumentTypeError
 from bisect import bisect
-import datetime
+from collections import OrderedDict
 from subprocess import CalledProcessError, check_output
 from urllib.parse import unquote, urlparse
 from lxml import html
 import requests
 from lxml import html
 from pytimeparse.timeparse import timeparse
 def get_int(s):
-    return int(re.findall(r'\d+', s)[0])
+    return int(re.findall(r"\d+", s)[0])
 def DurationType(arg):
@ -34,27 +34,28 @@ def DurationType(arg):
    else:
        secs = timeparse(arg)
        if secs is None:
-            raise ArgumentTypeError('%r is not a valid duration' % arg)
+            raise ArgumentTypeError("%r is not a valid duration" % arg)
    return secs
 def TimeDeltaType(arg):
    if arg.isdecimal():
        secs = int(arg)
    else:
        secs = timeparse(arg)
        if secs is None:
-            raise ArgumentTypeError('%r is not a valid time range' % arg)
+            raise ArgumentTypeError("%r is not a valid time range" % arg)
    return datetime.timedelta(seconds=secs)
 def weighted_choice(values, weights):
-    '''
+    """
    random.choice with weights
    weights must be integers greater than 0.
    Their meaning is "relative", that is [1,2,3] is the same as [2,4,6]
-    '''
+    """
    assert len(values) == len(weights)
    total = 0
    cum_weights = []
@ -68,19 +69,19 @@ def weighted_choice(values, weights):
 def delta_humanreadable(tdelta):
    if tdelta is None:
-        return ''
+        return ""
    days = tdelta.days
    hours = (tdelta - datetime.timedelta(days=days)).seconds // 3600
    if days:
-        return '{}d{}h'.format(days, hours)
+        return "{}d{}h".format(days, hours)
-    return '{}h'.format(hours)
+    return "{}h".format(hours)
 class Audio(object):
    def __init__(self, url, duration=None, date=None):
        self.url = url
        if duration is None:
-            duration = get_duration(url.encode('utf-8'))
+            duration = get_duration(url.encode("utf-8"))
        self.duration = duration
        self.date = date
        self.end_date = datetime.datetime(9999, 12, 31, tzinfo=datetime.timezone.utc)
@ -89,8 +90,9 @@ class Audio(object):
        return self.url
    def __repr__(self):
-        return '<Audio {} ({} {})>'.format(self.url, self.duration,
+        return "<Audio {} ({} {})>".format(
-                                           delta_humanreadable(self.age))
+            self.url, self.duration, delta_humanreadable(self.age)
        )
    @property
    def urls(self):
@ -106,12 +108,14 @@ class Audio(object):
    @property
    def valid(self):
-        return self.end_date >= datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc)
+        return self.end_date >= datetime.datetime.utcnow().replace(
            tzinfo=datetime.timezone.utc
        )
 class AudioGroup(list):
    def __init__(self, description=None):
-        self.description = description or ''
+        self.description = description or ""
        self.audios = []
    def __len__(self):
@ -121,13 +125,15 @@ class AudioGroup(list):
        self.audios.append(arg)
    def __str__(self):
-        return '\n'.join(str(a) for a in self.audios)
+        return "\n".join(str(a) for a in self.audios)
    def __repr__(self):
-        return '<AudioGroup "{}" ({} {})\n{} >'.\
+        return '<AudioGroup "{}" ({} {})\n{} >'.format(
-                format(self.description, self.duration,
+            self.description,
-                       delta_humanreadable(self.age),
+            self.duration,
-                       '\n'.join('   ' + repr(a) for a in self.audios))
+            delta_humanreadable(self.age),
            "\n".join("   " + repr(a) for a in self.audios),
        )
    @property
    def duration(self):
@ -140,7 +146,7 @@ class AudioGroup(list):
    @property
    def date(self):
        for a in self.audios:
-            if hasattr(a, 'date'):
+            if hasattr(a, "date"):
                return a.date
        return None
@ -157,9 +163,8 @@ class AudioGroup(list):
        return len(self.audios) > 0
 def get_tree(feed_url):
-    if feed_url.startswith('http:') or feed_url.startswith('https:'):
+    if feed_url.startswith("http:") or feed_url.startswith("https:"):
        tree = html.fromstring(requests.get(feed_url).content)
    else:
        if not os.path.exists(feed_url):
@ -170,70 +175,76 @@ def get_tree(feed_url):
 def get_audio_from_description(text):
    # non-empty lines
-    lines = [line.strip()
+    lines = [line.strip() for line in text.split("\n") if line.strip()]
             for line in text.split('\n')
             if line.strip()]
    url = lines[0]
    duration = None
    metadata = {}
-    for line in text.split('\n')[1:]:
+    for line in text.split("\n")[1:]:
-        if line.strip() and '=' in line:
+        if line.strip() and "=" in line:
-            metadata[line.split('=')[0]] = line.split('=')[1]
+            metadata[line.split("=")[0]] = line.split("=")[1]
-    if 'durata' in metadata:
+    if "durata" in metadata:
-        metadata['durata'] = get_int(metadata['durata'])
+        metadata["durata"] = get_int(metadata["durata"])
-    if 'txdate' in metadata:
+    if "txdate" in metadata:
        try:
-            metadata['txdate'] = datetime.datetime.strptime(
+            metadata["txdate"] = datetime.datetime.strptime(
-                metadata['txdate'], '%Y-%m-%dT%H:%M:%S%z')
+                metadata["txdate"], "%Y-%m-%dT%H:%M:%S%z"
            )
        except ValueError:
-            logging.warning('could not parse txdate %s', metadata['txdate'])
+            logging.warning("could not parse txdate %s", metadata["txdate"])
-            del metadata['txdate']
+            del metadata["txdate"]
-    a = Audio(unquote(url),
+    a = Audio(
-              duration=metadata.get('durata', None),
+        unquote(url),
-              date=metadata.get('txdate', None))
+        duration=metadata.get("durata", None),
        date=metadata.get("txdate", None),
    )
-    if 'txdate' in metadata and 'replica' in metadata:
+    if "txdate" in metadata and "replica" in metadata:
-        if metadata['replica'].endswith('g'):
+        if metadata["replica"].endswith("g"):
-            a.end_date = metadata['txdate'] + datetime.timedelta(
+            a.end_date = metadata["txdate"] + datetime.timedelta(
-                days=get_int(metadata['replica']))
+                days=get_int(metadata["replica"])
            )
    return a
 # copied from larigira.fsutils
-def scan_dir_audio(dirname, extensions=('mp3', 'oga', 'wav', 'ogg')):
+def scan_dir_audio(dirname, extensions=("mp3", "oga", "wav", "ogg")):
    for root, dirnames, filenames in os.walk(dirname):
        for fname in filenames:
-            if fname.split('.')[-1].lower() in extensions:
+            if fname.split(".")[-1].lower() in extensions:
                yield os.path.join(root, fname)
 def get_audio_from_dir(dirpath):
    fpaths = scan_dir_audio(dirpath)
-    return [Audio('file://' + os.path.realpath(u),
+    return [
-                  date=datetime.datetime.fromtimestamp(os.path.getmtime(u)).
+        Audio(
-                  replace(tzinfo=datetime.timezone.utc))
+            "file://" + os.path.realpath(u),
-            for u in fpaths]
+            date=datetime.datetime.fromtimestamp(os.path.getmtime(u)).replace(
                tzinfo=datetime.timezone.utc
            ),
        )
        for u in fpaths
    ]
 def get_item_date(el):
-    el_date = el.find('pubdate')
+    el_date = el.find("pubdate")
    if el_date is not None:
-        return datetime.datetime.strptime(
+        return datetime.datetime.strptime(el_date.text, "%Y-%m-%dT%H:%M:%S%z")
            el_date.text, '%Y-%m-%dT%H:%M:%S%z')
    return None
 def get_urls(tree):
-    items = tree.xpath('//item')
+    items = tree.xpath("//item")
    for it in items:
-        title = it.find('title').text
+        title = it.find("title").text
-        el_body = it.find('description')
+        el_body = it.find("description")
        if el_body is not None:
            url = el_body.text
            try:
                audio = get_audio_from_description(url)
            except Exception as exc:
-                logging.info('error getting duration for `%s`' % title)
+                logging.info("error getting duration for `%s`" % title)
                continue
            if audio.date is None:
                audio.date = get_item_date(it)
@ -242,12 +253,12 @@ def get_urls(tree):
 def get_grouped_urls(tree):
    groups = OrderedDict()
-    items = tree.xpath('//item')
+    items = tree.xpath("//item")
    for item in items:
-        guid = item.xpath('guid')[0].text.strip()
+        guid = item.xpath("guid")[0].text.strip()
        if guid not in groups:
            groups[guid] = AudioGroup(guid)
-        audio = get_audio_from_description(item.xpath('description')[0].text)
+        audio = get_audio_from_description(item.xpath("description")[0].text)
        audio.date = get_item_date(item)
        if audio.valid:
            groups[guid].append(audio)
@ -256,17 +267,17 @@ def get_grouped_urls(tree):
 def get_duration(url):
    try:
-        lineout = check_output(['ffprobe', '-v', 'error',
+        lineout = check_output(
-                                '-show_entries', 'format=duration',
+            ["ffprobe", "-v", "error", "-show_entries", "format=duration", "-i", url]
-                                '-i', url]).split(b'\n')
+        ).split(b"\n")
    except CalledProcessError as exc:
-        raise ValueError('error probing `%s`' % url) from exc
+        raise ValueError("error probing `%s`" % url) from exc
-    duration = next(l for l in lineout if l.startswith(b'duration='))
+    duration = next(l for l in lineout if l.startswith(b"duration="))
-    value = duration.split(b'=')[1]
+    value = duration.split(b"=")[1]
    return int(float(value))
-HELP = '''
+HELP = """
 Collect audio informations from multiple sources (XML feeds).
 Audios are (in that order):
 1. Collected from feeds; (grouped by article if --group is used)
@ -274,52 +285,79 @@ Audios are (in that order):
 3. Sorted; even randomly
 4. Sliced; take HOWMANY elements, skipping START elements
 5. (if --copy) Copied
-Usage: '''
+Usage: """
 def get_parser():
    p = ArgumentParser(HELP)
-    src = p.add_argument_group('sources', 'How to deal with sources')
+    src = p.add_argument_group("sources", "How to deal with sources")
-    src.add_argument('--source-weights',
+    src.add_argument(
-                     help='Select only one "source" based on this weights')
+        "--source-weights", help='Select only one "source" based on this weights'
-    src.add_argument('--group', default=False, action='store_true',
+    )
-                     help='Group audios that belong to the same article')
+    src.add_argument(
        "--group",
        default=False,
        action="store_true",
        help="Group audios that belong to the same article",
    )
-    filters = p.add_argument_group('filters', 'Select only items that match '
+    filters = p.add_argument_group(
-                                   'these conditions')
+        "filters", "Select only items that match " "these conditions"
-    filters.add_argument('--min-len', default=0, type=DurationType,
+    )
-                         help='Exclude any audio that is shorter '
+    filters.add_argument(
-                         'than MIN_LEN seconds')
+        "--min-len",
-    filters.add_argument('--max-len', default=0, type=DurationType,
+        default=0,
-                         help='Exclude any audio that is longer '
+        type=DurationType,
-                         'than MAX_LEN seconds')
+        help="Exclude any audio that is shorter " "than MIN_LEN seconds",
-    filters.add_argument('--sort-by', default='no', type=str,
+    )
-                         choices=('random', 'date', 'duration'))
+    filters.add_argument(
-    filters.add_argument('--reverse', default=False,
+        "--max-len",
-                         action='store_true', help='Reverse list order')
+        default=0,
        type=DurationType,
        help="Exclude any audio that is longer " "than MAX_LEN seconds",
    )
    filters.add_argument(
        "--sort-by", default="no", type=str, choices=("random", "date", "duration")
    )
    filters.add_argument(
        "--reverse", default=False, action="store_true", help="Reverse list order"
    )
-    filters.add_argument('--min-age', default=datetime.timedelta(),
+    filters.add_argument(
-                         type=TimeDeltaType,
+        "--min-age",
-                         help='Exclude audio more recent than MIN_AGE')
+        default=datetime.timedelta(),
-    filters.add_argument('--max-age', default=datetime.timedelta(),
+        type=TimeDeltaType,
-                         type=TimeDeltaType,
+        help="Exclude audio more recent than MIN_AGE",
-                         help='Exclude audio older than MAX_AGE')
+    )
    filters.add_argument(
        "--max-age",
        default=datetime.timedelta(),
        type=TimeDeltaType,
        help="Exclude audio older than MAX_AGE",
    )
-    p.add_argument('--start', default=0, type=int,
+    p.add_argument(
-                   help='0-indexed start number. '
+        "--start",
-                   'By default, play from most recent')
+        default=0,
-    p.add_argument('--howmany', default=1, type=int,
+        type=int,
-                   help='If not specified, only 1 will be played')
+        help="0-indexed start number. " "By default, play from most recent",
-    p.add_argument('--slotsize', type=int,
+    )
-                   help='Seconds between each audio. Still unsupported')
+    p.add_argument(
        "--howmany", default=1, type=int, help="If not specified, only 1 will be played"
    )
    p.add_argument(
        "--slotsize", type=int, help="Seconds between each audio. Still unsupported"
    )
-    general = p.add_argument_group('general', 'General options')
+    general = p.add_argument_group("general", "General options")
-    general.add_argument('--copy', help='Copy files to $TMPDIR', default=False,
+    general.add_argument(
-                         action='store_true')
+        "--copy", help="Copy files to $TMPDIR", default=False, action="store_true"
-    general.add_argument('--debug', help='Debug messages', default=False,
+    )
-                         action='store_true')
+    general.add_argument(
        "--debug", help="Debug messages", default=False, action="store_true"
    )
-    p.add_argument('urls', metavar='URL', nargs='+')
+    p.add_argument("urls", metavar="URL", nargs="+")
    return p
@ -329,21 +367,66 @@ def put(audio, copy=False):
            print(url)
    else:
        for url in audio.urls:
-            if url.split(':')[0] in ('http', 'https'):
+            if url.split(":")[0] in ("http", "https"):
-                destdir = (os.environ.get('TMPDIR', '.'))
+                destdir = os.environ.get("TMPDIR", ".")
                fname = posixpath.basename(urlparse(url).path)
                # sanitize
-                fname = "".join(c for c in fname
+                fname = "".join(
-                                if c.isalnum() or c in list('._-')).rstrip()
+                    c for c in fname if c.isalnum() or c in list("._-")
                ).rstrip()
                dest = os.path.join(destdir, fname)
                os.makedirs(destdir, exist_ok=True)
                fname, headers = urllib.request.urlretrieve(url, dest)
-                print('file://%s' % os.path.realpath(fname))
+                print("file://%s" % os.path.realpath(fname))
            else:
                # FIXME: file:// urls are just copied
                print(url)
 def retrieve(url, args):
    """
    returns a list of Audios or a list of AudioGroups
    """
    if not args.group:
        if os.path.isdir(url):
            audiodir = get_audio_from_dir(url)
            return audiodir
        elif url.startswith("http:") or url.startswith("https:") or os.path.isfile(url):
            return get_urls(get_tree(url))
        else:
            logging.info("unsupported url `%s`", url)
            return []
    else:  # group
        if os.path.isdir(url):
            audiodir = get_audio_from_dir(url)
            agroups = []
            for a in audiodir:
                ag = AudioGroup(os.path.basename(a.url))
                ag.append(a)
                agroups.append(ag)
            return agroups
        elif url.startswith("http:") or url.startswith("https:") or os.path.isfile(url):
            groups = get_grouped_urls(get_tree(url))
            return groups.values()
        else:
            logging.info("unsupported url `%s`", url)
            return []
 def audio_passes_filters(audio, args):
    if not audio.valid:
        return False
    if args.max_len and audio.duration > args.max_len:
        return False
    if args.min_len and audio.duration < args.min_len:
        return False
    if args.min_age.total_seconds() and audio.age < args.min_age:
        return False
    if args.max_age.total_seconds() and audio.age > args.max_age:
        return False
    return True
 def main():
    parser = get_parser()
    args = parser.parse_args()
@ -354,75 +437,32 @@ def main():
    sources = args.urls
    if args.source_weights:
-        weights = tuple(map(int, args.source_weights.split(':')))
+        weights = tuple(map(int, args.source_weights.split(":")))
        if len(weights) != len(sources):
-            parser.exit(status=2, message='Weight must be in the'
+            parser.exit(
-                        ' same number as sources\n')
+                status=2, message="Weight must be in the" " same number as sources\n"
            )
        sources = [weighted_choice(sources, weights)]
    audios = []
    for url in sources:
-        if not args.group:
+        url_audios = retrieve(url, args)
-            if os.path.isdir(url):
+        audios += [au for au in url_audios if audio_passes_filters(au, args)]
                audiodir = get_audio_from_dir(url)
                audios += audiodir
            elif url.startswith('http:') or url.startswith('https:') \
                    or os.path.isfile(url):
                audios += get_urls(get_tree(url))
            else:
                logging.info('unsupported url `%s`', url)
            audios = [audio for audio in audios if
                      (audio.valid) and
                      (args.max_len == 0 or
                       audio.duration <= args.max_len) and
                      (args.min_len == 0 or
                       audio.duration >= args.min_len) and
                      (args.min_age.total_seconds() == 0 or
                       audio.age >= args.min_age) and
                      (args.max_age.total_seconds() == 0 or
                       audio.age <= args.max_age)
                      ]
        else:  # group
            if os.path.isdir(url):
                audiodir = get_audio_from_dir(url)
                agroups = []
                for a in audiodir:
                    ag = AudioGroup(os.path.basename(a.url))
                    ag.append(a)
                    agroups.append(ag)
            elif url.startswith('http:') or url.startswith('https:') \
                    or os.path.isfile(url):
                groups = get_grouped_urls(get_tree(url))
                agroups = groups.values()
            else:
                logging.info('unsupported url `%s`', url)
            audios += [g for g in agroups
                       if
                       (g.valid) and
                       (args.max_len == 0 or
                        g.duration <= args.max_len) and
                       (args.min_len == 0 or
                        g.duration >= args.max_len) and
                       (args.min_age.total_seconds() == 0 or
                        g.age >= args.min_age) and
                       (args.max_age.total_seconds() == 0 or
                        g.age <= args.max_age)
                       ]
    # sort
-    if args.sort_by == 'random':
+    if args.sort_by == "random":
        random.shuffle(audios)
-    elif args.sort_by == 'date':
+    elif args.sort_by == "date":
        audios.sort(key=lambda x: x.age)
-    elif args.sort_by == 'duration':
+    elif args.sort_by == "duration":
        audios.sort(key=lambda x: x.duration)
    if args.reverse:
        audios.reverse()
    # slice
-    audios = audios[args.start:]
+    audios = audios[args.start :]
-    audios = audios[:args.howmany]
+    audios = audios[: args.howmany]
    # the for loop excludes the last one
    # this is to support  the --slotsize option
@ -436,13 +476,14 @@ def main():
        if args.slotsize is not None:
            duration = audio.duration
            if duration < args.slotsize:
-                print('## musica per {} secondi'
+                print("## musica per {} secondi".format(args.slotsize - duration))
                      .format(args.slotsize - duration))
    # finally, the last one
    if args.debug:
        print(repr(audios[-1]))
    else:
        put(audios[-1], args.copy)
 #     else:  # grouping; TODO: support slotsize
 #         for item in groups:
 #             if args.debug:
@ -450,5 +491,5 @@ def main():
 #             print(groups[item])
-if __name__ == '__main__':
+if __name__ == "__main__":
    main()