boyska
/
larigira-scripts


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301
							#!/usr/bin/env python3
'''
Feed parser with many features

from a feed, it supports filtering, subslicing, random picking

Beside feeds, it supports picking files from directories
'''
import os
import logging
from argparse import ArgumentParser
from subprocess import check_output
from collections import OrderedDict
import re
import urllib.request
from urllib.parse import urlparse, unquote
import posixpath
import random
from bisect import bisect

from lxml import html
import requests


def weighted_choice(values, weights):
    '''
    random.choice with weights

    weights must be integers greater than 0.

    Their meaning is "relative", that is [1,2,3] is the same as [2,4,6]
    '''
    assert len(values) == len(weights)
    total = 0
    cum_weights = []
    for w in weights:
        total += w
        cum_weights.append(total)
    x = random.random() * total
    i = bisect(cum_weights, x)
    return values[i]


class Audio(object):
    def __init__(self, url, duration=None):
        self.url = url
        if duration is None:
            duration = get_duration(url.encode('utf-8'))
        self.duration = duration

    def __str__(self):
        return self.url

    def __repr__(self):
        return '<Audio {} ({})>'.format(self.url, self.duration)

    @property
    def urls(self):
        return [self.url]


class AudioGroup(list):
    def __init__(self, description=None):
        self.description = description or ''
        self.audios = []

    def __len__(self):
        return len(self.audios)

    def append(self, arg):
        self.audios.append(arg)

    def __str__(self):
        return '\n'.join(str(a) for a in self.audios)

    def __repr__(self):
        return '<AudioGroup "{}" ({})\n{} >'.\
                format(self.description, self.duration,
                       '\n'.join('   ' + repr(a) for a in self.audios))

    @property
    def duration(self):
        return sum(a.duration for a in self.audios if a.duration is not None)

    @property
    def urls(self):
        return [a.url for a in self.audios]


def get_tree(feed_url):
    if feed_url.startswith('http:') or feed_url.startswith('https:'):
        tree = html.fromstring(requests.get(feed_url).content)
    else:
        if not os.path.exists(feed_url):
            raise ValueError("file not found: {}".format(feed_url))
        tree = html.parse(open(feed_url))
    return tree


def get_audio_from_description(text):
    # non-empty lines
    lines = [line.strip()
             for line in text.split('\n')
             if line.strip()]
    url = lines[0]
    duration = None
    if len(lines) > 1:
        duration = int(re.findall(r'\d+', lines[1].split('=')[1].strip())[0])
    return Audio(unquote(url), duration)


# copied from larigira.fsutils
def scan_dir_audio(dirname, extensions=('mp3', 'oga', 'wav', 'ogg')):
    for root, dirnames, filenames in os.walk(dirname):
        for fname in filenames:
            if fname.split('.')[-1].lower() in extensions:
                yield os.path.join(root, fname)


def get_audio_from_dir(dirpath):
    fpaths = scan_dir_audio(dirpath)
    return [Audio('file://' + os.path.realpath(u)) for u in fpaths]


def get_urls(tree):
    urls = tree.xpath('//item/description')
    for url_elem in urls:
        yield get_audio_from_description(url_elem.text)


def get_grouped_urls(tree):
    groups = OrderedDict()
    items = tree.xpath('//item')
    for item in items:
        guid = item.xpath('guid')[0].text.strip()
        if guid not in groups:
            groups[guid] = AudioGroup(guid)
        groups[guid].append(get_audio_from_description(
            item.xpath('description')[0].text))
    return groups


def get_duration(url):
    lineout = check_output(['ffprobe', '-v', 'error',
                            '-show_entries', 'format=duration',
                            '-i', url]).split(b'\n')
    duration = next(l for l in lineout if l.startswith(b'duration='))
    value = duration.split(b'=')[1]
    return int(float(value))


HELP = '''
Collect audio informations from multiple sources (XML feeds).
Audios are (in that order):
 1. Collected from feeds; (grouped by article if --group is used)
 2. Filtered; everything that does not match with requirements is excluded
 3. Sorted; even randomly
 4. Sliced; take HOWMANY elements, skipping START elements
 5. (if --copy) Copied
Usage: '''


def get_parser():
    p = ArgumentParser(HELP)
    src = p.add_argument_group('sources', 'How to deal with sources')
    src.add_argument('--source-weights',
                     help='Select only one "source" based on this weights')
    src.add_argument('--group', default=False, action='store_true',
                     help='Group audios that belong to the same article')

    filters = p.add_argument_group('filters', 'Select only items that match '
                                   'these conditions')
    filters.add_argument('--max-len', default=0, type=int,
                         help='Exclude any audio that is longer '
                         'than MAX_LEN seconds')
    filters.add_argument('--random', default=False,
                         action='store_true', help='Pick randomly')
    filters.add_argument('--min-len', default=0, type=int,
                         help='Exclude any audio that is shorter '
                         'than MIN_LEN seconds')

    p.add_argument('--start', default=0, type=int,
                   help='0-indexed start number. '
                   'By default, play from most recent')
    p.add_argument('--howmany', default=1, type=int,
                   help='If not specified, only 1 will be played')
    p.add_argument('--slotsize', type=int,
                   help='Seconds between each audio. Still unsupported')

    general = p.add_argument_group('general', 'General options')
    general.add_argument('--copy', help='Copy files to $TMPDIR', default=False,
                         action='store_true')
    general.add_argument('--debug', help='Debug messages', default=False,
                         action='store_true')

    p.add_argument('urls', metavar='URL', nargs='+')
    return p


def put(audio, copy=False):
    if not copy:
        for url in audio.urls:
            print(url)
    else:
        for url in audio.urls:
            if url.split(':')[0] in ('http', 'https'):
                destdir = (os.environ.get('TMPDIR', '.'))
                fname = posixpath.basename(urlparse(url).path)
                # sanitize
                fname = "".join(c for c in fname
                                if c.isalnum() or c in list('._-')).rstrip()
                dest = os.path.join(destdir, fname)
                os.makedirs(destdir, exist_ok=True)
                fname, headers = urllib.request.urlretrieve(url, dest)
                print('file://%s' % os.path.realpath(fname))
            else:
                # FIXME: file:// urls are just copied
                print(url)


def main():
    parser = get_parser()
    args = parser.parse_args()
    if not args.debug:
        logging.basicConfig(level=logging.WARNING)
    else:
        logging.basicConfig(level=logging.DEBUG)
    sources = args.urls

    if args.source_weights:
        weights = tuple(map(int, args.source_weights.split(':')))
        if len(weights) != len(sources):
            parser.exit(status=2, message='Weight must be in the'
                        ' same number as sources\n')
        sources = [weighted_choice(sources, weights)]

    audios = []
    for url in sources:
        if url.startswith('http:') or url.startswith('https:') \
           or os.path.isfile(url):
            # download the feed
            tree = get_tree(url)
            if not args.group:
                # get audio urls, removing those that are too long
                audios += [audio for audio in get_urls(tree) if
                           (args.max_len == 0 or
                            audio.duration <= args.max_len) and
                           (args.min_len == 0 or
                            audio.duration >= args.min_len)
                           ]
            else:
                groups = get_grouped_urls(tree)
                audios += [groups[g] for g in groups.keys()
                           if args.max_len == 0 or
                           groups[g].duration <= args.max_len
                           ]
        elif os.path.isdir(url):
            audiodir = get_audio_from_dir(url)
            if not args.group:
                audios += audiodir
            else:
                for a in audiodir:
                    ag = AudioGroup(os.path.basename(a.url))
                    ag.append(a)
                    audios.append(ag)
        else:
            logging.info('unsupported url `%s`', url)

    audios = audios[args.start:]
    if args.random:
        random.shuffle(audios)
    audios = audios[:args.howmany]

    # the for loop excludes the last one
    # this is to support  the --slotsize option
    if not audios:
        return
    for audio in audios[:-1]:
        if args.debug:
            print(repr(audio))
        else:
            put(audio, args.copy)
        if args.slotsize is not None:
            duration = audio.duration
            if duration < args.slotsize:
                print('## musica per {} secondi'
                      .format(args.slotsize - duration))
    # finally, the last one
    if args.debug:
        print(repr(audios[-1]))
    else:
        put(audios[-1], args.copy)
#     else:  # grouping; TODO: support slotsize
#         for item in groups:
#             if args.debug:
#                 print('#', item, groups[item].duration)
#             print(groups[item])


if __name__ == '__main__':
    main()