larigira-scripts/feed

#!/usr/bin/env python3
'''
Feed parser with many features

from a feed, it supports filtering, subslicing, random picking

Beside feeds, it supports picking files from directories
'''
import os
import logging
from argparse import ArgumentParser, ArgumentTypeError
from subprocess import check_output, CalledProcessError
from collections import OrderedDict
import re
import urllib.request
from urllib.parse import urlparse, unquote
import posixpath
import random
from bisect import bisect
import datetime

from lxml import html
import requests
from pytimeparse.timeparse import timeparse


def get_int(s):
    return int(re.findall(r'\d+', s)[0])


def DurationType(arg):
    if arg.isdecimal():
        secs = int(arg)
    else:
        secs = timeparse(arg)
        if secs is None:
            raise ArgumentTypeError('%r is not a valid duration' % arg)
    return secs

def TimeDeltaType(arg):
    if arg.isdecimal():
        secs = int(arg)
    else:
        secs = timeparse(arg)
        if secs is None:
            raise ArgumentTypeError('%r is not a valid time range' % arg)
    return datetime.timedelta(seconds=secs)


def weighted_choice(values, weights):
    '''
    random.choice with weights

    weights must be integers greater than 0.

    Their meaning is "relative", that is [1,2,3] is the same as [2,4,6]
    '''
    assert len(values) == len(weights)
    total = 0
    cum_weights = []
    for w in weights:
        total += w
        cum_weights.append(total)
    x = random.random() * total
    i = bisect(cum_weights, x)
    return values[i]


def delta_humanreadable(tdelta):
    if tdelta is None:
        return ''
    days = tdelta.days
    hours = (tdelta - datetime.timedelta(days=days)).seconds // 3600
    if days:
        return '{}d{}h'.format(days, hours)
    return '{}h'.format(hours)


class Audio(object):
    def __init__(self, url, duration=None, date=None):
        self.url = url
        if duration is None:
            duration = get_duration(url.encode('utf-8'))
        self.duration = duration
        self.date = date
        self.end_date = datetime.datetime(9999, 12, 31, tzinfo=datetime.timezone.utc)

    def __str__(self):
        return self.url

    def __repr__(self):
        return '<Audio {} ({} {})>'.format(self.url, self.duration,
                                           delta_humanreadable(self.age))

    @property
    def urls(self):
        return [self.url]

    @property
    def age(self):
        if self.date is None:
            return None
        now = datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc)

        return now - self.date

    @property
    def valid(self):
        return self.end_date >= datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc)


class AudioGroup(list):
    def __init__(self, description=None):
        self.description = description or ''
        self.audios = []

    def __len__(self):
        return len(self.audios)

    def append(self, arg):
        self.audios.append(arg)

    def __str__(self):
        return '\n'.join(str(a) for a in self.audios)

    def __repr__(self):
        return '<AudioGroup "{}" ({} {})\n{} >'.\
                format(self.description, self.duration,
                       delta_humanreadable(self.age),
                       '\n'.join('   ' + repr(a) for a in self.audios))

    @property
    def duration(self):
        return sum(a.duration for a in self.audios if a.duration is not None)

    @property
    def urls(self):
        return [a.url for a in self.audios]

    @property
    def date(self):
        for a in self.audios:
            if hasattr(a, 'date'):
                return a.date
        return None

    @property
    def age(self):
        if self.date is None:
            return None
        now = datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc)

        return now - self.date

    @property
    def valid(self):
        return len(self.audios) > 0


def get_tree(feed_url):
    if feed_url.startswith('http:') or feed_url.startswith('https:'):
        tree = html.fromstring(requests.get(feed_url).content)
    else:
        if not os.path.exists(feed_url):
            raise ValueError("file not found: {}".format(feed_url))
        tree = html.parse(open(feed_url))
    return tree


def get_audio_from_description(text):
    # non-empty lines
    lines = [line.strip()
             for line in text.split('\n')
             if line.strip()]
    url = lines[0]
    duration = None
    metadata = {}
    for line in text.split('\n')[1:]:
        if line.strip() and '=' in line:
            metadata[line.split('=')[0]] = line.split('=')[1]
    if 'durata' in metadata:
        metadata['durata'] = get_int(metadata['durata'])
    if 'txdate' in metadata:
        try:
            metadata['txdate'] = datetime.datetime.strptime(
                metadata['txdate'], '%Y-%m-%dT%H:%M:%S%z')
        except ValueError:
            logging.warning('could not parse txdate %s', metadata['txdate'])
            del metadata['txdate']
    a = Audio(unquote(url), metadata.get('durata', None))

    if 'txdate' in metadata and 'replica' in metadata:
        if metadata['replica'].endswith('g'):
            a.end_date = metadata['txdate'] + datetime.timedelta(
                days=get_int(metadata['replica']))
    return a


# copied from larigira.fsutils
def scan_dir_audio(dirname, extensions=('mp3', 'oga', 'wav', 'ogg')):
    for root, dirnames, filenames in os.walk(dirname):
        for fname in filenames:
            if fname.split('.')[-1].lower() in extensions:
                yield os.path.join(root, fname)


def get_audio_from_dir(dirpath):
    fpaths = scan_dir_audio(dirpath)
    return [Audio('file://' + os.path.realpath(u),
                  date=datetime.datetime.fromtimestamp(os.path.getmtime(u)).
                  replace(tzinfo=datetime.timezone.utc))
            for u in fpaths]


def get_item_date(el):
    # TODO: pick from txdate=
    el_date = el.find('pubdate')
    if el_date is not None:
        return datetime.datetime.strptime(
            el_date.text, '%Y-%m-%dT%H:%M:%S%z')
    return None


def get_urls(tree):
    items = tree.xpath('//item')
    for it in items:
        title = it.find('title').text
        el_body = it.find('description')
        if el_body is not None:
            url = el_body.text
            try:
                audio = get_audio_from_description(url)
            except Exception as exc:
                logging.info('error getting duration for `%s`' % title)
                continue
            audio.date = get_item_date(it)
            yield audio


def get_grouped_urls(tree):
    groups = OrderedDict()
    items = tree.xpath('//item')
    for item in items:
        guid = item.xpath('guid')[0].text.strip()
        if guid not in groups:
            groups[guid] = AudioGroup(guid)
        audio = get_audio_from_description(item.xpath('description')[0].text)
        audio.date = get_item_date(item)
        if audio.valid:
            groups[guid].append(audio)
    return groups


def get_duration(url):
    try:
        lineout = check_output(['ffprobe', '-v', 'error',
                                '-show_entries', 'format=duration',
                                '-i', url]).split(b'\n')
    except CalledProcessError as exc:
        raise ValueError('error probing `%s`' % url) from exc
    duration = next(l for l in lineout if l.startswith(b'duration='))
    value = duration.split(b'=')[1]
    return int(float(value))


HELP = '''
Collect audio informations from multiple sources (XML feeds).
Audios are (in that order):
 1. Collected from feeds; (grouped by article if --group is used)
 2. Filtered; everything that does not match with requirements is excluded
 3. Sorted; even randomly
 4. Sliced; take HOWMANY elements, skipping START elements
 5. (if --copy) Copied
Usage: '''


def get_parser():
    p = ArgumentParser(HELP)
    src = p.add_argument_group('sources', 'How to deal with sources')
    src.add_argument('--source-weights',
                     help='Select only one "source" based on this weights')
    src.add_argument('--group', default=False, action='store_true',
                     help='Group audios that belong to the same article')

    filters = p.add_argument_group('filters', 'Select only items that match '
                                   'these conditions')
    filters.add_argument('--min-len', default=0, type=DurationType,
                         help='Exclude any audio that is shorter '
                         'than MIN_LEN seconds')
    filters.add_argument('--max-len', default=0, type=DurationType,
                         help='Exclude any audio that is longer '
                         'than MAX_LEN seconds')
    filters.add_argument('--sort-by', default='no', type=str,
                         choices=('random', 'date', 'duration'))
    filters.add_argument('--reverse', default=False,
                         action='store_true', help='Reverse list order')

    filters.add_argument('--min-age', default=datetime.timedelta(),
                         type=TimeDeltaType,
                         help='Exclude audio more recent than MIN_AGE')
    filters.add_argument('--max-age', default=datetime.timedelta(),
                         type=TimeDeltaType,
                         help='Exclude audio older than MAX_AGE')

    p.add_argument('--start', default=0, type=int,
                   help='0-indexed start number. '
                   'By default, play from most recent')
    p.add_argument('--howmany', default=1, type=int,
                   help='If not specified, only 1 will be played')
    p.add_argument('--slotsize', type=int,
                   help='Seconds between each audio. Still unsupported')

    general = p.add_argument_group('general', 'General options')
    general.add_argument('--copy', help='Copy files to $TMPDIR', default=False,
                         action='store_true')
    general.add_argument('--debug', help='Debug messages', default=False,
                         action='store_true')

    p.add_argument('urls', metavar='URL', nargs='+')
    return p


def put(audio, copy=False):
    if not copy:
        for url in audio.urls:
            print(url)
    else:
        for url in audio.urls:
            if url.split(':')[0] in ('http', 'https'):
                destdir = (os.environ.get('TMPDIR', '.'))
                fname = posixpath.basename(urlparse(url).path)
                # sanitize
                fname = "".join(c for c in fname
                                if c.isalnum() or c in list('._-')).rstrip()
                dest = os.path.join(destdir, fname)
                os.makedirs(destdir, exist_ok=True)
                fname, headers = urllib.request.urlretrieve(url, dest)
                print('file://%s' % os.path.realpath(fname))
            else:
                # FIXME: file:// urls are just copied
                print(url)


def main():
    parser = get_parser()
    args = parser.parse_args()
    if not args.debug:
        logging.basicConfig(level=logging.WARNING)
    else:
        logging.basicConfig(level=logging.DEBUG)
    sources = args.urls

    if args.source_weights:
        weights = tuple(map(int, args.source_weights.split(':')))
        if len(weights) != len(sources):
            parser.exit(status=2, message='Weight must be in the'
                        ' same number as sources\n')
        sources = [weighted_choice(sources, weights)]

    audios = []
    for url in sources:
        if not args.group:
            if os.path.isdir(url):
                audiodir = get_audio_from_dir(url)
                audios += audiodir
            elif url.startswith('http:') or url.startswith('https:') \
                    or os.path.isfile(url):
                audios += get_urls(get_tree(url))
            else:
                logging.info('unsupported url `%s`', url)
            audios = [audio for audio in audios if
                      (audio.valid) and
                      (args.max_len == 0 or
                       audio.duration <= args.max_len) and
                      (args.min_len == 0 or
                       audio.duration >= args.min_len) and
                      (args.min_age.total_seconds() == 0 or
                       audio.age >= args.min_age) and
                      (args.max_age.total_seconds() == 0 or
                       audio.age <= args.max_age)
                      ]
        else:  # group
            if os.path.isdir(url):
                audiodir = get_audio_from_dir(url)
                agroups = []
                for a in audiodir:
                    ag = AudioGroup(os.path.basename(a.url))
                    ag.append(a)
                    agroups.append(ag)
            elif url.startswith('http:') or url.startswith('https:') \
                    or os.path.isfile(url):
                groups = get_grouped_urls(get_tree(url))
                agroups = groups.values()
            else:
                logging.info('unsupported url `%s`', url)
            audios += [g for g in agroups
                       if
                       (g.valid) and
                       (args.max_len == 0 or
                        g.duration <= args.max_len) and
                       (args.min_len == 0 or
                        g.duration >= args.max_len) and
                       (args.min_age.total_seconds() == 0 or
                        g.age >= args.min_age) and
                       (args.max_age.total_seconds() == 0 or
                        g.age <= args.max_age)
                       ]

    # sort
    if args.sort_by == 'random':
        random.shuffle(audios)
    elif args.sort_by == 'date':
        audios.sort(key=lambda x: x.age)
    elif args.sort_by == 'duration':
        audios.sort(key=lambda x: x.duration)

    if args.reverse:
        audios.reverse()

    # slice
    audios = audios[args.start:]
    audios = audios[:args.howmany]

    # the for loop excludes the last one
    # this is to support  the --slotsize option
    if not audios:
        return
    for audio in audios[:-1]:
        if args.debug:
            print(repr(audio))
        else:
            put(audio, args.copy)
        if args.slotsize is not None:
            duration = audio.duration
            if duration < args.slotsize:
                print('## musica per {} secondi'
                      .format(args.slotsize - duration))
    # finally, the last one
    if args.debug:
        print(repr(audios[-1]))
    else:
        put(audios[-1], args.copy)
#     else:  # grouping; TODO: support slotsize
#         for item in groups:
#             if args.debug:
#                 print('#', item, groups[item].duration)
#             print(groups[item])


if __name__ == '__main__':
    main()