larigira-scripts/feed

#!/usr/bin/env python3
"""
Feed parser with many features

from a feed, it supports filtering, subslicing, random picking

Beside feeds, it supports picking files from directories
"""
import datetime
import logging
import os
import glob
import posixpath
import random
import re
import sys
import urllib.request
from argparse import ArgumentParser, ArgumentTypeError
from bisect import bisect
from collections import OrderedDict
from subprocess import CalledProcessError, check_output
from urllib.parse import unquote, urlparse
import shutil

import requests
from lxml import html
from pytimeparse.timeparse import timeparse

DEBUG = False

class UnsupportedFeedtype(Exception):
    pass

class DurationNotFound(Exception):
    pass


class EmptySequenceError(Exception):
    pass

class WeightZeroError(Exception):
    pass


def debug(*args, **kwargs):
    if not DEBUG:
        return
    kwargs.setdefault("file", sys.stderr)
    print(*args, **kwargs)


def get_int(s):
    return int(re.findall(r"\d+", s)[0])


def DurationType(arg):
    if arg.isdecimal():
        secs = int(arg)
    else:
        secs = timeparse(arg)
        if secs is None:
            raise ArgumentTypeError("%r is not a valid duration" % arg)
    return secs


def TimeDeltaType(arg):
    if arg.isdecimal():
        secs = int(arg)
    else:
        secs = timeparse(arg)
        if secs is None:
            raise ArgumentTypeError("%r is not a valid time range" % arg)
    return datetime.timedelta(seconds=secs)

def weighted_choice(values, weights):
    """
    random.choice with weights

    weights must be integers greater than 0.

    Their meaning is "relative", that is [1,2,3] is the same as [2,4,6]
    """
    assert len(values) == len(weights)
    if not values:
        raise EmptySequenceError()  # Cannot do weighted choice from an empty sequence
    if sum(weights) == 0:
        raise WeightZeroError()   # Cannot do weighted choice where weight=0
    total = 0
    cum_weights = []
    for w in weights:
        total += w
        cum_weights.append(total)
    x = random.random() * total
    i = bisect(cum_weights, x)
    return values[i]


def delta_humanreadable(tdelta):
    if tdelta is None:
        return ""
    days = tdelta.days
    hours = (tdelta - datetime.timedelta(days=days)).seconds // 3600
    if days:
        return "{}d{}h".format(days, hours)
    return "{}h".format(hours)


def duration_humanreadable(seconds):
    hours = seconds // 3600
    minutes = (seconds - hours * 3600) // 60
    seconds = seconds % 60

    if hours > 0:
        return "{}h{}m{}s".format(hours, minutes, seconds)
    return "{}m{}s".format(minutes, seconds)


class Audio(object):
    def __init__(self, url, duration=None, date=None):
        self.url = url
        self._duration = duration
        self.date = date
        self.end_date = datetime.datetime(9999, 12, 31, tzinfo=datetime.timezone.utc)

    @classmethod
    def from_trusted(cls, url_or_path) -> 'Audio':
        if url_or_path.startswith('/'):
            return cls('file://' + url_or_path)
        return cls(url_or_path)

    def __str__(self):
        return self.url

    def __repr__(self):
        return "<Audio {} ({} {})>".format(
            self.url,
            duration_humanreadable(self.duration),
            delta_humanreadable(self.age),
        )

    @property
    def duration(self):
        if self._duration is None:
            self._duration = get_duration(self.url)
        return self._duration


    @property
    def urls(self):
        return [self.url]

    @property
    def age(self):
        if self.date is None:
            return None
        now = datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc)

        return now - self.date

    @property
    def valid(self):
        return self.end_date >= datetime.datetime.utcnow().replace(
            tzinfo=datetime.timezone.utc
        )


class AudioGroup(list):
    def __init__(self, description=None):
        self.description = description or ""
        self.audios = []

    def __len__(self):
        return len(self.audios)

    def append(self, arg):
        self.audios.append(arg)

    def __str__(self):
        return "\n".join(str(a) for a in self.audios)

    def __repr__(self):
        return '<AudioGroup "{}" ({} {})\n{} >'.format(
            self.description,
            duration_humanreadable(self.duration),
            delta_humanreadable(self.age),
            "\n".join("   " + repr(a) for a in self.audios),
        )

    @property
    def duration(self):
        return sum(a.duration for a in self.audios if a.duration is not None)

    @property
    def urls(self):
        return [a.url for a in self.audios]

    @property
    def date(self):
        for a in self.audios:
            if hasattr(a, "date"):
                return a.date
        return None

    @property
    def age(self):
        if self.date is None:
            return None
        now = datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc)

        return now - self.date

    @property
    def valid(self):
        return len(self.audios) > 0


def get_tree(feed_url):
    if feed_url.startswith("http:") or feed_url.startswith("https:"):
        tree = html.fromstring(requests.get(feed_url).content)
    else:
        if not os.path.exists(feed_url):
            raise FileNotFoundError(feed_url)
        tree = html.parse(open(feed_url))
    return tree


def get_audio_from_description(text):
    # non-empty lines
    lines = [line.strip() for line in text.split("\n") if line.strip()]
    url = lines[0]
    duration = None
    metadata = {}
    for line in text.split("\n")[1:]:
        if line.strip() and "=" in line:
            metadata[line.split("=")[0]] = line.split("=")[1]
    if "durata" in metadata:
        try:
            durata = get_int(metadata["durata"])
        except Exception as exc:
            logging.info("Could not get duration: %s" % exc)
            del metadata['durata']
        else:
            metadata["durata"] = durata
    if "txdate" in metadata:
        try:
            metadata["txdate"] = datetime.datetime.strptime(
                metadata["txdate"], "%Y-%m-%dT%H:%M:%S%z"
            )
        except ValueError:
            logging.warning("could not parse txdate %s", metadata["txdate"])
            del metadata["txdate"]
    a = Audio(
        unquote(url),
        duration=metadata.get("durata", None),
        date=metadata.get("txdate", None),
    )

    if "txdate" in metadata and "replica" in metadata:
        if metadata["replica"].endswith("g"):
            a.end_date = metadata["txdate"] + datetime.timedelta(
                days=get_int(metadata["replica"])
            )
    return a


def is_audio_file(fpath, extensions=("mp3", "oga", "wav", "ogg")):
    if fpath.split(".")[-1].lower() in extensions:
        return True
    return False

# copied from larigira.fsutils
def scan_dir_audio(dirname):
    for root, dirnames, filenames in os.walk(dirname):
        for fname in filenames:
            if is_audio_file(fname):
                path = os.path.join(root, fname)
                yield path


def get_audio_from_file(fpath):
    a = Audio(
        "file://" + os.path.realpath(fpath),
        date=datetime.datetime.fromtimestamp(os.path.getmtime(fpath)).replace(
            tzinfo=datetime.timezone.utc
        ),
    )
    return [a]
def get_audio_from_dir(dirpath):
    fpaths = scan_dir_audio(dirpath)
    ret = []
    for u in fpaths:
        try:
            a = Audio(
                "file://" + os.path.realpath(u),
                date=datetime.datetime.fromtimestamp(os.path.getmtime(u)).replace(
                    tzinfo=datetime.timezone.utc
                ),
            )
        except ValueError:
            continue
        ret.append(a)
    return ret


def get_item_date(el):
    el_date = el.find("pubdate")
    # Wed, 15 Jan 2020 22:45:33 +0000
    formats = ["%a, %d %b %Y %H:%M:%S %z", "%Y-%m-%dT%H:%M:%S%z"]
    if el_date is not None:
        for fmt in formats:
            try:
                return datetime.datetime.strptime(el_date.text, fmt)
            except ValueError:
                pass
    return None


def get_urls_generic(tree, url_selector="description[text()]", metadata_in_body=True):
    items = tree.xpath("//item")
    for it in items:
        title = it.find("title").text
        el_body = it.find("description")
        if metadata_in_body and el_body is not None:
            url = el_body.text
            try:
                audio = get_audio_from_description(url)
            except Exception as exc:
                logging.info("error getting duration for `%s`" % title)
                continue
            if audio.date is None:
                audio.date = get_item_date(it)
            yield audio
        else:
            try:
                url = it.xpath(url_selector)[0]
            except IndexError:
                logging.warning("no audio found in %s", title)
            else:
                audio = Audio(url)
                audio.date = get_item_date(it)
                yield audio


def get_urls_from_podcast(tree):
    return get_urls_generic(tree, url_selector="enclosure/@url", metadata_in_body=False)


def get_urls_from_custom_feed(tree):
    return get_urls_generic(tree, metadata_in_body=True)


def get_urls_factory(url, args):
    if args.feed_type == "customrss":
        return get_urls_from_custom_feed
    if args.feed_type == "podcast":
        return get_urls_from_podcast
    raise UnsupportedFeedtype(args.feed_type)


def get_grouped_urls(tree):
    groups = OrderedDict()
    items = tree.xpath("//item")
    for item in items:
        guid = item.xpath("guid")[0].text.strip()
        if guid not in groups:
            groups[guid] = AudioGroup(guid)
        audio = get_audio_from_description(item.xpath("description")[0].text)
        audio.date = get_item_date(item)
        if audio.valid:
            groups[guid].append(audio)
    return groups


def get_duration(url):
    try:
        lineout = check_output(
            ["ffprobe", "-v", "error", "-show_entries", "format=duration", "-i", url]
        ).split(b"\n")
    except CalledProcessError as exc:
        raise DurationNotFound(url) from exc
    duration = next(l for l in lineout if l.startswith(b"duration="))
    value = duration.split(b"=")[1]
    return int(float(value))


HELP = """
Collect audio informations from multiple sources (XML feeds).
Audios are (in that order):
 1. Collected from feeds; (grouped by article if --group is used)
 2. Filtered; everything that does not match with requirements is excluded
 3. Sorted; even randomly
 4. Sliced; take HOWMANY elements, skipping START elements
 5. (if --copy) Copied
Usage: """


def get_parser():
    p = ArgumentParser(HELP)
    parsing = p.add_argument_group("parsing", "Feed parsing")
    parsing.add_argument(
        "--feed-type", type=str, choices=["customrss", "podcast"], default="customrss"
    )
    src = p.add_argument_group("sources", "How to deal with sources")
    src.add_argument(
        "--source-weights", help='Select only one "source" based on this weights'
    )
    src.add_argument(
        "--group",
        default=False,
        action="store_true",
        help="Group audios that belong to the same article",
    )
    src.add_argument(
            "--glob",
            default=False,
            action="store_true",
            help="Wildcards in filenames are interpreted",
            )

    filters = p.add_argument_group(
        "filters", "Select only items that match " "these conditions"
    )
    filters.add_argument(
        "--min-len",
        default=0,
        type=DurationType,
        help="Exclude any audio that is shorter " "than MIN_LEN seconds",
    )
    filters.add_argument(
        "--max-len",
        default=0,
        type=DurationType,
        help="Exclude any audio that is longer " "than MAX_LEN seconds",
    )
    filters.add_argument(
        "--sort-by", default="no", type=str, choices=("random", "date", "duration")
    )
    filters.add_argument(
            '--random-seed', default=None, help='Initialize the random generator. For debug only')
    filters.add_argument(
        "--reverse", default=False, action="store_true", help="Reverse list order"
    )

    filters.add_argument(
        "--min-age",
        default=datetime.timedelta(),
        type=TimeDeltaType,
        help="Exclude audio more recent than MIN_AGE",
    )
    filters.add_argument(
        "--max-age",
        default=datetime.timedelta(),
        type=TimeDeltaType,
        help="Exclude audio older than MAX_AGE",
    )

    fill = p.add_argument_group(
        "fill", "Fill a 'block' with as many contents as possible"
    )
    fill.add_argument(
        "--fill",
        default=0,
        type=DurationType,
        help="Fill a block of duration LEN",
        metavar="LEN",
    )
    fill.add_argument(
        "--fill-reverse",
        default=False,
        action="store_true",
        help="Reverse list order after the fill algorithm",
    )
    fill.add_argument(
        "--fill-interleave-dir",
        default=None,
        type=str,  # FIXME: does it even work?
        help="Between each item, put a random file from DIR",
    )

    intro = p.add_argument_group(
        "intro", "Add intro/outro to output, but only if at least one audio will be output"
    )
    intro.add_argument("--intro", default=None, type=str, metavar="PATH")
    intro.add_argument("--outro", default=None, type=str, metavar="PATH")

    p.add_argument(
        "--start",
        default=0,
        type=int,
        help="0-indexed start number. " "By default, play from most recent",
    )
    p.add_argument(
        "--howmany", default=1, type=int, help="If not specified, only 1 will be played"
    )
    p.add_argument(
        "--slotsize", type=int, help="Seconds between each audio. Still unsupported"
    )

    general = p.add_argument_group("general", "General options")
    general.add_argument(
        "--copy", help="Copy files to $TMPDIR", default=False, action="store_true"
    )
    general.add_argument(
        "--debug", help="Debug messages", default=False, action="store_true"
    )

    p.add_argument("urls", metavar="URL", nargs="+")
    return p


def downloader(url, dest):
    headers = {}
    mode = "wb"
    if os.path.exists(dest):
        headers["Range"] = "bytes=%d-" % os.stat(dest).st_size
        mode = "ab"
    r = requests.get(url, stream=True, headers=headers)
    if r.status_code == 416:  # range not satisfiable
        return
    with open(dest, mode) as f:
        for chunk in r.iter_content(chunk_size=1 << 16):
            f.write(chunk)


def put(audio, copy=False):
    if not copy:
        for url in audio.urls:
            print(url)
    else:
        destdir = os.environ.get("TMPDIR", ".")
        os.makedirs(destdir, exist_ok=True)
        for url in audio.urls:
            if url.split(":")[0] in ("http", "https"):
                fname = posixpath.basename(urlparse(url).path)
                # sanitize
                fname = "".join(
                    c for c in fname if c.isalnum() or c in list("._-")
                ).rstrip()
                dest = os.path.join(destdir, fname)
                downloader(url, dest)
                print("file://%s" % os.path.realpath(dest))
            elif url.startswith("file:///"):
                src = url[len('file://'):]
                dest = os.path.join(destdir, os.path.basename(src))
                shutil.copy(src, dest)
                print("file://%s" % os.path.realpath(dest))
            else:
                # what's that? let's just copy it
                print(url)


def retrieve(url, args):
    """
    returns a list of Audios or a list of AudioGroups
    """
    if not args.group:
        if os.path.isdir(url):
            audiodir = get_audio_from_dir(url)
            return audiodir
        elif os.path.isfile(url) and is_audio_file(url):
            return get_audio_from_file(url)
        elif url.startswith("http:") or url.startswith("https:") or os.path.isfile(url):
            getter = get_urls_factory(url, args)
            tree = get_tree(url)
            return getter(tree)
        else:
            logging.info("unsupported url `%s`", url)
            return []
    else:  # group
        if os.path.isdir(url):
            audiodir = get_audio_from_dir(url)
            agroups = []
            for a in audiodir:
                ag = AudioGroup(os.path.basename(a.url))
                ag.append(a)
                agroups.append(ag)
            return agroups
        elif os.path.isfile(url) and is_audio_file(url):
            audio = get_audio_from_file(url)[0]
            ag = AudioGroup(url)
            ag.append(audio)
            return [ag]
        elif url.startswith("http:") or url.startswith("https:") or os.path.isfile(url):
            groups = get_grouped_urls(get_tree(url))
            return groups.values()
        else:
            logging.info("unsupported url `%s`", url)
            return []


def audio_passes_filters(audio, args):
    logging.debug(audio.end_date)
    if not audio.valid:
        return False
    if args.max_len and audio.duration > args.max_len:
        return False
    if args.fill and audio.duration > args.fill:
        return False
    if args.min_len and audio.duration < args.min_len:
        return False
    if args.min_age.total_seconds() and audio.age < args.min_age:
        return False
    if args.max_age.total_seconds() and audio.age > args.max_age:
        return False
    return True


def expand_glob(sources: list, weights: list) -> tuple:
    '''
    Let's say that sources=["foo", "bar*"] and weight=["2", "3"] and on filesystem there are bar1 and bar2.

    Result: ["foo", "bar1", "bar2"], ["2", "1.5", "1.5"]
    '''
    new_sources = []
    new_weights = []

    for src, weight in zip(sources, weights):
        if not src.startswith('http://') and not src.startswith('https://') and '*' in src:
            expanded_source = glob.glob(src)
        else:
            expanded_source = [src]
        logging.debug("glob: %s -> %s", src, expanded_source)
        expanded_weight = [weight / len(expanded_source)] * len(expanded_source)

        new_sources += expanded_source
        new_weights += expanded_weight


    return new_sources, new_weights


def get_audio_by_source(args, parser):
    sources = args.urls
    if args.source_weights:
        weights = list(map(int, args.source_weights.split(":")))
        if len(weights) != len(sources):
            parser.exit(
                status=2,
                message="Weight must be in the same number as sources\n",
            )
    else:
        weights = [1] * len(sources)

    if args.glob:
        sources, weights = expand_glob(sources, weights)

    audio_by_source = OrderedDict()
    for i, url in enumerate(sources):
        url_audios = list(retrieve(url, args))
        logging.debug("Found %d audios in %s", len(url_audios), url)
        url_audios = [au for au in url_audios if audio_passes_filters(au, args)]
        logging.debug("%d of those are passing filters", len(url_audios))
        audio_by_source[url] = url_audios
        if not url_audios:
            weights[i] = 0
    if sum(weights) == 0:
        return
    sources = [weighted_choice(sources, weights)]
    return audio_by_source, sources

def add_intro_outro(audios: list, args) -> list:
    if not audios:
        return audios
    audios = audios.copy()
    if args.intro:
        audios.insert(0, Audio.from_trusted(args.intro))
    if args.outro:
        audios.append(Audio.from_trusted(args.outro))
    return audios

def main():
    parser = get_parser()
    args = parser.parse_args()
    if not args.debug:
        logging.basicConfig(level=logging.WARNING)
    else:
        logging.basicConfig(level=logging.DEBUG)
    if args.random_seed is not None:
        random.seed(args.random_seed)

    audio_by_source, sources = get_audio_by_source(args, parser)

    audios = []
    for source_url in sources:
        audios += audio_by_source[source_url]

    logging.debug("Found %d audios", len(audios))

    # sort
    if args.sort_by == "random":
        random.shuffle(audios)
    elif args.sort_by == "date":
        audios.sort(key=lambda x: x.age)
    elif args.sort_by == "duration":
        audios.sort(key=lambda x: x.duration)

    if args.reverse:
        audios.reverse()

    # slice
    audios = audios[args.start :]
    if not args.fill:
        audios = audios[: args.howmany]

    if args.fill and audios:
        fill_audios = [audios.pop(0)]
        duration = fill_audios[0].duration
        for next_audio in audios:
            next_duration = next_audio.duration
            if args.fill_interleave_dir:
                interleaving = Audio(
                    "file://"
                    + random.choice(list(scan_dir_audio(args.fill_interleave_dir)))
                )
                # logging.info("%r", interleaving)
                next_duration += interleaving.duration

            if args.fill - duration > next_duration:
                if args.fill_interleave_dir:
                    fill_audios.append(interleaving)
                fill_audios.append(next_audio)
                duration += next_duration
        audios = fill_audios
        if args.fill_reverse:
            audios.reverse()

    # the for loop excludes the last one
    # this is to support  the --slotsize option
    if not audios:
        return
    audios = add_intro_outro(audios, args)
    for audio in audios[:-1]:
        if args.debug:
            debug(repr(audio))
        else:
            put(audio, args.copy)
        if args.slotsize is not None:
            duration = audio.duration
            if duration < args.slotsize:
                # TODO: prendi musica da un'altra cartella
                print("## musica per {} secondi".format(args.slotsize - duration))
    # finally, the last one
    if args.debug:
        debug(repr(audios[-1]))
    else:
        put(audios[-1], args.copy)


#     else:  # grouping; TODO: support slotsize
#         for item in groups:
#             if args.debug:
#                 print('#', item, groups[item].duration)
#             print(groups[item])


if __name__ == "__main__":
    main()