feed 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633
  1. #!/usr/bin/env python3
  2. """
  3. Feed parser with many features
  4. from a feed, it supports filtering, subslicing, random picking
  5. Beside feeds, it supports picking files from directories
  6. """
  7. import datetime
  8. import logging
  9. import os
  10. import posixpath
  11. import random
  12. import re
  13. import sys
  14. import urllib.request
  15. from argparse import ArgumentParser, ArgumentTypeError
  16. from bisect import bisect
  17. from collections import OrderedDict
  18. from subprocess import CalledProcessError, check_output
  19. from urllib.parse import unquote, urlparse
  20. import requests
  21. from lxml import html
  22. from pytimeparse.timeparse import timeparse
  23. def debug(*args, **kwargs):
  24. kwargs.setdefault("file", sys.stderr)
  25. print(*args, **kwargs)
  26. def get_int(s):
  27. return int(re.findall(r"\d+", s)[0])
  28. def DurationType(arg):
  29. if arg.isdecimal():
  30. secs = int(arg)
  31. else:
  32. secs = timeparse(arg)
  33. if secs is None:
  34. raise ArgumentTypeError("%r is not a valid duration" % arg)
  35. return secs
  36. def TimeDeltaType(arg):
  37. if arg.isdecimal():
  38. secs = int(arg)
  39. else:
  40. secs = timeparse(arg)
  41. if secs is None:
  42. raise ArgumentTypeError("%r is not a valid time range" % arg)
  43. return datetime.timedelta(seconds=secs)
  44. def weighted_choice(values, weights):
  45. """
  46. random.choice with weights
  47. weights must be integers greater than 0.
  48. Their meaning is "relative", that is [1,2,3] is the same as [2,4,6]
  49. """
  50. assert len(values) == len(weights)
  51. if not values:
  52. raise IndexError("Cannot do weighted choice from an empty sequence")
  53. if sum(weights) == 0:
  54. raise IndexError("Cannot do weighted choice where weight=0")
  55. total = 0
  56. cum_weights = []
  57. for w in weights:
  58. total += w
  59. cum_weights.append(total)
  60. x = random.random() * total
  61. i = bisect(cum_weights, x)
  62. return values[i]
  63. def delta_humanreadable(tdelta):
  64. if tdelta is None:
  65. return ""
  66. days = tdelta.days
  67. hours = (tdelta - datetime.timedelta(days=days)).seconds // 3600
  68. if days:
  69. return "{}d{}h".format(days, hours)
  70. return "{}h".format(hours)
  71. def duration_humanreadable(seconds):
  72. hours = seconds // 3600
  73. minutes = (seconds - hours * 3600) // 60
  74. seconds = seconds % 60
  75. if hours > 0:
  76. return "{}h{}m{}s".format(hours, minutes, seconds)
  77. return "{}m{}s".format(minutes, seconds)
  78. class Audio(object):
  79. def __init__(self, url, duration=None, date=None):
  80. self.url = url
  81. if duration is None:
  82. duration = get_duration(url.encode("utf-8"))
  83. self.duration = duration
  84. self.date = date
  85. self.end_date = datetime.datetime(9999, 12, 31, tzinfo=datetime.timezone.utc)
  86. def __str__(self):
  87. return self.url
  88. def __repr__(self):
  89. return "<Audio {} ({} {})>".format(
  90. self.url,
  91. duration_humanreadable(self.duration),
  92. delta_humanreadable(self.age),
  93. )
  94. @property
  95. def urls(self):
  96. return [self.url]
  97. @property
  98. def age(self):
  99. if self.date is None:
  100. return None
  101. now = datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc)
  102. return now - self.date
  103. @property
  104. def valid(self):
  105. return self.end_date >= datetime.datetime.utcnow().replace(
  106. tzinfo=datetime.timezone.utc
  107. )
  108. class AudioGroup(list):
  109. def __init__(self, description=None):
  110. self.description = description or ""
  111. self.audios = []
  112. def __len__(self):
  113. return len(self.audios)
  114. def append(self, arg):
  115. self.audios.append(arg)
  116. def __str__(self):
  117. return "\n".join(str(a) for a in self.audios)
  118. def __repr__(self):
  119. return '<AudioGroup "{}" ({} {})\n{} >'.format(
  120. self.description,
  121. duration_humanreadable(self.duration),
  122. delta_humanreadable(self.age),
  123. "\n".join(" " + repr(a) for a in self.audios),
  124. )
  125. @property
  126. def duration(self):
  127. return sum(a.duration for a in self.audios if a.duration is not None)
  128. @property
  129. def urls(self):
  130. return [a.url for a in self.audios]
  131. @property
  132. def date(self):
  133. for a in self.audios:
  134. if hasattr(a, "date"):
  135. return a.date
  136. return None
  137. @property
  138. def age(self):
  139. if self.date is None:
  140. return None
  141. now = datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc)
  142. return now - self.date
  143. @property
  144. def valid(self):
  145. return len(self.audios) > 0
  146. def get_tree(feed_url):
  147. if feed_url.startswith("http:") or feed_url.startswith("https:"):
  148. tree = html.fromstring(requests.get(feed_url).content)
  149. else:
  150. if not os.path.exists(feed_url):
  151. raise ValueError("file not found: {}".format(feed_url))
  152. tree = html.parse(open(feed_url))
  153. return tree
  154. def get_audio_from_description(text):
  155. # non-empty lines
  156. lines = [line.strip() for line in text.split("\n") if line.strip()]
  157. url = lines[0]
  158. duration = None
  159. metadata = {}
  160. for line in text.split("\n")[1:]:
  161. if line.strip() and "=" in line:
  162. metadata[line.split("=")[0]] = line.split("=")[1]
  163. if "durata" in metadata:
  164. metadata["durata"] = get_int(metadata["durata"])
  165. if "txdate" in metadata:
  166. try:
  167. metadata["txdate"] = datetime.datetime.strptime(
  168. metadata["txdate"], "%Y-%m-%dT%H:%M:%S%z"
  169. )
  170. except ValueError:
  171. logging.warning("could not parse txdate %s", metadata["txdate"])
  172. del metadata["txdate"]
  173. a = Audio(
  174. unquote(url),
  175. duration=metadata.get("durata", None),
  176. date=metadata.get("txdate", None),
  177. )
  178. if "txdate" in metadata and "replica" in metadata:
  179. if metadata["replica"].endswith("g"):
  180. a.end_date = metadata["txdate"] + datetime.timedelta(
  181. days=get_int(metadata["replica"])
  182. )
  183. return a
  184. # copied from larigira.fsutils
  185. def scan_dir_audio(dirname, extensions=("mp3", "oga", "wav", "ogg")):
  186. for root, dirnames, filenames in os.walk(dirname):
  187. for fname in filenames:
  188. if fname.split(".")[-1].lower() in extensions:
  189. yield os.path.join(root, fname)
  190. def get_audio_from_dir(dirpath):
  191. fpaths = scan_dir_audio(dirpath)
  192. ret = []
  193. for u in fpaths:
  194. try:
  195. a = Audio(
  196. "file://" + os.path.realpath(u),
  197. date=datetime.datetime.fromtimestamp(os.path.getmtime(u)).replace(
  198. tzinfo=datetime.timezone.utc
  199. ),
  200. )
  201. except ValueError:
  202. continue
  203. ret.append(a)
  204. return ret
  205. def get_item_date(el):
  206. el_date = el.find("pubdate")
  207. # Wed, 15 Jan 2020 22:45:33 +0000
  208. formats = ["%a, %d %b %Y %H:%M:%S %z", "%Y-%m-%dT%H:%M:%S%z"]
  209. if el_date is not None:
  210. for fmt in formats:
  211. try:
  212. return datetime.datetime.strptime(el_date.text, fmt)
  213. except ValueError:
  214. pass
  215. return None
  216. def get_urls_generic(tree, url_selector="description[text()]", metadata_in_body=True):
  217. items = tree.xpath("//item")
  218. for it in items:
  219. title = it.find("title").text
  220. el_body = it.find("description")
  221. if metadata_in_body and el_body is not None:
  222. url = el_body.text
  223. try:
  224. audio = get_audio_from_description(url)
  225. except Exception as exc:
  226. logging.info("error getting duration for `%s`" % title)
  227. continue
  228. if audio.date is None:
  229. audio.date = get_item_date(it)
  230. yield audio
  231. else:
  232. url = it.xpath(url_selector)[0]
  233. audio = Audio(url)
  234. audio.date = get_item_date(it)
  235. yield audio
  236. def get_urls_from_podcast(tree):
  237. return get_urls_generic(tree, url_selector="enclosure/@url", metadata_in_body=False)
  238. def get_urls_from_custom_feed(tree):
  239. return get_urls_generic(tree, metadata_in_body=True)
  240. def get_urls_factory(url, args):
  241. if args.feed_type == "customrss":
  242. return get_urls_from_custom_feed
  243. if args.feed_type == "podcast":
  244. return get_urls_from_podcast
  245. raise ValueError("unsupported feeedtype %s" % args.feed_type)
  246. def get_grouped_urls(tree):
  247. groups = OrderedDict()
  248. items = tree.xpath("//item")
  249. for item in items:
  250. guid = item.xpath("guid")[0].text.strip()
  251. if guid not in groups:
  252. groups[guid] = AudioGroup(guid)
  253. audio = get_audio_from_description(item.xpath("description")[0].text)
  254. audio.date = get_item_date(item)
  255. if audio.valid:
  256. groups[guid].append(audio)
  257. return groups
  258. def get_duration(url):
  259. try:
  260. lineout = check_output(
  261. ["ffprobe", "-v", "error", "-show_entries", "format=duration", "-i", url]
  262. ).split(b"\n")
  263. except CalledProcessError as exc:
  264. raise ValueError("error probing `%s`" % url) from exc
  265. duration = next(l for l in lineout if l.startswith(b"duration="))
  266. value = duration.split(b"=")[1]
  267. return int(float(value))
  268. HELP = """
  269. Collect audio informations from multiple sources (XML feeds).
  270. Audios are (in that order):
  271. 1. Collected from feeds; (grouped by article if --group is used)
  272. 2. Filtered; everything that does not match with requirements is excluded
  273. 3. Sorted; even randomly
  274. 4. Sliced; take HOWMANY elements, skipping START elements
  275. 5. (if --copy) Copied
  276. Usage: """
  277. def get_parser():
  278. p = ArgumentParser(HELP)
  279. parsing = p.add_argument_group("parsing", "Feed parsing")
  280. parsing.add_argument(
  281. "--feed-type", type=str, choices=["customrss", "podcast"], default="customrss"
  282. )
  283. src = p.add_argument_group("sources", "How to deal with sources")
  284. src.add_argument(
  285. "--source-weights", help='Select only one "source" based on this weights'
  286. )
  287. src.add_argument(
  288. "--group",
  289. default=False,
  290. action="store_true",
  291. help="Group audios that belong to the same article",
  292. )
  293. filters = p.add_argument_group(
  294. "filters", "Select only items that match " "these conditions"
  295. )
  296. filters.add_argument(
  297. "--min-len",
  298. default=0,
  299. type=DurationType,
  300. help="Exclude any audio that is shorter " "than MIN_LEN seconds",
  301. )
  302. filters.add_argument(
  303. "--max-len",
  304. default=0,
  305. type=DurationType,
  306. help="Exclude any audio that is longer " "than MAX_LEN seconds",
  307. )
  308. filters.add_argument(
  309. "--sort-by", default="no", type=str, choices=("random", "date", "duration")
  310. )
  311. filters.add_argument(
  312. "--reverse", default=False, action="store_true", help="Reverse list order"
  313. )
  314. filters.add_argument(
  315. "--min-age",
  316. default=datetime.timedelta(),
  317. type=TimeDeltaType,
  318. help="Exclude audio more recent than MIN_AGE",
  319. )
  320. filters.add_argument(
  321. "--max-age",
  322. default=datetime.timedelta(),
  323. type=TimeDeltaType,
  324. help="Exclude audio older than MAX_AGE",
  325. )
  326. fill = p.add_argument_group(
  327. "fill", "Fill a 'block' with as many contents as possible"
  328. )
  329. fill.add_argument(
  330. "--fill",
  331. default=0,
  332. type=DurationType,
  333. help="Fill a block of duration LEN",
  334. metavar="LEN",
  335. )
  336. fill.add_argument(
  337. "--fill-reverse",
  338. default=False,
  339. action="store_true",
  340. help="Reverse list order after the fill algorithm",
  341. )
  342. fill.add_argument(
  343. "--fill-interleave-dir",
  344. default=None,
  345. type=str, # FIXME: does it even work?
  346. help="Between each item, put a random file from DIR",
  347. )
  348. p.add_argument(
  349. "--start",
  350. default=0,
  351. type=int,
  352. help="0-indexed start number. " "By default, play from most recent",
  353. )
  354. p.add_argument(
  355. "--howmany", default=1, type=int, help="If not specified, only 1 will be played"
  356. )
  357. p.add_argument(
  358. "--slotsize", type=int, help="Seconds between each audio. Still unsupported"
  359. )
  360. general = p.add_argument_group("general", "General options")
  361. general.add_argument(
  362. "--copy", help="Copy files to $TMPDIR", default=False, action="store_true"
  363. )
  364. general.add_argument(
  365. "--debug", help="Debug messages", default=False, action="store_true"
  366. )
  367. p.add_argument("urls", metavar="URL", nargs="+")
  368. return p
  369. def downloader(url, dest):
  370. headers = {}
  371. mode = "wb"
  372. if os.path.exists(dest):
  373. headers["Range"] = "bytes=%d-" % os.stat(dest).st_size
  374. mode = "ab"
  375. r = requests.get(url, stream=True, headers=headers)
  376. if r.status_code == 416: # range not satisfiable
  377. return
  378. with open(dest, mode) as f:
  379. for chunk in r.iter_content(chunk_size=1 << 16):
  380. f.write(chunk)
  381. def put(audio, copy=False):
  382. if not copy:
  383. for url in audio.urls:
  384. print(url)
  385. else:
  386. for url in audio.urls:
  387. if url.split(":")[0] in ("http", "https"):
  388. destdir = os.environ.get("TMPDIR", ".")
  389. fname = posixpath.basename(urlparse(url).path)
  390. # sanitize
  391. fname = "".join(
  392. c for c in fname if c.isalnum() or c in list("._-")
  393. ).rstrip()
  394. dest = os.path.join(destdir, fname)
  395. os.makedirs(destdir, exist_ok=True)
  396. downloader(url, dest)
  397. print("file://%s" % os.path.realpath(dest))
  398. else:
  399. # FIXME: file:// urls are just copied
  400. print(url)
  401. def retrieve(url, args):
  402. """
  403. returns a list of Audios or a list of AudioGroups
  404. """
  405. if not args.group:
  406. if os.path.isdir(url):
  407. audiodir = get_audio_from_dir(url)
  408. return audiodir
  409. elif url.startswith("http:") or url.startswith("https:") or os.path.isfile(url):
  410. getter = get_urls_factory(url, args)
  411. tree = get_tree(url)
  412. return getter(tree)
  413. else:
  414. logging.info("unsupported url `%s`", url)
  415. return []
  416. else: # group
  417. if os.path.isdir(url):
  418. audiodir = get_audio_from_dir(url)
  419. agroups = []
  420. for a in audiodir:
  421. ag = AudioGroup(os.path.basename(a.url))
  422. ag.append(a)
  423. agroups.append(ag)
  424. return agroups
  425. elif url.startswith("http:") or url.startswith("https:") or os.path.isfile(url):
  426. groups = get_grouped_urls(get_tree(url))
  427. return groups.values()
  428. else:
  429. logging.info("unsupported url `%s`", url)
  430. return []
  431. def audio_passes_filters(audio, args):
  432. logging.debug(audio.end_date)
  433. if not audio.valid:
  434. return False
  435. if args.max_len and audio.duration > args.max_len:
  436. return False
  437. if args.fill and audio.duration > args.fill:
  438. return False
  439. if args.min_len and audio.duration < args.min_len:
  440. return False
  441. if args.min_age.total_seconds() and audio.age < args.min_age:
  442. return False
  443. if args.max_age.total_seconds() and audio.age > args.max_age:
  444. return False
  445. return True
  446. def main():
  447. parser = get_parser()
  448. args = parser.parse_args()
  449. if not args.debug:
  450. logging.basicConfig(level=logging.WARNING)
  451. else:
  452. logging.basicConfig(level=logging.DEBUG)
  453. sources = args.urls
  454. if args.source_weights:
  455. weights = list(map(int, args.source_weights.split(":")))
  456. if len(weights) != len(sources):
  457. parser.exit(
  458. status=2, message="Weight must be in the" " same number as sources\n"
  459. )
  460. else:
  461. weights = [1] * len(sources)
  462. audio_by_source = OrderedDict()
  463. for i, url in enumerate(sources):
  464. url_audios = list(retrieve(url, args))
  465. logging.debug("Found %d audios in %s", len(url_audios), url)
  466. url_audios = [au for au in url_audios if audio_passes_filters(au, args)]
  467. logging.debug("%d of those are passing filters", len(url_audios))
  468. audio_by_source[url] = url_audios
  469. if not url_audios:
  470. weights[i] = 0
  471. if sum(weights) == 0:
  472. return
  473. sources = [weighted_choice(sources, weights)]
  474. audios = []
  475. for source_url in sources:
  476. audios += audio_by_source[source_url]
  477. logging.debug("Found %d audios", len(audios))
  478. # sort
  479. if args.sort_by == "random":
  480. random.shuffle(audios)
  481. elif args.sort_by == "date":
  482. audios.sort(key=lambda x: x.age)
  483. elif args.sort_by == "duration":
  484. audios.sort(key=lambda x: x.duration)
  485. if args.reverse:
  486. audios.reverse()
  487. # slice
  488. audios = audios[args.start :]
  489. if not args.fill:
  490. audios = audios[: args.howmany]
  491. if args.fill and audios:
  492. fill_audios = [audios.pop(0)]
  493. duration = fill_audios[0].duration
  494. for next_audio in audios:
  495. next_duration = next_audio.duration
  496. if args.fill_interleave_dir:
  497. interleaving = Audio(
  498. "file://"
  499. + random.choice(list(scan_dir_audio(args.fill_interleave_dir)))
  500. )
  501. # logging.info("%r", interleaving)
  502. next_duration += interleaving.duration
  503. if args.fill - duration > next_duration:
  504. if args.fill_interleave_dir:
  505. fill_audios.append(interleaving)
  506. fill_audios.append(next_audio)
  507. duration += next_duration
  508. audios = fill_audios
  509. if args.fill_reverse:
  510. audios.reverse()
  511. # the for loop excludes the last one
  512. # this is to support the --slotsize option
  513. if not audios:
  514. return
  515. for audio in audios[:-1]:
  516. if args.debug:
  517. debug(repr(audio))
  518. else:
  519. put(audio, args.copy)
  520. if args.slotsize is not None:
  521. duration = audio.duration
  522. if duration < args.slotsize:
  523. # TODO: prendi musica da un'altra cartella
  524. print("## musica per {} secondi".format(args.slotsize - duration))
  525. # finally, the last one
  526. if args.debug:
  527. debug(repr(audios[-1]))
  528. else:
  529. put(audios[-1], args.copy)
  530. # else: # grouping; TODO: support slotsize
  531. # for item in groups:
  532. # if args.debug:
  533. # print('#', item, groups[item].duration)
  534. # print(groups[item])
  535. if __name__ == "__main__":
  536. main()