feed 20 KB

  1. #!/usr/bin/env python3
  2. """
  3. Feed parser with many features
  4. from a feed, it supports filtering, subslicing, random picking
  5. Beside feeds, it supports picking files from directories
  6. """
  7. import datetime
  8. import logging
  9. import os
  10. import glob
  11. import posixpath
  12. import random
  13. import re
  14. import sys
  15. import urllib.request
  16. from argparse import ArgumentParser, ArgumentTypeError
  17. from bisect import bisect
  18. from collections import OrderedDict
  19. from subprocess import CalledProcessError, check_output
  20. from urllib.parse import unquote, urlparse
  21. import requests
  22. from lxml import html
  23. from pytimeparse.timeparse import timeparse
  24. def debug(*args, **kwargs):
  25. kwargs.setdefault("file", sys.stderr)
  26. print(*args, **kwargs)
  27. def get_int(s):
  28. return int(re.findall(r"\d+", s)[0])
  29. def DurationType(arg):
  30. if arg.isdecimal():
  31. secs = int(arg)
  32. else:
  33. secs = timeparse(arg)
  34. if secs is None:
  35. raise ArgumentTypeError("%r is not a valid duration" % arg)
  36. return secs
  37. def TimeDeltaType(arg):
  38. if arg.isdecimal():
  39. secs = int(arg)
  40. else:
  41. secs = timeparse(arg)
  42. if secs is None:
  43. raise ArgumentTypeError("%r is not a valid time range" % arg)
  44. return datetime.timedelta(seconds=secs)
  45. def weighted_choice(values, weights):
  46. """
  47. random.choice with weights
  48. weights must be integers greater than 0.
  49. Their meaning is "relative", that is [1,2,3] is the same as [2,4,6]
  50. """
  51. assert len(values) == len(weights)
  52. if not values:
  53. raise IndexError("Cannot do weighted choice from an empty sequence")
  54. if sum(weights) == 0:
  55. raise IndexError("Cannot do weighted choice where weight=0")
  56. total = 0
  57. cum_weights = []
  58. for w in weights:
  59. total += w
  60. cum_weights.append(total)
  61. x = random.random() * total
  62. i = bisect(cum_weights, x)
  63. return values[i]
  64. def delta_humanreadable(tdelta):
  65. if tdelta is None:
  66. return ""
  67. days = tdelta.days
  68. hours = (tdelta - datetime.timedelta(days=days)).seconds // 3600
  69. if days:
  70. return "{}d{}h".format(days, hours)
  71. return "{}h".format(hours)
  72. def duration_humanreadable(seconds):
  73. hours = seconds // 3600
  74. minutes = (seconds - hours * 3600) // 60
  75. seconds = seconds % 60
  76. if hours > 0:
  77. return "{}h{}m{}s".format(hours, minutes, seconds)
  78. return "{}m{}s".format(minutes, seconds)
  79. class Audio(object):
  80. def __init__(self, url, duration=None, date=None):
  81. self.url = url
  82. self._duration = duration
  83. self.date = date
  84. self.end_date = datetime.datetime(9999, 12, 31, tzinfo=datetime.timezone.utc)
  85. def __str__(self):
  86. return self.url
  87. def __repr__(self):
  88. return "<Audio {} ({} {})>".format(
  89. self.url,
  90. duration_humanreadable(self.duration),
  91. delta_humanreadable(self.age),
  92. )
  93. @property
  94. def duration(self):
  95. if self._duration is None:
  96. self._duration = get_duration(self.url)
  97. return self._duration
  98. @property
  99. def urls(self):
  100. return [self.url]
  101. @property
  102. def age(self):
  103. if self.date is None:
  104. return None
  105. now = datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc)
  106. return now - self.date
  107. @property
  108. def valid(self):
  109. return self.end_date >= datetime.datetime.utcnow().replace(
  110. tzinfo=datetime.timezone.utc
  111. )
  112. class AudioGroup(list):
  113. def __init__(self, description=None):
  114. self.description = description or ""
  115. self.audios = []
  116. def __len__(self):
  117. return len(self.audios)
  118. def append(self, arg):
  119. self.audios.append(arg)
  120. def __str__(self):
  121. return "\n".join(str(a) for a in self.audios)
  122. def __repr__(self):
  123. return '<AudioGroup "{}" ({} {})\n{} >'.format(
  124. self.description,
  125. duration_humanreadable(self.duration),
  126. delta_humanreadable(self.age),
  127. "\n".join(" " + repr(a) for a in self.audios),
  128. )
  129. @property
  130. def duration(self):
  131. return sum(a.duration for a in self.audios if a.duration is not None)
  132. @property
  133. def urls(self):
  134. return [a.url for a in self.audios]
  135. @property
  136. def date(self):
  137. for a in self.audios:
  138. if hasattr(a, "date"):
  139. return a.date
  140. return None
  141. @property
  142. def age(self):
  143. if self.date is None:
  144. return None
  145. now = datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc)
  146. return now - self.date
  147. @property
  148. def valid(self):
  149. return len(self.audios) > 0
  150. def get_tree(feed_url):
  151. if feed_url.startswith("http:") or feed_url.startswith("https:"):
  152. tree = html.fromstring(requests.get(feed_url).content)
  153. else:
  154. if not os.path.exists(feed_url):
  155. raise ValueError("file not found: {}".format(feed_url))
  156. tree = html.parse(open(feed_url))
  157. return tree
  158. def get_audio_from_description(text):
  159. # non-empty lines
  160. lines = [line.strip() for line in text.split("\n") if line.strip()]
  161. url = lines[0]
  162. duration = None
  163. metadata = {}
  164. for line in text.split("\n")[1:]:
  165. if line.strip() and "=" in line:
  166. metadata[line.split("=")[0]] = line.split("=")[1]
  167. if "durata" in metadata:
  168. metadata["durata"] = get_int(metadata["durata"])
  169. if "txdate" in metadata:
  170. try:
  171. metadata["txdate"] = datetime.datetime.strptime(
  172. metadata["txdate"], "%Y-%m-%dT%H:%M:%S%z"
  173. )
  174. except ValueError:
  175. logging.warning("could not parse txdate %s", metadata["txdate"])
  176. del metadata["txdate"]
  177. a = Audio(
  178. unquote(url),
  179. duration=metadata.get("durata", None),
  180. date=metadata.get("txdate", None),
  181. )
  182. if "txdate" in metadata and "replica" in metadata:
  183. if metadata["replica"].endswith("g"):
  184. a.end_date = metadata["txdate"] + datetime.timedelta(
  185. days=get_int(metadata["replica"])
  186. )
  187. return a
  188. def is_audio_file(fpath, extensions=("mp3", "oga", "wav", "ogg")):
  189. if fpath.split(".")[-1].lower() in extensions:
  190. return True
  191. return False
  192. # copied from larigira.fsutils
  193. def scan_dir_audio(dirname):
  194. for root, dirnames, filenames in os.walk(dirname):
  195. for fname in filenames:
  196. if is_audio_file(fname):
  197. path = os.path.join(root, fname)
  198. yield path
  199. def get_audio_from_file(fpath):
  200. a = Audio(
  201. "file://" + os.path.realpath(fpath),
  202. date=datetime.datetime.fromtimestamp(os.path.getmtime(fpath)).replace(
  203. tzinfo=datetime.timezone.utc
  204. ),
  205. )
  206. return [a]
  207. def get_audio_from_dir(dirpath):
  208. fpaths = scan_dir_audio(dirpath)
  209. ret = []
  210. for u in fpaths:
  211. try:
  212. a = Audio(
  213. "file://" + os.path.realpath(u),
  214. date=datetime.datetime.fromtimestamp(os.path.getmtime(u)).replace(
  215. tzinfo=datetime.timezone.utc
  216. ),
  217. )
  218. except ValueError:
  219. continue
  220. ret.append(a)
  221. return ret
  222. def get_item_date(el):
  223. el_date = el.find("pubdate")
  224. # Wed, 15 Jan 2020 22:45:33 +0000
  225. formats = ["%a, %d %b %Y %H:%M:%S %z", "%Y-%m-%dT%H:%M:%S%z"]
  226. if el_date is not None:
  227. for fmt in formats:
  228. try:
  229. return datetime.datetime.strptime(el_date.text, fmt)
  230. except ValueError:
  231. pass
  232. return None
  233. def get_urls_generic(tree, url_selector="description[text()]", metadata_in_body=True):
  234. items = tree.xpath("//item")
  235. for it in items:
  236. title = it.find("title").text
  237. el_body = it.find("description")
  238. if metadata_in_body and el_body is not None:
  239. url = el_body.text
  240. try:
  241. audio = get_audio_from_description(url)
  242. except Exception as exc:
  243. logging.info("error getting duration for `%s`" % title)
  244. continue
  245. if audio.date is None:
  246. audio.date = get_item_date(it)
  247. yield audio
  248. else:
  249. try:
  250. url = it.xpath(url_selector)[0]
  251. except IndexError:
  252. logging.warning("no audio found in %s", title)
  253. else:
  254. audio = Audio(url)
  255. audio.date = get_item_date(it)
  256. yield audio
  257. def get_urls_from_podcast(tree):
  258. return get_urls_generic(tree, url_selector="enclosure/@url", metadata_in_body=False)
  259. def get_urls_from_custom_feed(tree):
  260. return get_urls_generic(tree, metadata_in_body=True)
  261. def get_urls_factory(url, args):
  262. if args.feed_type == "customrss":
  263. return get_urls_from_custom_feed
  264. if args.feed_type == "podcast":
  265. return get_urls_from_podcast
  266. raise ValueError("unsupported feeedtype %s" % args.feed_type)
  267. def get_grouped_urls(tree):
  268. groups = OrderedDict()
  269. items = tree.xpath("//item")
  270. for item in items:
  271. guid = item.xpath("guid")[0].text.strip()
  272. if guid not in groups:
  273. groups[guid] = AudioGroup(guid)
  274. audio = get_audio_from_description(item.xpath("description")[0].text)
  275. audio.date = get_item_date(item)
  276. if audio.valid:
  277. groups[guid].append(audio)
  278. return groups
  279. def get_duration(url):
  280. try:
  281. lineout = check_output(
  282. ["ffprobe", "-v", "error", "-show_entries", "format=duration", "-i", url]
  283. ).split(b"\n")
  284. except CalledProcessError as exc:
  285. raise ValueError("error probing `%s`" % url) from exc
  286. duration = next(l for l in lineout if l.startswith(b"duration="))
  287. value = duration.split(b"=")[1]
  288. return int(float(value))
  289. HELP = """
  290. Collect audio informations from multiple sources (XML feeds).
  291. Audios are (in that order):
  292. 1. Collected from feeds; (grouped by article if --group is used)
  293. 2. Filtered; everything that does not match with requirements is excluded
  294. 3. Sorted; even randomly
  295. 4. Sliced; take HOWMANY elements, skipping START elements
  296. 5. (if --copy) Copied
  297. Usage: """
  298. def get_parser():
  299. p = ArgumentParser(HELP)
  300. parsing = p.add_argument_group("parsing", "Feed parsing")
  301. parsing.add_argument(
  302. "--feed-type", type=str, choices=["customrss", "podcast"], default="customrss"
  303. )
  304. src = p.add_argument_group("sources", "How to deal with sources")
  305. src.add_argument(
  306. "--source-weights", help='Select only one "source" based on this weights'
  307. )
  308. src.add_argument(
  309. "--group",
  310. default=False,
  311. action="store_true",
  312. help="Group audios that belong to the same article",
  313. )
  314. src.add_argument(
  315. "--glob",
  316. default=False,
  317. action="store_true",
  318. help="Wildcards in filenames are interpreted",
  319. )
  320. filters = p.add_argument_group(
  321. "filters", "Select only items that match " "these conditions"
  322. )
  323. filters.add_argument(
  324. "--min-len",
  325. default=0,
  326. type=DurationType,
  327. help="Exclude any audio that is shorter " "than MIN_LEN seconds",
  328. )
  329. filters.add_argument(
  330. "--max-len",
  331. default=0,
  332. type=DurationType,
  333. help="Exclude any audio that is longer " "than MAX_LEN seconds",
  334. )
  335. filters.add_argument(
  336. "--sort-by", default="no", type=str, choices=("random", "date", "duration")
  337. )
  338. filters.add_argument(
  339. '--random-seed', default=None, help='Initialize the random generator. For debug only')
  340. filters.add_argument(
  341. "--reverse", default=False, action="store_true", help="Reverse list order"
  342. )
  343. filters.add_argument(
  344. "--min-age",
  345. default=datetime.timedelta(),
  346. type=TimeDeltaType,
  347. help="Exclude audio more recent than MIN_AGE",
  348. )
  349. filters.add_argument(
  350. "--max-age",
  351. default=datetime.timedelta(),
  352. type=TimeDeltaType,
  353. help="Exclude audio older than MAX_AGE",
  354. )
  355. fill = p.add_argument_group(
  356. "fill", "Fill a 'block' with as many contents as possible"
  357. )
  358. fill.add_argument(
  359. "--fill",
  360. default=0,
  361. type=DurationType,
  362. help="Fill a block of duration LEN",
  363. metavar="LEN",
  364. )
  365. fill.add_argument(
  366. "--fill-reverse",
  367. default=False,
  368. action="store_true",
  369. help="Reverse list order after the fill algorithm",
  370. )
  371. fill.add_argument(
  372. "--fill-interleave-dir",
  373. default=None,
  374. type=str, # FIXME: does it even work?
  375. help="Between each item, put a random file from DIR",
  376. )
  377. p.add_argument(
  378. "--start",
  379. default=0,
  380. type=int,
  381. help="0-indexed start number. " "By default, play from most recent",
  382. )
  383. p.add_argument(
  384. "--howmany", default=1, type=int, help="If not specified, only 1 will be played"
  385. )
  386. p.add_argument(
  387. "--slotsize", type=int, help="Seconds between each audio. Still unsupported"
  388. )
  389. general = p.add_argument_group("general", "General options")
  390. general.add_argument(
  391. "--copy", help="Copy files to $TMPDIR", default=False, action="store_true"
  392. )
  393. general.add_argument(
  394. "--debug", help="Debug messages", default=False, action="store_true"
  395. )
  396. p.add_argument("urls", metavar="URL", nargs="+")
  397. return p
  398. def downloader(url, dest):
  399. headers = {}
  400. mode = "wb"
  401. if os.path.exists(dest):
  402. headers["Range"] = "bytes=%d-" % os.stat(dest).st_size
  403. mode = "ab"
  404. r = requests.get(url, stream=True, headers=headers)
  405. if r.status_code == 416: # range not satisfiable
  406. return
  407. with open(dest, mode) as f:
  408. for chunk in r.iter_content(chunk_size=1 << 16):
  409. f.write(chunk)
  410. def put(audio, copy=False):
  411. if not copy:
  412. for url in audio.urls:
  413. print(url)
  414. else:
  415. for url in audio.urls:
  416. if url.split(":")[0] in ("http", "https"):
  417. destdir = os.environ.get("TMPDIR", ".")
  418. fname = posixpath.basename(urlparse(url).path)
  419. # sanitize
  420. fname = "".join(
  421. c for c in fname if c.isalnum() or c in list("._-")
  422. ).rstrip()
  423. dest = os.path.join(destdir, fname)
  424. os.makedirs(destdir, exist_ok=True)
  425. downloader(url, dest)
  426. print("file://%s" % os.path.realpath(dest))
  427. else:
  428. # FIXME: file:// urls are just copied
  429. print(url)
  430. def retrieve(url, args):
  431. """
  432. returns a list of Audios or a list of AudioGroups
  433. """
  434. if not args.group:
  435. if os.path.isdir(url):
  436. audiodir = get_audio_from_dir(url)
  437. return audiodir
  438. elif os.path.isfile(url) and is_audio_file(url):
  439. return get_audio_from_file(url)
  440. elif url.startswith("http:") or url.startswith("https:") or os.path.isfile(url):
  441. getter = get_urls_factory(url, args)
  442. tree = get_tree(url)
  443. return getter(tree)
  444. else:
  445. logging.info("unsupported url `%s`", url)
  446. return []
  447. else: # group
  448. if os.path.isdir(url):
  449. audiodir = get_audio_from_dir(url)
  450. agroups = []
  451. for a in audiodir:
  452. ag = AudioGroup(os.path.basename(a.url))
  453. ag.append(a)
  454. agroups.append(ag)
  455. return agroups
  456. elif os.path.isfile(url) and is_audio_file(url):
  457. audio = get_audio_from_file(url)[0]
  458. ag = AudioGroup(url)
  459. ag.append(audio)
  460. return [ag]
  461. elif url.startswith("http:") or url.startswith("https:") or os.path.isfile(url):
  462. groups = get_grouped_urls(get_tree(url))
  463. return groups.values()
  464. else:
  465. logging.info("unsupported url `%s`", url)
  466. return []
  467. def audio_passes_filters(audio, args):
  468. logging.debug(audio.end_date)
  469. if not audio.valid:
  470. return False
  471. if args.max_len and audio.duration > args.max_len:
  472. return False
  473. if args.fill and audio.duration > args.fill:
  474. return False
  475. if args.min_len and audio.duration < args.min_len:
  476. return False
  477. if args.min_age.total_seconds() and audio.age < args.min_age:
  478. return False
  479. if args.max_age.total_seconds() and audio.age > args.max_age:
  480. return False
  481. return True
  482. def expand_glob(sources: list, weights: list) -> tuple:
  483. '''
  484. Let's say that sources=["foo", "bar*"] and weight=["2", "3"] and on filesystem there are bar1 and bar2.
  485. Result: ["foo", "bar1", "bar2"], ["2", "3", "3"]
  486. '''
  487. new_sources = []
  488. new_weights = []
  489. for src, weight in zip(sources, weights):
  490. expanded_source = glob.glob(src)
  491. expanded_weight = [weight] * len(expanded_source)
  492. new_sources += expanded_source
  493. new_weights += expanded_weight
  494. return new_sources, new_weights
  495. def get_audio_by_source(args, parser):
  496. sources = args.urls
  497. if args.source_weights:
  498. weights = list(map(int, args.source_weights.split(":")))
  499. if len(weights) != len(sources):
  500. parser.exit(
  501. status=2,
  502. message="Weight must be in the same number as sources\n",
  503. )
  504. else:
  505. weights = [1] * len(sources)
  506. if args.glob:
  507. sources, weights = expand_glob(sources, weights)
  508. audio_by_source = OrderedDict()
  509. for i, url in enumerate(sources):
  510. url_audios = list(retrieve(url, args))
  511. logging.debug("Found %d audios in %s", len(url_audios), url)
  512. url_audios = [au for au in url_audios if audio_passes_filters(au, args)]
  513. logging.debug("%d of those are passing filters", len(url_audios))
  514. audio_by_source[url] = url_audios
  515. if not url_audios:
  516. weights[i] = 0
  517. if sum(weights) == 0:
  518. return
  519. sources = [weighted_choice(sources, weights)]
  520. return audio_by_source, sources
  521. def main():
  522. parser = get_parser()
  523. args = parser.parse_args()
  524. if not args.debug:
  525. logging.basicConfig(level=logging.WARNING)
  526. else:
  527. logging.basicConfig(level=logging.DEBUG)
  528. if args.random_seed is not None:
  529. random.seed(args.random_seed)
  530. audio_by_source, sources = get_audio_by_source(args, parser)
  531. audios = []
  532. for source_url in sources:
  533. audios += audio_by_source[source_url]
  534. logging.debug("Found %d audios", len(audios))
  535. # sort
  536. if args.sort_by == "random":
  537. random.shuffle(audios)
  538. elif args.sort_by == "date":
  539. audios.sort(key=lambda x: x.age)
  540. elif args.sort_by == "duration":
  541. audios.sort(key=lambda x: x.duration)
  542. if args.reverse:
  543. audios.reverse()
  544. # slice
  545. audios = audios[args.start :]
  546. if not args.fill:
  547. audios = audios[: args.howmany]
  548. if args.fill and audios:
  549. fill_audios = [audios.pop(0)]
  550. duration = fill_audios[0].duration
  551. for next_audio in audios:
  552. next_duration = next_audio.duration
  553. if args.fill_interleave_dir:
  554. interleaving = Audio(
  555. "file://"
  556. + random.choice(list(scan_dir_audio(args.fill_interleave_dir)))
  557. )
  558. # logging.info("%r", interleaving)
  559. next_duration += interleaving.duration
  560. if args.fill - duration > next_duration:
  561. if args.fill_interleave_dir:
  562. fill_audios.append(interleaving)
  563. fill_audios.append(next_audio)
  564. duration += next_duration
  565. audios = fill_audios
  566. if args.fill_reverse:
  567. audios.reverse()
  568. # the for loop excludes the last one
  569. # this is to support the --slotsize option
  570. if not audios:
  571. return
  572. for audio in audios[:-1]:
  573. if args.debug:
  574. debug(repr(audio))
  575. else:
  576. put(audio, args.copy)
  577. if args.slotsize is not None:
  578. duration = audio.duration
  579. if duration < args.slotsize:
  580. # TODO: prendi musica da un'altra cartella
  581. print("## musica per {} secondi".format(args.slotsize - duration))
  582. # finally, the last one
  583. if args.debug:
  584. debug(repr(audios[-1]))
  585. else:
  586. put(audios[-1], args.copy)
  587. # else: # grouping; TODO: support slotsize
  588. # for item in groups:
  589. # if args.debug:
  590. # print('#', item, groups[item].duration)
  591. # print(groups[item])
  592. if __name__ == "__main__":
  593. main()