feed 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559
  1. #!/usr/bin/env python3
  2. """
  3. Feed parser with many features
  4. from a feed, it supports filtering, subslicing, random picking
  5. Beside feeds, it supports picking files from directories
  6. """
  7. import datetime
  8. import logging
  9. import os
  10. import posixpath
  11. import random
  12. import re
  13. import urllib.request
  14. from argparse import ArgumentParser, ArgumentTypeError
  15. from bisect import bisect
  16. from collections import OrderedDict
  17. from subprocess import CalledProcessError, check_output
  18. from urllib.parse import unquote, urlparse
  19. import requests
  20. from lxml import html
  21. from pytimeparse.timeparse import timeparse
  22. def get_int(s):
  23. return int(re.findall(r"\d+", s)[0])
  24. def DurationType(arg):
  25. if arg.isdecimal():
  26. secs = int(arg)
  27. else:
  28. secs = timeparse(arg)
  29. if secs is None:
  30. raise ArgumentTypeError("%r is not a valid duration" % arg)
  31. return secs
  32. def TimeDeltaType(arg):
  33. if arg.isdecimal():
  34. secs = int(arg)
  35. else:
  36. secs = timeparse(arg)
  37. if secs is None:
  38. raise ArgumentTypeError("%r is not a valid time range" % arg)
  39. return datetime.timedelta(seconds=secs)
  40. def weighted_choice(values, weights):
  41. """
  42. random.choice with weights
  43. weights must be integers greater than 0.
  44. Their meaning is "relative", that is [1,2,3] is the same as [2,4,6]
  45. """
  46. assert len(values) == len(weights)
  47. total = 0
  48. cum_weights = []
  49. for w in weights:
  50. total += w
  51. cum_weights.append(total)
  52. x = random.random() * total
  53. i = bisect(cum_weights, x)
  54. return values[i]
  55. def delta_humanreadable(tdelta):
  56. if tdelta is None:
  57. return ""
  58. days = tdelta.days
  59. hours = (tdelta - datetime.timedelta(days=days)).seconds // 3600
  60. if days:
  61. return "{}d{}h".format(days, hours)
  62. return "{}h".format(hours)
  63. def duration_humanreadable(seconds):
  64. hours = seconds // 3600
  65. minutes = (seconds - hours * 3600) // 60
  66. seconds = seconds % 60
  67. if hours > 0:
  68. return "{}h{}m{}s".format(hours, minutes, seconds)
  69. return "{}m{}s".format(minutes, seconds)
  70. class Audio(object):
  71. def __init__(self, url, duration=None, date=None):
  72. self.url = url
  73. if duration is None:
  74. duration = get_duration(url.encode("utf-8"))
  75. self.duration = duration
  76. self.date = date
  77. self.end_date = datetime.datetime(9999, 12, 31, tzinfo=datetime.timezone.utc)
  78. def __str__(self):
  79. return self.url
  80. def __repr__(self):
  81. return "<Audio {} ({} {})>".format(
  82. self.url,
  83. duration_humanreadable(self.duration),
  84. delta_humanreadable(self.age),
  85. )
  86. @property
  87. def urls(self):
  88. return [self.url]
  89. @property
  90. def age(self):
  91. if self.date is None:
  92. return None
  93. now = datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc)
  94. return now - self.date
  95. @property
  96. def valid(self):
  97. return self.end_date >= datetime.datetime.utcnow().replace(
  98. tzinfo=datetime.timezone.utc
  99. )
  100. class AudioGroup(list):
  101. def __init__(self, description=None):
  102. self.description = description or ""
  103. self.audios = []
  104. def __len__(self):
  105. return len(self.audios)
  106. def append(self, arg):
  107. self.audios.append(arg)
  108. def __str__(self):
  109. return "\n".join(str(a) for a in self.audios)
  110. def __repr__(self):
  111. return '<AudioGroup "{}" ({} {})\n{} >'.format(
  112. self.description,
  113. duration_humanreadable(self.duration),
  114. delta_humanreadable(self.age),
  115. "\n".join(" " + repr(a) for a in self.audios),
  116. )
  117. @property
  118. def duration(self):
  119. return sum(a.duration for a in self.audios if a.duration is not None)
  120. @property
  121. def urls(self):
  122. return [a.url for a in self.audios]
  123. @property
  124. def date(self):
  125. for a in self.audios:
  126. if hasattr(a, "date"):
  127. return a.date
  128. return None
  129. @property
  130. def age(self):
  131. if self.date is None:
  132. return None
  133. now = datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc)
  134. return now - self.date
  135. @property
  136. def valid(self):
  137. return len(self.audios) > 0
  138. def get_tree(feed_url):
  139. if feed_url.startswith("http:") or feed_url.startswith("https:"):
  140. tree = html.fromstring(requests.get(feed_url).content)
  141. else:
  142. if not os.path.exists(feed_url):
  143. raise ValueError("file not found: {}".format(feed_url))
  144. tree = html.parse(open(feed_url))
  145. return tree
  146. def get_audio_from_description(text):
  147. # non-empty lines
  148. lines = [line.strip() for line in text.split("\n") if line.strip()]
  149. url = lines[0]
  150. duration = None
  151. metadata = {}
  152. for line in text.split("\n")[1:]:
  153. if line.strip() and "=" in line:
  154. metadata[line.split("=")[0]] = line.split("=")[1]
  155. if "durata" in metadata:
  156. metadata["durata"] = get_int(metadata["durata"])
  157. if "txdate" in metadata:
  158. try:
  159. metadata["txdate"] = datetime.datetime.strptime(
  160. metadata["txdate"], "%Y-%m-%dT%H:%M:%S%z"
  161. )
  162. except ValueError:
  163. logging.warning("could not parse txdate %s", metadata["txdate"])
  164. del metadata["txdate"]
  165. a = Audio(
  166. unquote(url),
  167. duration=metadata.get("durata", None),
  168. date=metadata.get("txdate", None),
  169. )
  170. if "txdate" in metadata and "replica" in metadata:
  171. if metadata["replica"].endswith("g"):
  172. a.end_date = metadata["txdate"] + datetime.timedelta(
  173. days=get_int(metadata["replica"])
  174. )
  175. return a
  176. # copied from larigira.fsutils
  177. def scan_dir_audio(dirname, extensions=("mp3", "oga", "wav", "ogg")):
  178. for root, dirnames, filenames in os.walk(dirname):
  179. for fname in filenames:
  180. if fname.split(".")[-1].lower() in extensions:
  181. yield os.path.join(root, fname)
  182. def get_audio_from_dir(dirpath):
  183. fpaths = scan_dir_audio(dirpath)
  184. return [
  185. Audio(
  186. "file://" + os.path.realpath(u),
  187. date=datetime.datetime.fromtimestamp(os.path.getmtime(u)).replace(
  188. tzinfo=datetime.timezone.utc
  189. ),
  190. )
  191. for u in fpaths
  192. ]
  193. def get_item_date(el):
  194. el_date = el.find("pubdate")
  195. if el_date is not None:
  196. return datetime.datetime.strptime(el_date.text, "%Y-%m-%dT%H:%M:%S%z")
  197. return None
  198. def get_urls(tree):
  199. items = tree.xpath("//item")
  200. for it in items:
  201. title = it.find("title").text
  202. el_body = it.find("description")
  203. if el_body is not None:
  204. url = el_body.text
  205. try:
  206. audio = get_audio_from_description(url)
  207. except Exception as exc:
  208. logging.info("error getting duration for `%s`" % title)
  209. continue
  210. if audio.date is None:
  211. audio.date = get_item_date(it)
  212. yield audio
  213. def get_grouped_urls(tree):
  214. groups = OrderedDict()
  215. items = tree.xpath("//item")
  216. for item in items:
  217. guid = item.xpath("guid")[0].text.strip()
  218. if guid not in groups:
  219. groups[guid] = AudioGroup(guid)
  220. audio = get_audio_from_description(item.xpath("description")[0].text)
  221. audio.date = get_item_date(item)
  222. if audio.valid:
  223. groups[guid].append(audio)
  224. return groups
  225. def get_duration(url):
  226. try:
  227. lineout = check_output(
  228. ["ffprobe", "-v", "error", "-show_entries", "format=duration", "-i", url]
  229. ).split(b"\n")
  230. except CalledProcessError as exc:
  231. raise ValueError("error probing `%s`" % url) from exc
  232. duration = next(l for l in lineout if l.startswith(b"duration="))
  233. value = duration.split(b"=")[1]
  234. return int(float(value))
  235. HELP = """
  236. Collect audio informations from multiple sources (XML feeds).
  237. Audios are (in that order):
  238. 1. Collected from feeds; (grouped by article if --group is used)
  239. 2. Filtered; everything that does not match with requirements is excluded
  240. 3. Sorted; even randomly
  241. 4. Sliced; take HOWMANY elements, skipping START elements
  242. 5. (if --copy) Copied
  243. Usage: """
  244. def get_parser():
  245. p = ArgumentParser(HELP)
  246. src = p.add_argument_group("sources", "How to deal with sources")
  247. src.add_argument(
  248. "--source-weights", help='Select only one "source" based on this weights'
  249. )
  250. src.add_argument(
  251. "--group",
  252. default=False,
  253. action="store_true",
  254. help="Group audios that belong to the same article",
  255. )
  256. filters = p.add_argument_group(
  257. "filters", "Select only items that match " "these conditions"
  258. )
  259. filters.add_argument(
  260. "--min-len",
  261. default=0,
  262. type=DurationType,
  263. help="Exclude any audio that is shorter " "than MIN_LEN seconds",
  264. )
  265. filters.add_argument(
  266. "--max-len",
  267. default=0,
  268. type=DurationType,
  269. help="Exclude any audio that is longer " "than MAX_LEN seconds",
  270. )
  271. filters.add_argument(
  272. "--sort-by", default="no", type=str, choices=("random", "date", "duration")
  273. )
  274. filters.add_argument(
  275. "--reverse", default=False, action="store_true", help="Reverse list order"
  276. )
  277. filters.add_argument(
  278. "--min-age",
  279. default=datetime.timedelta(),
  280. type=TimeDeltaType,
  281. help="Exclude audio more recent than MIN_AGE",
  282. )
  283. filters.add_argument(
  284. "--max-age",
  285. default=datetime.timedelta(),
  286. type=TimeDeltaType,
  287. help="Exclude audio older than MAX_AGE",
  288. )
  289. fill = p.add_argument_group(
  290. "fill", "Fill a 'block' with as many contents as possible"
  291. )
  292. fill.add_argument(
  293. "--fill",
  294. default=0,
  295. type=DurationType,
  296. help="Fill a block of duration LEN",
  297. metavar="LEN",
  298. )
  299. fill.add_argument(
  300. "--fill-reverse",
  301. default=False,
  302. action="store_true",
  303. help="Reverse list order after the fill algorithm",
  304. )
  305. fill.add_argument(
  306. "--fill-interleave-dir",
  307. default=None,
  308. type=str, # FIXME: does it even work?
  309. help="Between each item, put a random file from DIR",
  310. )
  311. p.add_argument(
  312. "--start",
  313. default=0,
  314. type=int,
  315. help="0-indexed start number. " "By default, play from most recent",
  316. )
  317. p.add_argument(
  318. "--howmany", default=1, type=int, help="If not specified, only 1 will be played"
  319. )
  320. p.add_argument(
  321. "--slotsize", type=int, help="Seconds between each audio. Still unsupported"
  322. )
  323. general = p.add_argument_group("general", "General options")
  324. general.add_argument(
  325. "--copy", help="Copy files to $TMPDIR", default=False, action="store_true"
  326. )
  327. general.add_argument(
  328. "--debug", help="Debug messages", default=False, action="store_true"
  329. )
  330. p.add_argument("urls", metavar="URL", nargs="+")
  331. return p
  332. def put(audio, copy=False):
  333. if not copy:
  334. for url in audio.urls:
  335. print(url)
  336. else:
  337. for url in audio.urls:
  338. if url.split(":")[0] in ("http", "https"):
  339. destdir = os.environ.get("TMPDIR", ".")
  340. fname = posixpath.basename(urlparse(url).path)
  341. # sanitize
  342. fname = "".join(
  343. c for c in fname if c.isalnum() or c in list("._-")
  344. ).rstrip()
  345. dest = os.path.join(destdir, fname)
  346. os.makedirs(destdir, exist_ok=True)
  347. fname, headers = urllib.request.urlretrieve(url, dest)
  348. print("file://%s" % os.path.realpath(fname))
  349. else:
  350. # FIXME: file:// urls are just copied
  351. print(url)
  352. def retrieve(url, args):
  353. """
  354. returns a list of Audios or a list of AudioGroups
  355. """
  356. if not args.group:
  357. if os.path.isdir(url):
  358. audiodir = get_audio_from_dir(url)
  359. return audiodir
  360. elif url.startswith("http:") or url.startswith("https:") or os.path.isfile(url):
  361. return get_urls(get_tree(url))
  362. else:
  363. logging.info("unsupported url `%s`", url)
  364. return []
  365. else: # group
  366. if os.path.isdir(url):
  367. audiodir = get_audio_from_dir(url)
  368. agroups = []
  369. for a in audiodir:
  370. ag = AudioGroup(os.path.basename(a.url))
  371. ag.append(a)
  372. agroups.append(ag)
  373. return agroups
  374. elif url.startswith("http:") or url.startswith("https:") or os.path.isfile(url):
  375. groups = get_grouped_urls(get_tree(url))
  376. return groups.values()
  377. else:
  378. logging.info("unsupported url `%s`", url)
  379. return []
  380. def audio_passes_filters(audio, args):
  381. if not audio.valid:
  382. return False
  383. if args.max_len and audio.duration > args.max_len:
  384. return False
  385. if args.fill and audio.duration > args.fill:
  386. return False
  387. if args.min_len and audio.duration < args.min_len:
  388. return False
  389. if args.min_age.total_seconds() and audio.age < args.min_age:
  390. return False
  391. if args.max_age.total_seconds() and audio.age > args.max_age:
  392. return False
  393. return True
  394. def main():
  395. parser = get_parser()
  396. args = parser.parse_args()
  397. if not args.debug:
  398. logging.basicConfig(level=logging.WARNING)
  399. else:
  400. logging.basicConfig(level=logging.DEBUG)
  401. sources = args.urls
  402. if args.source_weights:
  403. weights = tuple(map(int, args.source_weights.split(":")))
  404. if len(weights) != len(sources):
  405. parser.exit(
  406. status=2, message="Weight must be in the" " same number as sources\n"
  407. )
  408. sources = [weighted_choice(sources, weights)]
  409. audios = []
  410. for url in sources:
  411. url_audios = retrieve(url, args)
  412. audios += [au for au in url_audios if audio_passes_filters(au, args)]
  413. # sort
  414. if args.sort_by == "random":
  415. random.shuffle(audios)
  416. elif args.sort_by == "date":
  417. audios.sort(key=lambda x: x.age)
  418. elif args.sort_by == "duration":
  419. audios.sort(key=lambda x: x.duration)
  420. if args.reverse:
  421. audios.reverse()
  422. # slice
  423. audios = audios[args.start :]
  424. if not args.fill:
  425. audios = audios[: args.howmany]
  426. if args.fill and audios:
  427. fill_audios = [audios.pop(0)]
  428. duration = fill_audios[0].duration
  429. for next_audio in audios:
  430. next_duration = next_audio.duration
  431. if args.fill_interleave_dir:
  432. interleaving = Audio(
  433. # TODO: factorize "pick file"
  434. "file://"
  435. + os.path.join(
  436. args.fill_interleave_dir,
  437. random.choice(os.listdir(args.fill_interleave_dir)),
  438. )
  439. )
  440. # logging.info("%r", interleaving)
  441. next_duration += interleaving.duration
  442. if args.fill - duration > next_duration:
  443. if args.fill_interleave_dir:
  444. fill_audios.append(interleaving)
  445. fill_audios.append(next_audio)
  446. duration += next_duration
  447. audios = fill_audios
  448. if args.fill_reverse:
  449. audios.reverse()
  450. # the for loop excludes the last one
  451. # this is to support the --slotsize option
  452. if not audios:
  453. return
  454. for audio in audios[:-1]:
  455. if args.debug:
  456. print(repr(audio))
  457. else:
  458. put(audio, args.copy)
  459. if args.slotsize is not None:
  460. duration = audio.duration
  461. if duration < args.slotsize:
  462. print("## musica per {} secondi".format(args.slotsize - duration))
  463. # finally, the last one
  464. if args.debug:
  465. print(repr(audios[-1]))
  466. else:
  467. put(audios[-1], args.copy)
  468. # else: # grouping; TODO: support slotsize
  469. # for item in groups:
  470. # if args.debug:
  471. # print('#', item, groups[item].duration)
  472. # print(groups[item])
  473. if __name__ == "__main__":
  474. main()