audiogen_podcast.py 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216
  1. import datetime
  2. import logging
  3. import os
  4. import random
  5. import sys
  6. from subprocess import CalledProcessError, check_output
  7. import dateutil.parser
  8. import requests
  9. from lxml import html
  10. from pytimeparse.timeparse import timeparse
  11. from larigira.fsutils import download_http
  12. def delta_humanreadable(tdelta):
  13. if tdelta is None:
  14. return ""
  15. days = tdelta.days
  16. hours = (tdelta - datetime.timedelta(days=days)).seconds // 3600
  17. if days:
  18. return "{}d{}h".format(days, hours)
  19. return "{}h".format(hours)
  20. def get_duration(url):
  21. try:
  22. lineout = check_output(
  23. [
  24. "ffprobe",
  25. "-v",
  26. "error",
  27. "-show_entries",
  28. "format=duration",
  29. "-i",
  30. url,
  31. ]
  32. ).split(b"\n")
  33. except CalledProcessError as exc:
  34. raise ValueError("error probing `%s`" % url) from exc
  35. duration = next(l for l in lineout if l.startswith(b"duration="))
  36. value = duration.split(b"=")[1]
  37. return int(float(value))
  38. class Audio(object):
  39. def __init__(self, url, duration=None, date=None):
  40. self.url = url
  41. self._duration = duration
  42. self.date = date
  43. self.end_date = datetime.datetime(
  44. 9999, 12, 31, tzinfo=datetime.timezone.utc
  45. )
  46. def __str__(self):
  47. return self.url
  48. def __repr__(self):
  49. return "<Audio {} ({} {})>".format(
  50. self.url, self._duration, delta_humanreadable(self.age)
  51. )
  52. @property
  53. def duration(self):
  54. """lazy-calculation"""
  55. if self._duration is None:
  56. try:
  57. self._duration = get_duration(self.url.encode("utf-8"))
  58. except:
  59. logging.exception(
  60. "Error while computing duration of %s; set it to 0",
  61. self.url,
  62. )
  63. self._duration = 0
  64. return self._duration
  65. @property
  66. def urls(self):
  67. return [self.url]
  68. @property
  69. def age(self):
  70. if self.date is None:
  71. return None
  72. now = datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc)
  73. return now - self.date
  74. @property
  75. def valid(self):
  76. return self.end_date >= datetime.datetime.utcnow().replace(
  77. tzinfo=datetime.timezone.utc
  78. )
  79. def get_tree(feed_url):
  80. if feed_url.startswith("http:") or feed_url.startswith("https:"):
  81. tree = html.fromstring(requests.get(feed_url).content)
  82. else:
  83. if not os.path.exists(feed_url):
  84. raise ValueError("file not found: {}".format(feed_url))
  85. tree = html.parse(open(feed_url))
  86. return tree
  87. def get_item_date(el):
  88. el_date = el.find("pubdate")
  89. if el_date is None:
  90. return None
  91. for time_format in ("%Y-%m-%dT%H:%M:%S%z", "%a, %d %b %Y %H:%M:%S %z"):
  92. try:
  93. return datetime.datetime.strptime(el_date.text, time_format)
  94. except:
  95. continue
  96. return dateutil.parser.parse(el_date.text)
  97. def get_audio_from_item(item):
  98. encl = item.find("enclosure")
  99. if encl is not None:
  100. url = encl.get("url")
  101. else:
  102. return None
  103. audio_args = {}
  104. if item.find("duration") is not None:
  105. duration_parts = item.findtext("duration").split(":")
  106. total_seconds = 0
  107. for i, num in enumerate(reversed(duration_parts)):
  108. total_seconds += int(float(num)) * (60 ** i)
  109. if total_seconds:
  110. audio_args["duration"] = total_seconds
  111. else:
  112. contents = item.xpath("group/content")
  113. if not contents:
  114. contents = item.xpath("content")
  115. for child in contents:
  116. if child.get("url") == url and child.get("duration") is not None:
  117. audio_args["duration"] = int(float(child.get("duration")))
  118. break
  119. return Audio(url, **audio_args)
  120. def get_urls(tree):
  121. items = tree.xpath("//item")
  122. for i, it in enumerate(items):
  123. try:
  124. audio = get_audio_from_item(it)
  125. except Exception:
  126. logging.error("Could not parse item #%d, skipping", i)
  127. continue
  128. if audio is None:
  129. continue
  130. if audio.date is None:
  131. try:
  132. audio.date = get_item_date(it)
  133. except Exception:
  134. logging.warn("Could not find date for item #%d", i)
  135. yield audio
  136. def parse_duration(arg):
  137. if arg.isdecimal():
  138. secs = int(arg)
  139. else:
  140. secs = timeparse(arg)
  141. if secs is None:
  142. raise ValueError("%r is not a valid duration" % arg)
  143. return secs
  144. def generate(spec):
  145. if "url" not in spec:
  146. raise ValueError("Malformed audiospec: missing 'url'")
  147. audios = list(get_urls(get_tree(spec["url"])))
  148. if spec.get("min_len", False):
  149. audios = [
  150. a for a in audios if a.duration >= parse_duration(spec["min_len"])
  151. ]
  152. if spec.get("max_len", False):
  153. audios = [
  154. a for a in audios if a.duration <= parse_duration(spec["max_len"])
  155. ]
  156. # sort
  157. sort_by = spec.get("sort_by", "none")
  158. if sort_by == "random":
  159. random.shuffle(audios)
  160. elif sort_by == "date":
  161. audios.sort(key=lambda x: x.age)
  162. elif sort_by == "duration":
  163. audios.sort(key=lambda x: x.duration)
  164. if spec.get("reverse", False):
  165. audios.reverse()
  166. # slice
  167. audios = audios[int(spec.get("start", 0)) :]
  168. audios = audios[: int(spec.get("howmany", 1))]
  169. # copy local
  170. local_audios = [
  171. download_http(a.url, copy=spec.get("copy", True), prefix="podcast")
  172. for a in audios
  173. ]
  174. return local_audios
  175. # TODO: testing
  176. # TODO: lxml should maybe be optional?
  177. # TODO: ui
  178. if __name__ == "__main__":
  179. # less than proper testing
  180. logging.basicConfig(level=logging.DEBUG)
  181. for u in get_urls(get_tree(sys.argv[1])):
  182. print(" -", repr(u))