feed 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417
  1. #!/usr/bin/env python3
  2. '''
  3. Feed parser with many features
  4. from a feed, it supports filtering, subslicing, random picking
  5. Beside feeds, it supports picking files from directories
  6. '''
  7. import os
  8. import logging
  9. from argparse import ArgumentParser, ArgumentTypeError
  10. from subprocess import check_output, CalledProcessError
  11. from collections import OrderedDict
  12. import re
  13. import urllib.request
  14. from urllib.parse import urlparse, unquote
  15. import posixpath
  16. import random
  17. from bisect import bisect
  18. import datetime
  19. from lxml import html
  20. import requests
  21. from pytimeparse.timeparse import timeparse
  22. def DurationType(arg):
  23. if arg.isdecimal():
  24. secs = int(arg)
  25. else:
  26. secs = timeparse(arg)
  27. if secs is None:
  28. raise ArgumentTypeError('%r is not a valid duration' % arg)
  29. return secs
  30. def TimeDeltaType(arg):
  31. if arg.isdecimal():
  32. secs = int(arg)
  33. else:
  34. secs = timeparse(arg)
  35. if secs is None:
  36. raise ArgumentTypeError('%r is not a valid time range' % arg)
  37. return datetime.timedelta(seconds=secs)
  38. def weighted_choice(values, weights):
  39. '''
  40. random.choice with weights
  41. weights must be integers greater than 0.
  42. Their meaning is "relative", that is [1,2,3] is the same as [2,4,6]
  43. '''
  44. assert len(values) == len(weights)
  45. total = 0
  46. cum_weights = []
  47. for w in weights:
  48. total += w
  49. cum_weights.append(total)
  50. x = random.random() * total
  51. i = bisect(cum_weights, x)
  52. return values[i]
  53. def delta_humanreadable(tdelta):
  54. if tdelta is None:
  55. return ''
  56. days = tdelta.days
  57. hours = (tdelta - datetime.timedelta(days=days)).seconds // 3600
  58. if days:
  59. return '{}d{}h'.format(days, hours)
  60. return '{}h'.format(hours)
  61. class Audio(object):
  62. def __init__(self, url, duration=None, date=None):
  63. self.url = url
  64. if duration is None:
  65. duration = get_duration(url.encode('utf-8'))
  66. self.duration = duration
  67. self.date = date
  68. def __str__(self):
  69. return self.url
  70. def __repr__(self):
  71. return '<Audio {} ({} {})>'.format(self.url, self.duration,
  72. delta_humanreadable(self.age))
  73. @property
  74. def urls(self):
  75. return [self.url]
  76. @property
  77. def age(self):
  78. if self.date is None:
  79. return None
  80. now = datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc)
  81. return now - self.date
  82. class AudioGroup(list):
  83. def __init__(self, description=None):
  84. self.description = description or ''
  85. self.audios = []
  86. def __len__(self):
  87. return len(self.audios)
  88. def append(self, arg):
  89. self.audios.append(arg)
  90. def __str__(self):
  91. return '\n'.join(str(a) for a in self.audios)
  92. def __repr__(self):
  93. return '<AudioGroup "{}" ({} {})\n{} >'.\
  94. format(self.description, self.duration,
  95. delta_humanreadable(self.age),
  96. '\n'.join(' ' + repr(a) for a in self.audios))
  97. @property
  98. def duration(self):
  99. return sum(a.duration for a in self.audios if a.duration is not None)
  100. @property
  101. def urls(self):
  102. return [a.url for a in self.audios]
  103. @property
  104. def date(self):
  105. for a in self.audios:
  106. if hasattr(a, 'date'):
  107. return a.date
  108. return None
  109. @property
  110. def age(self):
  111. if self.date is None:
  112. return None
  113. now = datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc)
  114. return now - self.date
  115. def get_tree(feed_url):
  116. if feed_url.startswith('http:') or feed_url.startswith('https:'):
  117. tree = html.fromstring(requests.get(feed_url).content)
  118. else:
  119. if not os.path.exists(feed_url):
  120. raise ValueError("file not found: {}".format(feed_url))
  121. tree = html.parse(open(feed_url))
  122. return tree
  123. def get_audio_from_description(text):
  124. # non-empty lines
  125. lines = [line.strip()
  126. for line in text.split('\n')
  127. if line.strip()]
  128. url = lines[0]
  129. duration = None
  130. if len(lines) > 1:
  131. parts = lines[1].split('=')
  132. if len(parts) > 1 and parts[1]:
  133. duration = int(re.findall(r'\d+', parts[1].strip())[0])
  134. return Audio(unquote(url), duration)
  135. # copied from larigira.fsutils
  136. def scan_dir_audio(dirname, extensions=('mp3', 'oga', 'wav', 'ogg')):
  137. for root, dirnames, filenames in os.walk(dirname):
  138. for fname in filenames:
  139. if fname.split('.')[-1].lower() in extensions:
  140. yield os.path.join(root, fname)
  141. def get_audio_from_dir(dirpath):
  142. fpaths = scan_dir_audio(dirpath)
  143. return [Audio('file://' + os.path.realpath(u),
  144. date=datetime.datetime.fromtimestamp(os.path.getmtime(u)).
  145. replace(tzinfo=datetime.timezone.utc))
  146. for u in fpaths]
  147. def get_item_date(el):
  148. el_date = el.find('pubdate')
  149. if el_date is not None:
  150. return datetime.datetime.strptime(
  151. el_date.text, '%Y-%m-%dT%H:%M:%S%z')
  152. return None
  153. def get_urls(tree):
  154. items = tree.xpath('//item')
  155. for it in items:
  156. title = it.find('title').text
  157. el_body = it.find('description')
  158. if el_body is not None:
  159. url = el_body.text
  160. try:
  161. audio = get_audio_from_description(url)
  162. except Exception as exc:
  163. logging.info('error getting duration for `%s`' % title)
  164. continue
  165. audio.date = get_item_date(it)
  166. yield audio
  167. def get_grouped_urls(tree):
  168. groups = OrderedDict()
  169. items = tree.xpath('//item')
  170. for item in items:
  171. guid = item.xpath('guid')[0].text.strip()
  172. if guid not in groups:
  173. groups[guid] = AudioGroup(guid)
  174. audio = get_audio_from_description(item.xpath('description')[0].text)
  175. audio.date = get_item_date(item)
  176. groups[guid].append(audio)
  177. return groups
  178. def get_duration(url):
  179. try:
  180. lineout = check_output(['ffprobe', '-v', 'error',
  181. '-show_entries', 'format=duration',
  182. '-i', url]).split(b'\n')
  183. except CalledProcessError as exc:
  184. raise ValueError('error probing `%s`' % url) from exc
  185. duration = next(l for l in lineout if l.startswith(b'duration='))
  186. value = duration.split(b'=')[1]
  187. return int(float(value))
  188. HELP = '''
  189. Collect audio informations from multiple sources (XML feeds).
  190. Audios are (in that order):
  191. 1. Collected from feeds; (grouped by article if --group is used)
  192. 2. Filtered; everything that does not match with requirements is excluded
  193. 3. Sorted; even randomly
  194. 4. Sliced; take HOWMANY elements, skipping START elements
  195. 5. (if --copy) Copied
  196. Usage: '''
  197. def get_parser():
  198. p = ArgumentParser(HELP)
  199. src = p.add_argument_group('sources', 'How to deal with sources')
  200. src.add_argument('--source-weights',
  201. help='Select only one "source" based on this weights')
  202. src.add_argument('--group', default=False, action='store_true',
  203. help='Group audios that belong to the same article')
  204. filters = p.add_argument_group('filters', 'Select only items that match '
  205. 'these conditions')
  206. filters.add_argument('--min-len', default=0, type=DurationType,
  207. help='Exclude any audio that is shorter '
  208. 'than MIN_LEN seconds')
  209. filters.add_argument('--max-len', default=0, type=DurationType,
  210. help='Exclude any audio that is longer '
  211. 'than MAX_LEN seconds')
  212. filters.add_argument('--sort-by', default='no', type=str,
  213. choices=('random', 'date', 'duration'))
  214. filters.add_argument('--reverse', default=False,
  215. action='store_true', help='Reverse list order')
  216. filters.add_argument('--min-age', default=datetime.timedelta(),
  217. type=TimeDeltaType,
  218. help='Exclude audio more recent than MIN_AGE')
  219. filters.add_argument('--max-age', default=datetime.timedelta(),
  220. type=TimeDeltaType,
  221. help='Exclude audio older than MAX_AGE')
  222. p.add_argument('--start', default=0, type=int,
  223. help='0-indexed start number. '
  224. 'By default, play from most recent')
  225. p.add_argument('--howmany', default=1, type=int,
  226. help='If not specified, only 1 will be played')
  227. p.add_argument('--slotsize', type=int,
  228. help='Seconds between each audio. Still unsupported')
  229. general = p.add_argument_group('general', 'General options')
  230. general.add_argument('--copy', help='Copy files to $TMPDIR', default=False,
  231. action='store_true')
  232. general.add_argument('--debug', help='Debug messages', default=False,
  233. action='store_true')
  234. p.add_argument('urls', metavar='URL', nargs='+')
  235. return p
  236. def put(audio, copy=False):
  237. if not copy:
  238. for url in audio.urls:
  239. print(url)
  240. else:
  241. for url in audio.urls:
  242. if url.split(':')[0] in ('http', 'https'):
  243. destdir = (os.environ.get('TMPDIR', '.'))
  244. fname = posixpath.basename(urlparse(url).path)
  245. # sanitize
  246. fname = "".join(c for c in fname
  247. if c.isalnum() or c in list('._-')).rstrip()
  248. dest = os.path.join(destdir, fname)
  249. os.makedirs(destdir, exist_ok=True)
  250. fname, headers = urllib.request.urlretrieve(url, dest)
  251. print('file://%s' % os.path.realpath(fname))
  252. else:
  253. # FIXME: file:// urls are just copied
  254. print(url)
  255. def main():
  256. parser = get_parser()
  257. args = parser.parse_args()
  258. if not args.debug:
  259. logging.basicConfig(level=logging.WARNING)
  260. else:
  261. logging.basicConfig(level=logging.DEBUG)
  262. sources = args.urls
  263. if args.source_weights:
  264. weights = tuple(map(int, args.source_weights.split(':')))
  265. if len(weights) != len(sources):
  266. parser.exit(status=2, message='Weight must be in the'
  267. ' same number as sources\n')
  268. sources = [weighted_choice(sources, weights)]
  269. audios = []
  270. for url in sources:
  271. if url.startswith('http:') or url.startswith('https:') \
  272. or os.path.isfile(url):
  273. # download the feed
  274. tree = get_tree(url)
  275. # filtering
  276. if not args.group:
  277. # get audio urls, removing those that are too long
  278. audios += [audio for audio in get_urls(tree) if
  279. (args.max_len == 0 or
  280. audio.duration <= args.max_len) and
  281. (args.min_len == 0 or
  282. audio.duration >= args.min_len) and
  283. (args.min_age.total_seconds() == 0 or
  284. audio.age >= args.min_age) and
  285. (args.max_age.total_seconds() == 0 or
  286. audio.age <= args.max_age)
  287. ]
  288. else:
  289. groups = get_grouped_urls(tree)
  290. audios += [groups[g] for g in groups.keys()
  291. if
  292. (args.max_len == 0 or
  293. groups[g].duration <= args.max_len) and
  294. (args.min_len == 0 or
  295. groups[g].duration >= args.max_len) and
  296. (args.min_age.total_seconds() == 0 or
  297. groups[g].age >= args.min_age) and
  298. (args.max_age.total_seconds() == 0 or
  299. groups[g].age <= args.max_age)
  300. ]
  301. elif os.path.isdir(url):
  302. audiodir = get_audio_from_dir(url)
  303. if not args.group:
  304. audios += audiodir
  305. else:
  306. for a in audiodir:
  307. ag = AudioGroup(os.path.basename(a.url))
  308. ag.append(a)
  309. audios.append(ag)
  310. else:
  311. logging.info('unsupported url `%s`', url)
  312. # sort
  313. if args.sort_by == 'random':
  314. random.shuffle(audios)
  315. elif args.sort_by == 'date':
  316. audios.sort(key=lambda x: x.age)
  317. elif args.sort_by == 'duration':
  318. audios.sort(key=lambda x: x.duration)
  319. if args.reverse:
  320. audios.reverse()
  321. # slice
  322. audios = audios[args.start:]
  323. audios = audios[:args.howmany]
  324. # the for loop excludes the last one
  325. # this is to support the --slotsize option
  326. if not audios:
  327. return
  328. for audio in audios[:-1]:
  329. if args.debug:
  330. print(repr(audio))
  331. else:
  332. put(audio, args.copy)
  333. if args.slotsize is not None:
  334. duration = audio.duration
  335. if duration < args.slotsize:
  336. print('## musica per {} secondi'
  337. .format(args.slotsize - duration))
  338. # finally, the last one
  339. if args.debug:
  340. print(repr(audios[-1]))
  341. else:
  342. put(audios[-1], args.copy)
  343. # else: # grouping; TODO: support slotsize
  344. # for item in groups:
  345. # if args.debug:
  346. # print('#', item, groups[item].duration)
  347. # print(groups[item])
  348. if __name__ == '__main__':
  349. main()