feed 9.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301
  1. #!/usr/bin/env python3
  2. '''
  3. Feed parser with many features
  4. from a feed, it supports filtering, subslicing, random picking
  5. Beside feeds, it supports picking files from directories
  6. '''
  7. import os
  8. import logging
  9. from argparse import ArgumentParser
  10. from subprocess import check_output
  11. from collections import OrderedDict
  12. import re
  13. import urllib.request
  14. from urllib.parse import urlparse, unquote
  15. import posixpath
  16. import random
  17. from bisect import bisect
  18. from lxml import html
  19. import requests
  20. def weighted_choice(values, weights):
  21. '''
  22. random.choice with weights
  23. weights must be integers greater than 0.
  24. Their meaning is "relative", that is [1,2,3] is the same as [2,4,6]
  25. '''
  26. assert len(values) == len(weights)
  27. total = 0
  28. cum_weights = []
  29. for w in weights:
  30. total += w
  31. cum_weights.append(total)
  32. x = random.random() * total
  33. i = bisect(cum_weights, x)
  34. return values[i]
  35. class Audio(object):
  36. def __init__(self, url, duration=None):
  37. self.url = url
  38. if duration is None:
  39. duration = get_duration(url.encode('utf-8'))
  40. self.duration = duration
  41. def __str__(self):
  42. return self.url
  43. def __repr__(self):
  44. return '<Audio {} ({})>'.format(self.url, self.duration)
  45. @property
  46. def urls(self):
  47. return [self.url]
  48. class AudioGroup(list):
  49. def __init__(self, description=None):
  50. self.description = description or ''
  51. self.audios = []
  52. def __len__(self):
  53. return len(self.audios)
  54. def append(self, arg):
  55. self.audios.append(arg)
  56. def __str__(self):
  57. return '\n'.join(str(a) for a in self.audios)
  58. def __repr__(self):
  59. return '<AudioGroup "{}" ({})\n{} >'.\
  60. format(self.description, self.duration,
  61. '\n'.join(' ' + repr(a) for a in self.audios))
  62. @property
  63. def duration(self):
  64. return sum(a.duration for a in self.audios if a.duration is not None)
  65. @property
  66. def urls(self):
  67. return [a.url for a in self.audios]
  68. def get_tree(feed_url):
  69. if feed_url.startswith('http:') or feed_url.startswith('https:'):
  70. tree = html.fromstring(requests.get(feed_url).content)
  71. else:
  72. if not os.path.exists(feed_url):
  73. raise ValueError("file not found: {}".format(feed_url))
  74. tree = html.parse(open(feed_url))
  75. return tree
  76. def get_audio_from_description(text):
  77. # non-empty lines
  78. lines = [line.strip()
  79. for line in text.split('\n')
  80. if line.strip()]
  81. url = lines[0]
  82. duration = None
  83. if len(lines) > 1:
  84. duration = int(re.findall(r'\d+', lines[1].split('=')[1].strip())[0])
  85. return Audio(unquote(url), duration)
  86. # copied from larigira.fsutils
  87. def scan_dir_audio(dirname, extensions=('mp3', 'oga', 'wav', 'ogg')):
  88. for root, dirnames, filenames in os.walk(dirname):
  89. for fname in filenames:
  90. if fname.split('.')[-1].lower() in extensions:
  91. yield os.path.join(root, fname)
  92. def get_audio_from_dir(dirpath):
  93. fpaths = scan_dir_audio(dirpath)
  94. return [Audio('file://' + os.path.realpath(u)) for u in fpaths]
  95. def get_urls(tree):
  96. urls = tree.xpath('//item/description')
  97. for url_elem in urls:
  98. yield get_audio_from_description(url_elem.text)
  99. def get_grouped_urls(tree):
  100. groups = OrderedDict()
  101. items = tree.xpath('//item')
  102. for item in items:
  103. guid = item.xpath('guid')[0].text.strip()
  104. if guid not in groups:
  105. groups[guid] = AudioGroup(guid)
  106. groups[guid].append(get_audio_from_description(
  107. item.xpath('description')[0].text))
  108. return groups
  109. def get_duration(url):
  110. lineout = check_output(['ffprobe', '-v', 'error',
  111. '-show_entries', 'format=duration',
  112. '-i', url]).split(b'\n')
  113. duration = next(l for l in lineout if l.startswith(b'duration='))
  114. value = duration.split(b'=')[1]
  115. return int(float(value))
  116. HELP = '''
  117. Collect audio informations from multiple sources (XML feeds).
  118. Audios are (in that order):
  119. 1. Collected from feeds; (grouped by article if --group is used)
  120. 2. Filtered; everything that does not match with requirements is excluded
  121. 3. Sorted; even randomly
  122. 4. Sliced; take HOWMANY elements, skipping START elements
  123. 5. (if --copy) Copied
  124. Usage: '''
  125. def get_parser():
  126. p = ArgumentParser(HELP)
  127. src = p.add_argument_group('sources', 'How to deal with sources')
  128. src.add_argument('--source-weights',
  129. help='Select only one "source" based on this weights')
  130. src.add_argument('--group', default=False, action='store_true',
  131. help='Group audios that belong to the same article')
  132. filters = p.add_argument_group('filters', 'Select only items that match '
  133. 'these conditions')
  134. filters.add_argument('--max-len', default=0, type=int,
  135. help='Exclude any audio that is longer '
  136. 'than MAX_LEN seconds')
  137. filters.add_argument('--random', default=False,
  138. action='store_true', help='Pick randomly')
  139. filters.add_argument('--min-len', default=0, type=int,
  140. help='Exclude any audio that is shorter '
  141. 'than MIN_LEN seconds')
  142. p.add_argument('--start', default=0, type=int,
  143. help='0-indexed start number. '
  144. 'By default, play from most recent')
  145. p.add_argument('--howmany', default=1, type=int,
  146. help='If not specified, only 1 will be played')
  147. p.add_argument('--slotsize', type=int,
  148. help='Seconds between each audio. Still unsupported')
  149. general = p.add_argument_group('general', 'General options')
  150. general.add_argument('--copy', help='Copy files to $TMPDIR', default=False,
  151. action='store_true')
  152. general.add_argument('--debug', help='Debug messages', default=False,
  153. action='store_true')
  154. p.add_argument('urls', metavar='URL', nargs='+')
  155. return p
  156. def put(audio, copy=False):
  157. if not copy:
  158. for url in audio.urls:
  159. print(url)
  160. else:
  161. for url in audio.urls:
  162. if url.split(':')[0] in ('http', 'https'):
  163. destdir = (os.environ.get('TMPDIR', '.'))
  164. fname = posixpath.basename(urlparse(url).path)
  165. # sanitize
  166. fname = "".join(c for c in fname
  167. if c.isalnum() or c in list('._-')).rstrip()
  168. dest = os.path.join(destdir, fname)
  169. os.makedirs(destdir, exist_ok=True)
  170. fname, headers = urllib.request.urlretrieve(url, dest)
  171. print('file://%s' % os.path.realpath(fname))
  172. else:
  173. # FIXME: file:// urls are just copied
  174. print(url)
  175. def main():
  176. parser = get_parser()
  177. args = parser.parse_args()
  178. if not args.debug:
  179. logging.basicConfig(level=logging.WARNING)
  180. else:
  181. logging.basicConfig(level=logging.DEBUG)
  182. sources = args.urls
  183. if args.source_weights:
  184. weights = tuple(map(int, args.source_weights.split(':')))
  185. if len(weights) != len(sources):
  186. parser.exit(status=2, message='Weight must be in the'
  187. ' same number as sources\n')
  188. sources = [weighted_choice(sources, weights)]
  189. audios = []
  190. for url in sources:
  191. if url.startswith('http:') or url.startswith('https:') \
  192. or os.path.isfile(url):
  193. # download the feed
  194. tree = get_tree(url)
  195. if not args.group:
  196. # get audio urls, removing those that are too long
  197. audios += [audio for audio in get_urls(tree) if
  198. (args.max_len == 0 or
  199. audio.duration <= args.max_len) and
  200. (args.min_len == 0 or
  201. audio.duration >= args.min_len)
  202. ]
  203. else:
  204. groups = get_grouped_urls(tree)
  205. audios += [groups[g] for g in groups.keys()
  206. if args.max_len == 0 or
  207. groups[g].duration <= args.max_len
  208. ]
  209. elif os.path.isdir(url):
  210. audiodir = get_audio_from_dir(url)
  211. if not args.group:
  212. audios += audiodir
  213. else:
  214. for a in audiodir:
  215. ag = AudioGroup(os.path.basename(a.url))
  216. ag.append(a)
  217. audios.append(ag)
  218. else:
  219. logging.info('unsupported url `%s`', url)
  220. audios = audios[args.start:]
  221. if args.random:
  222. random.shuffle(audios)
  223. audios = audios[:args.howmany]
  224. # the for loop excludes the last one
  225. # this is to support the --slotsize option
  226. if not audios:
  227. return
  228. for audio in audios[:-1]:
  229. if args.debug:
  230. print(repr(audio))
  231. else:
  232. put(audio, args.copy)
  233. if args.slotsize is not None:
  234. duration = audio.duration
  235. if duration < args.slotsize:
  236. print('## musica per {} secondi'
  237. .format(args.slotsize - duration))
  238. # finally, the last one
  239. if args.debug:
  240. print(repr(audios[-1]))
  241. else:
  242. put(audios[-1], args.copy)
  243. # else: # grouping; TODO: support slotsize
  244. # for item in groups:
  245. # if args.debug:
  246. # print('#', item, groups[item].duration)
  247. # print(groups[item])
  248. if __name__ == '__main__':
  249. main()