feed 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242
  1. #!/usr/bin/env python3
  2. '''
  3. Feed parser with many features
  4. from a feed, it supports filtering, subslicing, random picking
  5. Beside feeds, it supports picking files from directories
  6. '''
  7. import os
  8. import logging
  9. from argparse import ArgumentParser
  10. from subprocess import check_output
  11. from collections import OrderedDict
  12. import re
  13. import urllib.request
  14. from urllib.parse import urlparse, unquote
  15. import posixpath
  16. import random
  17. from lxml import html
  18. import requests
  19. class Audio(object):
  20. def __init__(self, url, durata=None):
  21. self.url = url
  22. if durata is None:
  23. durata = get_duration(url.encode('utf-8'))
  24. self.durata = durata
  25. def __str__(self):
  26. return self.url
  27. def __repr__(self):
  28. return '<Audio {} ({})>'.format(self.url, self.durata)
  29. @property
  30. def urls(self):
  31. return [self.url]
  32. class AudioGroup(list):
  33. def __init__(self, description=None):
  34. self.description = description or ''
  35. self.audios = []
  36. def __len__(self):
  37. return len(self.audios)
  38. def append(self, arg):
  39. self.audios.append(arg)
  40. def __str__(self):
  41. return '\n'.join(str(a) for a in self.audios)
  42. def __repr__(self):
  43. return '<AudioGroup "{}" ({})\n{} >'.\
  44. format(self.description, self.durata,
  45. '\n'.join(' ' + repr(a) for a in self.audios))
  46. @property
  47. def durata(self):
  48. return sum(a.durata for a in self.audios if a.durata is not None)
  49. @property
  50. def urls(self):
  51. return [a.url for a in self.audios]
  52. def get_tree(feed_url):
  53. if feed_url.startswith('http:') or feed_url.startswith('https:'):
  54. tree = html.fromstring(requests.get(feed_url).content)
  55. else:
  56. if not os.path.exists(feed_url):
  57. raise ValueError("file not found: {}".format(feed_url))
  58. tree = html.parse(open(feed_url))
  59. return tree
  60. def get_audio_from_description(text):
  61. # non-empty lines
  62. lines = [line.strip()
  63. for line in text.split('\n')
  64. if line.strip()]
  65. url = lines[0]
  66. durata = None
  67. if len(lines) > 1:
  68. durata = int(re.findall(r'\d+', lines[1].split('=')[1].strip())[0])
  69. return Audio(unquote(url), durata)
  70. # copied from larigira.fsutils
  71. def scan_dir_audio(dirname, extensions=('mp3', 'oga', 'wav', 'ogg')):
  72. for root, dirnames, filenames in os.walk(dirname):
  73. for fname in filenames:
  74. if fname.split('.')[-1].lower() in extensions:
  75. yield os.path.join(root, fname)
  76. def get_audio_from_dir(dirpath):
  77. fpaths = scan_dir_audio(dirpath)
  78. return [Audio('file://' + os.path.realpath(u)) for u in fpaths]
  79. def get_urls(tree):
  80. urls = tree.xpath('//item/description')
  81. for url_elem in urls:
  82. yield get_audio_from_description(url_elem.text)
  83. def get_grouped_urls(tree):
  84. groups = OrderedDict()
  85. items = tree.xpath('//item')
  86. for item in items:
  87. guid = item.xpath('guid')[0].text.strip()
  88. if guid not in groups:
  89. groups[guid] = AudioGroup(guid)
  90. groups[guid].append(get_audio_from_description(
  91. item.xpath('description')[0].text))
  92. return groups
  93. def get_duration(url):
  94. lineout = check_output(['ffprobe', '-v', 'error',
  95. '-show_entries', 'format=duration',
  96. '-i', url]).split(b'\n')
  97. duration = next(l for l in lineout if l.startswith(b'duration='))
  98. value = duration.split(b'=')[1]
  99. return int(float(value))
  100. def get_parser():
  101. p = ArgumentParser('Get music from a (well-specified) xml feed')
  102. p.add_argument('--start', default=0, type=int,
  103. help='0-indexed start number. '
  104. 'By default, play from most recent')
  105. p.add_argument('--max-len', default=0, type=int,
  106. help='Exclude any audio that is longer than MAXLEN seconds')
  107. p.add_argument('--random', default=False,
  108. action='store_true', help='Pick randomly')
  109. p.add_argument('--howmany', default=1, type=int,
  110. help='If not specified, only 1 will be played')
  111. p.add_argument('--slotsize', help='Seconds between each audio', type=int)
  112. p.add_argument('--group', help='Group articles', default=False,
  113. action='store_true')
  114. p.add_argument('--copy', help='Copy files to $TMPDIR', default=False,
  115. action='store_true')
  116. p.add_argument('--debug', help='Debug messages', default=False,
  117. action='store_true')
  118. p.add_argument('urls', metavar='URL', nargs='+')
  119. return p
  120. def put(audio, copy=False):
  121. if not copy:
  122. for url in audio.urls:
  123. print(url)
  124. else:
  125. for url in audio.urls:
  126. if url.split(':')[0] in ('http', 'https'):
  127. destdir = (os.environ.get('TMPDIR', '.'))
  128. fname = posixpath.basename(urlparse(url).path)
  129. # sanitize
  130. fname = "".join(c for c in fname
  131. if c.isalnum() or c in list('._-')).rstrip()
  132. dest = os.path.join(destdir, fname)
  133. os.makedirs(destdir, exist_ok=True)
  134. fname, headers = urllib.request.urlretrieve(url, dest)
  135. print('file://%s' % os.path.realpath(fname))
  136. else:
  137. # FIXME: file:// urls are just copied
  138. print(url)
  139. def main():
  140. args = get_parser().parse_args()
  141. if not args.debug:
  142. logging.basicConfig(level=logging.WARNING)
  143. else:
  144. logging.basicConfig(level=logging.DEBUG)
  145. audios = []
  146. for url in args.urls:
  147. if url.startswith('http:') or url.startswith('https:') \
  148. or os.path.isfile(url):
  149. # download the feed
  150. tree = get_tree(url)
  151. if not args.group:
  152. # get audio urls, removing those that are too long
  153. audios += [audio for audio in get_urls(tree)
  154. if args.max_len == 0 or
  155. audio.durata <= args.max_len]
  156. else:
  157. groups = get_grouped_urls(tree)
  158. audios += [groups[g] for g in groups.keys()
  159. if args.max_len == 0 or
  160. groups[g].durata <= args.max_len
  161. ]
  162. elif os.path.isdir(url):
  163. audiodir = get_audio_from_dir(url)
  164. if not args.group:
  165. audios += audiodir
  166. else:
  167. for a in audiodir:
  168. ag = AudioGroup(os.path.basename(a.url))
  169. ag.append(a)
  170. audios.append(ag)
  171. else:
  172. logging.info('unsupported url `%s`', url)
  173. audios = audios[args.start:]
  174. if args.random:
  175. random.shuffle(audios)
  176. audios = audios[:args.howmany]
  177. # the for loop excludes the last one
  178. # this is to support the --slotsize option
  179. if not audios:
  180. return
  181. for audio in audios[:-1]:
  182. if args.debug:
  183. print(repr(audio))
  184. else:
  185. put(audio, args.copy)
  186. if args.slotsize is not None:
  187. duration = audio.durata
  188. if duration < args.slotsize:
  189. print('## musica per {} secondi'
  190. .format(args.slotsize - duration))
  191. # finally, the last one
  192. if args.debug:
  193. print(repr(audios[-1]))
  194. else:
  195. put(audios[-1], args.copy)
  196. # else: # grouping; TODO: support slotsize
  197. # for item in groups:
  198. # if args.debug:
  199. # print('#', item, groups[item].durata)
  200. # print(groups[item])
  201. if __name__ == '__main__':
  202. main()