feed 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175
  1. #!/usr/bin/env python3
  2. # pip install lxml requests
  3. import os
  4. from argparse import ArgumentParser
  5. from subprocess import check_output
  6. from collections import OrderedDict
  7. import re
  8. import urllib.request
  9. from lxml import html
  10. import requests
  11. class Audio(object):
  12. def __init__(self, url, durata=None):
  13. self.url = url
  14. if durata is None:
  15. durata = get_duration(url.encode('utf-8'))
  16. self.durata = durata
  17. def __str__(self):
  18. return self.url
  19. def __repr__(self):
  20. return '<Audio {} ({})>'.format(self.url, self.durata)
  21. class AudioGroup(list):
  22. def __init__(self, description=None):
  23. self.description = description or ''
  24. self.audios = []
  25. def __len__(self):
  26. return len(self.audios)
  27. def append(self, arg):
  28. self.audios.append(arg)
  29. def __str__(self):
  30. return '\n'.join(str(a) for a in self.audios)
  31. def __repr__(self):
  32. return '<AudioGroup "{}" ({})\n{}>'.format(self.description,
  33. self.durata,
  34. '\n'.join(repr(a) for a in self.audios))
  35. @property
  36. def durata(self):
  37. return sum(a.durata for a in self.audios if a.durata is not None)
  38. def get_tree(feed_url):
  39. if feed_url.startswith('http:') or feed_url.startswith('https:'):
  40. tree = html.fromstring(requests.get(feed_url).content)
  41. else:
  42. if not os.path.exists(feed_url):
  43. raise ValueError("file not found: {}".format(feed_url))
  44. tree = html.parse(open(feed_url))
  45. return tree
  46. def get_audio_from_description(text):
  47. # non-empty lines
  48. lines = [line.strip()
  49. for line in text.split('\n')
  50. if line.strip()]
  51. url = lines[0]
  52. durata = None
  53. if len(lines) > 1:
  54. durata = int(re.findall(r'\d+', lines[1].split('=')[1].strip())[0])
  55. return Audio(url, durata)
  56. def get_urls(tree):
  57. urls = tree.xpath('//item/description')
  58. for url_elem in urls:
  59. yield get_audio_from_description(url_elem.text)
  60. def get_grouped_urls(tree):
  61. groups = OrderedDict()
  62. items = tree.xpath('//item')
  63. for item in items:
  64. guid = item.xpath('guid')[0].text.strip()
  65. if guid not in groups:
  66. groups[guid] = AudioGroup(guid)
  67. groups[guid].append(get_audio_from_description(
  68. item.xpath('description')[0].text))
  69. return groups
  70. def get_duration(url):
  71. lineout = check_output(['ffprobe', '-v', 'error',
  72. '-show_entries', 'format=duration',
  73. '-i', url]).split(b'\n')
  74. duration = next(l for l in lineout if l.startswith(b'duration='))
  75. value = duration.split(b'=')[1]
  76. return int(float(value))
  77. def get_parser():
  78. p = ArgumentParser('Get music from a (well-specified) xml feed')
  79. p.add_argument('--start', default=0, type=int,
  80. help='0-indexed start number. '
  81. 'By default, play from most recent')
  82. p.add_argument('--max-len', default=0, type=int,
  83. help='Exclude any audio that is longer than MAXLEN seconds')
  84. p.add_argument('--howmany', default=1, type=int,
  85. help='If not specified, only 1 will be played')
  86. p.add_argument('--slotsize', help='Seconds between each audio', type=int)
  87. p.add_argument('--group', help='Group articles', default=False,
  88. action='store_true')
  89. p.add_argument('--copy', help='Copy files to $TMPDIR', default=False,
  90. action='store_true')
  91. p.add_argument('--debug', help='Debug messages', default=False,
  92. action='store_true')
  93. p.add_argument('url')
  94. return p
  95. def put(audio, copy=False):
  96. if not copy:
  97. print(audio.url)
  98. else:
  99. destdir = (os.environ.get('TMPDIR', '.'))
  100. dest = os.path.join(destdir, audio.url.split('/')[-1])
  101. os.makedirs(destdir, exist_ok=True)
  102. fname, headers = urllib.request.urlretrieve(audio.url, dest)
  103. print(fname)
  104. def main():
  105. args = get_parser().parse_args()
  106. # download the feed
  107. tree = get_tree(args.url)
  108. if not args.group:
  109. # get audio urls, removing those that are too long
  110. audios = [audio for audio in get_urls(tree)
  111. if args.max_len == 0 or
  112. audio.durata <= args.max_len]
  113. audios = audios[args.start:args.start+args.howmany]
  114. else:
  115. groups = get_grouped_urls(tree)
  116. audios = [groups[g] for g in groups.keys()
  117. if args.max_len == 0 or
  118. groups[g].durata <= args.max_len
  119. ][args.start:args.start+args.howmany]
  120. # the for loop excludes the last one
  121. # this is to support the --slotsize option
  122. if not audios:
  123. return
  124. for audio in audios[:-1]:
  125. if args.debug:
  126. print(repr(audio))
  127. else:
  128. put(audio, args.copy)
  129. if args.slotsize is not None:
  130. duration = audio.durata
  131. if duration < args.slotsize:
  132. print('## musica per {} secondi'
  133. .format(args.slotsize - duration))
  134. # finally, the last one
  135. if args.debug:
  136. print(repr(audios[-1]))
  137. else:
  138. put(audios[-1], args.copy)
  139. # else: # grouping; TODO: support slotsize
  140. # for item in groups:
  141. # if args.debug:
  142. # print('#', item, groups[item].durata)
  143. # print(groups[item])
  144. if __name__ == '__main__':
  145. main()