feed 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182
  1. #!/usr/bin/env python3
  2. # pip install lxml requests
  3. import os
  4. from argparse import ArgumentParser
  5. from subprocess import check_output
  6. from collections import OrderedDict
  7. import re
  8. import urllib.request
  9. from urllib.parse import urlparse, unquote
  10. import posixpath
  11. from lxml import html
  12. import requests
  13. class Audio(object):
  14. def __init__(self, url, durata=None):
  15. self.url = url
  16. if durata is None:
  17. durata = get_duration(url.encode('utf-8'))
  18. self.durata = durata
  19. def __str__(self):
  20. return self.url
  21. def __repr__(self):
  22. return '<Audio {} ({})>'.format(self.url, self.durata)
  23. class AudioGroup(list):
  24. def __init__(self, description=None):
  25. self.description = description or ''
  26. self.audios = []
  27. def __len__(self):
  28. return len(self.audios)
  29. def append(self, arg):
  30. self.audios.append(arg)
  31. def __str__(self):
  32. return '\n'.join(str(a) for a in self.audios)
  33. def __repr__(self):
  34. return '<AudioGroup "{}" ({})\n{}>'.format(self.description,
  35. self.durata,
  36. '\n'.join(repr(a) for a in self.audios))
  37. @property
  38. def durata(self):
  39. return sum(a.durata for a in self.audios if a.durata is not None)
  40. def get_tree(feed_url):
  41. if feed_url.startswith('http:') or feed_url.startswith('https:'):
  42. tree = html.fromstring(requests.get(feed_url).content)
  43. else:
  44. if not os.path.exists(feed_url):
  45. raise ValueError("file not found: {}".format(feed_url))
  46. tree = html.parse(open(feed_url))
  47. return tree
  48. def get_audio_from_description(text):
  49. # non-empty lines
  50. lines = [line.strip()
  51. for line in text.split('\n')
  52. if line.strip()]
  53. url = lines[0]
  54. durata = None
  55. if len(lines) > 1:
  56. durata = int(re.findall(r'\d+', lines[1].split('=')[1].strip())[0])
  57. return Audio(unquote(url), durata)
  58. def get_urls(tree):
  59. urls = tree.xpath('//item/description')
  60. for url_elem in urls:
  61. yield get_audio_from_description(url_elem.text)
  62. def get_grouped_urls(tree):
  63. groups = OrderedDict()
  64. items = tree.xpath('//item')
  65. for item in items:
  66. guid = item.xpath('guid')[0].text.strip()
  67. if guid not in groups:
  68. groups[guid] = AudioGroup(guid)
  69. groups[guid].append(get_audio_from_description(
  70. item.xpath('description')[0].text))
  71. return groups
  72. def get_duration(url):
  73. lineout = check_output(['ffprobe', '-v', 'error',
  74. '-show_entries', 'format=duration',
  75. '-i', url]).split(b'\n')
  76. duration = next(l for l in lineout if l.startswith(b'duration='))
  77. value = duration.split(b'=')[1]
  78. return int(float(value))
  79. def get_parser():
  80. p = ArgumentParser('Get music from a (well-specified) xml feed')
  81. p.add_argument('--start', default=0, type=int,
  82. help='0-indexed start number. '
  83. 'By default, play from most recent')
  84. p.add_argument('--max-len', default=0, type=int,
  85. help='Exclude any audio that is longer than MAXLEN seconds')
  86. p.add_argument('--howmany', default=1, type=int,
  87. help='If not specified, only 1 will be played')
  88. p.add_argument('--slotsize', help='Seconds between each audio', type=int)
  89. p.add_argument('--group', help='Group articles', default=False,
  90. action='store_true')
  91. p.add_argument('--copy', help='Copy files to $TMPDIR', default=False,
  92. action='store_true')
  93. p.add_argument('--debug', help='Debug messages', default=False,
  94. action='store_true')
  95. p.add_argument('url')
  96. return p
  97. def put(audio, copy=False):
  98. if not copy:
  99. print(audio.url)
  100. else:
  101. destdir = (os.environ.get('TMPDIR', '.'))
  102. fname = posixpath.basename(urlparse(audio.url).path)
  103. # sanitize
  104. fname = "".join(c for c in fname
  105. if c.isalnum() or c in list('._-')).rstrip()
  106. dest = os.path.join(destdir, fname)
  107. os.makedirs(destdir, exist_ok=True)
  108. fname, headers = urllib.request.urlretrieve(audio.url, dest)
  109. print(fname)
  110. def main():
  111. args = get_parser().parse_args()
  112. # download the feed
  113. tree = get_tree(args.url)
  114. if not args.group:
  115. # get audio urls, removing those that are too long
  116. audios = [audio for audio in get_urls(tree)
  117. if args.max_len == 0 or
  118. audio.durata <= args.max_len]
  119. audios = audios[args.start:args.start+args.howmany]
  120. else:
  121. groups = get_grouped_urls(tree)
  122. audios = [groups[g] for g in groups.keys()
  123. if args.max_len == 0 or
  124. groups[g].durata <= args.max_len
  125. ][args.start:args.start+args.howmany]
  126. # the for loop excludes the last one
  127. # this is to support the --slotsize option
  128. if not audios:
  129. return
  130. for audio in audios[:-1]:
  131. if args.debug:
  132. print(repr(audio))
  133. else:
  134. put(audio, args.copy)
  135. if args.slotsize is not None:
  136. duration = audio.durata
  137. if duration < args.slotsize:
  138. print('## musica per {} secondi'
  139. .format(args.slotsize - duration))
  140. # finally, the last one
  141. if args.debug:
  142. print(repr(audios[-1]))
  143. else:
  144. put(audios[-1], args.copy)
  145. # else: # grouping; TODO: support slotsize
  146. # for item in groups:
  147. # if args.debug:
  148. # print('#', item, groups[item].durata)
  149. # print(groups[item])
  150. if __name__ == '__main__':
  151. main()