readold.py 1017 B

12345678910111213141516171819202122232425262728293031323334353637
  1. import sys
  2. import json
  3. from pprint import pprint
  4. from lxml import html, etree
  5. def get_postinfo(article):
  6. subelems = article.xpath('tr')
  7. if len(subelems) != 3:
  8. return None
  9. title = subelems[0].text_content().strip()
  10. # text = etree.tostring(subelems[1])
  11. text = subelems[1].text_content().strip()
  12. urls = [e.get('href') for e in subelems[2].xpath('.//a')]
  13. urls = [url for url in urls
  14. if url is not None
  15. and url.startswith('http')
  16. and url.lower().endswith('.mp3')]
  17. return dict(title=title, text=text, urls=urls)
  18. if len(sys.argv) != 3:
  19. print("Wrong usage", file=sys.stderr)
  20. sys.exit(1)
  21. content = open(sys.argv[1], 'rb').read()
  22. assert type(content) is bytes
  23. tree = html.fromstring(content)
  24. articles = tree.xpath('//table[@cellspacing="2"][@width="100%"]')
  25. allinfo = []
  26. for a in articles:
  27. info = get_postinfo(a)
  28. if info is not None:
  29. allinfo.append(info)
  30. # pprint(info)
  31. json.dump(allinfo, open(sys.argv[2], 'w'))