readold.py 1.3 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152
  1. #!/usr/bin/env python3
  2. import sys
  3. import json
  4. from datetime import datetime
  5. from pprint import pprint
  6. from lxml import html, etree
  7. def get_postinfo(article):
  8. subelems = article.xpath('tr')
  9. if len(subelems) != 3:
  10. return None
  11. title = subelems[0].text_content().strip()
  12. # text = etree.tostring(subelems[1])
  13. text = subelems[1].text_content().strip()
  14. try:
  15. date = datetime.strptime(text.split('\n')[0].strip(),
  16. '%b %d, %Y')
  17. except ValueError:
  18. date = None
  19. else:
  20. date = date.timestamp()
  21. urls = [e.get('href')
  22. for cont in (subelems[1], subelems[2])
  23. for e in cont.xpath('.//a')
  24. ]
  25. urls = [url for url in urls
  26. if url is not None
  27. and url.startswith('http')
  28. and url.lower().endswith('.mp3')]
  29. return dict(title=title, text=text, urls=urls,
  30. date=date)
  31. if len(sys.argv) != 3:
  32. print("Wrong usage", file=sys.stderr)
  33. sys.exit(1)
  34. content = open(sys.argv[1], 'rb').read()
  35. assert type(content) is bytes
  36. tree = html.fromstring(content)
  37. articles = tree.xpath('//table[@cellspacing="2"][@width="100%"]')
  38. allinfo = []
  39. for a in articles:
  40. info = get_postinfo(a)
  41. if info is not None:
  42. allinfo.append(info)
  43. # pprint(info)
  44. json.dump(allinfo, open(sys.argv[2], 'w'), indent=2)