12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152 |
- #!/usr/bin/env python3
- import sys
- import json
- from datetime import datetime
- from pprint import pprint
- from lxml import html, etree
- def get_postinfo(article):
- subelems = article.xpath('tr')
- if len(subelems) != 3:
- return None
- title = subelems[0].text_content().strip()
- # text = etree.tostring(subelems[1])
- text = subelems[1].text_content().strip()
- try:
- date = datetime.strptime(text.split('\n')[0].strip(),
- '%b %d, %Y')
- except ValueError:
- date = None
- else:
- date = date.timestamp()
- urls = [e.get('href')
- for cont in (subelems[1], subelems[2])
- for e in cont.xpath('.//a')
- ]
- urls = [url for url in urls
- if url is not None
- and url.startswith('http')
- and url.lower().endswith('.mp3')]
- return dict(title=title, text=text, urls=urls,
- date=date)
- if len(sys.argv) != 3:
- print("Wrong usage", file=sys.stderr)
- sys.exit(1)
- content = open(sys.argv[1], 'rb').read()
- assert type(content) is bytes
- tree = html.fromstring(content)
- articles = tree.xpath('//table[@cellspacing="2"][@width="100%"]')
- allinfo = []
- for a in articles:
- info = get_postinfo(a)
- if info is not None:
- allinfo.append(info)
- # pprint(info)
- json.dump(allinfo, open(sys.argv[2], 'w'), indent=2)
|