#!/usr/bin/env python3 import sys import json from datetime import datetime from pprint import pprint from lxml import html, etree def get_postinfo(article): subelems = article.xpath('tr') if len(subelems) != 3: return None title = subelems[0].text_content().strip() # text = etree.tostring(subelems[1]) text = subelems[1].text_content().strip() try: date = datetime.strptime(text.split('\n')[0].strip(), '%b %d, %Y') except ValueError: date = None else: date = date.timestamp() urls = [e.get('href') for cont in (subelems[1], subelems[2]) for e in cont.xpath('.//a') ] urls = [url for url in urls if url is not None and url.startswith('http') and url.lower().endswith('.mp3')] return dict(title=title, text=text, urls=urls, date=date) if len(sys.argv) != 3: print("Wrong usage", file=sys.stderr) sys.exit(1) content = open(sys.argv[1], 'rb').read() assert type(content) is bytes tree = html.fromstring(content) articles = tree.xpath('//table[@cellspacing="2"][@width="100%"]') allinfo = [] for a in articles: info = get_postinfo(a) if info is not None: allinfo.append(info) # pprint(info) json.dump(allinfo, open(sys.argv[2], 'w'), indent=2)