ror-old-migrate/readold.py

#!/usr/bin/env python3
import sys
import json
from datetime import datetime
from pprint import pprint

from lxml import html, etree


def get_postinfo(article):
    subelems = article.xpath('tr')
    if len(subelems) != 3:
        return None
    title = subelems[0].text_content().strip()
    # text = etree.tostring(subelems[1])
    text = subelems[1].text_content().strip()
    try:
        date = datetime.strptime(text.split('\n')[0].strip(),
                                 '%b %d, %Y')
    except ValueError:
        date = None
    else:
        date = date.timestamp()

    urls = [e.get('href')
            for cont in (subelems[1], subelems[2])
            for e in cont.xpath('.//a')
            ]
    urls = [url for url in urls
            if url is not None
            and url.startswith('http')
            and url.lower().endswith('.mp3')]

    return dict(title=title, text=text, urls=urls,
                date=date)


if len(sys.argv) != 3:
    print("Wrong usage", file=sys.stderr)
    sys.exit(1)
content = open(sys.argv[1], 'rb').read()
assert type(content) is bytes
tree = html.fromstring(content)
articles = tree.xpath('//table[@cellspacing="2"][@width="100%"]')
allinfo = []
for a in articles:
    info = get_postinfo(a)
    if info is not None:
        allinfo.append(info)
        # pprint(info)

json.dump(allinfo, open(sys.argv[2], 'w'), indent=2)