first commit, kinda working

This commit is contained in:
boyska 2016-08-23 00:55:59 +02:00
commit b780cfaabe
3 changed files with 46 additions and 0 deletions

2
.gitignore vendored Normal file
View file

@ -0,0 +1,2 @@
news_index.php
*.json

7
Makefile Normal file
View file

@ -0,0 +1,7 @@
all: old.json
news_index.php:
wget -q 'https://old.ondarossa.info/news_index.php' -c
old.json: news_index.php readold.py
python readold.py $< $@

37
readold.py Normal file
View file

@ -0,0 +1,37 @@
import sys
import json
from pprint import pprint
from lxml import html, etree
def get_postinfo(article):
subelems = article.xpath('tr')
if len(subelems) != 3:
return None
title = subelems[0].text_content().strip()
# text = etree.tostring(subelems[1])
text = subelems[1].text_content().strip()
urls = [e.get('href') for e in subelems[2].xpath('.//a')]
urls = [url for url in urls
if url is not None
and url.startswith('http')
and url.lower().endswith('.mp3')]
return dict(title=title, text=text, urls=urls)
if len(sys.argv) != 3:
print("Wrong usage", file=sys.stderr)
sys.exit(1)
content = open(sys.argv[1], 'rb').read()
assert type(content) is bytes
tree = html.fromstring(content)
articles = tree.xpath('//table[@cellspacing="2"][@width="100%"]')
allinfo = []
for a in articles:
info = get_postinfo(a)
if info is not None:
allinfo.append(info)
# pprint(info)
json.dump(allinfo, open(sys.argv[2], 'w'))