From b780cfaabec99c0e511debd4645359465a0de9f2 Mon Sep 17 00:00:00 2001 From: boyska Date: Tue, 23 Aug 2016 00:55:59 +0200 Subject: [PATCH] first commit, kinda working --- .gitignore | 2 ++ Makefile | 7 +++++++ readold.py | 37 +++++++++++++++++++++++++++++++++++++ 3 files changed, 46 insertions(+) create mode 100644 .gitignore create mode 100644 Makefile create mode 100644 readold.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a33430f --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +news_index.php +*.json diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..e0d9648 --- /dev/null +++ b/Makefile @@ -0,0 +1,7 @@ +all: old.json + +news_index.php: + wget -q 'https://old.ondarossa.info/news_index.php' -c + +old.json: news_index.php readold.py + python readold.py $< $@ diff --git a/readold.py b/readold.py new file mode 100644 index 0000000..eb65811 --- /dev/null +++ b/readold.py @@ -0,0 +1,37 @@ +import sys +import json +from pprint import pprint + +from lxml import html, etree + + +def get_postinfo(article): + subelems = article.xpath('tr') + if len(subelems) != 3: + return None + title = subelems[0].text_content().strip() + # text = etree.tostring(subelems[1]) + text = subelems[1].text_content().strip() + urls = [e.get('href') for e in subelems[2].xpath('.//a')] + urls = [url for url in urls + if url is not None + and url.startswith('http') + and url.lower().endswith('.mp3')] + return dict(title=title, text=text, urls=urls) + + +if len(sys.argv) != 3: + print("Wrong usage", file=sys.stderr) + sys.exit(1) +content = open(sys.argv[1], 'rb').read() +assert type(content) is bytes +tree = html.fromstring(content) +articles = tree.xpath('//table[@cellspacing="2"][@width="100%"]') +allinfo = [] +for a in articles: + info = get_postinfo(a) + if info is not None: + allinfo.append(info) + # pprint(info) + +json.dump(allinfo, open(sys.argv[2], 'w'))