first commit, kinda working
This commit is contained in:
commit
b780cfaabe
3 changed files with 46 additions and 0 deletions
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
|
@ -0,0 +1,2 @@
|
|||
news_index.php
|
||||
*.json
|
7
Makefile
Normal file
7
Makefile
Normal file
|
@ -0,0 +1,7 @@
|
|||
all: old.json
|
||||
|
||||
news_index.php:
|
||||
wget -q 'https://old.ondarossa.info/news_index.php' -c
|
||||
|
||||
old.json: news_index.php readold.py
|
||||
python readold.py $< $@
|
37
readold.py
Normal file
37
readold.py
Normal file
|
@ -0,0 +1,37 @@
|
|||
import sys
|
||||
import json
|
||||
from pprint import pprint
|
||||
|
||||
from lxml import html, etree
|
||||
|
||||
|
||||
def get_postinfo(article):
|
||||
subelems = article.xpath('tr')
|
||||
if len(subelems) != 3:
|
||||
return None
|
||||
title = subelems[0].text_content().strip()
|
||||
# text = etree.tostring(subelems[1])
|
||||
text = subelems[1].text_content().strip()
|
||||
urls = [e.get('href') for e in subelems[2].xpath('.//a')]
|
||||
urls = [url for url in urls
|
||||
if url is not None
|
||||
and url.startswith('http')
|
||||
and url.lower().endswith('.mp3')]
|
||||
return dict(title=title, text=text, urls=urls)
|
||||
|
||||
|
||||
if len(sys.argv) != 3:
|
||||
print("Wrong usage", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
content = open(sys.argv[1], 'rb').read()
|
||||
assert type(content) is bytes
|
||||
tree = html.fromstring(content)
|
||||
articles = tree.xpath('//table[@cellspacing="2"][@width="100%"]')
|
||||
allinfo = []
|
||||
for a in articles:
|
||||
info = get_postinfo(a)
|
||||
if info is not None:
|
||||
allinfo.append(info)
|
||||
# pprint(info)
|
||||
|
||||
json.dump(allinfo, open(sys.argv[2], 'w'))
|
Loading…
Reference in a new issue