Extract date and more URLs

This commit is contained in:
boyska 2016-08-23 10:06:28 +02:00
parent 1471906054
commit 19432b884b

View file

@ -1,6 +1,7 @@
#!/usr/bin/env python3
import sys
import json
from datetime import datetime
from pprint import pprint
from lxml import html, etree
@ -13,12 +14,25 @@ def get_postinfo(article):
title = subelems[0].text_content().strip()
# text = etree.tostring(subelems[1])
text = subelems[1].text_content().strip()
urls = [e.get('href') for e in subelems[2].xpath('.//a')]
try:
date = datetime.strptime(text.split('\n')[0].strip(),
'%b %d, %Y')
except ValueError:
date = None
else:
date = date.timestamp()
urls = [e.get('href')
for cont in (subelems[1], subelems[2])
for e in cont.xpath('.//a')
]
urls = [url for url in urls
if url is not None
and url.startswith('http')
and url.lower().endswith('.mp3')]
return dict(title=title, text=text, urls=urls)
return dict(title=title, text=text, urls=urls,
date=date)
if len(sys.argv) != 3:
@ -35,4 +49,4 @@ for a in articles:
allinfo.append(info)
# pprint(info)
json.dump(allinfo, open(sys.argv[2], 'w'))
json.dump(allinfo, open(sys.argv[2], 'w'), indent=2)