Extract date and more URLs

This commit is contained in:
boyska 2016-08-23 10:06:28 +02:00
parent 1471906054
commit 19432b884b

View file

@ -1,6 +1,7 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import sys import sys
import json import json
from datetime import datetime
from pprint import pprint from pprint import pprint
from lxml import html, etree from lxml import html, etree
@ -13,12 +14,25 @@ def get_postinfo(article):
title = subelems[0].text_content().strip() title = subelems[0].text_content().strip()
# text = etree.tostring(subelems[1]) # text = etree.tostring(subelems[1])
text = subelems[1].text_content().strip() text = subelems[1].text_content().strip()
urls = [e.get('href') for e in subelems[2].xpath('.//a')] try:
date = datetime.strptime(text.split('\n')[0].strip(),
'%b %d, %Y')
except ValueError:
date = None
else:
date = date.timestamp()
urls = [e.get('href')
for cont in (subelems[1], subelems[2])
for e in cont.xpath('.//a')
]
urls = [url for url in urls urls = [url for url in urls
if url is not None if url is not None
and url.startswith('http') and url.startswith('http')
and url.lower().endswith('.mp3')] and url.lower().endswith('.mp3')]
return dict(title=title, text=text, urls=urls)
return dict(title=title, text=text, urls=urls,
date=date)
if len(sys.argv) != 3: if len(sys.argv) != 3:
@ -35,4 +49,4 @@ for a in articles:
allinfo.append(info) allinfo.append(info)
# pprint(info) # pprint(info)
json.dump(allinfo, open(sys.argv[2], 'w')) json.dump(allinfo, open(sys.argv[2], 'w'), indent=2)