Extract date and more URLs
This commit is contained in:
parent
1471906054
commit
19432b884b
1 changed files with 17 additions and 3 deletions
20
readold.py
20
readold.py
|
@ -1,6 +1,7 @@
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
import sys
|
import sys
|
||||||
import json
|
import json
|
||||||
|
from datetime import datetime
|
||||||
from pprint import pprint
|
from pprint import pprint
|
||||||
|
|
||||||
from lxml import html, etree
|
from lxml import html, etree
|
||||||
|
@ -13,12 +14,25 @@ def get_postinfo(article):
|
||||||
title = subelems[0].text_content().strip()
|
title = subelems[0].text_content().strip()
|
||||||
# text = etree.tostring(subelems[1])
|
# text = etree.tostring(subelems[1])
|
||||||
text = subelems[1].text_content().strip()
|
text = subelems[1].text_content().strip()
|
||||||
urls = [e.get('href') for e in subelems[2].xpath('.//a')]
|
try:
|
||||||
|
date = datetime.strptime(text.split('\n')[0].strip(),
|
||||||
|
'%b %d, %Y')
|
||||||
|
except ValueError:
|
||||||
|
date = None
|
||||||
|
else:
|
||||||
|
date = date.timestamp()
|
||||||
|
|
||||||
|
urls = [e.get('href')
|
||||||
|
for cont in (subelems[1], subelems[2])
|
||||||
|
for e in cont.xpath('.//a')
|
||||||
|
]
|
||||||
urls = [url for url in urls
|
urls = [url for url in urls
|
||||||
if url is not None
|
if url is not None
|
||||||
and url.startswith('http')
|
and url.startswith('http')
|
||||||
and url.lower().endswith('.mp3')]
|
and url.lower().endswith('.mp3')]
|
||||||
return dict(title=title, text=text, urls=urls)
|
|
||||||
|
return dict(title=title, text=text, urls=urls,
|
||||||
|
date=date)
|
||||||
|
|
||||||
|
|
||||||
if len(sys.argv) != 3:
|
if len(sys.argv) != 3:
|
||||||
|
@ -35,4 +49,4 @@ for a in articles:
|
||||||
allinfo.append(info)
|
allinfo.append(info)
|
||||||
# pprint(info)
|
# pprint(info)
|
||||||
|
|
||||||
json.dump(allinfo, open(sys.argv[2], 'w'))
|
json.dump(allinfo, open(sys.argv[2], 'w'), indent=2)
|
||||||
|
|
Loading…
Reference in a new issue