Extract date and more URLs

2016-08-23 10:06:28 +02:00 · 2016-08-23 10:06:28 +02:00 · 19432b884b
commit 19432b884b
parent 1471906054
1 changed files with 17 additions and 3 deletions
--- a/readold.py
+++ b/readold.py
@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 import sys
 import json
 from datetime import datetime
 from pprint import pprint
 from lxml import html, etree
@ -13,12 +14,25 @@ def get_postinfo(article):
    title = subelems[0].text_content().strip()
    # text = etree.tostring(subelems[1])
    text = subelems[1].text_content().strip()
-    urls = [e.get('href') for e in subelems[2].xpath('.//a')]
+    try:
        date = datetime.strptime(text.split('\n')[0].strip(),
                                 '%b %d, %Y')
    except ValueError:
        date = None
    else:
        date = date.timestamp()
    urls = [e.get('href')
            for cont in (subelems[1], subelems[2])
            for e in cont.xpath('.//a')
            ]
    urls = [url for url in urls
            if url is not None
            and url.startswith('http')
            and url.lower().endswith('.mp3')]
-    return dict(title=title, text=text, urls=urls)
+
    return dict(title=title, text=text, urls=urls,
                date=date)
 if len(sys.argv) != 3:
@ -35,4 +49,4 @@ for a in articles:
        allinfo.append(info)
        # pprint(info)
-json.dump(allinfo, open(sys.argv[2], 'w'))
+json.dump(allinfo, open(sys.argv[2], 'w'), indent=2)