From 19432b884b44e59378e6e09e712476c600b9f6ec Mon Sep 17 00:00:00 2001 From: boyska Date: Tue, 23 Aug 2016 10:06:28 +0200 Subject: [PATCH] Extract date and more URLs --- readold.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/readold.py b/readold.py index 200cc64..a41aa1d 100755 --- a/readold.py +++ b/readold.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 import sys import json +from datetime import datetime from pprint import pprint from lxml import html, etree @@ -13,12 +14,25 @@ def get_postinfo(article): title = subelems[0].text_content().strip() # text = etree.tostring(subelems[1]) text = subelems[1].text_content().strip() - urls = [e.get('href') for e in subelems[2].xpath('.//a')] + try: + date = datetime.strptime(text.split('\n')[0].strip(), + '%b %d, %Y') + except ValueError: + date = None + else: + date = date.timestamp() + + urls = [e.get('href') + for cont in (subelems[1], subelems[2]) + for e in cont.xpath('.//a') + ] urls = [url for url in urls if url is not None and url.startswith('http') and url.lower().endswith('.mp3')] - return dict(title=title, text=text, urls=urls) + + return dict(title=title, text=text, urls=urls, + date=date) if len(sys.argv) != 3: @@ -35,4 +49,4 @@ for a in articles: allinfo.append(info) # pprint(info) -json.dump(allinfo, open(sys.argv[2], 'w')) +json.dump(allinfo, open(sys.argv[2], 'w'), indent=2)