From 19432b884b44e59378e6e09e712476c600b9f6ec Mon Sep 17 00:00:00 2001
From: boyska <piuttosto@logorroici.org>
Date: Tue, 23 Aug 2016 10:06:28 +0200
Subject: [PATCH] Extract date and more URLs

---
 readold.py | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/readold.py b/readold.py
index 200cc64..a41aa1d 100755
--- a/readold.py
+++ b/readold.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 import sys
 import json
+from datetime import datetime
 from pprint import pprint
 
 from lxml import html, etree
@@ -13,12 +14,25 @@ def get_postinfo(article):
     title = subelems[0].text_content().strip()
     # text = etree.tostring(subelems[1])
     text = subelems[1].text_content().strip()
-    urls = [e.get('href') for e in subelems[2].xpath('.//a')]
+    try:
+        date = datetime.strptime(text.split('\n')[0].strip(),
+                                 '%b %d, %Y')
+    except ValueError:
+        date = None
+    else:
+        date = date.timestamp()
+
+    urls = [e.get('href')
+            for cont in (subelems[1], subelems[2])
+            for e in cont.xpath('.//a')
+            ]
     urls = [url for url in urls
             if url is not None
             and url.startswith('http')
             and url.lower().endswith('.mp3')]
-    return dict(title=title, text=text, urls=urls)
+
+    return dict(title=title, text=text, urls=urls,
+                date=date)
 
 
 if len(sys.argv) != 3:
@@ -35,4 +49,4 @@ for a in articles:
         allinfo.append(info)
         # pprint(info)
 
-json.dump(allinfo, open(sys.argv[2], 'w'))
+json.dump(allinfo, open(sys.argv[2], 'w'), indent=2)