|
@@ -0,0 +1,37 @@
|
|
|
+import sys
|
|
|
+import json
|
|
|
+from pprint import pprint
|
|
|
+
|
|
|
+from lxml import html, etree
|
|
|
+
|
|
|
+
|
|
|
+def get_postinfo(article):
|
|
|
+ subelems = article.xpath('tr')
|
|
|
+ if len(subelems) != 3:
|
|
|
+ return None
|
|
|
+ title = subelems[0].text_content().strip()
|
|
|
+ # text = etree.tostring(subelems[1])
|
|
|
+ text = subelems[1].text_content().strip()
|
|
|
+ urls = [e.get('href') for e in subelems[2].xpath('.//a')]
|
|
|
+ urls = [url for url in urls
|
|
|
+ if url is not None
|
|
|
+ and url.startswith('http')
|
|
|
+ and url.lower().endswith('.mp3')]
|
|
|
+ return dict(title=title, text=text, urls=urls)
|
|
|
+
|
|
|
+
|
|
|
+if len(sys.argv) != 3:
|
|
|
+ print("Wrong usage", file=sys.stderr)
|
|
|
+ sys.exit(1)
|
|
|
+content = open(sys.argv[1], 'rb').read()
|
|
|
+assert type(content) is bytes
|
|
|
+tree = html.fromstring(content)
|
|
|
+articles = tree.xpath('//table[@cellspacing="2"][@width="100%"]')
|
|
|
+allinfo = []
|
|
|
+for a in articles:
|
|
|
+ info = get_postinfo(a)
|
|
|
+ if info is not None:
|
|
|
+ allinfo.append(info)
|
|
|
+ # pprint(info)
|
|
|
+
|
|
|
+json.dump(allinfo, open(sys.argv[2], 'w'))
|