Browse Source

first commit, kinda working

boyska 7 years ago
commit
b780cfaabe
3 changed files with 46 additions and 0 deletions
  1. 2 0
      .gitignore
  2. 7 0
      Makefile
  3. 37 0
      readold.py

+ 2 - 0
.gitignore

@@ -0,0 +1,2 @@
+news_index.php
+*.json

+ 7 - 0
Makefile

@@ -0,0 +1,7 @@
+all: old.json
+
+news_index.php:
+	wget -q 'https://old.ondarossa.info/news_index.php' -c
+
+old.json: news_index.php readold.py
+	python readold.py $< $@

+ 37 - 0
readold.py

@@ -0,0 +1,37 @@
+import sys
+import json
+from pprint import pprint
+
+from lxml import html, etree
+
+
+def get_postinfo(article):
+    subelems = article.xpath('tr')
+    if len(subelems) != 3:
+        return None
+    title = subelems[0].text_content().strip()
+    # text = etree.tostring(subelems[1])
+    text = subelems[1].text_content().strip()
+    urls = [e.get('href') for e in subelems[2].xpath('.//a')]
+    urls = [url for url in urls
+            if url is not None
+            and url.startswith('http')
+            and url.lower().endswith('.mp3')]
+    return dict(title=title, text=text, urls=urls)
+
+
+if len(sys.argv) != 3:
+    print("Wrong usage", file=sys.stderr)
+    sys.exit(1)
+content = open(sys.argv[1], 'rb').read()
+assert type(content) is bytes
+tree = html.fromstring(content)
+articles = tree.xpath('//table[@cellspacing="2"][@width="100%"]')
+allinfo = []
+for a in articles:
+    info = get_postinfo(a)
+    if info is not None:
+        allinfo.append(info)
+        # pprint(info)
+
+json.dump(allinfo, open(sys.argv[2], 'w'))