introduce draft of XPath selector

2018-01-21 14:20:27 +01:00 · 2018-01-21 14:20:27 +01:00 · 32421905a2
commit 32421905a2
parent 208e028569
1 changed files with 20 additions and 1 deletions
--- a/diffido.py
+++ b/diffido.py
@ -14,6 +14,8 @@ import datetime
 import requests
 import subprocess
 import multiprocessing
+from lxml import etree
+from xml.etree import ElementTree

 from tornado.ioloop import IOLoop
 from apscheduler.triggers.cron import CronTrigger
@ -70,6 +72,20 @@ def get_schedule(id_, addID=True):
    return data


+def select_xpath(content, xpath):
+    fd = io.StringIO(content)
+    tree = etree.parse(fd)
+    elems = tree.xpath(xpath)
+    if not elems:
+        return content
+    selected_content = []
+    for elem in elems:
+        selected_content.append(''.join([elem.text] + [ElementTree.tostring(e).decode('utf8', 'replace')
+                                                    for e in elem.getchildren()]))
+    content = ''.join(selected_content)
+    return content
+
+
 def run_job(id_=None, *args, **kwargs):
    schedule = get_schedule(id_, addID=False)
    url = schedule.get('url')
@ -78,8 +94,11 @@ def run_job(id_=None, *args, **kwargs):
    logger.debug('Running job id:%s title:%s url: %s' % (id_, schedule.get('title', ''), url))
    req = requests.get(url, allow_redirects=True, timeout=(30.10, 240))
    content = req.text
+    xpath = schedule.get('xpath')
+    if xpath:
+        content = select_xpath(content, xpath)
    req_path = urllib.parse.urlparse(req.url).path
-    base_name = os.path.basename(req_path) or 'index'
+    base_name = os.path.basename(req_path) or 'index.html'
    def _commit(id_, filename, content, queue):
        os.chdir('storage/%s' % id_)
        current_lines = 0