more robust XPath extractor

2018-03-22 14:03:18 +01:00 · 2018-03-22 14:03:18 +01:00 · 6e6803fcff
commit 6e6803fcff
parent e7d1ed6faa
1 changed files with 11 additions and 3 deletions
--- a/diffido.py
+++ b/diffido.py
@ -149,9 +149,17 @@ def select_xpath(content, xpath):
        return content
    selected_content = []
    for elem in elems:
-        selected_content.append(''.join([elem.text] +
-                                        [ElementTree.tostring(e, method='html').decode('utf-8', 'replace')
-                                         for e in elem.getchildren()]))
+        pieces = []
+        if elem.text:
+            pieces.append(elem.text)
+        for sub_el in elem.getchildren():
+            try:
+                sub_el_text = ElementTree.tostring(sub_el, method='html').decode('utf-8', 'replace')
+            except:
+                continue
+            if sub_el_text:
+                pieces.append(sub_el_text)
+        selected_content.append(''.join(pieces))
    content = ''.join(selected_content).strip()
    return content