Browse Source

more robust XPath extractor

Davide Alberani 6 years ago
parent
commit
6e6803fcff
1 changed files with 11 additions and 3 deletions
  1. 11 3
      diffido.py

+ 11 - 3
diffido.py

@@ -149,9 +149,17 @@ def select_xpath(content, xpath):
         return content
     selected_content = []
     for elem in elems:
-        selected_content.append(''.join([elem.text] +
-                                        [ElementTree.tostring(e, method='html').decode('utf-8', 'replace')
-                                         for e in elem.getchildren()]))
+        pieces = []
+        if elem.text:
+            pieces.append(elem.text)
+        for sub_el in elem.getchildren():
+            try:
+                sub_el_text = ElementTree.tostring(sub_el, method='html').decode('utf-8', 'replace')
+            except:
+                continue
+            if sub_el_text:
+                pieces.append(sub_el_text)
+        selected_content.append(''.join(pieces))
     content = ''.join(selected_content).strip()
     return content