From 6e6803fcff63b542caceef807aa76fc7c57351d3 Mon Sep 17 00:00:00 2001 From: Davide Alberani Date: Thu, 22 Mar 2018 14:03:18 +0100 Subject: [PATCH] more robust XPath extractor --- diffido.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/diffido.py b/diffido.py index 5c6cafa..bd47379 100755 --- a/diffido.py +++ b/diffido.py @@ -149,9 +149,17 @@ def select_xpath(content, xpath): return content selected_content = [] for elem in elems: - selected_content.append(''.join([elem.text] + - [ElementTree.tostring(e, method='html').decode('utf-8', 'replace') - for e in elem.getchildren()])) + pieces = [] + if elem.text: + pieces.append(elem.text) + for sub_el in elem.getchildren(): + try: + sub_el_text = ElementTree.tostring(sub_el, method='html').decode('utf-8', 'replace') + except: + continue + if sub_el_text: + pieces.append(sub_el_text) + selected_content.append(''.join(pieces)) content = ''.join(selected_content).strip() return content