more robust XPath extractor

This commit is contained in:
Davide Alberani 2018-03-22 14:03:18 +01:00
parent e7d1ed6faa
commit 6e6803fcff

View file

@ -149,9 +149,17 @@ def select_xpath(content, xpath):
return content
selected_content = []
for elem in elems:
selected_content.append(''.join([elem.text] +
[ElementTree.tostring(e, method='html').decode('utf-8', 'replace')
for e in elem.getchildren()]))
pieces = []
if elem.text:
pieces.append(elem.text)
for sub_el in elem.getchildren():
try:
sub_el_text = ElementTree.tostring(sub_el, method='html').decode('utf-8', 'replace')
except:
continue
if sub_el_text:
pieces.append(sub_el_text)
selected_content.append(''.join(pieces))
content = ''.join(selected_content).strip()
return content