more robust XPath extractor
This commit is contained in:
parent
e7d1ed6faa
commit
6e6803fcff
1 changed files with 11 additions and 3 deletions
14
diffido.py
14
diffido.py
|
@ -149,9 +149,17 @@ def select_xpath(content, xpath):
|
||||||
return content
|
return content
|
||||||
selected_content = []
|
selected_content = []
|
||||||
for elem in elems:
|
for elem in elems:
|
||||||
selected_content.append(''.join([elem.text] +
|
pieces = []
|
||||||
[ElementTree.tostring(e, method='html').decode('utf-8', 'replace')
|
if elem.text:
|
||||||
for e in elem.getchildren()]))
|
pieces.append(elem.text)
|
||||||
|
for sub_el in elem.getchildren():
|
||||||
|
try:
|
||||||
|
sub_el_text = ElementTree.tostring(sub_el, method='html').decode('utf-8', 'replace')
|
||||||
|
except:
|
||||||
|
continue
|
||||||
|
if sub_el_text:
|
||||||
|
pieces.append(sub_el_text)
|
||||||
|
selected_content.append(''.join(pieces))
|
||||||
content = ''.join(selected_content).strip()
|
content = ''.join(selected_content).strip()
|
||||||
return content
|
return content
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue