more robust XPath extractor
This commit is contained in:
parent
e7d1ed6faa
commit
6e6803fcff
1 changed files with 11 additions and 3 deletions
14
diffido.py
14
diffido.py
|
@ -149,9 +149,17 @@ def select_xpath(content, xpath):
|
|||
return content
|
||||
selected_content = []
|
||||
for elem in elems:
|
||||
selected_content.append(''.join([elem.text] +
|
||||
[ElementTree.tostring(e, method='html').decode('utf-8', 'replace')
|
||||
for e in elem.getchildren()]))
|
||||
pieces = []
|
||||
if elem.text:
|
||||
pieces.append(elem.text)
|
||||
for sub_el in elem.getchildren():
|
||||
try:
|
||||
sub_el_text = ElementTree.tostring(sub_el, method='html').decode('utf-8', 'replace')
|
||||
except:
|
||||
continue
|
||||
if sub_el_text:
|
||||
pieces.append(sub_el_text)
|
||||
selected_content.append(''.join(pieces))
|
||||
content = ''.join(selected_content).strip()
|
||||
return content
|
||||
|
||||
|
|
Loading…
Reference in a new issue