use HTML parser
This commit is contained in:
parent
a85e0eccae
commit
16146a14d0
1 changed files with 5 additions and 6 deletions
|
@ -17,7 +17,6 @@ limitations under the License.
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import io
|
|
||||||
import json
|
import json
|
||||||
import pytz
|
import pytz
|
||||||
import shutil
|
import shutil
|
||||||
|
@ -144,16 +143,16 @@ def select_xpath(content, xpath):
|
||||||
:type xpath: str
|
:type xpath: str
|
||||||
:returns: the selected document
|
:returns: the selected document
|
||||||
:rtype: str"""
|
:rtype: str"""
|
||||||
fd = io.StringIO(content)
|
tree = etree.HTML(content)
|
||||||
tree = etree.parse(fd)
|
|
||||||
elems = tree.xpath(xpath)
|
elems = tree.xpath(xpath)
|
||||||
if not elems:
|
if not elems:
|
||||||
return content
|
return content
|
||||||
selected_content = []
|
selected_content = []
|
||||||
for elem in elems:
|
for elem in elems:
|
||||||
selected_content.append(''.join([elem.text] + [ElementTree.tostring(e).decode('utf-8', 'replace')
|
selected_content.append(''.join([elem.text] +
|
||||||
|
[ElementTree.tostring(e, method='html').decode('utf-8', 'replace')
|
||||||
for e in elem.getchildren()]))
|
for e in elem.getchildren()]))
|
||||||
content = ''.join(selected_content)
|
content = ''.join(selected_content).strip()
|
||||||
return content
|
return content
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue