use HTML parser

This commit is contained in:
Diffido 2018-01-28 11:00:21 +01:00
parent a85e0eccae
commit 16146a14d0

View file

@ -17,7 +17,6 @@ limitations under the License.
import os import os
import re import re
import io
import json import json
import pytz import pytz
import shutil import shutil
@ -144,16 +143,16 @@ def select_xpath(content, xpath):
:type xpath: str :type xpath: str
:returns: the selected document :returns: the selected document
:rtype: str""" :rtype: str"""
fd = io.StringIO(content) tree = etree.HTML(content)
tree = etree.parse(fd)
elems = tree.xpath(xpath) elems = tree.xpath(xpath)
if not elems: if not elems:
return content return content
selected_content = [] selected_content = []
for elem in elems: for elem in elems:
selected_content.append(''.join([elem.text] + [ElementTree.tostring(e).decode('utf-8', 'replace') selected_content.append(''.join([elem.text] +
[ElementTree.tostring(e, method='html').decode('utf-8', 'replace')
for e in elem.getchildren()])) for e in elem.getchildren()]))
content = ''.join(selected_content) content = ''.join(selected_content).strip()
return content return content