From 16146a14d06837230ca13e7396aa90731fdbfc0f Mon Sep 17 00:00:00 2001 From: Diffido Date: Sun, 28 Jan 2018 11:00:21 +0100 Subject: [PATCH] use HTML parser --- diffido.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/diffido.py b/diffido.py index ab63b8c..5c6cafa 100755 --- a/diffido.py +++ b/diffido.py @@ -17,7 +17,6 @@ limitations under the License. import os import re -import io import json import pytz import shutil @@ -144,16 +143,16 @@ def select_xpath(content, xpath): :type xpath: str :returns: the selected document :rtype: str""" - fd = io.StringIO(content) - tree = etree.parse(fd) + tree = etree.HTML(content) elems = tree.xpath(xpath) if not elems: return content selected_content = [] for elem in elems: - selected_content.append(''.join([elem.text] + [ElementTree.tostring(e).decode('utf-8', 'replace') - for e in elem.getchildren()])) - content = ''.join(selected_content) + selected_content.append(''.join([elem.text] + + [ElementTree.tostring(e, method='html').decode('utf-8', 'replace') + for e in elem.getchildren()])) + content = ''.join(selected_content).strip() return content