From 16146a14d06837230ca13e7396aa90731fdbfc0f Mon Sep 17 00:00:00 2001
From: Diffido <diffido@localhost>
Date: Sun, 28 Jan 2018 11:00:21 +0100
Subject: [PATCH] use HTML parser

---
 diffido.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/diffido.py b/diffido.py
index ab63b8c..5c6cafa 100755
--- a/diffido.py
+++ b/diffido.py
@@ -17,7 +17,6 @@ limitations under the License.
 
 import os
 import re
-import io
 import json
 import pytz
 import shutil
@@ -144,16 +143,16 @@ def select_xpath(content, xpath):
     :type xpath: str
     :returns: the selected document
     :rtype: str"""
-    fd = io.StringIO(content)
-    tree = etree.parse(fd)
+    tree = etree.HTML(content)
     elems = tree.xpath(xpath)
     if not elems:
         return content
     selected_content = []
     for elem in elems:
-        selected_content.append(''.join([elem.text] + [ElementTree.tostring(e).decode('utf-8', 'replace')
-                                                    for e in elem.getchildren()]))
-    content = ''.join(selected_content)
+        selected_content.append(''.join([elem.text] +
+                                        [ElementTree.tostring(e, method='html').decode('utf-8', 'replace')
+                                         for e in elem.getchildren()]))
+    content = ''.join(selected_content).strip()
     return content