safe_html: fix handling of broken html on Python 2.7

Also simplify sample HTML in test_safeHTML_conversion

safe_html: fix handling of broken html on Python 2.7
Also simplify sample HTML in test_safeHTML_conversion
44e0b153 · Julien Muchembled · ce667c12 · 44e0b153 · 44e0b153
Commit 44e0b153 authored Jan 25, 2013 by Julien Muchembled
Showing with 17 additions and 10 deletions

product/ERP5OOo/tests/testDms.py product/ERP5OOo/tests/testDms.py +10 -10

product/PortalTransforms/transforms/safe_html.py product/PortalTransforms/transforms/safe_html.py +7 -0

No files found.
--- a/product/ERP5OOo/tests/testDms.py
+++ b/product/ERP5OOo/tests/testDms.py
@@ -1650,7 +1650,7 @@ class TestDocument(TestDocumentMixin):
    self.assertFalse(web_page.hasConversion(format=format))

    # test with not well-formed html document
-    html_content = """
+    html_content = r"""
    <HTML dir=3Dltr><HEAD>=0A=
 <META http-equiv=3DContent-Type content=3D"text/html; charset=3Dunicode">=0A=
 <META content=3D"DIRTYHTML 6.00.2900.2722" name=3DGENERATOR></HEAD>=0A=
@@ -1663,19 +1663,19 @@ class TestDocument(TestDocumentMixin):
 <DIV>&nbsp;</DIV>=0A=
 <DIV>&nbsp;</DIV>=0A=
 <br>=
-<!DOCTYPE html PUBLIC \\\"-//W3C//DTD XHTML 1.0 Transitional//EN\\\\=
-" \\\"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\\\">=
-=0A<html xmlns=3D\\\"http://www.w3.org/1999/xhtml\\\">=0A<head>=0A<m=
-eta http-equiv=3D\\\"Content-Type\\\" content=3D\\\"text/html; c=
-harset=3Diso-8859-1\\\" />=0A<style type=3D\\\"text/css\\\">=0A<=
+<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\\=
+" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">=
+=0A<html xmlns=3D\"http://www.w3.org/1999/xhtml\">=0A<head>=0A<m=
+eta http-equiv=3D\"Content-Type\" content=3D\"text/html; c=
+harset=3Diso-8859-1\" />=0A<style type=3D\"text/css\">=0A<=
 !--=0A.style1 {font-size: 8px}=0A.style2 {font-family: Arial, Helvetica, san=
 s-serif}=0A.style3 {font-size: 8px; font-family: Arial, Helvetica, sans-seri=
-f; }=0A-->=0A</style>=0A</head>=0A=0A<body>=0A<div>=0A  <p><span class=3D\\=
-\\"style1\\\"><span class=3D\\\"style2\\\"><strong>I'm inside very broken HTML code</strong><br />=0A    ERP5<br />=0A
+f; }=0A-->=0A</style>=0A</head>=0A=0A<body>=0A<div>=0A  <p><span class=3D\=
+\"style1\"><span class=3D\"style2\"><strong>I'm inside very broken HTML code</strong><br />=0A    ERP5<br />=0A
 ERP5
 <br />=0A    =
-</span></span></p>=0A  <p class=3D\\\"sty=
-le3\\\">ERP5:<br />=0A   </p>=0A  <p class=3D\\\"style3\\\"><strong>ERP5</strong>=
+</span></span></p>=0A  <p class=3D\"sty=
+le3\">ERP5:<br />=0A   </p>=0A  <p class=3D\"style3\"><strong>ERP5</strong>=

 <br />=0A    ERP5</p>=0A</di=
 v>=0A</body>=0A</html>=0A

--- a/product/PortalTransforms/transforms/safe_html.py
+++ b/product/PortalTransforms/transforms/safe_html.py
@@ -283,6 +283,13 @@ class StrippingParser(HTMLParser):
    def getResult(self):
        return ''.join(self.result)

+    def feed(self, html):
+      # BBB: Python 2.7 is more tolerant to broken HTML.
+      #      For the moment, be strict to behave like Python 2.6.
+      HTMLParser.feed(self, html)
+      if self.rawdata:
+        raise HTMLParseError("unknown error", self.getpos())
+
 def scrubHTML(html, valid=VALID_TAGS, nasty=NASTY_TAGS,
              remove_javascript=True, raise_error=True,
              default_encoding=None):