Commit 44e0b153 by Julien Muchembled

safe_html: fix handling of broken html on Python 2.7

Also simplify sample HTML in test_safeHTML_conversion
1 parent ce667c12
...@@ -1650,7 +1650,7 @@ class TestDocument(TestDocumentMixin): ...@@ -1650,7 +1650,7 @@ class TestDocument(TestDocumentMixin):
self.assertFalse(web_page.hasConversion(format=format)) self.assertFalse(web_page.hasConversion(format=format))
# test with not well-formed html document # test with not well-formed html document
html_content = """ html_content = r"""
<HTML dir=3Dltr><HEAD>=0A= <HTML dir=3Dltr><HEAD>=0A=
<META http-equiv=3DContent-Type content=3D"text/html; charset=3Dunicode">=0A= <META http-equiv=3DContent-Type content=3D"text/html; charset=3Dunicode">=0A=
<META content=3D"DIRTYHTML 6.00.2900.2722" name=3DGENERATOR></HEAD>=0A= <META content=3D"DIRTYHTML 6.00.2900.2722" name=3DGENERATOR></HEAD>=0A=
...@@ -1663,19 +1663,19 @@ class TestDocument(TestDocumentMixin): ...@@ -1663,19 +1663,19 @@ class TestDocument(TestDocumentMixin):
<DIV>&nbsp;</DIV>=0A= <DIV>&nbsp;</DIV>=0A=
<DIV>&nbsp;</DIV>=0A= <DIV>&nbsp;</DIV>=0A=
<br>= <br>=
<!DOCTYPE html PUBLIC \\\"-//W3C//DTD XHTML 1.0 Transitional//EN\\\\= <!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\\=
" \\\"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\\\">= " \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">=
=0A<html xmlns=3D\\\"http://www.w3.org/1999/xhtml\\\">=0A<head>=0A<m= =0A<html xmlns=3D\"http://www.w3.org/1999/xhtml\">=0A<head>=0A<m=
eta http-equiv=3D\\\"Content-Type\\\" content=3D\\\"text/html; c= eta http-equiv=3D\"Content-Type\" content=3D\"text/html; c=
harset=3Diso-8859-1\\\" />=0A<style type=3D\\\"text/css\\\">=0A<= harset=3Diso-8859-1\" />=0A<style type=3D\"text/css\">=0A<=
!--=0A.style1 {font-size: 8px}=0A.style2 {font-family: Arial, Helvetica, san= !--=0A.style1 {font-size: 8px}=0A.style2 {font-family: Arial, Helvetica, san=
s-serif}=0A.style3 {font-size: 8px; font-family: Arial, Helvetica, sans-seri= s-serif}=0A.style3 {font-size: 8px; font-family: Arial, Helvetica, sans-seri=
f; }=0A-->=0A</style>=0A</head>=0A=0A<body>=0A<div>=0A <p><span class=3D\\= f; }=0A-->=0A</style>=0A</head>=0A=0A<body>=0A<div>=0A <p><span class=3D\=
\\"style1\\\"><span class=3D\\\"style2\\\"><strong>I'm inside very broken HTML code</strong><br />=0A ERP5<br />=0A \"style1\"><span class=3D\"style2\"><strong>I'm inside very broken HTML code</strong><br />=0A ERP5<br />=0A
ERP5 ERP5
<br />=0A = <br />=0A =
</span></span></p>=0A <p class=3D\\\"sty= </span></span></p>=0A <p class=3D\"sty=
le3\\\">ERP5:<br />=0A </p>=0A <p class=3D\\\"style3\\\"><strong>ERP5</strong>= le3\">ERP5:<br />=0A </p>=0A <p class=3D\"style3\"><strong>ERP5</strong>=
<br />=0A ERP5</p>=0A</di= <br />=0A ERP5</p>=0A</di=
v>=0A</body>=0A</html>=0A v>=0A</body>=0A</html>=0A
......
...@@ -283,6 +283,13 @@ class StrippingParser(HTMLParser): ...@@ -283,6 +283,13 @@ class StrippingParser(HTMLParser):
def getResult(self): def getResult(self):
return ''.join(self.result) return ''.join(self.result)
def feed(self, html):
# BBB: Python 2.7 is more tolerant to broken HTML.
# For the moment, be strict to behave like Python 2.6.
HTMLParser.feed(self, html)
if self.rawdata:
raise HTMLParseError("unknown error", self.getpos())
def scrubHTML(html, valid=VALID_TAGS, nasty=NASTY_TAGS, def scrubHTML(html, valid=VALID_TAGS, nasty=NASTY_TAGS,
remove_javascript=True, raise_error=True, remove_javascript=True, raise_error=True,
default_encoding=None): default_encoding=None):
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!