Better support for flawed html in safe_html

Check that declared encoding is supported by python before using it.

Better support for flawed html in safe_html
Check that declared encoding is supported by python before using it.
8f4c7667 · Nicolas Delaby · 4269d424 · 8f4c7667 · 8f4c7667
Commit 8f4c7667 authored Sep 12, 2011 by Nicolas Delaby
Hide whitespace changes
Inline Side-by-side

Showing with 29 additions and 1 deletion

product/ERP5OOo/tests/testDms.py product/ERP5OOo/tests/testDms.py +20 -0

product/PortalTransforms/transforms/safe_html.py product/PortalTransforms/transforms/safe_html.py +9 -1

No files found.
--- a/product/ERP5OOo/tests/testDms.py
+++ b/product/ERP5OOo/tests/testDms.py
@@ -1726,6 +1726,26 @@ document.write('<sc'+'ript type="text/javascript" src="http://somosite.bg/utb.ph
      expectedFailure(self.fail)(
        'Even BeautifulSoup is not able to parse such HTML')
+  def test_safeHTML_unknown_codec(self):
+    """Some html declare unknown codecs.
+    """
+    web_page_portal_type = 'Web Page'
+    module = self.portal.getDefaultModule(web_page_portal_type)
+    web_page = module.newContent(portal_type=web_page_portal_type)
+    html_content = """
+    <html>
+      <head>
+        <meta http-equiv="Content-Type" content="text/html; charset=unicode" />
+        <title>BLa</title>
+      </head>
+      <body><p> blablabla</p></body>
+    </html>"""
+    web_page.edit(text_content=html_content)
+    safe_html = web_page.convert('html')[1]
+    self.assertTrue('unicode' not in safe_html)
+    self.assertTrue('utf-8' in safe_html)
  def test_parallel_conversion(self):
    """Check that conversion engine is able to fill in
    cache without overwrite previous conversion

--- a/product/PortalTransforms/transforms/safe_html.py
+++ b/product/PortalTransforms/transforms/safe_html.py
@@ -3,6 +3,7 @@ from zLOG import ERROR
 from HTMLParser import HTMLParser, HTMLParseError
 import re
 from cgi import escape
+import codecs
 from Products.PortalTransforms.interfaces import ITransform
 from zope.interface import implements
@@ -224,7 +225,14 @@ class StrippingParser(HTMLParser):
                     self.default_encoding and self.default_encoding not in v:
                        match = charset_parser.search(v)
                        if match is not None:
-                            self.original_charset = match.group('charset')
+                            charset = match.group('charset')
+                            try:
+                                codecs.lookup(charset)
+                            except LookupError:
+                                # If a codec is not known by python, it is better
+                                # to prevent it's usage
+                                charset = None
+                            self.original_charset = charset
                        v = charset_parser.sub(
                            CharsetReplacer(self.default_encoding), v)
                    self.result.append(' %s="%s"' % (k, escape(v, True)))