Commit 8f4c7667 authored by Nicolas Delaby's avatar Nicolas Delaby

Better support for flawed html in safe_html

Check that declared encoding is supported by python before using it.
parent 4269d424
...@@ -1726,6 +1726,26 @@ document.write('<sc'+'ript type="text/javascript" src="http://somosite.bg/utb.ph ...@@ -1726,6 +1726,26 @@ document.write('<sc'+'ript type="text/javascript" src="http://somosite.bg/utb.ph
expectedFailure(self.fail)( expectedFailure(self.fail)(
'Even BeautifulSoup is not able to parse such HTML') 'Even BeautifulSoup is not able to parse such HTML')
def test_safeHTML_unknown_codec(self):
"""Some html declare unknown codecs.
"""
web_page_portal_type = 'Web Page'
module = self.portal.getDefaultModule(web_page_portal_type)
web_page = module.newContent(portal_type=web_page_portal_type)
html_content = """
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=unicode" />
<title>BLa</title>
</head>
<body><p> blablabla</p></body>
</html>"""
web_page.edit(text_content=html_content)
safe_html = web_page.convert('html')[1]
self.assertTrue('unicode' not in safe_html)
self.assertTrue('utf-8' in safe_html)
def test_parallel_conversion(self): def test_parallel_conversion(self):
"""Check that conversion engine is able to fill in """Check that conversion engine is able to fill in
cache without overwrite previous conversion cache without overwrite previous conversion
......
...@@ -3,6 +3,7 @@ from zLOG import ERROR ...@@ -3,6 +3,7 @@ from zLOG import ERROR
from HTMLParser import HTMLParser, HTMLParseError from HTMLParser import HTMLParser, HTMLParseError
import re import re
from cgi import escape from cgi import escape
import codecs
from Products.PortalTransforms.interfaces import ITransform from Products.PortalTransforms.interfaces import ITransform
from zope.interface import implements from zope.interface import implements
...@@ -224,7 +225,14 @@ class StrippingParser(HTMLParser): ...@@ -224,7 +225,14 @@ class StrippingParser(HTMLParser):
self.default_encoding and self.default_encoding not in v: self.default_encoding and self.default_encoding not in v:
match = charset_parser.search(v) match = charset_parser.search(v)
if match is not None: if match is not None:
self.original_charset = match.group('charset') charset = match.group('charset')
try:
codecs.lookup(charset)
except LookupError:
# If a codec is not known by python, it is better
# to prevent it's usage
charset = None
self.original_charset = charset
v = charset_parser.sub( v = charset_parser.sub(
CharsetReplacer(self.default_encoding), v) CharsetReplacer(self.default_encoding), v)
self.result.append(' %s="%s"' % (k, escape(v, True))) self.result.append(' %s="%s"' % (k, escape(v, True)))
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment