Commit e4b0603f authored by Nicolas Delaby's avatar Nicolas Delaby

Do not trust specified encoding

This patch will always perform conversion against given encoding, in order to check if this codec is valid or not.
parent f6caaf1b
......@@ -333,17 +333,17 @@ class TextDocument(CachedConvertableMixin, BaseConvertableFileMixin,
message = 'Conversion to base format succeeds'
if re_match is not None:
charset = re_match.group('charset')
if charset.lower() != 'utf-8':
try:
# Use encoding in html document
text_content = text_content.decode(charset).encode('utf-8')
except (UnicodeDecodeError, LookupError):
# Encoding read from document is wrong
text_content, message = guessCharsetAndConvert(self,
text_content, content_type)
else:
message = 'Conversion to base format with charset %r succeeds'\
% charset
try:
# Use encoding in html document
text_content = text_content.decode(charset).encode('utf-8')
except (UnicodeDecodeError, LookupError):
# Encoding read from document is wrong
text_content, message = guessCharsetAndConvert(self,
text_content, content_type)
else:
message = 'Conversion to base format with charset %r succeeds'\
% charset
if charset.lower() != 'utf-8':
charset = 'utf-8' # Override charset if convertion succeeds
# change charset value in html_document as well
def subCharset(matchobj):
......
......@@ -1704,6 +1704,11 @@ document.write('<sc'+'ript type="text/javascript" src="http://somosite.bg/utb.ph
self.assertTrue('AZERTYY' not in safe_html)
self.assertTrue('#FFAA44' in safe_html)
filename = 'broken_html.html'
file_object = makeFileUpload(filename)
web_page.edit(file=file_object)
converted = web_page.convert('html')[1]
def test_safeHTML_impossible_conversion(self):
"""Some html are not parsable.
"""
......
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment