Do not trust specified encoding

This patch will always perform conversion against given encoding, in order to check if this codec is valid or not.

Do not trust specified encoding
This patch will always perform conversion against given encoding, in order to check if this codec is valid or not.
e4b0603f · Nicolas Delaby · f6caaf1b · e4b0603f · e4b0603f · e4b0603f
Commit e4b0603f authored Sep 12, 2011 by Nicolas Delaby
3 changed files
--- a/product/ERP5/Document/TextDocument.py
+++ b/product/ERP5/Document/TextDocument.py
@@ -333,17 +333,17 @@ class TextDocument(CachedConvertableMixin, BaseConvertableFileMixin,
        message = 'Conversion to base format succeeds'
        if re_match is not None:
          charset = re_match.group('charset')
-          if charset.lower() != 'utf-8':
-            try:
-              # Use encoding in html document
-              text_content = text_content.decode(charset).encode('utf-8')
-            except (UnicodeDecodeError, LookupError):
-              # Encoding read from document is wrong
-              text_content, message = guessCharsetAndConvert(self,
-                                                    text_content, content_type)
-            else:
-              message = 'Conversion to base format with charset %r succeeds'\
-                                                                      % charset
+          try:
+            # Use encoding in html document
+            text_content = text_content.decode(charset).encode('utf-8')
+          except (UnicodeDecodeError, LookupError):
+            # Encoding read from document is wrong
+            text_content, message = guessCharsetAndConvert(self,
+                                                  text_content, content_type)
+          else:
+            message = 'Conversion to base format with charset %r succeeds'\
+                                                                    % charset
+            if charset.lower() != 'utf-8':
              charset = 'utf-8' # Override charset if convertion succeeds
              # change charset value in html_document as well
              def subCharset(matchobj):

--- a/product/ERP5OOo/tests/testDms.py
+++ b/product/ERP5OOo/tests/testDms.py
@@ -1704,6 +1704,11 @@ document.write('<sc'+'ript type="text/javascript" src="http://somosite.bg/utb.ph
    self.assertTrue('AZERTYY' not in safe_html)
    self.assertTrue('#FFAA44' in safe_html)

+    filename = 'broken_html.html'
+    file_object = makeFileUpload(filename)
+    web_page.edit(file=file_object)
+    converted = web_page.convert('html')[1]
+
  def test_safeHTML_impossible_conversion(self):
    """Some html are not parsable.
    """

--- a/product/ERP5OOo/tests/test_document/broken_html.html
+++ b/product/ERP5OOo/tests/test_document/broken_html.html