TextContent base_data bytes

c24c3923 · Jérome Perrin · f1f4137c · c24c3923 · c24c3923
Commit c24c3923 authored Feb 07, 2024 by Jérome Perrin
2 changed files
--- a/product/ERP5/bootstrap/erp5_core/DocumentTemplateItem/portal_components/document.erp5.Document.py
+++ b/product/ERP5/bootstrap/erp5_core/DocumentTemplateItem/portal_components/document.erp5.Document.py
@@ -410,7 +410,7 @@ class Document(DocumentExtensibleTraversableMixin, XMLObject, UrlMixin,
  body_parser = re.compile(r'<body[^>]*>(.*?)</body>', re.IGNORECASE + re.DOTALL)
  title_parser = re.compile(r'<title[^>]*>(.*?)</title>', re.IGNORECASE + re.DOTALL)
  base_parser = re.compile(r'<base[^>]*href=[\'"](.*?)[\'"][^>]*>', re.IGNORECASE + re.DOTALL)
-  charset_parser = re.compile(r'(?P<keyword>charset="?)(?P<charset>[a-z0-9\-]+)', re.IGNORECASE)
+  charset_parser = re.compile(br'(?P<keyword>charset="?)(?P<charset>[a-z0-9\-]+)', re.IGNORECASE)
  # Declarative security
  security = ClassSecurityInfo()

--- a/product/ERP5/bootstrap/erp5_core/DocumentTemplateItem/portal_components/document.erp5.TextDocument.py
+++ b/product/ERP5/bootstrap/erp5_core/DocumentTemplateItem/portal_components/document.erp5.TextDocument.py
@@ -186,6 +186,8 @@ class TextDocument(CachedConvertableMixin, BaseConvertableFileMixin, TextContent
                                       file=BytesIO(),
                                       filename=self.getId(),
                                       temp_object=1)
+          if not isinstance(result, bytes):
+            result = result.encode('utf-8')
          temp_image._setData(result)
          _, result = temp_image.convert(**kw)
@@ -227,7 +229,7 @@ class TextDocument(CachedConvertableMixin, BaseConvertableFileMixin, TextContent
  def setBaseData(self, value):
    """Store base_data into text_content
    """
-    self._setTextContent(value)
+    self._setTextContent(value.decode('utf-8'))
  security.declareProtected(Permissions.ModifyPortalContent, '_setBaseData')
  _setBaseData = setBaseData
@@ -253,9 +255,9 @@ class TextDocument(CachedConvertableMixin, BaseConvertableFileMixin, TextContent
    """
    self._checkConversionFormatPermission(None)
    if default is _MARKER:
-      return self.getTextContent()
+      return self.getTextContent().encode('utf-8')
    else:
-      return self.getTextContent(default=default)
+      return self.getTextContent(default=default).encode('utf-8')
  security.declareProtected(Permissions.AccessContentsInformation, 'hasBaseData')
  def hasBaseData(self):
@@ -290,9 +292,12 @@ class TextDocument(CachedConvertableMixin, BaseConvertableFileMixin, TextContent
  def _convertToBaseFormat(self):
    """Conversion to base format for TextDocument consist
-    to convert file content into utf-8
+    to convert file content into utf-8.
+    If the data embeds charset information, this information is updated
+    to the new (utf-8) charset. This supports XML and HTML.
    """
    def guessCharsetAndConvert(document, text_content, content_type):
+      # type: (TextDocument, bytes, str) -> Tuple[bytes, str]
      """
      return encoded content_type and message if encoding
      is not utf-8
@@ -322,37 +327,32 @@ class TextDocument(CachedConvertableMixin, BaseConvertableFileMixin, TextContent
      return text_content, message
    content_type = self.getContentType() or DEFAULT_CONTENT_TYPE
-    text_content = self.getData() # TODO: don't we need to convert to bytes here ? what if it is PData ?
+    data = bytes(self.getData())
    if content_type.endswith('xml'):
      try:
-        tree = etree.fromstring(text_content)
+        tree = etree.fromstring(data)
-        text_content = etree.tostring(tree, encoding='utf-8', xml_declaration=True)
+        base_data = etree.tostring(tree, encoding='utf-8', xml_declaration=True)
        content_type = 'application/xml'
        message = 'Conversion to base format succeeds'
      except etree.XMLSyntaxError: # pylint: disable=catching-non-exception
        message = 'Conversion to base format without codec fails'
    elif content_type == 'text/html':
-      re_match = self.charset_parser.search(
+      re_match = self.charset_parser.search(data)
-        # we don't really care about decoding errors for searching this
-        # regexp
-        text_content.decode('ascii', 'replace') if six.PY3 else text_content)
      message = 'Conversion to base format succeeds'
      if re_match is not None:
-        charset = re_match.group('charset')
+        charset = re_match.group('charset').decode('ascii')
        try:
          # Use encoding in html document
-          text_content = text_content.decode(charset)
+          data = data.decode(charset).encode('utf-8')
-          if six.PY2:
-            text_content = text_content.encode('utf-8')
        except (UnicodeDecodeError, LookupError):
          # Encoding read from document is wrong
-          text_content, message = guessCharsetAndConvert(self,
+          base_data, message = guessCharsetAndConvert(self,
-                                                text_content, content_type)
+                                                data, content_type)
        else:
          message = 'Conversion to base format with charset %r succeeds'\
                                                                  % charset
          if charset.lower() != 'utf-8':
-            charset = 'utf-8' # Override charset if convertion succeeds
+            charset = 'utf-8'  # Override charset if convertion succeeds
            # change charset value in html_document as well
            def subCharset(matchobj):
              keyword = matchobj.group('keyword')
@@ -362,24 +362,21 @@ class TextDocument(CachedConvertableMixin, BaseConvertableFileMixin, TextContent
                return matchobj.group(0)
              elif keyword:
                # if keyword is present, replace charset just after
-                return keyword + 'utf-8'
+                return keyword + b'utf-8'
-            text_content = self.charset_parser.sub(subCharset, text_content)
+            base_data = self.charset_parser.sub(subCharset, data)
      else:
-        text_content, message = guessCharsetAndConvert(self,
+        base_data, message = guessCharsetAndConvert(self, data, content_type)
-                                                  text_content, content_type)
    else:
      # generaly text/plain
      try:
        # if succeeds, not need to change encoding
        # it's already utf-8
-        text_content.decode('utf-8')
+        data.decode('utf-8')
      except (UnicodeDecodeError, LookupError):
-        text_content, message = guessCharsetAndConvert(self,
+        base_data, message = guessCharsetAndConvert(self, data, content_type)
-                                                  text_content, content_type)
      else:
        message = 'Conversion to base format succeeds'
-    # TODO(zope4py3): rethink this, shouldn't we store bytes in base data ?
+    self._setBaseData(base_data)
-    self._setBaseData(text_content)
    self._setBaseContentType(content_type)
    return message