core: text document / mail message str vs bytes WIP

dc2bc32c · Jérome Perrin · c9364d13 · dc2bc32c · dc2bc32c · dc2bc32c
Commit dc2bc32c authored Jan 26, 2024 by Jérome Perrin
3 changed files
--- a/product/ERP5/bootstrap/erp5_core/DocumentTemplateItem/portal_components/document.erp5.TextDocument.py
+++ b/product/ERP5/bootstrap/erp5_core/DocumentTemplateItem/portal_components/document.erp5.TextDocument.py
@@ -115,7 +115,7 @@ class TextDocument(CachedConvertableMixin, BaseConvertableFileMixin, TextContent
        text = Template(text).substitute(unicode_mapping)

      # If the original was a str, convert it back to str.
-      if is_str:
+      if six.PY2 and is_str:
        text = text.encode('utf-8')

    return text
@@ -188,7 +188,10 @@ class TextDocument(CachedConvertableMixin, BaseConvertableFileMixin, TextContent
        self.setConversion(result, original_mime_type, **kw)
      else:
        mime_type, result = self.getConversion(**kw)
-      if substitute and format in VALID_TEXT_FORMAT_LIST:
+      if format in VALID_TEXT_FORMAT_LIST:
+        if six.PY3 and isinstance(result, bytes):
+          result = result.decode()
+        if substitute:
          # only textual content can be sustituted
          if substitution_method_parameter_dict is None:
            substitution_method_parameter_dict = {}
@@ -375,21 +378,27 @@ class TextDocument(CachedConvertableMixin, BaseConvertableFileMixin, TextContent
    return message

  security.declareProtected(Permissions.AccessContentsInformation, 'getTextContent')
-  def getTextContent(self, default=_MARKER):
+  def getTextContent(self, default=_MARKER, encoding=None):
    """Overriden method to check
-    permission to access content in raw format
+    permission to access content in raw format and manage encoding.
    """
+    self._checkConversionFormatPermission(None)
+    if default is _MARKER:
+      text_content = self._baseGetTextContent()
+    text_content = self._baseGetTextContent(default)
+    if isinstance(text_content, bytes):
      # XXX Zope4py3: should this return str ??
      # We probably have "legacy" documents where `text_content` is a python2
      # str encoded as something else than utf-8.
      # Maybe we should introduce a new text_content_encoding property and
      # expose API to getRawTextContent (as bytes) and getTextContent would return
      # the decoded string.
-    self._checkConversionFormatPermission(None)
-    if default is _MARKER:
-      return self._baseGetTextContent()
-    else:
-      return self._baseGetTextContent(default)
+      # XXX what about _convertToBaseFormat/guessCharsetAndConvert ???
+      try:
+        text_content = text_content.decode('utf-8')
+      except UnicodeDecodeError:
+        text_content = text_content.decode('latin1')
+    return text_content

  # Backward compatibility for replacement of text_format by content_type
  security.declareProtected(Permissions.AccessContentsInformation, 'getTextFormat')
@@ -424,9 +433,11 @@ class TextDocument(CachedConvertableMixin, BaseConvertableFileMixin, TextContent
    """
    if not self.hasData():
      if default is _MARKER:
-        return self.getTextContent()
+        data = self._baseGetTextContent()
      else:
-        return self.getTextContent(default)
+        data = self._baseGetTextContent(default)
+      if not isinstance(data, bytes):
+        return data.encode('utf-8')
    else:
      if default is _MARKER:
        return File.getData(self)

--- a/product/ERP5/bootstrap/erp5_core/MixinTemplateItem/portal_components/mixin.erp5.DownloadableMixin.py
+++ b/product/ERP5/bootstrap/erp5_core/MixinTemplateItem/portal_components/mixin.erp5.DownloadableMixin.py
@@ -149,6 +149,7 @@ class DownloadableMixin:
    RESPONSE.setHeader('Content-Length', len(data))
    if output_format in VALID_TEXT_FORMAT_LIST:
      RESPONSE.setHeader('Content-Type', '%s; charset=utf-8' % mime)
+      data = data.encode('utf-8')
    else:
      RESPONSE.setHeader('Content-Type', mime)
    if inline is _MARKER:

--- a/product/ERP5/bootstrap/erp5_core/MixinTemplateItem/portal_components/mixin.erp5.MailMessageMixin.py
+++ b/product/ERP5/bootstrap/erp5_core/MixinTemplateItem/portal_components/mixin.erp5.MailMessageMixin.py
@@ -29,7 +29,7 @@
 from AccessControl import ClassSecurityInfo
 from Products.ERP5Type.Globals import InitializeClass
 from Products.ERP5Type import Permissions
-from Products.ERP5Type.Utils import guessEncodingFromText
+from Products.ERP5Type.Utils import guessEncodingFromText # TODO: guessEncodingFromBytes
 from zLOG import LOG, INFO

 from email.header import decode_header, HeaderParseError
@@ -42,7 +42,7 @@ filename_regexp = 'name="([^"]*)"'
 def testCharsetAndConvert(text_content, content_type, encoding):
  try:
    if encoding is not None:
-      text_content = text_content.decode(encoding).encode('utf-8')
+      text_content = text_content.decode(encoding)
    else:
      if six.PY2:
        text_content = text_content.decode().encode('utf-8')
@@ -50,8 +50,9 @@ def testCharsetAndConvert(text_content, content_type, encoding):
    encoding = guessEncodingFromText(text_content, content_type)
    if encoding is not None:
      try:
-        text_content = text_content.decode(encoding).encode('utf-8')
+        text_content = text_content.decode(encoding)
      except (UnicodeDecodeError, LookupError):
+        # TODO: errors= repr ?
        text_content = repr(text_content)[1:-1]
    else:
      text_content = repr(text_content)[1:-1]
@@ -113,9 +114,6 @@ class MailMessageMixin:
    """
    Returns the content information from the header information.
    This is used by the metadata discovery system.
-
-    Header information is converted in UTF-8 since this is the standard
-    way of representing strings in ERP5.
    """
    result = {}
    for (name, value) in self._getMessage().items():