Commit c24c3923 authored by Jérome Perrin's avatar Jérome Perrin

TextContent base_data bytes

parent f1f4137c
...@@ -410,7 +410,7 @@ class Document(DocumentExtensibleTraversableMixin, XMLObject, UrlMixin, ...@@ -410,7 +410,7 @@ class Document(DocumentExtensibleTraversableMixin, XMLObject, UrlMixin,
body_parser = re.compile(r'<body[^>]*>(.*?)</body>', re.IGNORECASE + re.DOTALL) body_parser = re.compile(r'<body[^>]*>(.*?)</body>', re.IGNORECASE + re.DOTALL)
title_parser = re.compile(r'<title[^>]*>(.*?)</title>', re.IGNORECASE + re.DOTALL) title_parser = re.compile(r'<title[^>]*>(.*?)</title>', re.IGNORECASE + re.DOTALL)
base_parser = re.compile(r'<base[^>]*href=[\'"](.*?)[\'"][^>]*>', re.IGNORECASE + re.DOTALL) base_parser = re.compile(r'<base[^>]*href=[\'"](.*?)[\'"][^>]*>', re.IGNORECASE + re.DOTALL)
charset_parser = re.compile(r'(?P<keyword>charset="?)(?P<charset>[a-z0-9\-]+)', re.IGNORECASE) charset_parser = re.compile(br'(?P<keyword>charset="?)(?P<charset>[a-z0-9\-]+)', re.IGNORECASE)
# Declarative security # Declarative security
security = ClassSecurityInfo() security = ClassSecurityInfo()
......
...@@ -186,6 +186,8 @@ class TextDocument(CachedConvertableMixin, BaseConvertableFileMixin, TextContent ...@@ -186,6 +186,8 @@ class TextDocument(CachedConvertableMixin, BaseConvertableFileMixin, TextContent
file=BytesIO(), file=BytesIO(),
filename=self.getId(), filename=self.getId(),
temp_object=1) temp_object=1)
if not isinstance(result, bytes):
result = result.encode('utf-8')
temp_image._setData(result) temp_image._setData(result)
_, result = temp_image.convert(**kw) _, result = temp_image.convert(**kw)
...@@ -227,7 +229,7 @@ class TextDocument(CachedConvertableMixin, BaseConvertableFileMixin, TextContent ...@@ -227,7 +229,7 @@ class TextDocument(CachedConvertableMixin, BaseConvertableFileMixin, TextContent
def setBaseData(self, value): def setBaseData(self, value):
"""Store base_data into text_content """Store base_data into text_content
""" """
self._setTextContent(value) self._setTextContent(value.decode('utf-8'))
security.declareProtected(Permissions.ModifyPortalContent, '_setBaseData') security.declareProtected(Permissions.ModifyPortalContent, '_setBaseData')
_setBaseData = setBaseData _setBaseData = setBaseData
...@@ -253,9 +255,9 @@ class TextDocument(CachedConvertableMixin, BaseConvertableFileMixin, TextContent ...@@ -253,9 +255,9 @@ class TextDocument(CachedConvertableMixin, BaseConvertableFileMixin, TextContent
""" """
self._checkConversionFormatPermission(None) self._checkConversionFormatPermission(None)
if default is _MARKER: if default is _MARKER:
return self.getTextContent() return self.getTextContent().encode('utf-8')
else: else:
return self.getTextContent(default=default) return self.getTextContent(default=default).encode('utf-8')
security.declareProtected(Permissions.AccessContentsInformation, 'hasBaseData') security.declareProtected(Permissions.AccessContentsInformation, 'hasBaseData')
def hasBaseData(self): def hasBaseData(self):
...@@ -290,9 +292,12 @@ class TextDocument(CachedConvertableMixin, BaseConvertableFileMixin, TextContent ...@@ -290,9 +292,12 @@ class TextDocument(CachedConvertableMixin, BaseConvertableFileMixin, TextContent
def _convertToBaseFormat(self): def _convertToBaseFormat(self):
"""Conversion to base format for TextDocument consist """Conversion to base format for TextDocument consist
to convert file content into utf-8 to convert file content into utf-8.
If the data embeds charset information, this information is updated
to the new (utf-8) charset. This supports XML and HTML.
""" """
def guessCharsetAndConvert(document, text_content, content_type): def guessCharsetAndConvert(document, text_content, content_type):
# type: (TextDocument, bytes, str) -> Tuple[bytes, str]
""" """
return encoded content_type and message if encoding return encoded content_type and message if encoding
is not utf-8 is not utf-8
...@@ -322,37 +327,32 @@ class TextDocument(CachedConvertableMixin, BaseConvertableFileMixin, TextContent ...@@ -322,37 +327,32 @@ class TextDocument(CachedConvertableMixin, BaseConvertableFileMixin, TextContent
return text_content, message return text_content, message
content_type = self.getContentType() or DEFAULT_CONTENT_TYPE content_type = self.getContentType() or DEFAULT_CONTENT_TYPE
text_content = self.getData() # TODO: don't we need to convert to bytes here ? what if it is PData ? data = bytes(self.getData())
if content_type.endswith('xml'): if content_type.endswith('xml'):
try: try:
tree = etree.fromstring(text_content) tree = etree.fromstring(data)
text_content = etree.tostring(tree, encoding='utf-8', xml_declaration=True) base_data = etree.tostring(tree, encoding='utf-8', xml_declaration=True)
content_type = 'application/xml' content_type = 'application/xml'
message = 'Conversion to base format succeeds' message = 'Conversion to base format succeeds'
except etree.XMLSyntaxError: # pylint: disable=catching-non-exception except etree.XMLSyntaxError: # pylint: disable=catching-non-exception
message = 'Conversion to base format without codec fails' message = 'Conversion to base format without codec fails'
elif content_type == 'text/html': elif content_type == 'text/html':
re_match = self.charset_parser.search( re_match = self.charset_parser.search(data)
# we don't really care about decoding errors for searching this
# regexp
text_content.decode('ascii', 'replace') if six.PY3 else text_content)
message = 'Conversion to base format succeeds' message = 'Conversion to base format succeeds'
if re_match is not None: if re_match is not None:
charset = re_match.group('charset') charset = re_match.group('charset').decode('ascii')
try: try:
# Use encoding in html document # Use encoding in html document
text_content = text_content.decode(charset) data = data.decode(charset).encode('utf-8')
if six.PY2:
text_content = text_content.encode('utf-8')
except (UnicodeDecodeError, LookupError): except (UnicodeDecodeError, LookupError):
# Encoding read from document is wrong # Encoding read from document is wrong
text_content, message = guessCharsetAndConvert(self, base_data, message = guessCharsetAndConvert(self,
text_content, content_type) data, content_type)
else: else:
message = 'Conversion to base format with charset %r succeeds'\ message = 'Conversion to base format with charset %r succeeds'\
% charset % charset
if charset.lower() != 'utf-8': if charset.lower() != 'utf-8':
charset = 'utf-8' # Override charset if convertion succeeds charset = 'utf-8' # Override charset if convertion succeeds
# change charset value in html_document as well # change charset value in html_document as well
def subCharset(matchobj): def subCharset(matchobj):
keyword = matchobj.group('keyword') keyword = matchobj.group('keyword')
...@@ -362,24 +362,21 @@ class TextDocument(CachedConvertableMixin, BaseConvertableFileMixin, TextContent ...@@ -362,24 +362,21 @@ class TextDocument(CachedConvertableMixin, BaseConvertableFileMixin, TextContent
return matchobj.group(0) return matchobj.group(0)
elif keyword: elif keyword:
# if keyword is present, replace charset just after # if keyword is present, replace charset just after
return keyword + 'utf-8' return keyword + b'utf-8'
text_content = self.charset_parser.sub(subCharset, text_content) base_data = self.charset_parser.sub(subCharset, data)
else: else:
text_content, message = guessCharsetAndConvert(self, base_data, message = guessCharsetAndConvert(self, data, content_type)
text_content, content_type)
else: else:
# generaly text/plain # generaly text/plain
try: try:
# if succeeds, not need to change encoding # if succeeds, not need to change encoding
# it's already utf-8 # it's already utf-8
text_content.decode('utf-8') data.decode('utf-8')
except (UnicodeDecodeError, LookupError): except (UnicodeDecodeError, LookupError):
text_content, message = guessCharsetAndConvert(self, base_data, message = guessCharsetAndConvert(self, data, content_type)
text_content, content_type)
else: else:
message = 'Conversion to base format succeeds' message = 'Conversion to base format succeeds'
# TODO(zope4py3): rethink this, shouldn't we store bytes in base data ? self._setBaseData(base_data)
self._setBaseData(text_content)
self._setBaseContentType(content_type) self._setBaseContentType(content_type)
return message return message
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment