Commit c24c3923 authored by Jérome Perrin's avatar Jérome Perrin

TextContent base_data bytes

parent f1f4137c
......@@ -410,7 +410,7 @@ class Document(DocumentExtensibleTraversableMixin, XMLObject, UrlMixin,
body_parser = re.compile(r'<body[^>]*>(.*?)</body>', re.IGNORECASE + re.DOTALL)
title_parser = re.compile(r'<title[^>]*>(.*?)</title>', re.IGNORECASE + re.DOTALL)
base_parser = re.compile(r'<base[^>]*href=[\'"](.*?)[\'"][^>]*>', re.IGNORECASE + re.DOTALL)
charset_parser = re.compile(r'(?P<keyword>charset="?)(?P<charset>[a-z0-9\-]+)', re.IGNORECASE)
charset_parser = re.compile(br'(?P<keyword>charset="?)(?P<charset>[a-z0-9\-]+)', re.IGNORECASE)
# Declarative security
security = ClassSecurityInfo()
......
......@@ -186,6 +186,8 @@ class TextDocument(CachedConvertableMixin, BaseConvertableFileMixin, TextContent
file=BytesIO(),
filename=self.getId(),
temp_object=1)
if not isinstance(result, bytes):
result = result.encode('utf-8')
temp_image._setData(result)
_, result = temp_image.convert(**kw)
......@@ -227,7 +229,7 @@ class TextDocument(CachedConvertableMixin, BaseConvertableFileMixin, TextContent
def setBaseData(self, value):
"""Store base_data into text_content
"""
self._setTextContent(value)
self._setTextContent(value.decode('utf-8'))
security.declareProtected(Permissions.ModifyPortalContent, '_setBaseData')
_setBaseData = setBaseData
......@@ -253,9 +255,9 @@ class TextDocument(CachedConvertableMixin, BaseConvertableFileMixin, TextContent
"""
self._checkConversionFormatPermission(None)
if default is _MARKER:
return self.getTextContent()
return self.getTextContent().encode('utf-8')
else:
return self.getTextContent(default=default)
return self.getTextContent(default=default).encode('utf-8')
security.declareProtected(Permissions.AccessContentsInformation, 'hasBaseData')
def hasBaseData(self):
......@@ -290,9 +292,12 @@ class TextDocument(CachedConvertableMixin, BaseConvertableFileMixin, TextContent
def _convertToBaseFormat(self):
"""Conversion to base format for TextDocument consist
to convert file content into utf-8
to convert file content into utf-8.
If the data embeds charset information, this information is updated
to the new (utf-8) charset. This supports XML and HTML.
"""
def guessCharsetAndConvert(document, text_content, content_type):
# type: (TextDocument, bytes, str) -> Tuple[bytes, str]
"""
return encoded content_type and message if encoding
is not utf-8
......@@ -322,37 +327,32 @@ class TextDocument(CachedConvertableMixin, BaseConvertableFileMixin, TextContent
return text_content, message
content_type = self.getContentType() or DEFAULT_CONTENT_TYPE
text_content = self.getData() # TODO: don't we need to convert to bytes here ? what if it is PData ?
data = bytes(self.getData())
if content_type.endswith('xml'):
try:
tree = etree.fromstring(text_content)
text_content = etree.tostring(tree, encoding='utf-8', xml_declaration=True)
tree = etree.fromstring(data)
base_data = etree.tostring(tree, encoding='utf-8', xml_declaration=True)
content_type = 'application/xml'
message = 'Conversion to base format succeeds'
except etree.XMLSyntaxError: # pylint: disable=catching-non-exception
message = 'Conversion to base format without codec fails'
elif content_type == 'text/html':
re_match = self.charset_parser.search(
# we don't really care about decoding errors for searching this
# regexp
text_content.decode('ascii', 'replace') if six.PY3 else text_content)
re_match = self.charset_parser.search(data)
message = 'Conversion to base format succeeds'
if re_match is not None:
charset = re_match.group('charset')
charset = re_match.group('charset').decode('ascii')
try:
# Use encoding in html document
text_content = text_content.decode(charset)
if six.PY2:
text_content = text_content.encode('utf-8')
data = data.decode(charset).encode('utf-8')
except (UnicodeDecodeError, LookupError):
# Encoding read from document is wrong
text_content, message = guessCharsetAndConvert(self,
text_content, content_type)
base_data, message = guessCharsetAndConvert(self,
data, content_type)
else:
message = 'Conversion to base format with charset %r succeeds'\
% charset
if charset.lower() != 'utf-8':
charset = 'utf-8' # Override charset if convertion succeeds
charset = 'utf-8' # Override charset if convertion succeeds
# change charset value in html_document as well
def subCharset(matchobj):
keyword = matchobj.group('keyword')
......@@ -362,24 +362,21 @@ class TextDocument(CachedConvertableMixin, BaseConvertableFileMixin, TextContent
return matchobj.group(0)
elif keyword:
# if keyword is present, replace charset just after
return keyword + 'utf-8'
text_content = self.charset_parser.sub(subCharset, text_content)
return keyword + b'utf-8'
base_data = self.charset_parser.sub(subCharset, data)
else:
text_content, message = guessCharsetAndConvert(self,
text_content, content_type)
base_data, message = guessCharsetAndConvert(self, data, content_type)
else:
# generaly text/plain
try:
# if succeeds, not need to change encoding
# it's already utf-8
text_content.decode('utf-8')
data.decode('utf-8')
except (UnicodeDecodeError, LookupError):
text_content, message = guessCharsetAndConvert(self,
text_content, content_type)
base_data, message = guessCharsetAndConvert(self, data, content_type)
else:
message = 'Conversion to base format succeeds'
# TODO(zope4py3): rethink this, shouldn't we store bytes in base data ?
self._setBaseData(text_content)
self._setBaseData(base_data)
self._setBaseContentType(content_type)
return message
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment