Commit bdb0dd19 authored by Romain Courteaud's avatar Romain Courteaud

Improve email message text part detection.

In case of multipart/alternative, try to get the content type defined on preference.
Return first text part in case of multipart/mixed.
parent f2b2c8d5
...@@ -165,6 +165,37 @@ class EmailDocument(TextDocument): ...@@ -165,6 +165,37 @@ class EmailDocument(TextDocument):
self._v_message = result self._v_message = result
return result return result
def _getMessageTextPart(self):
"""
Return the main text part of the message data
Based on rfc: http://tools.ietf.org/html/rfc2046#section-5.1.4)
"""
# Default value if no text is found
found_part = None
part_list = [self._getMessage()]
while part_list:
part = part_list.pop(0)
if part.is_multipart():
if part.get_content_subtype() == 'alternative':
# Try to get the favourite text format defined on preference
preferred_content_type = self.getPortalObject().portal_preferences.\
getPreferredTextFormat('text/html')
favourite_part = None
for subpart in part.get_payload():
if subpart.get_content_type() == preferred_content_type:
part_list.insert(0, subpart)
else:
part_list.append(subpart)
else:
part_list.extend(part.get_payload())
elif part.get_content_maintype() == 'text':
found_part = part
break
return found_part
security.declareProtected(Permissions.AccessContentsInformation, security.declareProtected(Permissions.AccessContentsInformation,
'isSupportBaseDataConversion') 'isSupportBaseDataConversion')
def isSupportBaseDataConversion(self): def isSupportBaseDataConversion(self):
...@@ -409,12 +440,6 @@ class EmailDocument(TextDocument): ...@@ -409,12 +440,6 @@ class EmailDocument(TextDocument):
""" """
Returns the content of the email as text. This is useful Returns the content of the email as text. This is useful
to display the content of an email. to display the content of an email.
According to rfc, (http://tools.ietf.org/html/rfc2046#section-5.1.4)
getTextContent should return html part of multipart/alternative couple
If multipart/mixed, the html part is an attachement. So return the
main content (text/plain).
TODO: add support for legacy objects
""" """
self._checkConversionFormatPermission(None) self._checkConversionFormatPermission(None)
if not self.hasFile() or self._baseGetTextContent() is not None: if not self.hasFile() or self._baseGetTextContent() is not None:
...@@ -425,49 +450,39 @@ class EmailDocument(TextDocument): ...@@ -425,49 +450,39 @@ class EmailDocument(TextDocument):
else: else:
return self._baseGetTextContent(default) return self._baseGetTextContent(default)
# find from mail message else:
text_result = None part = self._getMessageTextPart()
html_result = None if part is None:
is_alternative = False text_result = ""
for part in self._getMessage().walk(): else:
if part.is_multipart():
if part.get_content_type() == 'multipart/alternative':
is_alternative = True
else:
is_alternative = False
elif part.get_content_type() == 'text/plain' and not is_alternative:
part_encoding = part.get_content_charset() part_encoding = part.get_content_charset()
message_text = part.get_payload(decode=1) message_text = part.get_payload(decode=1)
if part_encoding != 'utf-8': if part.get_content_type() == 'text/html':
try: mime, text_result = self.convert(format='html',
if part_encoding is not None: text_content=message_text,
text_result = message_text.decode(part_encoding).encode('utf-8') charset=part_encoding)
else:
text_result = message_text.decode().encode('utf-8')
except (UnicodeDecodeError, LookupError), error_message:
LOG('EmailDocument.getTextContent', INFO,
'Failed to decode %s TEXT message of %s with error: %s' %
(part_encoding, self.getPath(), error_message))
codec = guessEncodingFromText(message_text,
content_type=part.get_content_type())
if codec is not None:
try:
text_result = message_text.decode(codec).encode('utf-8')
except (UnicodeDecodeError, LookupError):
text_result = repr(message_text)
else:
text_result = repr(message_text)
else: else:
text_result = message_text if part_encoding != 'utf-8':
break try:
elif part.get_content_type() == 'text/html' and is_alternative: if part_encoding is not None:
part_encoding = part.get_content_charset() text_result = message_text.decode(part_encoding).encode('utf-8')
part_html = part.get_payload(decode=1) else:
# Invoke Document class HTML stripper text_result = message_text.decode().encode('utf-8')
mime, text_result = self.convert(format='html', except (UnicodeDecodeError, LookupError), error_message:
text_content=part_html, LOG('EmailDocument.getTextContent', INFO,
charset=part_encoding) 'Failed to decode %s TEXT message of %s with error: %s' %
break (part_encoding, self.getPath(), error_message))
codec = guessEncodingFromText(message_text,
content_type=part.get_content_type())
if codec is not None:
try:
text_result = message_text.decode(codec).encode('utf-8')
except (UnicodeDecodeError, LookupError):
text_result = repr(message_text)
else:
text_result = repr(message_text)
else:
text_result = message_text
if default is _MARKER: if default is _MARKER:
return text_result return text_result
...@@ -486,17 +501,12 @@ class EmailDocument(TextDocument): ...@@ -486,17 +501,12 @@ class EmailDocument(TextDocument):
return TextDocument.getContentType(self) return TextDocument.getContentType(self)
else: else:
return TextDocument.getContentType(self, default) return TextDocument.getContentType(self, default)
is_alternative = False else:
for part in self._getMessage().walk(): part = self._getMessageTextPart()
if part.is_multipart(): if part is None:
if part.get_content_type() == 'multipart/alternative': return 'text/plain'
is_alternative = True else:
else: return part.get_content_type()
is_alternative = False
elif part.get_content_type() == 'text/html' and is_alternative:
return 'text/html'
return 'text/plain'
email_parser = re.compile('[ ;,<>\'"]*([^<> ;,\'"]+?\@[^<> ;,\'"]+)[ ;,<>\'"]*',re.IGNORECASE) email_parser = re.compile('[ ;,<>\'"]*([^<> ;,\'"]+?\@[^<> ;,\'"]+)[ ;,<>\'"]*',re.IGNORECASE)
security.declareProtected(Permissions.AccessContentsInformation, 'getContentURLList') security.declareProtected(Permissions.AccessContentsInformation, 'getContentURLList')
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment