Commit 71ec9b7d authored by Jean-Paul Smets's avatar Jean-Paul Smets

Move generic HTML processing to where it belongs (ie. conversion handling...

Move generic HTML processing to where it belongs (ie. conversion handling superclass for now, mixin some day)

git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk@25565 20353a03-c40f-0410-a6d1-a30d3c3de9de
parent 7679b889
......@@ -1278,14 +1278,25 @@ class Document(PermanentURLMixIn, XMLObject, UrlMixIn, ConversionCacheMixin, Sna
A private method which can be reused by subclasses
to strip HTML content
"""
def _guessEncoding(self, string):
"""
Some Email Clients indicate wrong encoding
This method try to guess which encoding is used.
"""
try:
import chardet
except ImportError:
return None
return chardet.detect(string).get('encoding', None)
body_list = re.findall(self.body_parser, str(html))
if len(body_list):
stripped_html = body_list[0]
else:
stripped_html = html
# find charset and convert to utf-8
charset_list = self.charset_parser.findall(str(html)) # XXX - Not efficient is datastream
# instance but hard to do better
charset_list = self.charset_parser.findall(str(html)) # XXX - Not efficient if this
# is datastream instance but hard to do better
if charset and not charset_list:
# Use optional parameter is we can not find encoding in HTML
charset_list = [charset]
......@@ -1297,6 +1308,7 @@ class Document(PermanentURLMixIn, XMLObject, UrlMixIn, ConversionCacheMixin, Sna
return str(stripped_html)
return stripped_html
security.declareProtected(Permissions.AccessContentsInformation, 'getContentInformation')
def getContentInformation(self):
"""
......
......@@ -40,6 +40,8 @@ from Products.ERP5.Document.File import File
from Products.ERP5.Document.Document import ConversionError
from Products.ERP5.Tool.NotificationTool import buildEmailMessage
from zLOG import LOG, INFO
try:
from Products.MimetypesRegistry.common import MimeTypeException
except ImportError:
......@@ -300,7 +302,9 @@ class EmailDocument(File, TextDocument):
text_result = message_text.decode(part_encoding).encode('utf-8')
else:
text_result = message_text.decode().encode('utf-8')
except (UnicodeDecodeError, LookupError):
except (UnicodeDecodeError, LookupError), error_message:
LOG('EmailDocument.getTextContent', INFO,
'Failed to decode %s TEXT message with error: %s' % (part_encoding, error_message))
codec = self._guessEncoding(message_text)
if codec is not None:
try:
......@@ -313,24 +317,12 @@ class EmailDocument(File, TextDocument):
text_result = message_text
elif part.get_content_type() == 'text/html' and not html_result and not part.is_multipart():
part_encoding = part.get_content_charset()
message_text = part.get_payload(decode=1)
if part_encoding != 'utf-8':
try:
if part_encoding is not None:
text_result = message_text.decode(part_encoding).encode('utf-8')
else:
text_result = message_text.decode().encode('utf-8')
except (UnicodeDecodeError, LookupError):
codec = self._guessEncoding(message_text)
if codec is not None:
try:
text_result = message_text.decode(codec).encode('utf-8')
except (UnicodeDecodeError, LookupError):
text_result = repr(message_text)
else:
text_result = repr(message_text)
else:
text_result = message_text
part_html = part.get_payload(decode=1)
# Invoke Document class HTML stripper
html_result = self._stripHTML(part_html, charset=part_encoding)
if html_result:
# Give priority to HTML
text_result = html_result
if default is _MARKER:
return text_result
return text_result or default
......@@ -399,6 +391,8 @@ class EmailDocument(File, TextDocument):
"""
For FCKEditor Compatibility, we should remove DTD,
blank lines and some tags in html document
XXX - What is this SHIT !!!!!!!!!!!!!!!!!!!!!!!!!!
"""
if html_text is None:
html_text = self.getTextContent()
......@@ -626,17 +620,6 @@ class EmailDocument(File, TextDocument):
"""
self.MailHost.send(message)
def _guessEncoding(self, string):
"""
Some Email Clients indicate wrong encoding
This method try to guess which encoding is used.
"""
try:
import chardet
except ImportError:
return None
return chardet.detect(string).get('encoding', None)
## Compatibility layer
#from Products.ERP5Type import Document
#Document.MailMessage = EmailDocument
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment