Commit 71ec9b7d authored by Jean-Paul Smets's avatar Jean-Paul Smets

Move generic HTML processing to where it belongs (ie. conversion handling...

Move generic HTML processing to where it belongs (ie. conversion handling superclass for now, mixin some day)

git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk@25565 20353a03-c40f-0410-a6d1-a30d3c3de9de
parent 7679b889
...@@ -1278,14 +1278,25 @@ class Document(PermanentURLMixIn, XMLObject, UrlMixIn, ConversionCacheMixin, Sna ...@@ -1278,14 +1278,25 @@ class Document(PermanentURLMixIn, XMLObject, UrlMixIn, ConversionCacheMixin, Sna
A private method which can be reused by subclasses A private method which can be reused by subclasses
to strip HTML content to strip HTML content
""" """
def _guessEncoding(self, string):
"""
Some Email Clients indicate wrong encoding
This method try to guess which encoding is used.
"""
try:
import chardet
except ImportError:
return None
return chardet.detect(string).get('encoding', None)
body_list = re.findall(self.body_parser, str(html)) body_list = re.findall(self.body_parser, str(html))
if len(body_list): if len(body_list):
stripped_html = body_list[0] stripped_html = body_list[0]
else: else:
stripped_html = html stripped_html = html
# find charset and convert to utf-8 # find charset and convert to utf-8
charset_list = self.charset_parser.findall(str(html)) # XXX - Not efficient is datastream charset_list = self.charset_parser.findall(str(html)) # XXX - Not efficient if this
# instance but hard to do better # is datastream instance but hard to do better
if charset and not charset_list: if charset and not charset_list:
# Use optional parameter is we can not find encoding in HTML # Use optional parameter is we can not find encoding in HTML
charset_list = [charset] charset_list = [charset]
...@@ -1297,6 +1308,7 @@ class Document(PermanentURLMixIn, XMLObject, UrlMixIn, ConversionCacheMixin, Sna ...@@ -1297,6 +1308,7 @@ class Document(PermanentURLMixIn, XMLObject, UrlMixIn, ConversionCacheMixin, Sna
return str(stripped_html) return str(stripped_html)
return stripped_html return stripped_html
security.declareProtected(Permissions.AccessContentsInformation, 'getContentInformation') security.declareProtected(Permissions.AccessContentsInformation, 'getContentInformation')
def getContentInformation(self): def getContentInformation(self):
""" """
......
...@@ -40,6 +40,8 @@ from Products.ERP5.Document.File import File ...@@ -40,6 +40,8 @@ from Products.ERP5.Document.File import File
from Products.ERP5.Document.Document import ConversionError from Products.ERP5.Document.Document import ConversionError
from Products.ERP5.Tool.NotificationTool import buildEmailMessage from Products.ERP5.Tool.NotificationTool import buildEmailMessage
from zLOG import LOG, INFO
try: try:
from Products.MimetypesRegistry.common import MimeTypeException from Products.MimetypesRegistry.common import MimeTypeException
except ImportError: except ImportError:
...@@ -300,7 +302,9 @@ class EmailDocument(File, TextDocument): ...@@ -300,7 +302,9 @@ class EmailDocument(File, TextDocument):
text_result = message_text.decode(part_encoding).encode('utf-8') text_result = message_text.decode(part_encoding).encode('utf-8')
else: else:
text_result = message_text.decode().encode('utf-8') text_result = message_text.decode().encode('utf-8')
except (UnicodeDecodeError, LookupError): except (UnicodeDecodeError, LookupError), error_message:
LOG('EmailDocument.getTextContent', INFO,
'Failed to decode %s TEXT message with error: %s' % (part_encoding, error_message))
codec = self._guessEncoding(message_text) codec = self._guessEncoding(message_text)
if codec is not None: if codec is not None:
try: try:
...@@ -313,24 +317,12 @@ class EmailDocument(File, TextDocument): ...@@ -313,24 +317,12 @@ class EmailDocument(File, TextDocument):
text_result = message_text text_result = message_text
elif part.get_content_type() == 'text/html' and not html_result and not part.is_multipart(): elif part.get_content_type() == 'text/html' and not html_result and not part.is_multipart():
part_encoding = part.get_content_charset() part_encoding = part.get_content_charset()
message_text = part.get_payload(decode=1) part_html = part.get_payload(decode=1)
if part_encoding != 'utf-8': # Invoke Document class HTML stripper
try: html_result = self._stripHTML(part_html, charset=part_encoding)
if part_encoding is not None: if html_result:
text_result = message_text.decode(part_encoding).encode('utf-8') # Give priority to HTML
else: text_result = html_result
text_result = message_text.decode().encode('utf-8')
except (UnicodeDecodeError, LookupError):
codec = self._guessEncoding(message_text)
if codec is not None:
try:
text_result = message_text.decode(codec).encode('utf-8')
except (UnicodeDecodeError, LookupError):
text_result = repr(message_text)
else:
text_result = repr(message_text)
else:
text_result = message_text
if default is _MARKER: if default is _MARKER:
return text_result return text_result
return text_result or default return text_result or default
...@@ -399,6 +391,8 @@ class EmailDocument(File, TextDocument): ...@@ -399,6 +391,8 @@ class EmailDocument(File, TextDocument):
""" """
For FCKEditor Compatibility, we should remove DTD, For FCKEditor Compatibility, we should remove DTD,
blank lines and some tags in html document blank lines and some tags in html document
XXX - What is this SHIT !!!!!!!!!!!!!!!!!!!!!!!!!!
""" """
if html_text is None: if html_text is None:
html_text = self.getTextContent() html_text = self.getTextContent()
...@@ -626,17 +620,6 @@ class EmailDocument(File, TextDocument): ...@@ -626,17 +620,6 @@ class EmailDocument(File, TextDocument):
""" """
self.MailHost.send(message) self.MailHost.send(message)
def _guessEncoding(self, string):
"""
Some Email Clients indicate wrong encoding
This method try to guess which encoding is used.
"""
try:
import chardet
except ImportError:
return None
return chardet.detect(string).get('encoding', None)
## Compatibility layer ## Compatibility layer
#from Products.ERP5Type import Document #from Products.ERP5Type import Document
#Document.MailMessage = EmailDocument #Document.MailMessage = EmailDocument
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment