From f5d12302e5649ac48bbb0998536c366cdcd880dd Mon Sep 17 00:00:00 2001
From: Nicolas Delaby <nicolas@nexedi.com>
Date: Fri, 5 Dec 2008 18:11:16 +0000
Subject: [PATCH] Use chardet to guess which enconding is used when encode is
 missing or wrong

git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk@24814 20353a03-c40f-0410-a6d1-a30d3c3de9de
---
 product/ERP5/Document/EmailDocument.py | 61 ++++++++++++++++----------
 1 file changed, 39 insertions(+), 22 deletions(-)

diff --git a/product/ERP5/Document/EmailDocument.py b/product/ERP5/Document/EmailDocument.py
index 5fd708a54a..98253e15ce 100644
--- a/product/ERP5/Document/EmailDocument.py
+++ b/product/ERP5/Document/EmailDocument.py
@@ -128,12 +128,17 @@ class EmailDocument(File, TextDocument):
     result = {}
     for (name, value) in self._getMessage().items():
       for text, encoding in decode_header(value):
-        if encoding is not None:
-          try:
+        try:
+          if encoding is not None:
             text = text.decode(encoding).encode('utf-8')
-          except UnicodeDecodeError:
-            encoding = self._guessEncoding(text)
+          else:
+            text = text.decode().encode('utf-8')
+        except UnicodeDecodeError:
+          encoding = self._guessEncoding(text)
+          if encoding is not None:
             text = text.decode(encoding).encode('utf-8')
+          else:
+            text = repr(text)
         if name in result:
           result[name] = '%s %s' % (result[name], text)
         else:
@@ -145,7 +150,6 @@ class EmailDocument(File, TextDocument):
     """
     Returns a list of dictionnaries for every attachment. Each dictionnary
     represents the metadata of the attachment.
-    
     **kw - support for listbox (TODO: improve it)
     """
     result = []
@@ -233,6 +237,7 @@ class EmailDocument(File, TextDocument):
         return self._baseGetTitle()
       else:
         return self._baseGetTitle(default)
+    message = self._getMessage()
     subject = self.getContentInformation().get('Subject', '')
     # Remove all newlines
     if '\r' in subject:
@@ -288,23 +293,38 @@ class EmailDocument(File, TextDocument):
     for part in self._getMessage().walk():
       if part.get_content_type() == 'text/plain' and not text_result and not part.is_multipart():
         part_encoding = part.get_content_charset()
-        if part_encoding not in (None, 'utf-8',):
+        message_text = part.get_payload(decode=1)
+        if part_encoding != 'utf-8':
           try:
-            text_result = part.get_payload(decode=1).decode(part_encoding).encode('utf-8')
+            if part_encoding is not None:
+              text_result = message_text.decode(part_encoding).encode('utf-8')
+            else:
+              text_result = message_text.decode().encode('utf-8')
           except (UnicodeDecodeError, LookupError):
-            text_result = part.get_payload(decode=1)
+            codec = self._guessEncoding(message_text)
+            if codec is not None:
+              text_result = message_text.decode(codec).encode('utf-8')
+            else:
+              text_result = repr(message_text)
         else:
-          text_result = part.get_payload(decode=1)
+          text_result = message_text
       elif part.get_content_type() == 'text/html' and not html_result and not part.is_multipart():
         part_encoding = part.get_content_charset()
-        if part_encoding not in (None, 'utf-8',):
+        message_text = part.get_payload(decode=1)
+        if part_encoding != 'utf-8':
           try:
-            text_result = part.get_payload(decode=1).\
-                          decode(part_encoding).encode('utf-8')
+            if part_encoding is not None:
+              text_result = message_text.decode(part_encoding).encode('utf-8')
+            else:
+              text_result = message_text.decode().encode('utf-8')
           except (UnicodeDecodeError, LookupError):
-            text_result = part.get_payload(decode=1)
+            codec = self._guessEncoding(message_text)
+            if codec is not None:
+              text_result = message_text.decode(codec).encode('utf-8')
+            else:
+              text_result = repr(message_text)
         else:
-          text_result = part.get_payload(decode=1)
+          text_result = message_text
     if default is _MARKER:
       return text_result
     return text_result or default
@@ -605,14 +625,11 @@ class EmailDocument(File, TextDocument):
     Some Email Clients indicate wrong encoding
     This method try to guess which encoding is used.
     """
-    from encodings.aliases import aliases
-    codec_list = set(aliases.values())
-    for codec in codec_list:
-      try:
-        string.decode(codec)
-      except (UnicodeDecodeError, IOError):
-        continue
-      return codec
+    try:
+      import chardet
+    except ImportError:
+      return None
+    return chardet.detect(string).get('encoding', None)
 
 ## Compatibility layer
 #from Products.ERP5Type import Document
-- 
2.30.9