Commit 6257ef4b authored by Nicolas Delaby's avatar Nicolas Delaby

Extend guessEncoding method when chardet does not detect

acceptable encoding (it is reliable for html content only),
So fallback to file command (only available on linux2 platform)
to detect used encoding for text/plain.



git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk@35217 20353a03-c40f-0410-a6d1-a30d3c3de9de
parent dda491be
......@@ -1172,7 +1172,8 @@ class Document(PermanentURLMixIn, XMLObject, UrlMixIn, CachedConvertableMixin, S
"""
return self._stripHTML(self._asHTML(**kw))
def _guessEncoding(self, string):
security.declarePrivate('_guessEncoding')
def _guessEncoding(self, string, mime='text/html'):
"""
Try to guess the encoding for this string.
Returns None if no encoding can be guessed.
......@@ -1180,8 +1181,24 @@ class Document(PermanentURLMixIn, XMLObject, UrlMixIn, CachedConvertableMixin, S
try:
import chardet
except ImportError:
return None
chardet = None
if chardet is not None and (mime == 'text/html'\
or os.sys.platform != 'linux2'):
# chardet works fine on html document and its platform independent
return chardet.detect(string).get('encoding', None)
else:
# file command provide better result
# for text/plain documents
# store the content into tempfile
file_descriptor, path = tempfile.mkstemp()
file_object = os.fdopen(file_descriptor, 'w')
file_object.write(string)
file_object.close()
# run file command against tempfile to and read encoded
command_result = Popen(['file', '-b', '--mime-encoding', path],
stdout=PIPE).communicate()[0]
# return detected encoding
return command_result.strip()
def _stripHTML(self, html, charset=None):
"""
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment