diff --git a/product/ERP5/Document/Document.py b/product/ERP5/Document/Document.py index e79580c1909b8869c29b5184c0e48c145ac232bb..028498e4ef20346bf9314d7a6e56f00c5ecfb8d6 100644 --- a/product/ERP5/Document/Document.py +++ b/product/ERP5/Document/Document.py @@ -490,7 +490,7 @@ class Document(PermanentURLMixIn, XMLObject, UrlMixIn, CachedConvertableMixin, S href_parser = re.compile('<a[^>]*href=[\'"](.*?)[\'"]',re.IGNORECASE) body_parser = re.compile('<body[^>]*>(.*?)</body>', re.IGNORECASE + re.DOTALL) title_parser = re.compile('<title[^>]*>(.*?)</title>', re.IGNORECASE + re.DOTALL) - charset_parser = re.compile('charset="?([a-z0-9\-]+)', re.IGNORECASE) + charset_parser = re.compile('(?P<keyword>charset="?)(?P<charset>[a-z0-9\-]+)', re.IGNORECASE) # Declarative security security = ClassSecurityInfo() diff --git a/product/ERP5/Document/TextDocument.py b/product/ERP5/Document/TextDocument.py index 614bb9d7e4cc89163c813d831120ffa062fcdddd..c5bb1f7961670984d202fe11b14fe1d1582e4d48 100644 --- a/product/ERP5/Document/TextDocument.py +++ b/product/ERP5/Document/TextDocument.py @@ -230,9 +230,7 @@ class TextDocument(Document, TextContent): mime_type = 'text/x-html-safe' if charset is None: # find charset - charset_list = self.charset_parser.findall(text_content) - if charset_list: - charset = charset_list[0] + charset = self.charset_parser.search(text_content).group('charset') if charset and charset not in ('utf-8', 'UTF-8'): try: text_content = text_content.decode(charset).encode('utf-8') @@ -241,7 +239,16 @@ class TextDocument(Document, TextContent): else: charset = 'utf-8' # Override charset if convertion succeeds # change charset value in html_document as well - self.charset_parser.sub('utf-8', text_content) + def subCharset(matchobj): + keyword = matchobj.group('keyword') + charset = matchobj.group('charset') + if not (keyword or charset): + # no match, return same string + return matchobj.group(0) + elif keyword: + # if keyword is present, replace charset just after + return keyword + 'utf-8' + text_content = self.charset_parser.sub(subCharset, text_content) result = portal_transforms.convertToData(mime_type, text_content, object=self, context=self, filename=filename,