From 8fa68dcad160cac7cbf5df670e38661608d0e263 Mon Sep 17 00:00:00 2001 From: Nicolas Delaby <nicolas@nexedi.com> Date: Thu, 7 Jan 2010 13:34:14 +0000 Subject: [PATCH] Recover broken HTML documents, specially regarding encoding used. reviewed by Kazuhiko git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk@31627 20353a03-c40f-0410-a6d1-a30d3c3de9de --- product/ERP5OOo/transforms/html_to_odt.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/product/ERP5OOo/transforms/html_to_odt.py b/product/ERP5OOo/transforms/html_to_odt.py index 302e47264b..337a8c138e 100644 --- a/product/ERP5OOo/transforms/html_to_odt.py +++ b/product/ERP5OOo/transforms/html_to_odt.py @@ -1,7 +1,12 @@ +# -*- coding: utf-8 -*- from Products.PortalTransforms.interfaces import itransform from zope.interface import implements from oood_commandtransform import OOOdCommandTransform, OOoDocumentDataStream from zLOG import LOG +from lxml import etree, html +from lxml.etree import Element, SubElement + +html_parser = etree.HTMLParser(remove_blank_text=True, encoding='utf-8') class HTMLToOdt: """Transforms HTML to odt by using oood""" @@ -25,6 +30,19 @@ class HTMLToOdt: raise AttributeError(attr) def convert(self, orig, data, cache=None, filename=None, context=None, **kwargs): + # Try to recover broken HTML documents, specially regarding encoding used + html_node = etree.XML(orig, parser=html_parser) + html_tree = html_node.getroottree() + head = html_tree.find('head') + if head is None: + # This part of code is supposed to be useless + # lxml.html.tostring function with include_meta_content_type + # parameter to True, should do the same things. But it does not. + head = Element('head') + html_node.insert(0, head) + SubElement(head, 'meta', **{'http-equiv': 'Content-Type', 'content': 'text/html; charset=utf-8'}) + orig = html.tostring(html_tree, encoding='utf-8') + doc = OOOdCommandTransform(context, filename, orig, self.inputs[0]) doc.convert() odt = doc.convertTo('odt') -- 2.30.9