Recover broken HTML documents, specially regarding encoding used. reviewed by Kazuhiko

git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk@31627 20353a03-c40f-0410-a6d1-a30d3c3de9de

Recover broken HTML documents, specially regarding encoding used. reviewed by Kazuhiko
git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk@31627 20353a03-c40f-0410-a6d1-a30d3c3de9de
8fa68dca · Nicolas Delaby · 4c3154f9 · 8fa68dca
Commit 8fa68dca authored Jan 07, 2010 by Nicolas Delaby
Hide whitespace changes
Inline Side-by-side

Showing with 18 additions and 0 deletions

product/ERP5OOo/transforms/html_to_odt.py product/ERP5OOo/transforms/html_to_odt.py +18 -0

No files found.
--- a/product/ERP5OOo/transforms/html_to_odt.py
+++ b/product/ERP5OOo/transforms/html_to_odt.py
+# -*- coding: utf-8 -*-
 from Products.PortalTransforms.interfaces import itransform
 from zope.interface import implements
 from oood_commandtransform import OOOdCommandTransform, OOoDocumentDataStream
 from zLOG import LOG
+from lxml import etree, html
+from lxml.etree import Element, SubElement
+
+html_parser = etree.HTMLParser(remove_blank_text=True, encoding='utf-8')

 class HTMLToOdt:
  """Transforms HTML to odt by using oood"""
@@ -25,6 +30,19 @@ class HTMLToOdt:
    raise AttributeError(attr)

  def convert(self, orig, data, cache=None, filename=None, context=None, **kwargs):
+    # Try to recover broken HTML documents, specially regarding encoding used
+    html_node = etree.XML(orig, parser=html_parser)
+    html_tree = html_node.getroottree()
+    head = html_tree.find('head')
+    if head is None:
+      # This part of code is supposed to be useless
+      # lxml.html.tostring function with include_meta_content_type
+      # parameter to True, should do the same things. But it does not.
+      head = Element('head')
+      html_node.insert(0, head)
+      SubElement(head, 'meta', **{'http-equiv': 'Content-Type', 'content': 'text/html; charset=utf-8'})
+    orig = html.tostring(html_tree, encoding='utf-8')
+
    doc = OOOdCommandTransform(context, filename, orig, self.inputs[0])
    doc.convert()
    odt = doc.convertTo('odt')