• Nicolas Delaby's avatar
    Conversion from html to odt was working only by chance. · 11f80857
    Nicolas Delaby authored
    because convertToBaseFormat was not able to import html.
    Now OOoDocument is able to successfully convertToBaseFormat html content (or any other),
    thanks to content_type parameter which is now given to conversion tool.
    Previous implementation was storing html content into
    base_data instead of data, but convertToBaseFormat was never called.
    
    * The method convert on oood_commandtransform was useless.
    * Call convertToBaseFormat once temp_document is created
    
    
    
    
    git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk@39049 20353a03-c40f-0410-a6d1-a30d3c3de9de
    11f80857
html_to_odt.py 1.86 KB
# -*- coding: utf-8 -*-
from Products.PortalTransforms.interfaces import itransform
from zope.interface import implements
from oood_commandtransform import OOOdCommandTransform, OOoDocumentDataStream
from zLOG import LOG
from lxml import etree, html
from lxml.etree import Element, SubElement

html_parser = etree.HTMLParser(remove_blank_text=True, encoding='utf-8')

class HTMLToOdt:
  """Transforms HTML to odt by using oood"""

  implements(itransform)

  __name__ = 'html_to_odt'
  inputs   = ('text/html',)
  output = 'application/vnd.oasis.opendocument.text'

  tranform_engine = OOOdCommandTransform.__module__

  def name(self):
    return self.__name__

  def __getattr__(self, attr):
    if attr == 'inputs':
      return self.config['inputs']
    if attr == 'output':
      return self.config['output']
    raise AttributeError(attr)

  def convert(self, orig, data, cache=None, filename=None, context=None, **kwargs):
    # Try to recover broken HTML documents, specially regarding encoding used
    html_node = etree.XML(orig, parser=html_parser)
    html_tree = html_node.getroottree()
    head = html_tree.find('head')
    if head is None:
      # This part of code is supposed to be useless
      # lxml.html.tostring function with include_meta_content_type
      # parameter to True, should do the same things. But it does not.
      head = Element('head')
      html_node.insert(0, head)
      SubElement(head, 'meta', **{'http-equiv': 'Content-Type',
                                  'content': 'text/html; charset=utf-8'})
    orig = html.tostring(html_tree, encoding='utf-8')

    doc = OOOdCommandTransform(context, filename, orig, self.inputs[0])
    odt = doc.convertTo('odt')
    if cache is not None:
      cache.setData(odt)
      return cache
    else:
      stream = OOoDocumentDataStream()
      stream.setData(odt)
      return stream

def register():
  return HTMLToOdt()