PortalTransforms: merge upstream 2.0

This fixes test_20_reStructuredText partially. Conflicts: Products/PortalTransforms/TransformEngine.py Products/PortalTransforms/libtransforms/commandtransform.py Products/PortalTransforms/transforms/safe_html.py Products/PortalTransforms/utils.py git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk@41726 20353a03-c40f-0410-a6d1-a30d3c3de9de

PortalTransforms: merge upstream 2.0
This fixes test_20_reStructuredText partially. Conflicts: Products/PortalTransforms/TransformEngine.py Products/PortalTransforms/libtransforms/commandtransform.py Products/PortalTransforms/transforms/safe_html.py Products/PortalTransforms/utils.py git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk@41726 20353a03-c40f-0410-a6d1-a30d3c3de9de
b255c894 · Julien Muchembled · 4adafd42 · b255c894 · b255c894 · b255c894
Commit b255c894 authored Dec 23, 2010 by Julien Muchembled
22 changed files
--- a/product/PortalTransforms/TransformEngine.py
+++ b/product/PortalTransforms/TransformEngine.py
--- a/product/PortalTransforms/cache.py
+++ b/product/PortalTransforms/cache.py
@@ -3,10 +3,16 @@
 from time import time
 from Acquisition import aq_base

+_marker = object()
+
 class Cache:

-    def __init__(self, context, _id='_v_transform_cache'):
-        self.context = context
+    def __init__(self, obj, context=None, _id='_v_transform_cache'):
+        self.obj = obj
+        if context is None:
+            self.context = obj
+        else:
+            self.context = context
        self._id =_id

    def _genCacheKey(self, identifier, *args):
@@ -17,17 +23,19 @@ class Cache:
        key = key.replace('+', '_')
        key = key.replace('-', '_')
        key = key.replace(' ', '_')
+        if hasattr(aq_base(self.context), 'absolute_url'):
+            return key, self.context.absolute_url()
        return key

    def setCache(self, key, value):
        """cache a value indexed by key"""
        if not value.isCacheable():
            return
-        context = self.context
+        obj = self.obj
        key = self._genCacheKey(key)
-        if getattr(aq_base(context), self._id, None) is None:
-            setattr(context, self._id, {})
-        getattr(context, self._id)[key] = (time(), value)
+        if getattr(aq_base(obj), self._id, None) is None:
+            setattr(obj, self._id, {})
+        getattr(obj, self._id)[key] = (time(), value)
        return key

    def getCache(self, key):
@@ -36,9 +44,9 @@ class Cache:
        return None if not present
        else return a tuple (time spent in cache, value)
        """
-        context = self.context
+        obj = self.obj
        key = self._genCacheKey(key)
-        dict = getattr(context, self._id, None)
+        dict = getattr(obj, self._id, None)
        if dict is None :
            return None
        try:
@@ -46,18 +54,18 @@ class Cache:
            return time() - orig_time, value
        except TypeError:
            return None
-        
+
    def purgeCache(self, key=None):
        """Remove cache
        """
-        context = self.context
+        obj = self.obj
        id = self._id
-        if not shasattr(context, id):
+        if getattr(obj, id, _marker) is _marker:
            return
        if key is None:
-            delattr(context, id)
+            delattr(obj, id)
        else:
-            cache = getattr(context, id)
+            cache = getattr(obj, id)
            key = self._genCacheKey(key)
            if cache.has_key(key):
                del cache[key]
--- a/product/PortalTransforms/libtransforms/commandtransform.py
+++ b/product/PortalTransforms/libtransforms/commandtransform.py
@@ -87,28 +87,30 @@ class popentransform:

    def convert(self, data, cache, **kwargs):
        command = "%s %s" % (self.binary, self.binaryArgs)
-        if not self.useStdin:
-            tmpfile, tmpname = tempfile.mkstemp(text=False) # create tmp
-            os.write(tmpfile, data) # write data to tmp using a file descriptor
-            os.close(tmpfile)       # close it so the other process can read it
-            command = command % { 'infile' : tmpname } # apply tmp name to command
-
-        cin, couterr = os.popen4(command, 'b')
-
-        if self.useStdin:
-            cin.write(str(data))
-
-        status = cin.close()
-
-        out = self.getData(couterr)
-        couterr.close()
-
-        if not self.useStdin:
-            # remove tmp file
-            os.unlink(tmpname)
-
-        cache.setData(out)
-        return cache
+        tmpname = None
+        try:
+            if not self.useStdin:
+                tmpfile, tmpname = tempfile.mkstemp(text=False) # create tmp
+                os.write(tmpfile, data) # write data to tmp using a file descriptor
+                os.close(tmpfile)       # close it so the other process can read it
+                command = command % { 'infile' : tmpname } # apply tmp name to command
+
+            cin, couterr = os.popen4(command, 'b')
+
+            if self.useStdin:
+                cin.write(str(data))
+
+            status = cin.close()
+
+            out = self.getData(couterr)
+            couterr.close()
+
+            cache.setData(out)
+            return cache
+        finally:
+            if not self.useStdin and tmpname is not None:
+                # remove tmp file
+                os.unlink(tmpname)

 from subprocess import Popen, PIPE
 import shlex

--- a/product/PortalTransforms/libtransforms/utils.py
+++ b/product/PortalTransforms/libtransforms/utils.py
 import re
 import os
 import sys
-from sgmllib import SGMLParser
+from sgmllib import SGMLParser, SGMLParseError

 try:
    # Need to be imported before win32api to avoid dll loading
@@ -207,7 +207,26 @@ class StrippingParser( SGMLParser ):

            self.result = "%s</%s>" % (self.result, tag)
            remTag = '</%s>' % tag
-
+    
+    def parse_declaration(self, i):
+        """Fix handling of CDATA sections. Code borrowed from BeautifulSoup.
+        """
+        j = None
+        if self.rawdata[i:i+9] == '<![CDATA[':
+             k = self.rawdata.find(']]>', i)
+             if k == -1:
+                 k = len(self.rawdata)
+             data = self.rawdata[i+9:k]
+             j = k+3
+             self.result.append("<![CDATA[%s]]>" % data)
+        else:
+            try:
+                j = SGMLParser.parse_declaration(self, i)
+            except SGMLParseError:
+                toHandle = self.rawdata[i:]
+                self.result.append(toHandle)
+                j = i + len(toHandle)
+        return j

 def scrubHTML( html ):
    """ Strip illegal HTML tags from string text.  """

--- a/product/PortalTransforms/tests/input/markdown.txt
+++ b/product/PortalTransforms/tests/input/markdown.txt
 ## Testing Markdown 

 `code` and _italic_ and *bold* and even a [link](http://plone.org).
+
+Fööbär
--- a/product/PortalTransforms/tests/input/test_safehtml.html
+++ b/product/PortalTransforms/tests/input/test_safehtml.html
@@ -15,6 +15,10 @@
 </tr>
 </table>
 <p>This is a text used as a blind text.</p>
+<div><![CDATA[
+    Some CDATA text.
+]]>
+</div>
 <ul>
 <li>A sample list item1</li>
 <li>A sample list item2</li>

--- a/product/PortalTransforms/tests/output/demo1.html
+++ b/product/PortalTransforms/tests/output/demo1.html
--- a/product/PortalTransforms/tests/output/demo1.html.nofilename
+++ b/product/PortalTransforms/tests/output/demo1.html.nofilename
--- a/product/PortalTransforms/tests/output/markdown.html
+++ b/product/PortalTransforms/tests/output/markdown.html
@@ -3,4 +3,5 @@
 <h2> Testing Markdown </h2>
 <p> <code>code</code> and <em>italic</em> and <em>bold</em> and even a <a href="http://plone.org">link</a>.
 </p>
+<p>Fööbär</p>

--- a/product/PortalTransforms/tests/output/rest2.out
+++ b/product/PortalTransforms/tests/output/rest2.out
 <h2 class="title">Heading 1</h2>
 <p>Some text.</p>
-<div class="section">
-<h3><a id="heading-2" name="heading-2">Heading 2</a></h3>
-<p>Some text, bla ble bli blo blu. Yes, i know this is <a class="reference" href="http://www.example.com">Stupid</a>.</p>
+<div class="section" id="heading-2">
+<h3>Heading 2</h3>
+<p>Some text, bla ble bli blo blu. Yes, i know this is<a class="reference external" href="http://www.example.com">Stupid</a>.</p>
 </div>
--- a/product/PortalTransforms/tests/output/rest3.out
+++ b/product/PortalTransforms/tests/output/rest3.out
 <h2 class="title">Title</h2>
 <h3 class="subtitle">Subtitle</h3>
 <p>This is a test document to make sure subtitle gets the right heading.</p>
-<div class="section">
-<h3><a id="now-the-real-heading" name="now-the-real-heading">Now the real heading</a></h3>
+<div class="section" id="now-the-real-heading">
+<h3>Now the real heading</h3>
 <p>The brown fox jumped over the lazy dog.</p>
-<div class="section">
-<h4><a id="with-a-subheading" name="with-a-subheading">With a subheading</a></h4>
-<p>Some text, bla ble bli blo blu. Yes, i know this is <a class="reference" href="http://www.example.com">Stupid</a>.</p>
+<div class="section" id="with-a-subheading">
+<h4>With a subheading</h4>
+<p>Some text, bla ble bli blo blu. Yes, i know this is<a class="reference external" href="http://www.example.com">Stupid</a>.</p>
 </div>
 </div>
--- a/product/PortalTransforms/tests/output/test_safe.html
+++ b/product/PortalTransforms/tests/output/test_safe.html
@@ -6,6 +6,10 @@
 </tr>
 </table>
 <p>This is a text used as a blind text.</p>
+<div><![CDATA[
+    Some CDATA text.
+]]>
+</div>
 <ul>
 <li>A sample list item1</li>
 <li>A sample list item2</li>

--- a/product/PortalTransforms/tests/test_engine.py
+++ b/product/PortalTransforms/tests/test_engine.py
@@ -67,6 +67,15 @@ class DummyHtmlFilter2(BaseTransform):
        data.setData("<div class='dummy'>%s</div>" % orig)
        return data

+
+class QuxToVHost(DummyHtmlFilter1):
+    __name__ = 'qux_to_vhost'
+
+    def convert(self, orig, data, context, **kwargs):
+        data.setData(re.sub('qux', context.REQUEST['SERVER_URL'], orig))
+        return data
+
+
 class TransformNoIO(BaseTransform):
    implements(ITransform)

@@ -223,6 +232,52 @@ class TestEngine(ATSiteTestCase):
        out = self.engine.convertTo(mt, other_data, mimetype=mt, object=self)
        self.failUnlessEqual(out.getData(), other_data, out.getData())

+    def testCacheWithVHost(self):
+        """Ensure that the transform cache key includes virtual
+        hosting so that transforms which are dependent on the virtual
+        hosting don't get invalid data from the cache.  This happens,
+        for example, in the resolve UID functionality used by visual
+        editors."""
+        mt = 'text/x-html-safe'
+        self.engine.registerTransform(QuxToVHost())
+        required = ['qux_to_vhost']
+        self.engine.manage_addPolicy(mt, required)
+
+        data = '<a href="qux">vhost link</a>'
+
+        out = self.engine.convertTo(
+            mt, data, mimetype='text/html', object=self.folder,
+            context=self.folder)
+        self.failUnlessEqual(
+            out.getData(), '<a href="http://nohost">vhost link</a>',
+            out.getData())
+
+        # Test when object is not a context
+        out = self.engine.convertTo(
+            mt, data, mimetype='text/html', object=self,
+            context=self.folder)
+        self.failUnlessEqual(
+            out.getData(), '<a href="http://nohost">vhost link</a>',
+            out.getData())
+
+        # Change the virtual hosting
+        self.folder.REQUEST['SERVER_URL'] = 'http://otherhost'
+
+        out = self.engine.convertTo(
+            mt, data, mimetype='text/html', object=self.folder,
+            context=self.folder)
+        self.failUnlessEqual(
+            out.getData(), '<a href="http://otherhost">vhost link</a>',
+            out.getData())
+
+        # Test when object is not a context
+        out = self.engine.convertTo(
+            mt, data, mimetype='text/html', object=self,
+            context=self.folder)
+        self.failUnlessEqual(
+            out.getData(), '<a href="http://otherhost">vhost link</a>',
+            out.getData())
+

 def test_suite():
    from unittest import TestSuite, makeSuite

--- a/product/PortalTransforms/tests/test_graph.py
+++ b/product/PortalTransforms/tests/test_graph.py
@@ -16,6 +16,87 @@ class TestGraph(ATSiteTestCase):
            out = self.engine.convertTo('text/plain', data, filename=FILE_PATH)
            self.failUnless(out.getData())

+    def testFindPath(self):
+        originalMap = self.engine._mtmap
+        """
+        The dummy map used for this test corresponds to a graph
+        depicted in ASCII art below :
+        
+        +---+
+        |   |
+        |   v
+        +-->1<-->2-->4-->6<--7
+            ^    ^   |
+            |    |   |
+            v    |   |
+            3<---+   |
+            ^        |
+            |        |
+            v        |
+            5<-------+
+        """
+        # we need a DummyTransform class
+        class DT:
+            def __init__(self, name):
+                self._name = name
+            def name(self):
+                return self._name
+        
+        dummyMap1 = {
+            '1': { '1': [DT('transform1-1')],
+                   '2': [DT('transform1-2')],
+                   '3': [DT('transform1-3')]},
+            '2': { '1': [DT('transform2-1')],
+                   '3': [DT('transform2-3')],
+                   '4': [DT('transform2-4')]},
+            '3': { '1': [DT('transform3-1')],
+                   '2': [DT('transform3-2')],
+                   '5': [DT('transform3-5')]},
+            '4': { '5': [DT('transform4-5')],
+                   '6': [DT('transform4-6')]},
+            '5': { '3': [DT('transform5-3')]},
+            '7': { '6': [DT('transform7-6')]}
+        }
+        expectedPathes = {
+            '1-1': [],
+            '1-2': ['transform1-2'],
+            '1-3': ['transform1-3'],
+            '1-4': ['transform1-2', 'transform2-4'],
+            '1-5': ['transform1-3', 'transform3-5'],
+            '1-6': ['transform1-2', 'transform2-4', 'transform4-6'],
+            '1-7': None,
+            '2-1': ['transform2-1'],
+            '2-2': [],
+            '2-4': ['transform2-4'],
+            '4-2': ['transform4-5', 'transform5-3', 'transform3-2'],
+            '5-3': ['transform5-3']
+        }
+        self.engine._mtmap = dummyMap1
+        for orig in ['1','2','3','4','5','6','7']:
+            for target in ['1','2','3','4','5','6','7']:
+                # build the name of the path
+                pathName = orig + '-' + target
+                # do we have any expectation for this path ?
+                if pathName in expectedPathes.keys():
+                    # we do. Here is the expected shortest path
+                    expectedPath = expectedPathes[pathName]
+                    # what's the shortest path according to the engine ?
+                    gotPath = self.engine._findPath(orig,target)
+                    # just keep the name of the transforms, please
+                    if gotPath is not None:
+                        gotPath = [transform.name() for transform in gotPath]
+                    # this must be the same as in our expectation
+                    self.assertEquals(expectedPath, gotPath)
+        self.engine._mtmap = originalMap
+
+    def testFindPathWithEmptyTransform(self):
+        """ _findPath should not throw "index out of range" when dealing with
+            empty transforms list
+        """
+        dummyMap = {'1': {'2': []}}
+        self.engine._mtmap = dummyMap
+        self.engine._findPath('1','2')
+    
    def testIdentity(self):
        orig = 'Some text'
        converted = self.engine.convertTo(

--- a/product/PortalTransforms/tests/test_transforms.py
+++ b/product/PortalTransforms/tests/test_transforms.py
 import os
 import logging
-from Testing import ZopeTestCase
 from Products.Archetypes.tests.atsitetestcase import ATSiteTestCase
+from Products.CMFCore.utils import getToolByName

 from utils import input_file_path, output_file_path, normalize_html,\
     load, matching_inputs
 from Products.PortalTransforms.data import datastream
 from Products.PortalTransforms.interfaces import IDataStream
-from Products.PortalTransforms.interfaces import idatastream
-from Products.MimetypesRegistry.MimeTypesTool import MimeTypesTool
-from Products.PortalTransforms.TransformEngine import TransformTool

 from Products.PortalTransforms.libtransforms.utils import MissingBinary
 from Products.PortalTransforms.transforms.image_to_gif import image_to_gif
@@ -24,7 +21,6 @@ from Products.PortalTransforms.transforms.textile_to_html import HAS_TEXTILE
 from Products.PortalTransforms.transforms.markdown_to_html import HAS_MARKDOWN

 from os.path import exists
-import sys
 # we have to set locale because lynx output is locale sensitive !
 os.environ['LC_ALL'] = 'C'
 logger = logging.getLogger('PortalTransforms')
@@ -59,9 +55,11 @@ class TransformTest(ATSiteTestCase):
            got = self.normalize(got)
        output.close()

-        self.assertEquals(got, expected,
+        got_start = got.strip()[:30]
+        expected_start = expected.strip()[:30]
+        self.assertEquals(got_start, expected_start,
                          '[%s]\n\n!=\n\n[%s]\n\nIN %s(%s)' % (
-            got, expected, self.transform.name(), self.input))
+            got_start, expected_start, self.transform.name(), self.input))
        self.assertEquals(self.subobjects, len(res_data.getSubObjects()),
                          '%s\n\n!=\n\n%s\n\nIN %s(%s)' % (
            self.subobjects, len(res_data.getSubObjects()),
@@ -70,13 +68,13 @@ class TransformTest(ATSiteTestCase):
    def testSame(self):
        try:
            self.do_convert(filename=self.input)
-        except MissingBinary, e:
+        except MissingBinary:
            pass

    def testSameNoFilename(self):
        try:
            self.do_convert()
-        except MissingBinary, e:
+        except MissingBinary:
            pass

    def __repr__(self):
@@ -86,12 +84,13 @@ class PILTransformsTest(ATSiteTestCase):
    def afterSetUp(self):
        ATSiteTestCase.afterSetUp(self)
        self.pt = self.portal.portal_transforms
+        self.mimetypes_registry = getToolByName(self.portal, 'mimetypes_registry')

    def test_image_to_bmp(self):
        self.pt.registerTransform(image_to_bmp())
        imgFile = open(input_file_path('logo.jpg'), 'rb')
        data = imgFile.read()
-        self.failUnlessEqual(self.portal.mimetypes_registry.classify(data),'image/jpeg')
+        self.failUnlessEqual(self.mimetypes_registry.classify(data),'image/jpeg')
        data = self.pt.convertTo(target_mimetype='image/x-ms-bmp',orig=data)
        self.failUnlessEqual(data.getMetadata()['mimetype'], 'image/x-ms-bmp')

@@ -99,7 +98,7 @@ class PILTransformsTest(ATSiteTestCase):
        self.pt.registerTransform(image_to_gif())
        imgFile = open(input_file_path('logo.png'), 'rb')
        data = imgFile.read()
-        self.failUnlessEqual(self.portal.mimetypes_registry.classify(data),'image/png')
+        self.failUnlessEqual(self.mimetypes_registry.classify(data),'image/png')
        data = self.pt.convertTo(target_mimetype='image/gif',orig=data)
        self.failUnlessEqual(data.getMetadata()['mimetype'], 'image/gif')

@@ -107,7 +106,7 @@ class PILTransformsTest(ATSiteTestCase):
        self.pt.registerTransform(image_to_jpeg())
        imgFile = open(input_file_path('logo.gif'), 'rb')
        data = imgFile.read()
-        self.failUnlessEqual(self.portal.mimetypes_registry.classify(data),'image/gif')
+        self.failUnlessEqual(self.mimetypes_registry.classify(data),'image/gif')
        data = self.pt.convertTo(target_mimetype='image/jpeg',orig=data)
        self.failUnlessEqual(data.getMetadata()['mimetype'], 'image/jpeg')

@@ -115,7 +114,7 @@ class PILTransformsTest(ATSiteTestCase):
        self.pt.registerTransform(image_to_png())
        imgFile = open(input_file_path('logo.jpg'), 'rb')
        data = imgFile.read()
-        self.failUnlessEqual(self.portal.mimetypes_registry.classify(data),'image/jpeg')
+        self.failUnlessEqual(self.mimetypes_registry.classify(data),'image/jpeg')
        data = self.pt.convertTo(target_mimetype='image/png',orig=data)
        self.failUnlessEqual(data.getMetadata()['mimetype'], 'image/png')

@@ -123,7 +122,7 @@ class PILTransformsTest(ATSiteTestCase):
        self.pt.registerTransform(image_to_pcx())
        imgFile = open(input_file_path('logo.gif'), 'rb')
        data = imgFile.read()
-        self.failUnlessEqual(self.portal.mimetypes_registry.classify(data),'image/gif')
+        self.failUnlessEqual(self.mimetypes_registry.classify(data),'image/gif')
        data = self.pt.convertTo(target_mimetype='image/pcx',orig=data)
        self.failUnlessEqual(data.getMetadata()['mimetype'], 'image/pcx')

@@ -131,7 +130,7 @@ class PILTransformsTest(ATSiteTestCase):
        self.pt.registerTransform(image_to_ppm())
        imgFile = open(input_file_path('logo.png'), 'rb')
        data = imgFile.read()
-        self.failUnlessEqual(self.portal.mimetypes_registry.classify(data),'image/png')
+        self.failUnlessEqual(self.mimetypes_registry.classify(data),'image/png')
        data = self.pt.convertTo(target_mimetype='image/x-portable-pixmap',orig=data)
        self.failUnlessEqual(data.getMetadata()['mimetype'], 'image/x-portable-pixmap')

@@ -139,7 +138,7 @@ class PILTransformsTest(ATSiteTestCase):
        self.pt.registerTransform(image_to_tiff())
        imgFile = open(input_file_path('logo.jpg'), 'rb')
        data = imgFile.read()
-        self.failUnlessEqual(self.portal.mimetypes_registry.classify(data),'image/jpeg')
+        self.failUnlessEqual(self.mimetypes_registry.classify(data),'image/jpeg')
        data = self.pt.convertTo(target_mimetype='image/tiff',orig=data)
        self.failUnlessEqual(data.getMetadata()['mimetype'], 'image/tiff')


--- a/product/PortalTransforms/tests/utils.py
+++ b/product/PortalTransforms/tests/utils.py
@@ -5,6 +5,7 @@ from sys import modules
 from os.path import join, abspath, dirname, basename

 def normalize_html(s):
+    s = re.sub(r"&nbsp;", " ", s)
    s = re.sub(r"\s+", " ", s)
    s = re.sub(r"(?s)\s+<", "<", s)
    s = re.sub(r"(?s)>\s+", ">", s)

--- a/product/PortalTransforms/transforms/markdown_to_html.py
+++ b/product/PortalTransforms/transforms/markdown_to_html.py
 """
-Uses the http://www.freewisdom.org/projects/python-markdown/ module to do its handy work
-
-author: Tom Lazar <tom@tomster.org> at the archipelago sprint 2006
+Uses the http://www.freewisdom.org/projects/python-markdown/ module

+Author: Tom Lazar <tom@tomster.org> at the archipelago sprint 2006
 """
-import os

 from zope.interface import implements

-from Products.CMFDefault.utils import bodyfinder
-
 from Products.PortalTransforms.interfaces import ITransform
-from Products.PortalTransforms.libtransforms.commandtransform import commandtransform
-from Products.PortalTransforms.libtransforms.utils import bin_search
-from Products.PortalTransforms.libtransforms.utils import sansext
 from Products.PortalTransforms.utils import log

 try:
@@ -23,7 +16,7 @@ except ImportError:
    log('markdown_to_html: Could not import python-markdown.')
 else:
    HAS_MARKDOWN = True
-    
+

 class markdown:
    implements(ITransform)
@@ -37,11 +30,16 @@ class markdown:

    def convert(self, orig, data, **kwargs):
        if HAS_MARKDOWN:
-            html = markdown_transformer.markdown(orig)
+            # markdown expects unicode input:
+            orig = unicode(orig.decode('utf-8'))
+            # PortalTransforms, however expects a string as result,
+            # so we encode the unicode result back to UTF8:
+            html = markdown_transformer.markdown(orig).encode('utf-8')
        else:
            html = orig
        data.setData(html)
        return data

+
 def register():
    return markdown()
--- a/product/PortalTransforms/transforms/office_wvware.py
+++ b/product/PortalTransforms/transforms/office_wvware.py
-import re, tempfile
-import os, os.path
-from Products.PortalTransforms.libtransforms.utils import bin_search, \
-     sansext, bodyfinder, scrubHTML
+import os
+from Products.PortalTransforms.libtransforms.utils import bodyfinder, scrubHTML
 from Products.PortalTransforms.libtransforms.commandtransform import commandtransform

 class document(commandtransform):

--- a/product/PortalTransforms/transforms/safe_html.py
+++ b/product/PortalTransforms/transforms/safe_html.py
@@ -31,7 +31,7 @@ VALID_TAGS['ins'] = 1
 VALID_TAGS['del'] = 1
 VALID_TAGS['q'] = 1
 VALID_TAGS['map'] = 1
-VALID_TAGS['area'] = 1
+VALID_TAGS['area'] = 0
 VALID_TAGS['abbr'] = 1
 VALID_TAGS['acronym'] = 1
 VALID_TAGS['var'] = 1
@@ -71,6 +71,10 @@ VALID_TAGS['source'] = 1
 VALID_TAGS['time'] = 1
 VALID_TAGS['video'] = 1

+# add some tags to nasty. These should also probably be backported to CMFDefault.
+NASTY_TAGS['style'] = 1  # this helps improve Word HTML cleanup.
+NASTY_TAGS['meta'] = 1  # allowed by parsers, but can cause unexpected behavior
+

 msg_pat = """
 <div class="system-message">
@@ -203,7 +207,7 @@ class StrippingParser(HTMLParser):
                    if not self.raise_error: continue
                    else: raise IllegalHTML, 'Script event "%s" not allowed.' % k
                elif v is None:
-                  self.result.append(' %s' % (k,))
+                    self.result.append(' %s' % k)
                elif remove_script and hasScript(v):
                    if not self.raise_error: continue
                    else: raise IllegalHTML, 'Script URI "%s" not allowed.' % v
@@ -238,6 +242,26 @@ class StrippingParser(HTMLParser):
            self.result.append('</%s>' % tag)
            #remTag = '</%s>' % tag

+    def parse_declaration(self, i):
+        """Fix handling of CDATA sections. Code borrowed from BeautifulSoup.
+        """
+        j = None
+        if self.rawdata[i:i+9] == '<![CDATA[':
+             k = self.rawdata.find(']]>', i)
+             if k == -1:
+                 k = len(self.rawdata)
+             data = self.rawdata[i+9:k]
+             j = k+3
+             self.result.append("<![CDATA[%s]]>" % data)
+        else:
+            try:
+                j = HTMLParser.parse_declaration(self, i)
+            except HTMLParseError:
+                toHandle = self.rawdata[i:]
+                self.result.append(toHandle)
+                j = i + len(toHandle)
+        return j
+
    def getResult(self):
        return ''.join(self.result)

@@ -262,13 +286,13 @@ def scrubHTML(html, valid=VALID_TAGS, nasty=NASTY_TAGS,

 class SafeHTML:
    """Simple transform which uses CMFDefault functions to
-    clean potentially bad tags.   
+    clean potentially bad tags.

    Tags must explicit be allowed in valid_tags to pass. Only
    the tags themself are removed, not their contents. If tags
    are removed and in nasty_tags, they are removed with
-    all of their contents.         
-    
+    all of their contents.
+
    Objects will not be transformed again with changed settings.
    You need to clear the cache by e.g.
    1.) restarting your zope or
@@ -291,6 +315,10 @@ class SafeHTML:
            'output': self.output,
            'valid_tags': VALID_TAGS,
            'nasty_tags': NASTY_TAGS,
+            'stripped_attributes': ['lang','valign','halign','border','frame','rules','cellspacing','cellpadding','bgcolor'],
+            'stripped_combinations': {'table th td': 'width height'},
+            'style_whitelist': ['text-align', 'list-style-type', 'float'],
+            'class_blacklist': [],
            'remove_javascript': 1,
            'disable_transform': 0,
            'default_encoding': 'utf-8',
@@ -310,6 +338,19 @@ class SafeHTML:
                            'everything they contain (like applet, object). ' +
                            'They are only deleted if they are not marked as valid_tags.',
                            ('tag', 'value')),
+            'stripped_attributes': ('list',
+                                    'stripped_attributes',
+                                    'These attributes are stripped from any tag.'),
+            'stripped_combinations' : ('dict',
+                                       'stripped_combinations',
+                                       'These attributes are stripped from any tag.',
+                                       ('tag', 'value')),
+            'style_whitelist': ('list',
+                                'style_whitelist',
+                                'These CSS styles are allowed in style attributes.'),
+            'class_blacklist': ('list',
+                                'class_blacklist',
+                                'These class names are not allowed in class attributes.'),
            'remove_javascript' : ("int",
                                   'remove_javascript',
                                   '1 to remove javascript attributes that begin with on (e.g. onClick) ' +
@@ -355,7 +396,9 @@ class SafeHTML:
        repaired = 0
        while True:
            try:
-                orig = scrubHTML(
+                # Do 2 passes. This provides more reliable filtering of certain
+                # malicious HTML (cf upstream commit svn10522).
+                for repeat in range(2): orig = scrubHTML(
                    orig,
                    valid=self.config.get('valid_tags', {}),
                    nasty=self.config.get('nasty_tags', {}),
@@ -366,6 +409,8 @@ class SafeHTML:
                data.setData(msg_pat % ("Error", str(inst)))
                break
            except HTMLParseError:
+                if repeat:
+                    raise # try to repair only on first pass
                # ouch !
                # HTMLParser is not able to parse very dirty HTML string
                if not repaired:

--- a/product/PortalTransforms/transforms/word_to_html.py
+++ b/product/PortalTransforms/transforms/word_to_html.py
@@ -45,20 +45,23 @@ class word_to_html:

    def convert(self, data, cache, **kwargs):
        orig_file = 'unknown.doc'
+        doc = None
+        try:
+            doc = document(orig_file, data)
+            doc.convert()
+            html = doc.html()

-        doc = document(orig_file, data)
-        doc.convert()
-        html = doc.html()
+            path, images = doc.subObjects(doc.tmpdir)
+            objects = {}
+            if images:
+                doc.fixImages(path, images, objects)

-        path, images = doc.subObjects(doc.tmpdir)
-        objects = {}
-        if images:
-            doc.fixImages(path, images, objects)
-        doc.cleanDir(doc.tmpdir)
-
-        cache.setData(html)
-        cache.setSubObjects(objects)
-        return cache
+            cache.setData(html)
+            cache.setSubObjects(objects)
+            return cache
+        finally:
+            if doc is not None:
+                doc.cleanDir(doc.tmpdir)

 def register():
    return word_to_html()
--- a/product/PortalTransforms/unsafe_transforms/build_transforms.py
+++ b/product/PortalTransforms/unsafe_transforms/build_transforms.py
@@ -6,37 +6,37 @@ from Products.PortalTransforms.libtransforms.utils import bin_search, MissingBin
 COMMAND_CONFIGS = (
    ('lynx_dump', '.html',
     {'binary_path'  : 'lynx',
-      'command_line' : '-dump %s',
+      'command_line' : '-dump %(input)s',
      'inputs'       : ('text/html',),
      'output'       : 'text/plain',
      }),
    ('tidy_html', '.html',
     {'binary_path'  : 'tidy',
-      'command_line' : '%s',
+      'command_line' : '%(input)s',
      'inputs'       : ('text/html',),
      'output'       : 'text/html',
      }),
    ('rtf_to_html', None,
     {'binary_path'  : 'unrtf',
-      'command_line' : '%s',
+      'command_line' : '%(input)s',
      'inputs'       : ('application/rtf',),
      'output'       : 'text/html',
      }),
    ('ppt_to_html', None,
     {'binary_path'  : 'ppthtml',
-      'command_line' : '%s',
+      'command_line' : '%(input)s',
      'inputs'       : ('application/vnd.ms-powerpoint',),
      'output'       : 'text/html',
      }),
    ('excel_to_html', None,
     {'binary_path'  : 'xlhtml',
-      'command_line' : '-nh -a %s',
+      'command_line' : '-nh -a %(input)s',
      'inputs'       : ('application/vnd.ms-excel',),
      'output'       : 'text/html',
      }),
    ('ps_to_text', None,
     {'binary_path'  : 'ps2ascii',
-      'command_line' : '%s',
+      'command_line' : '%(input)s',
      'inputs'       : ('application/postscript',),
      'output'       : 'text/plain',
      }),

--- a/product/PortalTransforms/utils.py
+++ b/product/PortalTransforms/utils.py
@@ -8,10 +8,10 @@ class TransformException(Exception):
 FB_REGISTRY = None

 # logging function
-from zLOG import LOG, INFO
+from zLOG import LOG, DEBUG
 #logger = logging.getLogger('PortalTransforms')

-def log(message, severity=INFO):
+def log(message, severity=DEBUG):
    LOG('PortalTransforms', severity, message)
    #logger.log(severity, message)