Commit e4e12259 authored by Jérome Perrin's avatar Jérome Perrin

ERP5OOo: don't fail exporting documents with control character

if for some reason an ERP5 document has some control characters in title of
description and is exported in ODS/ODT, the export will fail with an etree
error like this:

    Exception:
      Module Products.CMFActivity.ActivityTool, line 356, in __call__
          result = method(*self.args, **self.kw)
      Module Products.ERP5Type.patches.PythonScript, line 179, in __call__
          return self._orig_bindAndExec(args, kw, None)
      Module Shared.DC.Scripts.Bindings, line 359, in _bindAndExec
          return self._exec(bound_data, args, kw)
      Module Products.PythonScripts.PythonScript, line 344, in _exec
          result = f(*args, **kw)
      Module script, line 15, in Base_renderSimpleView
      - <PythonScript at /erp5/Base_renderSimpleView used for /erp5/sale_packing_list_module>
      - Line 15
          report_data = getattr(context, deferred_style_dialog_method)(**params)
      Module AccessControl.ZopeGuards, line 369, in guarded_apply
          return builtin_guarded_apply(func, args, kws)
      Module AccessControl.ZopeGuards, line 391, in builtin_guarded_apply
          return func(*arglist, **argdict)
      Module Products.ERP5Form.Form, line 705, in __call__
          return pt.pt_render(extra_context=extra_context)
      Module Products.ERP5OOo.OOoTemplate, line 484, in pt_render
          extra_context, request)
      Module Products.ERP5OOo.OOoTemplate, line 422, in renderIncludes
          xml_doc = etree.XML(text)
      Module lxml.etree, line 3192, in lxml.etree.XML (src/lxml/lxml.etree.c:78763)
      Module lxml.etree, line 1848, in lxml.etree._parseMemoryDocument (src/lxml/lxml.etree.c:118341)
      Module lxml.etree, line 1736, in lxml.etree._parseDoc (src/lxml/lxml.etree.c:117021)
      Module lxml.etree, line 1102, in lxml.etree._BaseParser._parseDoc (src/lxml/lxml.etree.c:111265)
      Module lxml.etree, line 595, in lxml.etree._ParserContext._handleParseResultDoc (src/lxml/lxml.etree.c:105109)
      Module lxml.etree, line 706, in lxml.etree._handleParseResult (src/lxml/lxml.etree.c:106817)
      Module lxml.etree, line 635, in lxml.etree._raiseParseError (src/lxml/lxml.etree.c:105671)
    XMLSyntaxError: PCDATA invalid Char value 20, line 228761, column 21 (line 228761)

This is because XML does not accept the full range of characters, there are a
few characters that are not allowed.

To prevent these errors, we replace these invalid characters by an error
character (�) before manipulating the XML with lxml.
parent 0c5f6dc7
Pipeline #10560 failed with stage
......@@ -54,7 +54,7 @@ except ImportError:
SUPPORTS_WEBDAV_LOCKS = 0
from Products.ERP5.Document.Document import ConversionError
from Products.Formulator.Widget import convert_to_xml_compatible_string
from lxml import etree
from lxml.etree import Element
......@@ -95,11 +95,12 @@ def add_and_edit(self, id, REQUEST):
u = "%s/%s" % (u, quote(id))
REQUEST.RESPONSE.redirect(u+'/manage_main')
class OOoTemplateStringIO(FasterStringIO):
def write(self, s):
if type(s) == unicode:
s = s.encode('utf-8')
FasterStringIO.write(self, s)
return FasterStringIO.write(
self,
convert_to_xml_compatible_string(s).encode('utf-8'))
from Products.PageTemplates.Expressions import ZopeContext, createZopeEngine
......@@ -476,8 +477,7 @@ class OOoTemplate(ZopePageTemplate):
# And render page template
doc_xml = ZopePageTemplate.pt_render(self, source=source,
extra_context=extra_context)
if isinstance(doc_xml, unicode):
doc_xml = doc_xml.encode('utf-8')
doc_xml = convert_to_xml_compatible_string(doc_xml).encode('utf-8')
# Replace the includes
(doc_xml,attachments_dict) = self.renderIncludes(here, doc_xml,
......
......@@ -276,6 +276,18 @@ class TestOOoStyle(ERP5TypeTestCase, ZopeTestCase.Functional):
self.assertEqual('attachment', content_disposition.split(';')[0])
self._validate(response.getBody())
def test_control_character_encoding(self):
# XML does not allow certain control characters
self.portal.person_module.pers.setFirstName('This character: \x14 is not allowed in XML')
response = self.publish('/%s/person_module/pers/Person_view'
% self.portal.getId(), basic=self.auth)
self.assertEqual(HTTP_OK, response.getStatus())
content_type = response.getHeader('content-type')
self.assertTrue(content_type.startswith(self.content_type), content_type)
content_disposition = response.getHeader('content-disposition')
self.assertEqual('attachment', content_disposition.split(';')[0])
self._validate(response.getBody())
def test_form_view_category(self):
self.portal.person_module.pers.setGender('male')
response = self.publish('/%s/person_module/pers/Person_view'
......
......@@ -11,6 +11,7 @@ from lxml import etree
from lxml.etree import Element, SubElement
from lxml.builder import ElementMaker
import re
import sys
DRAW_URI = 'urn:oasis:names:tc:opendocument:xmlns:drawing:1.0'
TEXT_URI = 'urn:oasis:names:tc:opendocument:xmlns:text:1.0'
......@@ -28,6 +29,35 @@ NSMAP = {
EForm = ElementMaker(namespace=FORM_URI, nsmap=NSMAP)
def convert_to_xml_compatible_string(value):
"""Convert value to an XML 1.0 compatible string.
This helper makes sure the value is compatible with this requirement of lxml:
All strings must be XML compatible: Unicode or ASCII, no NULL bytes
"""
if not value:
return ''
if isinstance(value, str):
value = value.decode('utf-8')
# remove control characters as described in the example from
# https://bugs.python.org/issue5166#msg95689
# http://www.w3.org/TR/REC-xml/#NT-Char
# Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] |
# [#x10000- #x10FFFF]
# (any Unicode character, excluding the surrogate blocks, FFFE, and FFFF)
_char_tail = ''
if sys.maxunicode > 0x10000:
_char_tail = u'%s-%s' % (unichr(0x10000),
unichr(min(sys.maxunicode, 0x10FFFF)))
_nontext_sub = re.compile(
ur'[^\x09\x0A\x0D\x20-\uD7FF\uE000-\uFFFD%s]' % _char_tail,
re.U).sub
return _nontext_sub(u'\uFFFD', value)
RE_OOO_ESCAPE = re.compile(r'([\n\t])?([^\n\t]*)')
class OOoEscaper:
"""Replacement function to use inside re.sub expression.
......@@ -40,13 +70,13 @@ class OOoEscaper:
def __call__(self, match_object):
match_value = match_object.group(1)
if match_value is None:
self.parent_node.text = match_object.group(2)
self.parent_node.text = convert_to_xml_compatible_string(match_object.group(2))
elif match_value == '\n':
line_break = SubElement(self.parent_node, '{%s}%s' % (TEXT_URI, 'line-break'))
line_break.tail = match_object.group(2)
line_break.tail = convert_to_xml_compatible_string(match_object.group(2))
elif match_value == '\t':
line_break = SubElement(self.parent_node, '{%s}%s' % (TEXT_URI, 'tab'))
line_break.tail = match_object.group(2)
line_break.tail = convert_to_xml_compatible_string(match_object.group(2))
def convertToString(value):
if not isinstance(value, (str, unicode)):
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment