Output always safe html content.

* _safeHTML is removed * The stripping is done inside convert method * Conversion Cache is handled corectly git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk@34360 20353a03-c40f-0410-a6d1-a30d3c3de9de

Output always safe html content.
* _safeHTML is removed * The stripping is done inside convert method * Conversion Cache is handled corectly git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk@34360 20353a03-c40f-0410-a6d1-a30d3c3de9de
e4b0e224 · Nicolas Delaby · 7d81afef · e4b0e224 · e4b0e224
Commit e4b0e224 authored Apr 08, 2010 by Nicolas Delaby
Hide whitespace changes
Inline Side-by-side

Showing with 35 additions and 70 deletions

product/ERP5/Document/Document.py product/ERP5/Document/Document.py +4 -60

product/ERP5/Document/TextDocument.py product/ERP5/Document/TextDocument.py +31 -10

No files found.
--- a/product/ERP5/Document/Document.py
+++ b/product/ERP5/Document/Document.py
@@ -490,7 +490,6 @@ class Document(PermanentURLMixIn, XMLObject, UrlMixIn, CachedConvertableMixin, S
  href_parser = re.compile('<a[^>]*href=[\'"](.*?)[\'"]',re.IGNORECASE)
  body_parser = re.compile('<body[^>]*>(.*?)</body>', re.IGNORECASE + re.DOTALL)
  title_parser = re.compile('<title[^>]*>(.*?)</title>', re.IGNORECASE + re.DOTALL)
-  base_parser = re.compile('<base[^>]*href=[\'"](.*?)[\'"][^>]*>', re.IGNORECASE + re.DOTALL)
  charset_parser = re.compile('charset="?([a-z0-9\-]+)', re.IGNORECASE)

  # Declarative security
@@ -1151,14 +1150,9 @@ class Document(PermanentURLMixIn, XMLObject, UrlMixIn, CachedConvertableMixin, S
    """
    if not self.hasBaseData():
      raise ConversionError('This document has not been processed yet.')
-    try:
-      # FIXME: no substitution may occur in this case.
-      mime, data = self.getConversion(format='base-html')
-      return data
-    except KeyError:
-      kw['format'] = 'html'
-      mime, html = self.convert(**kw)
-      return html
+    kw['format'] = 'html'
+    mime, html = self.convert(**kw)
+    return html

  security.declareProtected(Permissions.View, 'asStrippedHTML')
  def asStrippedHTML(self, **kw):
@@ -1167,16 +1161,7 @@ class Document(PermanentURLMixIn, XMLObject, UrlMixIn, CachedConvertableMixin, S
      (without html and body tags, etc.) which can be used to inline
      a preview of the document.
    """
-    if not self.hasBaseData():
-      return ''
-    try:
-      # FIXME: no substitution may occur in this case.
-      mime, data = self.getConversion(format='stripped-html')
-      return data
-    except KeyError:
-      kw['format'] = 'html'
-      mime, html = self.convert(**kw)
-      return self._stripHTML(str(html))
+    return self._stripHTML(self._asHTML(**kw))

  def _guessEncoding(self, string):
    """
@@ -1199,49 +1184,8 @@ class Document(PermanentURLMixIn, XMLObject, UrlMixIn, CachedConvertableMixin, S
      stripped_html = body_list[0]
    else:
      stripped_html = html
-    # find charset and convert to utf-8
-    charset_list = self.charset_parser.findall(str(html)) # XXX - Not efficient if this
-                                         # is datastream instance but hard to do better
-    if charset and not charset_list:
-      # Use optional parameter is we can not find encoding in HTML
-      charset_list = [charset]
-    if charset_list and charset_list[0] not in ('utf-8', 'UTF-8'):
-      try:
-        stripped_html = unicode(str(stripped_html),
-                                charset_list[0]).encode('utf-8')
-      except (UnicodeDecodeError, LookupError):
-        return str(stripped_html)
    return stripped_html

-  def _safeHTML(self, html, format='text/x-html-safe', charset=None):
-    """
-      A private method to strip HTML content in safe mode,
-      w/o emmbed javascript, forms and any external plugins imports.
-      This should be used when we do not trust the user (Anonymous)
-      who push data into database.
-      - html: content to strip
-      - format: destination format
-      - charset: charset used to encode string. Take precedence
-      on charset values found in html string
-    """
-    portal = self.getPortalObject()
-    if charset is None:
-      # find charset
-      charset_list = self.charset_parser.findall(html)
-      if charset_list:
-        charset = charset_list[0]
-    if charset and charset not in ('utf-8', 'UTF-8'):
-      try:
-        safe_html_string = html.decode(charset).encode('utf-8')
-      except (UnicodeDecodeError, LookupError):
-        pass
-      else:
-        charset = 'utf-8' # Override charset if convertion succeeds
-    transform_tool = getToolByName(portal, 'portal_transforms')
-    safe_html_string = transform_tool.convertToData(format, html,
-                                                    encoding=charset)
-    return safe_html_string
-
  security.declareProtected(Permissions.AccessContentsInformation, 'getContentInformation')
  def getContentInformation(self):
    """

--- a/product/ERP5/Document/TextDocument.py
+++ b/product/ERP5/Document/TextDocument.py
@@ -202,7 +202,8 @@ class TextDocument(Document, TextContent):
                                         **substitution_method_parameter_dict)

    security.declareProtected(Permissions.AccessContentsInformation, 'convert')
-    def convert(self, format, substitution_method_parameter_dict=None, safe_substitute=True, **kw):
+    def convert(self, format, substitution_method_parameter_dict=None,
+                safe_substitute=True, charset=None, text_content=None, **kw):
      """
        Convert text using portal_transforms or oood
      """
@@ -212,35 +213,55 @@ class TextDocument(Document, TextContent):
      if format == 'raw':
        return 'text/plain', self.getTextContent()
      portal = self.getPortalObject()
-      mime_type = getToolByName(portal, 'mimetypes_registry').lookupExtension('name.%s' % format)
-      mime_type = str(mime_type)
+      mime_type = getToolByName(portal, 'mimetypes_registry').\
+                                            lookupExtension('name.%s' % format)
+      original_mime_type = mime_type = str(mime_type)
      src_mimetype = self.getTextFormat(DEFAULT_TEXT_FORMAT)
      if not src_mimetype.startswith('text/'):
        src_mimetype = 'text/%s' % src_mimetype
-      # check if document has set text_content and convert if necessary
-      text_content = self.getTextContent()
+      if text_content is None:
+        # check if document has set text_content and convert if necessary
+        text_content = self.getTextContent()
      if text_content:
        if not self.hasConversion(format=format):
          portal_transforms = getToolByName(portal, 'portal_transforms')
          filename = self.getSourceReference(self.getTitleOrId())
+          if mime_type == 'text/html':
+            mime_type = 'text/x-html-safe'
+            if charset is None:
+              # find charset
+              charset_list = self.charset_parser.findall(text_content)
+              if charset_list:
+                charset = charset_list[0]
+            if charset and charset not in ('utf-8', 'UTF-8'):
+              try:
+                text_content = text_content.decode(charset).encode('utf-8')
+              except (UnicodeDecodeError, LookupError):
+                pass
+              else:
+                charset = 'utf-8' # Override charset if convertion succeeds
+                # change charset value in html_document as well
+                self.charset_parser.sub('utf-8', text_content)
          result = portal_transforms.convertToData(mime_type, text_content,
                                                   object=self, context=self,
                                                   filename=filename,
-                                                   mimetype=src_mimetype)
+                                                   mimetype=src_mimetype,
+                                                   encoding=charset)
          if result is None:
            raise ConversionError('TextDocument conversion error. '
-                                  'portal_transforms failed to convert to %s: %r' % (mime_type, self))
-          self.setConversion(result, mime_type, format=format)
+                                  'portal_transforms failed to convert'\
+                                  'to %s: %r' % (mime_type, self))
+          self.setConversion(result, original_mime_type, format=format)
        else:
          mime_type, result = self.getConversion(format=format)
        if substitution_method_parameter_dict is None:
          substitution_method_parameter_dict = {}
        result = self._substituteTextContent(result, safe_substitute=safe_substitute,
                                             **substitution_method_parameter_dict)
-        return mime_type, result
+        return original_mime_type, result
      else:
        # text_content is not set, return empty string instead of None
-        return mime_type, ''
+        return original_mime_type, ''

    def __call__(self):
      _setCacheHeaders(_ViewEmulator().__of__(self), {})