From b4494de863e5a715bdce1158a063c0a0eb8d11cd Mon Sep 17 00:00:00 2001
From: Nicolas Delaby <nicolas@nexedi.com>
Date: Fri, 5 Mar 2010 11:04:35 +0000
Subject: [PATCH] Implement asSafeHTML output for documents:   - It aims to
 strip html documents and remove   non safe contents like emmbed javascript,
 forms,   import of externals multimediai contents, ...   - usefull to display
 HTML attachments of ingested events   - Use portal_transforms as conversion
 engine (and its transform safe_html). reviewed by Kazuhiko

git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk@33438 20353a03-c40f-0410-a6d1-a30d3c3de9de
---
 product/ERP5/Document/Document.py           | 47 +++++++++++++++++++++
 product/ERP5/Document/EmailDocument.py      |  2 +-
 product/ERP5/interfaces/html_convertable.py | 12 +++++-
 3 files changed, 59 insertions(+), 2 deletions(-)

diff --git a/product/ERP5/Document/Document.py b/product/ERP5/Document/Document.py
index 1e42501837..ea218f8f55 100644
--- a/product/ERP5/Document/Document.py
+++ b/product/ERP5/Document/Document.py
@@ -1204,6 +1204,25 @@ class Document(PermanentURLMixIn, XMLObject, UrlMixIn, CachedConvertableMixin, S
       mime, html = self.convert(**kw)
       return self._stripHTML(str(html))
 
+  security.declareProtected(Permissions.View, 'asSafeHTML')
+  def asSafeHTML(self, **kw):
+    """
+      Converts the current document to HTML, strip it and remove
+      emmbed javascript, forms, any external plugins imports.
+    """
+    format = 'text/x-html-safe'
+    if not self.hasBaseData():
+      return ''
+    try:
+      mime, data = self.getConversion(format=format)
+      return data
+    except KeyError:
+      kw['format'] = 'html'
+      mime, html = self.convert(**kw)
+      safe_html = self._safeHTML(str(html), format=format)
+      self.setConversion(safe_html, mime=mime, format=format)
+      return safe_html
+
   def _guessEncoding(self, string):
     """
       Try to guess the encoding for this string.
@@ -1239,6 +1258,34 @@ class Document(PermanentURLMixIn, XMLObject, UrlMixIn, CachedConvertableMixin, S
         return str(stripped_html)
     return stripped_html
 
+  def _safeHTML(self, html, format='text/x-html-safe', charset=None):
+    """
+      A private method to strip HTML content in safe mode,
+      w/o emmbed javascript, forms and any external plugins imports.
+      This should be used when we do not trust the user (Anonymous)
+      who push data into database.
+      - html: content to strip
+      - format: destination format
+      - charset: charset used to encode string. Take precedence
+      on charset values found in html string
+    """
+    portal = self.getPortalObject()
+    if charset is None:
+      # find charset
+      charset_list = self.charset_parser.findall(html)
+      if charset_list:
+        charset = charset_list[0]
+    if charset and charset not in ('utf-8', 'UTF-8'):
+      try:
+        safe_html_string = html.decode(charset).encode('utf-8')
+      except (UnicodeDecodeError, LookupError):
+        pass
+      else:
+        charset = 'utf-8' # Override charset if convertion succeeds
+    transform_tool = getToolByName(portal, 'portal_transforms')
+    safe_html_string = transform_tool.convertToData(format, html,
+                                                    encoding=charset)
+    return safe_html_string
 
   security.declareProtected(Permissions.AccessContentsInformation, 'getContentInformation')
   def getContentInformation(self):
diff --git a/product/ERP5/Document/EmailDocument.py b/product/ERP5/Document/EmailDocument.py
index 5321fe026c..5883865f2e 100644
--- a/product/ERP5/Document/EmailDocument.py
+++ b/product/ERP5/Document/EmailDocument.py
@@ -452,7 +452,7 @@ class EmailDocument(File, TextDocument):
         part_encoding = part.get_content_charset()
         part_html = part.get_payload(decode=1)
         # Invoke Document class HTML stripper
-        html_result = self._stripHTML(part_html, charset=part_encoding)
+        html_result = self._safeHTML(part_html, charset=part_encoding)
     if html_result:
       # Give priority to HTML
       text_result = html_result
diff --git a/product/ERP5/interfaces/html_convertable.py b/product/ERP5/interfaces/html_convertable.py
index d674162f4e..b4e8cb2173 100644
--- a/product/ERP5/interfaces/html_convertable.py
+++ b/product/ERP5/interfaces/html_convertable.py
@@ -53,4 +53,14 @@ class IHtmlConvertable(Interface):
  
     kw -- optional parameters which can be passed to the
           conversion engine
-    """
\ No newline at end of file
+    """
+
+  def asSafeHTML(**kw):
+    """
+    Converts the current document to HTML, and remove
+    emmbed javascript, forms, any external plugins imports.
+
+    kw -- optional parameters which can be passed to the
+          conversion engine
+    """
+
-- 
2.30.9