From b4494de863e5a715bdce1158a063c0a0eb8d11cd Mon Sep 17 00:00:00 2001 From: Nicolas Delaby <nicolas@nexedi.com> Date: Fri, 5 Mar 2010 11:04:35 +0000 Subject: [PATCH] Implement asSafeHTML output for documents: - It aims to strip html documents and remove non safe contents like emmbed javascript, forms, import of externals multimediai contents, ... - usefull to display HTML attachments of ingested events - Use portal_transforms as conversion engine (and its transform safe_html). reviewed by Kazuhiko git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk@33438 20353a03-c40f-0410-a6d1-a30d3c3de9de --- product/ERP5/Document/Document.py | 47 +++++++++++++++++++++ product/ERP5/Document/EmailDocument.py | 2 +- product/ERP5/interfaces/html_convertable.py | 12 +++++- 3 files changed, 59 insertions(+), 2 deletions(-) diff --git a/product/ERP5/Document/Document.py b/product/ERP5/Document/Document.py index 1e42501837..ea218f8f55 100644 --- a/product/ERP5/Document/Document.py +++ b/product/ERP5/Document/Document.py @@ -1204,6 +1204,25 @@ class Document(PermanentURLMixIn, XMLObject, UrlMixIn, CachedConvertableMixin, S mime, html = self.convert(**kw) return self._stripHTML(str(html)) + security.declareProtected(Permissions.View, 'asSafeHTML') + def asSafeHTML(self, **kw): + """ + Converts the current document to HTML, strip it and remove + emmbed javascript, forms, any external plugins imports. + """ + format = 'text/x-html-safe' + if not self.hasBaseData(): + return '' + try: + mime, data = self.getConversion(format=format) + return data + except KeyError: + kw['format'] = 'html' + mime, html = self.convert(**kw) + safe_html = self._safeHTML(str(html), format=format) + self.setConversion(safe_html, mime=mime, format=format) + return safe_html + def _guessEncoding(self, string): """ Try to guess the encoding for this string. @@ -1239,6 +1258,34 @@ class Document(PermanentURLMixIn, XMLObject, UrlMixIn, CachedConvertableMixin, S return str(stripped_html) return stripped_html + def _safeHTML(self, html, format='text/x-html-safe', charset=None): + """ + A private method to strip HTML content in safe mode, + w/o emmbed javascript, forms and any external plugins imports. + This should be used when we do not trust the user (Anonymous) + who push data into database. + - html: content to strip + - format: destination format + - charset: charset used to encode string. Take precedence + on charset values found in html string + """ + portal = self.getPortalObject() + if charset is None: + # find charset + charset_list = self.charset_parser.findall(html) + if charset_list: + charset = charset_list[0] + if charset and charset not in ('utf-8', 'UTF-8'): + try: + safe_html_string = html.decode(charset).encode('utf-8') + except (UnicodeDecodeError, LookupError): + pass + else: + charset = 'utf-8' # Override charset if convertion succeeds + transform_tool = getToolByName(portal, 'portal_transforms') + safe_html_string = transform_tool.convertToData(format, html, + encoding=charset) + return safe_html_string security.declareProtected(Permissions.AccessContentsInformation, 'getContentInformation') def getContentInformation(self): diff --git a/product/ERP5/Document/EmailDocument.py b/product/ERP5/Document/EmailDocument.py index 5321fe026c..5883865f2e 100644 --- a/product/ERP5/Document/EmailDocument.py +++ b/product/ERP5/Document/EmailDocument.py @@ -452,7 +452,7 @@ class EmailDocument(File, TextDocument): part_encoding = part.get_content_charset() part_html = part.get_payload(decode=1) # Invoke Document class HTML stripper - html_result = self._stripHTML(part_html, charset=part_encoding) + html_result = self._safeHTML(part_html, charset=part_encoding) if html_result: # Give priority to HTML text_result = html_result diff --git a/product/ERP5/interfaces/html_convertable.py b/product/ERP5/interfaces/html_convertable.py index d674162f4e..b4e8cb2173 100644 --- a/product/ERP5/interfaces/html_convertable.py +++ b/product/ERP5/interfaces/html_convertable.py @@ -53,4 +53,14 @@ class IHtmlConvertable(Interface): kw -- optional parameters which can be passed to the conversion engine - """ \ No newline at end of file + """ + + def asSafeHTML(**kw): + """ + Converts the current document to HTML, and remove + emmbed javascript, forms, any external plugins imports. + + kw -- optional parameters which can be passed to the + conversion engine + """ + -- 2.30.9