Commit e4b0e224 authored by Nicolas Delaby's avatar Nicolas Delaby

Output always safe html content.

  * _safeHTML is removed
  * The stripping is done inside convert method
  * Conversion Cache is handled corectly


git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk@34360 20353a03-c40f-0410-a6d1-a30d3c3de9de
parent 7d81afef
......@@ -490,7 +490,6 @@ class Document(PermanentURLMixIn, XMLObject, UrlMixIn, CachedConvertableMixin, S
href_parser = re.compile('<a[^>]*href=[\'"](.*?)[\'"]',re.IGNORECASE)
body_parser = re.compile('<body[^>]*>(.*?)</body>', re.IGNORECASE + re.DOTALL)
title_parser = re.compile('<title[^>]*>(.*?)</title>', re.IGNORECASE + re.DOTALL)
base_parser = re.compile('<base[^>]*href=[\'"](.*?)[\'"][^>]*>', re.IGNORECASE + re.DOTALL)
charset_parser = re.compile('charset="?([a-z0-9\-]+)', re.IGNORECASE)
# Declarative security
......@@ -1151,11 +1150,6 @@ class Document(PermanentURLMixIn, XMLObject, UrlMixIn, CachedConvertableMixin, S
"""
if not self.hasBaseData():
raise ConversionError('This document has not been processed yet.')
try:
# FIXME: no substitution may occur in this case.
mime, data = self.getConversion(format='base-html')
return data
except KeyError:
kw['format'] = 'html'
mime, html = self.convert(**kw)
return html
......@@ -1167,16 +1161,7 @@ class Document(PermanentURLMixIn, XMLObject, UrlMixIn, CachedConvertableMixin, S
(without html and body tags, etc.) which can be used to inline
a preview of the document.
"""
if not self.hasBaseData():
return ''
try:
# FIXME: no substitution may occur in this case.
mime, data = self.getConversion(format='stripped-html')
return data
except KeyError:
kw['format'] = 'html'
mime, html = self.convert(**kw)
return self._stripHTML(str(html))
return self._stripHTML(self._asHTML(**kw))
def _guessEncoding(self, string):
"""
......@@ -1199,49 +1184,8 @@ class Document(PermanentURLMixIn, XMLObject, UrlMixIn, CachedConvertableMixin, S
stripped_html = body_list[0]
else:
stripped_html = html
# find charset and convert to utf-8
charset_list = self.charset_parser.findall(str(html)) # XXX - Not efficient if this
# is datastream instance but hard to do better
if charset and not charset_list:
# Use optional parameter is we can not find encoding in HTML
charset_list = [charset]
if charset_list and charset_list[0] not in ('utf-8', 'UTF-8'):
try:
stripped_html = unicode(str(stripped_html),
charset_list[0]).encode('utf-8')
except (UnicodeDecodeError, LookupError):
return str(stripped_html)
return stripped_html
def _safeHTML(self, html, format='text/x-html-safe', charset=None):
"""
A private method to strip HTML content in safe mode,
w/o emmbed javascript, forms and any external plugins imports.
This should be used when we do not trust the user (Anonymous)
who push data into database.
- html: content to strip
- format: destination format
- charset: charset used to encode string. Take precedence
on charset values found in html string
"""
portal = self.getPortalObject()
if charset is None:
# find charset
charset_list = self.charset_parser.findall(html)
if charset_list:
charset = charset_list[0]
if charset and charset not in ('utf-8', 'UTF-8'):
try:
safe_html_string = html.decode(charset).encode('utf-8')
except (UnicodeDecodeError, LookupError):
pass
else:
charset = 'utf-8' # Override charset if convertion succeeds
transform_tool = getToolByName(portal, 'portal_transforms')
safe_html_string = transform_tool.convertToData(format, html,
encoding=charset)
return safe_html_string
security.declareProtected(Permissions.AccessContentsInformation, 'getContentInformation')
def getContentInformation(self):
"""
......
......@@ -202,7 +202,8 @@ class TextDocument(Document, TextContent):
**substitution_method_parameter_dict)
security.declareProtected(Permissions.AccessContentsInformation, 'convert')
def convert(self, format, substitution_method_parameter_dict=None, safe_substitute=True, **kw):
def convert(self, format, substitution_method_parameter_dict=None,
safe_substitute=True, charset=None, text_content=None, **kw):
"""
Convert text using portal_transforms or oood
"""
......@@ -212,35 +213,55 @@ class TextDocument(Document, TextContent):
if format == 'raw':
return 'text/plain', self.getTextContent()
portal = self.getPortalObject()
mime_type = getToolByName(portal, 'mimetypes_registry').lookupExtension('name.%s' % format)
mime_type = str(mime_type)
mime_type = getToolByName(portal, 'mimetypes_registry').\
lookupExtension('name.%s' % format)
original_mime_type = mime_type = str(mime_type)
src_mimetype = self.getTextFormat(DEFAULT_TEXT_FORMAT)
if not src_mimetype.startswith('text/'):
src_mimetype = 'text/%s' % src_mimetype
if text_content is None:
# check if document has set text_content and convert if necessary
text_content = self.getTextContent()
if text_content:
if not self.hasConversion(format=format):
portal_transforms = getToolByName(portal, 'portal_transforms')
filename = self.getSourceReference(self.getTitleOrId())
if mime_type == 'text/html':
mime_type = 'text/x-html-safe'
if charset is None:
# find charset
charset_list = self.charset_parser.findall(text_content)
if charset_list:
charset = charset_list[0]
if charset and charset not in ('utf-8', 'UTF-8'):
try:
text_content = text_content.decode(charset).encode('utf-8')
except (UnicodeDecodeError, LookupError):
pass
else:
charset = 'utf-8' # Override charset if convertion succeeds
# change charset value in html_document as well
self.charset_parser.sub('utf-8', text_content)
result = portal_transforms.convertToData(mime_type, text_content,
object=self, context=self,
filename=filename,
mimetype=src_mimetype)
mimetype=src_mimetype,
encoding=charset)
if result is None:
raise ConversionError('TextDocument conversion error. '
'portal_transforms failed to convert to %s: %r' % (mime_type, self))
self.setConversion(result, mime_type, format=format)
'portal_transforms failed to convert'\
'to %s: %r' % (mime_type, self))
self.setConversion(result, original_mime_type, format=format)
else:
mime_type, result = self.getConversion(format=format)
if substitution_method_parameter_dict is None:
substitution_method_parameter_dict = {}
result = self._substituteTextContent(result, safe_substitute=safe_substitute,
**substitution_method_parameter_dict)
return mime_type, result
return original_mime_type, result
else:
# text_content is not set, return empty string instead of None
return mime_type, ''
return original_mime_type, ''
def __call__(self):
_setCacheHeaders(_ViewEmulator().__of__(self), {})
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment