Commit 281104a2 authored by Nicolas Delaby's avatar Nicolas Delaby

Add new option which enable conversion of document into new codec (default is utf-8)

and replace its charset declaration inside html conversion


git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk@35262 20353a03-c40f-0410-a6d1-a30d3c3de9de
parent ba025824
...@@ -15,6 +15,7 @@ from Products.PortalTransforms.utils import safeToInt ...@@ -15,6 +15,7 @@ from Products.PortalTransforms.utils import safeToInt
from lxml import etree from lxml import etree
from lxml.etree import HTMLParser as LHTMLParser from lxml.etree import HTMLParser as LHTMLParser
from lxml.html import tostring
# tag mapping: tag -> short or long tag # tag mapping: tag -> short or long tag
VALID_TAGS = VALID_TAGS.copy() VALID_TAGS = VALID_TAGS.copy()
...@@ -71,6 +72,19 @@ def decode_htmlentity(m): ...@@ -71,6 +72,19 @@ def decode_htmlentity(m):
except ValueError: except ValueError:
return entity_value return entity_value
charset_parser = re.compile('charset="?(?P<charset>[^"]*)"?$')
class CharsetReplacer:
def __init__(self, encoding):
self.encoding = encoding
def __call__(self, match):
if match is None:
return ''
charset = match.group('charset')
if charset != self.encoding:
return match.group(0).replace(charset, self.encoding)
return match.group(0)
class StrippingParser(HTMLParser): class StrippingParser(HTMLParser):
"""Pass only allowed tags; raise exception for known-bad. """Pass only allowed tags; raise exception for known-bad.
...@@ -80,7 +94,8 @@ class StrippingParser(HTMLParser): ...@@ -80,7 +94,8 @@ class StrippingParser(HTMLParser):
from htmlentitydefs import entitydefs # replace entitydefs from sgmllib from htmlentitydefs import entitydefs # replace entitydefs from sgmllib
def __init__(self, valid, nasty, remove_javascript, raise_error): def __init__(self, valid, nasty, remove_javascript, raise_error,
default_encoding):
HTMLParser.__init__( self ) HTMLParser.__init__( self )
self.result = [] self.result = []
self.valid = valid self.valid = valid
...@@ -88,6 +103,8 @@ class StrippingParser(HTMLParser): ...@@ -88,6 +103,8 @@ class StrippingParser(HTMLParser):
self.remove_javascript = remove_javascript self.remove_javascript = remove_javascript
self.raise_error = raise_error self.raise_error = raise_error
self.suppress = False self.suppress = False
self.default_encoding = default_encoding
self.original_charset = None
def handle_data(self, data): def handle_data(self, data):
if self.suppress: return if self.suppress: return
...@@ -117,14 +134,12 @@ class StrippingParser(HTMLParser): ...@@ -117,14 +134,12 @@ class StrippingParser(HTMLParser):
def handle_starttag(self, tag, attrs): def handle_starttag(self, tag, attrs):
""" Delete all tags except for legal ones. """ Delete all tags except for legal ones.
""" """
if self.suppress: return if self.suppress: return
if self.valid.has_key(tag): if self.valid.has_key(tag):
self.result.append('<' + tag) self.result.append('<' + tag)
remove_script = getattr(self,'remove_javascript',True) remove_script = getattr(self,'remove_javascript',True)
for k, v in attrs: for k, v in attrs:
if remove_script and k.strip().lower().startswith('on'): if remove_script and k.strip().lower().startswith('on'):
if not self.raise_error: continue if not self.raise_error: continue
...@@ -134,6 +149,14 @@ class StrippingParser(HTMLParser): ...@@ -134,6 +149,14 @@ class StrippingParser(HTMLParser):
elif remove_script and hasScript(v): elif remove_script and hasScript(v):
if not self.raise_error: continue if not self.raise_error: continue
else: raise IllegalHTML, 'Script URI "%s" not allowed.' % v else: raise IllegalHTML, 'Script URI "%s" not allowed.' % v
elif k.lower() == 'content' and self.default_encoding and\
self.default_encoding not in v:
match = charset_parser.search(v)
if match is not None:
self.original_charset = match.group('charset')
self.result.append(' %s="%s"' % (k,
charset_parser.sub(CharsetReplacer(self.default_encoding), v)
,))
else: else:
self.result.append(' %s="%s"' % (k, v)) self.result.append(' %s="%s"' % (k, v))
...@@ -162,15 +185,22 @@ class StrippingParser(HTMLParser): ...@@ -162,15 +185,22 @@ class StrippingParser(HTMLParser):
return ''.join(self.result) return ''.join(self.result)
def scrubHTML(html, valid=VALID_TAGS, nasty=NASTY_TAGS, def scrubHTML(html, valid=VALID_TAGS, nasty=NASTY_TAGS,
remove_javascript=True, raise_error=True): remove_javascript=True, raise_error=True,
default_encoding=None):
""" Strip illegal HTML tags from string text. """ Strip illegal HTML tags from string text.
""" """
parser = StrippingParser(valid=valid, nasty=nasty, parser = StrippingParser(valid=valid, nasty=nasty,
remove_javascript=remove_javascript, remove_javascript=remove_javascript,
raise_error=raise_error) raise_error=raise_error,
default_encoding=default_encoding)
parser.feed(html) parser.feed(html)
parser.close() parser.close()
if parser.original_charset:
result = parser.getResult().decode(parser.original_charset)\
.encode(default_encoding)
return result
return parser.getResult() return parser.getResult()
class SafeHTML: class SafeHTML:
...@@ -206,6 +236,7 @@ class SafeHTML: ...@@ -206,6 +236,7 @@ class SafeHTML:
'nasty_tags': NASTY_TAGS, 'nasty_tags': NASTY_TAGS,
'remove_javascript': 1, 'remove_javascript': 1,
'disable_transform': 0, 'disable_transform': 0,
'default_encoding': 'utf-8',
} }
self.config_metadata = { self.config_metadata = {
...@@ -230,7 +261,11 @@ class SafeHTML: ...@@ -230,7 +261,11 @@ class SafeHTML:
'This does not effect <script> tags. 0 to leave the attributes.'), 'This does not effect <script> tags. 0 to leave the attributes.'),
'disable_transform' : ("int", 'disable_transform' : ("int",
'disable_transform', 'disable_transform',
'If 1, nothing is done.') 'If 1, nothing is done.'),
'default_encoding': ('string',
'default_encoding',
'Encoding used for html string.'\
' If encoding is different, the string will be converted' ),
} }
self.config.update(kwargs) self.config.update(kwargs)
...@@ -269,7 +304,8 @@ class SafeHTML: ...@@ -269,7 +304,8 @@ class SafeHTML:
valid=self.config.get('valid_tags', {}), valid=self.config.get('valid_tags', {}),
nasty=self.config.get('nasty_tags', {}), nasty=self.config.get('nasty_tags', {}),
remove_javascript=self.config.get('remove_javascript', True), remove_javascript=self.config.get('remove_javascript', True),
raise_error=False) raise_error=False,
default_encoding=self.config.get('default_encoding', 'utf-8'))
except IllegalHTML, inst: except IllegalHTML, inst:
data.setData(msg_pat % ("Error", str(inst))) data.setData(msg_pat % ("Error", str(inst)))
break break
...@@ -292,7 +328,8 @@ class SafeHTML: ...@@ -292,7 +328,8 @@ class SafeHTML:
lparser = LHTMLParser(recover=True, lparser = LHTMLParser(recover=True,
remove_comments=True) remove_comments=True)
repaired_html_tree = etree.HTML(orig, parser=lparser) repaired_html_tree = etree.HTML(orig, parser=lparser)
html_string = etree.tostring(repaired_html_tree) html_string = tostring(repaired_html_tree,
include_meta_content_type=True)
# avoid breaking now. # avoid breaking now.
# continue into the loop with repaired html # continue into the loop with repaired html
else: else:
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment