Commit f6a8f1d5 authored by Julien Muchembled's avatar Julien Muchembled

safe_html: fix repairing with BeautifulSoup (+ some refactoring)

git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk@41722 20353a03-c40f-0410-a6d1-a30d3c3de9de
parent 4b5b7cb2
...@@ -205,15 +205,14 @@ class StrippingParser(HTMLParser): ...@@ -205,15 +205,14 @@ class StrippingParser(HTMLParser):
elif remove_script and hasScript(v): elif remove_script and hasScript(v):
if not self.raise_error: continue if not self.raise_error: continue
else: raise IllegalHTML, 'Script URI "%s" not allowed.' % v else: raise IllegalHTML, 'Script URI "%s" not allowed.' % v
elif tag.lower() == 'meta' and k.lower() == 'content' and\
self.default_encoding and self.default_encoding not in v:
match = charset_parser.search(v)
if match is not None:
self.original_charset = match.group('charset')
self.result.append(' %s="%s"' % (k,
charset_parser.sub(CharsetReplacer(self.default_encoding), v)
,))
else: else:
if tag.lower() == 'meta' and k.lower() == 'content' and \
self.default_encoding and self.default_encoding not in v:
match = charset_parser.search(v)
if match is not None:
self.original_charset = match.group('charset')
v = charset_parser.sub(
CharsetReplacer(self.default_encoding), v)
self.result.append(' %s="%s"' % (k, v)) self.result.append(' %s="%s"' % (k, v))
#UNUSED endTag = '</%s>' % tag #UNUSED endTag = '</%s>' % tag
...@@ -351,13 +350,11 @@ class SafeHTML: ...@@ -351,13 +350,11 @@ class SafeHTML:
data.setData(orig) data.setData(orig)
return data return data
html_string = orig repaired = 0
already_repaired = False
one_more_bullet_with_beautifulsoup = soupfromstring is not None
while True: while True:
try: try:
safe = scrubHTML( orig = scrubHTML(
html_string, orig,
valid=self.config.get('valid_tags', {}), valid=self.config.get('valid_tags', {}),
nasty=self.config.get('nasty_tags', {}), nasty=self.config.get('nasty_tags', {}),
remove_javascript=self.config.get('remove_javascript', True), remove_javascript=self.config.get('remove_javascript', True),
...@@ -368,42 +365,38 @@ class SafeHTML: ...@@ -368,42 +365,38 @@ class SafeHTML:
break break
except HTMLParseError: except HTMLParseError:
# ouch ! # ouch !
# HTMLParser is not able to parse very dirty HTML string, # HTMLParser is not able to parse very dirty HTML string
# try to repair any broken html with help of lxml if not repaired:
if already_repaired and not one_more_bullet_with_beautifulsoup: # try to repair any broken html with help of lxml
# Even lxml nor BeautifulSoup doesn't perform miracles encoding = kwargs.get('encoding')
# so Give up ! # recover parameter is equal to True by default
raise # in lxml API. I pass the argument to improve readability
elif already_repaired and one_more_bullet_with_beautifulsoup: # of above code.
# Is BeautifulSoup can perform miracles ? try:
one_more_bullet_with_beautifulsoup = False lparser = LHTMLParser(encoding=encoding, recover=True,
# This function can raise the exception HTMLParseError. remove_comments=True)
# So consider this parsing as last chance except LookupError:
# to get parsable html. # Provided encoding is not known by parser so discard it
repaired_html_tree = soupfromstring(html_string) lparser = LHTMLParser(recover=True,
html_string = tostring(repaired_html_tree, remove_comments=True)
include_meta_content_type=True, repaired_html_tree = etree.HTML(orig, parser=lparser)
method='xml') elif repaired > (soupfromstring is not None):
already_repaired = True # Neither lxml nor BeautifulSoup worked so give up !
encoding = kwargs.get('encoding') raise
# recover parameter is equal to True by default else:
# in lxml API. I pass the argument to improve readability # Can BeautifulSoup perform miracles ?
# of above code. # This function may raise HTMLParseError.
try: # So consider this parsing as last chance
lparser = LHTMLParser(encoding=encoding, recover=True, # to get parsable html.
remove_comments=True) repaired_html_tree = soupfromstring(orig)
except LookupError: orig = tostring(repaired_html_tree,
# Provided encoding is not known by parser, so discard it include_meta_content_type=True,
lparser = LHTMLParser(recover=True, method='xml')
remove_comments=True) repaired += 1
repaired_html_tree = etree.HTML(orig, parser=lparser)
html_string = tostring(repaired_html_tree,
include_meta_content_type=True,
method='xml')
# avoid breaking now. # avoid breaking now.
# continue into the loop with repaired html # continue into the loop with repaired html
else: else:
data.setData(safe) data.setData(orig)
break break
return data return data
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment