Commit a38c59c9 authored by Nicolas Delaby's avatar Nicolas Delaby

Make safe_html transforms more robuts against dirty html documents.

  - In case of failure of HTMLParser, lxml take under its hand
  the broken html and recover it. Then put back only once
  to HTMLParser again.
 


git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk@33407 20353a03-c40f-0410-a6d1-a30d3c3de9de
parent 112180a6
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import logging import logging
from HTMLParser import HTMLParser from HTMLParser import HTMLParser, HTMLParseError
import re import re
from cgi import escape from cgi import escape
from zope.interface import implements from zope.interface import implements
...@@ -14,6 +14,9 @@ from Products.CMFDefault.utils import VALID_TAGS ...@@ -14,6 +14,9 @@ from Products.CMFDefault.utils import VALID_TAGS
from Products.CMFDefault.utils import NASTY_TAGS from Products.CMFDefault.utils import NASTY_TAGS
from Products.PortalTransforms.utils import safeToInt from Products.PortalTransforms.utils import safeToInt
from lxml import etree
from lxml.etree import HTMLParser as LHTMLParser
# tag mapping: tag -> short or long tag # tag mapping: tag -> short or long tag
VALID_TAGS = VALID_TAGS.copy() VALID_TAGS = VALID_TAGS.copy()
NASTY_TAGS = NASTY_TAGS.copy() NASTY_TAGS = NASTY_TAGS.copy()
...@@ -256,17 +259,42 @@ class SafeHTML: ...@@ -256,17 +259,42 @@ class SafeHTML:
data.setData(orig) data.setData(orig)
return data return data
html_string = orig
allready_repaired = False
while True:
try: try:
safe = scrubHTML( safe = scrubHTML(
bodyfinder(orig), bodyfinder(html_string),
valid=self.config.get('valid_tags', {}), valid=self.config.get('valid_tags', {}),
nasty=self.config.get('nasty_tags', {}), nasty=self.config.get('nasty_tags', {}),
remove_javascript=self.config.get('remove_javascript', True), remove_javascript=self.config.get('remove_javascript', True),
raise_error=False) raise_error=False)
except IllegalHTML, inst: except IllegalHTML, inst:
data.setData(msg_pat % ("Error", str(inst))) data.setData(msg_pat % ("Error", str(inst)))
break
except HTMLParseError:
# ouch !
# HTMLParser is not able to parse very dirty HTML string,
# try to repair any broken html with help of lxml
if allready_repaired:
raise
allready_repaired = True
encoding = kwargs.get('encoding')
# recover parameter is equal to True by default
# in lxml API. I pass the argument to improve readability
# of above code.
try:
lparser = LHTMLParser(encoding=encoding, recover=True)
except LookupError:
# Provided encoding is not known by parser, so discard it
lparser = LHTMLParser(recover=True)
repaired_html_tree = etree.HTML(orig, parser=lparser)
html_string = etree.tostring(repaired_html_tree)
# avoid breaking now.
# continue into the loop with repaired html
else: else:
data.setData(safe) data.setData(safe)
break
return data return data
def register(): def register():
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment