Commit a38c59c9 authored by Nicolas Delaby's avatar Nicolas Delaby

Make safe_html transforms more robuts against dirty html documents.

  - In case of failure of HTMLParser, lxml take under its hand
  the broken html and recover it. Then put back only once
  to HTMLParser again.
 


git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk@33407 20353a03-c40f-0410-a6d1-a30d3c3de9de
parent 112180a6
# -*- coding: utf-8 -*-
import logging
from HTMLParser import HTMLParser
from HTMLParser import HTMLParser, HTMLParseError
import re
from cgi import escape
from zope.interface import implements
......@@ -14,6 +14,9 @@ from Products.CMFDefault.utils import VALID_TAGS
from Products.CMFDefault.utils import NASTY_TAGS
from Products.PortalTransforms.utils import safeToInt
from lxml import etree
from lxml.etree import HTMLParser as LHTMLParser
# tag mapping: tag -> short or long tag
VALID_TAGS = VALID_TAGS.copy()
NASTY_TAGS = NASTY_TAGS.copy()
......@@ -256,17 +259,42 @@ class SafeHTML:
data.setData(orig)
return data
try:
safe = scrubHTML(
bodyfinder(orig),
valid=self.config.get('valid_tags', {}),
nasty=self.config.get('nasty_tags', {}),
remove_javascript=self.config.get('remove_javascript', True),
raise_error=False)
except IllegalHTML, inst:
data.setData(msg_pat % ("Error", str(inst)))
else:
data.setData(safe)
html_string = orig
allready_repaired = False
while True:
try:
safe = scrubHTML(
bodyfinder(html_string),
valid=self.config.get('valid_tags', {}),
nasty=self.config.get('nasty_tags', {}),
remove_javascript=self.config.get('remove_javascript', True),
raise_error=False)
except IllegalHTML, inst:
data.setData(msg_pat % ("Error", str(inst)))
break
except HTMLParseError:
# ouch !
# HTMLParser is not able to parse very dirty HTML string,
# try to repair any broken html with help of lxml
if allready_repaired:
raise
allready_repaired = True
encoding = kwargs.get('encoding')
# recover parameter is equal to True by default
# in lxml API. I pass the argument to improve readability
# of above code.
try:
lparser = LHTMLParser(encoding=encoding, recover=True)
except LookupError:
# Provided encoding is not known by parser, so discard it
lparser = LHTMLParser(recover=True)
repaired_html_tree = etree.HTML(orig, parser=lparser)
html_string = etree.tostring(repaired_html_tree)
# avoid breaking now.
# continue into the loop with repaired html
else:
data.setData(safe)
break
return data
def register():
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment