Commit a804620f authored by R. David Murray's avatar R. David Murray

#1486713: Add a tolerant mode to HTMLParser.

The motivation for adding this option is that the the functionality it
provides used to be provided by sgmllib in Python2, and was used by,
for example, BeautifulSoup.  Without this option, the Python3 version
of BeautifulSoup and the many programs that use it are crippled.

The original patch was by 'kxroberto'.  I modified it heavily but kept his
heuristics and test.  I also added additional heuristics to fix #975556,
#1046092, and part of #6191.  This patch should be completely backward
compatible:  the behavior with the default strict=True is unchanged.
parent ad29aca3
...@@ -12,9 +12,13 @@ ...@@ -12,9 +12,13 @@
This module defines a class :class:`HTMLParser` which serves as the basis for This module defines a class :class:`HTMLParser` which serves as the basis for
parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML. parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML.
.. class:: HTMLParser() .. class:: HTMLParser(strict=True)
The :class:`HTMLParser` class is instantiated without arguments. Create a parser instance. If *strict* is ``True`` (the default), invalid
html results in :exc:`~html.parser.HTMLParseError` exceptions [#]_. If
*strict* is ``False``, the parser uses heuristics to make a best guess at
the intention of any invalid html it encounters, similar to the way most
browsers do.
An :class:`HTMLParser` instance is fed HTML data and calls handler functions when tags An :class:`HTMLParser` instance is fed HTML data and calls handler functions when tags
begin and end. The :class:`HTMLParser` class is meant to be overridden by the begin and end. The :class:`HTMLParser` class is meant to be overridden by the
...@@ -191,3 +195,8 @@ As a basic example, below is a very basic HTML parser that uses the ...@@ -191,3 +195,8 @@ As a basic example, below is a very basic HTML parser that uses the
Encountered a html end tag Encountered a html end tag
.. rubric:: Footnotes
.. [#] For backward compatibility reasons *strict* mode does not throw
errors for all non-compliant HTML. That is, some invalid HTML
is tolerated even in *strict* mode.
...@@ -24,10 +24,14 @@ starttagopen = re.compile('<[a-zA-Z]') ...@@ -24,10 +24,14 @@ starttagopen = re.compile('<[a-zA-Z]')
piclose = re.compile('>') piclose = re.compile('>')
commentclose = re.compile(r'--\s*>') commentclose = re.compile(r'--\s*>')
tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*') tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*')
# Note, the strict one of this pair isn't really strict, but we can't
# make it correctly strict without breaking backward compatibility.
attrfind = re.compile( attrfind = re.compile(
r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*' r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~@]*))?') r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~@]*))?')
attrfind_tolerant = re.compile(
r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
r'(\'[^\']*\'|"[^"]*"|[^>\s]*))?')
locatestarttagend = re.compile(r""" locatestarttagend = re.compile(r"""
<[a-zA-Z][-.a-zA-Z0-9:_]* # tag name <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
(?:\s+ # whitespace before attribute name (?:\s+ # whitespace before attribute name
...@@ -42,6 +46,21 @@ locatestarttagend = re.compile(r""" ...@@ -42,6 +46,21 @@ locatestarttagend = re.compile(r"""
)* )*
\s* # trailing whitespace \s* # trailing whitespace
""", re.VERBOSE) """, re.VERBOSE)
locatestarttagend_tolerant = re.compile(r"""
<[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
(?:\s* # optional whitespace before attribute name
(?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
(?:\s*=\s* # value indicator
(?:'[^']*' # LITA-enclosed value
|\"[^\"]*\" # LIT-enclosed value
|[^'\">\s]+ # bare value
)
(?:\s*,)* # possibly followed by a comma
)?
)
)*
\s* # trailing whitespace
""", re.VERBOSE)
endendtag = re.compile('>') endendtag = re.compile('>')
endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>') endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
...@@ -86,9 +105,15 @@ class HTMLParser(_markupbase.ParserBase): ...@@ -86,9 +105,15 @@ class HTMLParser(_markupbase.ParserBase):
CDATA_CONTENT_ELEMENTS = ("script", "style") CDATA_CONTENT_ELEMENTS = ("script", "style")
def __init__(self, strict=True):
"""Initialize and reset this instance.
def __init__(self): If strict is set to True (the default), errors are raised when invalid
"""Initialize and reset this instance.""" HTML is encountered. If set to False, an attempt is instead made to
continue parsing, making "best guesses" about the intended meaning, in
a fashion similar to what browsers typically do.
"""
self.strict = strict
self.reset() self.reset()
def reset(self): def reset(self):
...@@ -160,9 +185,18 @@ class HTMLParser(_markupbase.ParserBase): ...@@ -160,9 +185,18 @@ class HTMLParser(_markupbase.ParserBase):
else: else:
break break
if k < 0: if k < 0:
if end: if not end:
break
if self.strict:
self.error("EOF in middle of construct") self.error("EOF in middle of construct")
break k = rawdata.find('>', i + 1)
if k < 0:
k = rawdata.find('<', i + 1)
if k < 0:
k = i + 1
else:
k += 1
self.handle_data(rawdata[i:k])
i = self.updatepos(i, k) i = self.updatepos(i, k)
elif startswith("&#", i): elif startswith("&#", i):
match = charref.match(rawdata, i) match = charref.match(rawdata, i)
...@@ -193,7 +227,12 @@ class HTMLParser(_markupbase.ParserBase): ...@@ -193,7 +227,12 @@ class HTMLParser(_markupbase.ParserBase):
if match: if match:
# match.group() will contain at least 2 chars # match.group() will contain at least 2 chars
if end and match.group() == rawdata[i:]: if end and match.group() == rawdata[i:]:
self.error("EOF in middle of entity or char ref") if self.strict:
self.error("EOF in middle of entity or char ref")
else:
if k <= i:
k = n
i = self.updatepos(i, i + 1)
# incomplete # incomplete
break break
elif (i + 1) < n: elif (i + 1) < n:
...@@ -240,7 +279,10 @@ class HTMLParser(_markupbase.ParserBase): ...@@ -240,7 +279,10 @@ class HTMLParser(_markupbase.ParserBase):
self.lasttag = tag = rawdata[i+1:k].lower() self.lasttag = tag = rawdata[i+1:k].lower()
while k < endpos: while k < endpos:
m = attrfind.match(rawdata, k) if self.strict:
m = attrfind.match(rawdata, k)
else:
m = attrfind_tolerant.search(rawdata, k)
if not m: if not m:
break break
attrname, rest, attrvalue = m.group(1, 2, 3) attrname, rest, attrvalue = m.group(1, 2, 3)
...@@ -262,8 +304,11 @@ class HTMLParser(_markupbase.ParserBase): ...@@ -262,8 +304,11 @@ class HTMLParser(_markupbase.ParserBase):
- self.__starttag_text.rfind("\n") - self.__starttag_text.rfind("\n")
else: else:
offset = offset + len(self.__starttag_text) offset = offset + len(self.__starttag_text)
self.error("junk characters in start tag: %r" if self.strict:
% (rawdata[k:endpos][:20],)) self.error("junk characters in start tag: %r"
% (rawdata[k:endpos][:20],))
self.handle_data(rawdata[i:endpos])
return endpos
if end.endswith('/>'): if end.endswith('/>'):
# XHTML-style empty tag: <span attr="value" /> # XHTML-style empty tag: <span attr="value" />
self.handle_startendtag(tag, attrs) self.handle_startendtag(tag, attrs)
...@@ -277,7 +322,10 @@ class HTMLParser(_markupbase.ParserBase): ...@@ -277,7 +322,10 @@ class HTMLParser(_markupbase.ParserBase):
# or -1 if incomplete. # or -1 if incomplete.
def check_for_whole_start_tag(self, i): def check_for_whole_start_tag(self, i):
rawdata = self.rawdata rawdata = self.rawdata
m = locatestarttagend.match(rawdata, i) if self.strict:
m = locatestarttagend.match(rawdata, i)
else:
m = locatestarttagend_tolerant.match(rawdata, i)
if m: if m:
j = m.end() j = m.end()
next = rawdata[j:j+1] next = rawdata[j:j+1]
...@@ -290,8 +338,13 @@ class HTMLParser(_markupbase.ParserBase): ...@@ -290,8 +338,13 @@ class HTMLParser(_markupbase.ParserBase):
# buffer boundary # buffer boundary
return -1 return -1
# else bogus input # else bogus input
self.updatepos(i, j + 1) if self.strict:
self.error("malformed empty start tag") self.updatepos(i, j + 1)
self.error("malformed empty start tag")
if j > i:
return j
else:
return i + 1
if next == "": if next == "":
# end of input # end of input
return -1 return -1
...@@ -300,8 +353,13 @@ class HTMLParser(_markupbase.ParserBase): ...@@ -300,8 +353,13 @@ class HTMLParser(_markupbase.ParserBase):
# end of input in or before attribute value, or we have the # end of input in or before attribute value, or we have the
# '/' from a '/>' ending # '/' from a '/>' ending
return -1 return -1
self.updatepos(i, j) if self.strict:
self.error("malformed start tag") self.updatepos(i, j)
self.error("malformed start tag")
if j > i:
return j
else:
return i + 1
raise AssertionError("we should not get here!") raise AssertionError("we should not get here!")
# Internal -- parse endtag, return end or -1 if incomplete # Internal -- parse endtag, return end or -1 if incomplete
...@@ -314,7 +372,15 @@ class HTMLParser(_markupbase.ParserBase): ...@@ -314,7 +372,15 @@ class HTMLParser(_markupbase.ParserBase):
j = match.end() j = match.end()
match = endtagfind.match(rawdata, i) # </ + tag + > match = endtagfind.match(rawdata, i) # </ + tag + >
if not match: if not match:
self.error("bad end tag: %r" % (rawdata[i:j],)) if self.strict:
self.error("bad end tag: %r" % (rawdata[i:j],))
k = rawdata.find('<', i + 1, j)
if k > i:
j = k
if j <= i:
j = i + 1
self.handle_data(rawdata[i:j])
return j
tag = match.group(1) tag = match.group(1)
self.handle_endtag(tag.lower()) self.handle_endtag(tag.lower())
self.clear_cdata_mode() self.clear_cdata_mode()
...@@ -358,7 +424,8 @@ class HTMLParser(_markupbase.ParserBase): ...@@ -358,7 +424,8 @@ class HTMLParser(_markupbase.ParserBase):
pass pass
def unknown_decl(self, data): def unknown_decl(self, data):
self.error("unknown declaration: %r" % (data,)) if self.strict:
self.error("unknown declaration: %r" % (data,))
# Internal -- helper to remove special character quoting # Internal -- helper to remove special character quoting
entitydefs = None entitydefs = None
......
...@@ -8,10 +8,10 @@ from test import support ...@@ -8,10 +8,10 @@ from test import support
class EventCollector(html.parser.HTMLParser): class EventCollector(html.parser.HTMLParser):
def __init__(self): def __init__(self, *args, **kw):
self.events = [] self.events = []
self.append = self.events.append self.append = self.events.append
html.parser.HTMLParser.__init__(self) html.parser.HTMLParser.__init__(self, *args, **kw)
def get_events(self): def get_events(self):
# Normalize the list of events so that buffer artefacts don't # Normalize the list of events so that buffer artefacts don't
...@@ -72,8 +72,10 @@ class EventCollectorExtra(EventCollector): ...@@ -72,8 +72,10 @@ class EventCollectorExtra(EventCollector):
class TestCaseBase(unittest.TestCase): class TestCaseBase(unittest.TestCase):
def _run_check(self, source, expected_events, collector=EventCollector): def _run_check(self, source, expected_events, collector=None):
parser = collector() if collector is None:
collector = EventCollector()
parser = collector
for s in source: for s in source:
parser.feed(s) parser.feed(s)
parser.close() parser.close()
...@@ -84,7 +86,7 @@ class TestCaseBase(unittest.TestCase): ...@@ -84,7 +86,7 @@ class TestCaseBase(unittest.TestCase):
"\nReceived:\n" + pprint.pformat(events)) "\nReceived:\n" + pprint.pformat(events))
def _run_check_extra(self, source, events): def _run_check_extra(self, source, events):
self._run_check(source, events, EventCollectorExtra) self._run_check(source, events, EventCollectorExtra())
def _parse_error(self, source): def _parse_error(self, source):
def parse(source=source): def parse(source=source):
...@@ -321,8 +323,42 @@ DOCTYPE html [ ...@@ -321,8 +323,42 @@ DOCTYPE html [
]) ])
class HTMLParserTolerantTestCase(TestCaseBase):
def setUp(self):
self.collector = EventCollector(strict=False)
def test_tolerant_parsing(self):
self._run_check('<html <html>te>>xt&a<<bc</a></html>\n'
'<img src="URL><//img></html</html>', [
('data', '<html '),
('starttag', 'html', []),
('data', 'te>>xt'),
('entityref', 'a'),
('data', '<<bc'),
('endtag', 'a'),
('endtag', 'html'),
('data', '\n<img src="URL><//img></html'),
('endtag', 'html')],
collector = self.collector)
def test_comma_between_attributes(self):
self._run_check('<form action="/xxx.php?a=1&amp;b=2&amp", '
'method="post">', [
('starttag', 'form',
[('action', '/xxx.php?a=1&b=2&amp'),
('method', 'post')])],
collector = self.collector)
def test_weird_chars_in_unquoted_attribute_values(self):
self._run_check('<form action=bogus|&#()value>', [
('starttag', 'form',
[('action', 'bogus|&#()value')])],
collector = self.collector)
def test_main(): def test_main():
support.run_unittest(HTMLParserTestCase) support.run_unittest(HTMLParserTestCase, HTMLParserTolerantTestCase)
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -58,6 +58,9 @@ Core and Builtins ...@@ -58,6 +58,9 @@ Core and Builtins
Library Library
------- -------
- Issue #1486713: HTMLParser now has an optional tolerant mode where it
tries to guess at the correct parsing of invalid html.
- Issue #10554: Add context manager support to subprocess.Popen objects. - Issue #10554: Add context manager support to subprocess.Popen objects.
- Issue #8989: email.utils.make_msgid now has a domain parameter that can - Issue #8989: email.utils.make_msgid now has a domain parameter that can
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment