Commit 7db49d12 authored by Ezio Melotti's avatar Ezio Melotti

#15114: the strict mode of HTMLParser and the HTMLParseError exception are...

#15114: the strict mode of HTMLParser and the HTMLParseError exception are deprecated now that the parser is able to parse invalid markup.
parent 0151133e
...@@ -16,13 +16,14 @@ ...@@ -16,13 +16,14 @@
This module defines a class :class:`HTMLParser` which serves as the basis for This module defines a class :class:`HTMLParser` which serves as the basis for
parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML. parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML.
.. class:: HTMLParser(strict=True) .. class:: HTMLParser(strict=False)
Create a parser instance. If *strict* is ``True`` (the default), invalid Create a parser instance. If *strict* is ``False`` (the default), the parser
HTML results in :exc:`~html.parser.HTMLParseError` exceptions [#]_. If will accept and parse invalid markup. If *strict* is ``True`` the parser
*strict* is ``False``, the parser uses heuristics to make a best guess at will raise an :exc:`~html.parser.HTMLParseError` exception instead [#]_ when
the intention of any invalid HTML it encounters, similar to the way most it's not able to parse the markup.
browsers do. Using ``strict=False`` is advised. The use of ``strict=True`` is discouraged and the *strict* argument is
deprecated.
An :class:`.HTMLParser` instance is fed HTML data and calls handler methods An :class:`.HTMLParser` instance is fed HTML data and calls handler methods
when start tags, end tags, text, comments, and other markup elements are when start tags, end tags, text, comments, and other markup elements are
...@@ -34,6 +35,10 @@ parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML. ...@@ -34,6 +35,10 @@ parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML.
.. versionchanged:: 3.2 *strict* keyword added .. versionchanged:: 3.2 *strict* keyword added
.. deprecated-removed:: 3.3 3.5
The *strict* argument and the strict mode have been deprecated.
The parser is now able to accept and parse invalid markup too.
An exception is defined as well: An exception is defined as well:
...@@ -46,6 +51,10 @@ An exception is defined as well: ...@@ -46,6 +51,10 @@ An exception is defined as well:
detected, and :attr:`offset` is the number of characters into the line at detected, and :attr:`offset` is the number of characters into the line at
which the construct starts. which the construct starts.
.. deprecated-removed:: 3.3 3.5
This exception has been deprecated because it's never raised by the parser
(when the default non-strict mode is used).
Example HTML Parser Application Example HTML Parser Application
------------------------------- -------------------------------
......
...@@ -10,6 +10,7 @@ ...@@ -10,6 +10,7 @@
import _markupbase import _markupbase
import re import re
import warnings
# Regular expressions used for parsing # Regular expressions used for parsing
...@@ -113,14 +114,16 @@ class HTMLParser(_markupbase.ParserBase): ...@@ -113,14 +114,16 @@ class HTMLParser(_markupbase.ParserBase):
CDATA_CONTENT_ELEMENTS = ("script", "style") CDATA_CONTENT_ELEMENTS = ("script", "style")
def __init__(self, strict=True): def __init__(self, strict=False):
"""Initialize and reset this instance. """Initialize and reset this instance.
If strict is set to True (the default), errors are raised when invalid If strict is set to False (the default) the parser will parse invalid
HTML is encountered. If set to False, an attempt is instead made to markup, otherwise it will raise an error. Note that the strict mode
continue parsing, making "best guesses" about the intended meaning, in is deprecated.
a fashion similar to what browsers typically do.
""" """
if strict:
warnings.warn("The strict mode is deprecated.",
DeprecationWarning, stacklevel=2)
self.strict = strict self.strict = strict
self.reset() self.reset()
...@@ -271,8 +274,8 @@ class HTMLParser(_markupbase.ParserBase): ...@@ -271,8 +274,8 @@ class HTMLParser(_markupbase.ParserBase):
# See also parse_declaration in _markupbase # See also parse_declaration in _markupbase
def parse_html_declaration(self, i): def parse_html_declaration(self, i):
rawdata = self.rawdata rawdata = self.rawdata
if rawdata[i:i+2] != '<!': assert rawdata[i:i+2] == '<!', ('unexpected call to '
self.error('unexpected call to parse_html_declaration()') 'parse_html_declaration()')
if rawdata[i:i+4] == '<!--': if rawdata[i:i+4] == '<!--':
# this case is actually already handled in goahead() # this case is actually already handled in goahead()
return self.parse_comment(i) return self.parse_comment(i)
...@@ -292,8 +295,8 @@ class HTMLParser(_markupbase.ParserBase): ...@@ -292,8 +295,8 @@ class HTMLParser(_markupbase.ParserBase):
# see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
def parse_bogus_comment(self, i, report=1): def parse_bogus_comment(self, i, report=1):
rawdata = self.rawdata rawdata = self.rawdata
if rawdata[i:i+2] not in ('<!', '</'): assert rawdata[i:i+2] in ('<!', '</'), ('unexpected call to '
self.error('unexpected call to parse_comment()') 'parse_comment()')
pos = rawdata.find('>', i+2) pos = rawdata.find('>', i+2)
if pos == -1: if pos == -1:
return -1 return -1
......
...@@ -102,6 +102,7 @@ class TestCaseBase(unittest.TestCase): ...@@ -102,6 +102,7 @@ class TestCaseBase(unittest.TestCase):
class HTMLParserStrictTestCase(TestCaseBase): class HTMLParserStrictTestCase(TestCaseBase):
def get_collector(self): def get_collector(self):
with support.check_warnings(("", DeprecationWarning), quite=False):
return EventCollector(strict=True) return EventCollector(strict=True)
def test_processing_instruction_only(self): def test_processing_instruction_only(self):
...@@ -594,6 +595,7 @@ class HTMLParserTolerantTestCase(HTMLParserStrictTestCase): ...@@ -594,6 +595,7 @@ class HTMLParserTolerantTestCase(HTMLParserStrictTestCase):
class AttributesStrictTestCase(TestCaseBase): class AttributesStrictTestCase(TestCaseBase):
def get_collector(self): def get_collector(self):
with support.check_warnings(("", DeprecationWarning), quite=False):
return EventCollector(strict=True) return EventCollector(strict=True)
def test_attr_syntax(self): def test_attr_syntax(self):
......
...@@ -43,6 +43,9 @@ Core and Builtins ...@@ -43,6 +43,9 @@ Core and Builtins
Library Library
------- -------
- Issue #15114: the strict mode of HTMLParser and the HTMLParseError exception
are deprecated now that the parser is able to parse invalid markup.
- Issue #3665: \u and \U escapes are now supported in unicode regular - Issue #3665: \u and \U escapes are now supported in unicode regular
expressions. Patch by Serhiy Storchaka. expressions. Patch by Serhiy Storchaka.
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment