Commit 5561ae41 authored by Ezio Melotti's avatar Ezio Melotti

#13960: merge with 3.2.

parents ba8b6f5a e0c489b9
...@@ -184,7 +184,17 @@ class HTMLParser(_markupbase.ParserBase): ...@@ -184,7 +184,17 @@ class HTMLParser(_markupbase.ParserBase):
elif startswith("<?", i): elif startswith("<?", i):
k = self.parse_pi(i) k = self.parse_pi(i)
elif startswith("<!", i): elif startswith("<!", i):
k = self.parse_declaration(i) # this might fail with things like <! not a comment > or
# <! -- space before '--' -->. When strict is True an
# error is raised, when it's False they will be considered
# as bogus comments and parsed (see parse_bogus_comment).
if self.strict:
k = self.parse_declaration(i)
else:
try:
k = self.parse_declaration(i)
except HTMLParseError:
k = self.parse_bogus_comment(i)
elif (i + 1) < n: elif (i + 1) < n:
self.handle_data("<") self.handle_data("<")
k = i + 1 k = i + 1
...@@ -256,6 +266,19 @@ class HTMLParser(_markupbase.ParserBase): ...@@ -256,6 +266,19 @@ class HTMLParser(_markupbase.ParserBase):
i = self.updatepos(i, n) i = self.updatepos(i, n)
self.rawdata = rawdata[i:] self.rawdata = rawdata[i:]
# Internal -- parse bogus comment, return length or -1 if not terminated
# see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
def parse_bogus_comment(self, i, report=1):
rawdata = self.rawdata
if rawdata[i:i+2] != '<!':
self.error('unexpected call to parse_comment()')
pos = rawdata.find('>', i+2)
if pos == -1:
return -1
if report:
self.handle_comment(rawdata[i+2:pos])
return pos + 1
# Internal -- parse processing instr, return end or -1 if not terminated # Internal -- parse processing instr, return end or -1 if not terminated
def parse_pi(self, i): def parse_pi(self, i):
rawdata = self.rawdata rawdata = self.rawdata
......
...@@ -323,6 +323,23 @@ DOCTYPE html [ ...@@ -323,6 +323,23 @@ DOCTYPE html [
("endtag", element_lower)], ("endtag", element_lower)],
collector=Collector()) collector=Collector())
def test_comments(self):
html = ("<!-- I'm a valid comment -->"
'<!--me too!-->'
'<!------>'
'<!---->'
'<!----I have many hyphens---->'
'<!-- I have a > in the middle -->'
'<!-- and I have -- in the middle! -->')
expected = [('comment', " I'm a valid comment "),
('comment', 'me too!'),
('comment', '--'),
('comment', ''),
('comment', '--I have many hyphens--'),
('comment', ' I have a > in the middle '),
('comment', ' and I have -- in the middle! ')]
self._run_check(html, expected)
def test_condcoms(self): def test_condcoms(self):
html = ('<!--[if IE & !(lte IE 8)]>aren\'t<![endif]-->' html = ('<!--[if IE & !(lte IE 8)]>aren\'t<![endif]-->'
'<!--[if IE 8]>condcoms<![endif]-->' '<!--[if IE 8]>condcoms<![endif]-->'
...@@ -426,6 +443,19 @@ class HTMLParserTolerantTestCase(HTMLParserStrictTestCase): ...@@ -426,6 +443,19 @@ class HTMLParserTolerantTestCase(HTMLParserStrictTestCase):
# see #12888 # see #12888
self.assertEqual(p.unescape('&#123; ' * 1050), '{ ' * 1050) self.assertEqual(p.unescape('&#123; ' * 1050), '{ ' * 1050)
def test_broken_comments(self):
html = ('<! not really a comment >'
'<! not a comment either -->'
'<! -- close enough -->'
'<!!! another bogus comment !!!>')
expected = [
('comment', ' not really a comment '),
('comment', ' not a comment either --'),
('comment', ' -- close enough --'),
('comment', '!! another bogus comment !!!'),
]
self._run_check(html, expected)
def test_broken_condcoms(self): def test_broken_condcoms(self):
# these condcoms are missing the '--' after '<!' and before the '>' # these condcoms are missing the '--' after '<!' and before the '>'
html = ('<![if !(IE)]>broken condcom<![endif]>' html = ('<![if !(IE)]>broken condcom<![endif]>'
......
...@@ -466,6 +466,9 @@ Core and Builtins ...@@ -466,6 +466,9 @@ Core and Builtins
Library Library
------- -------
- Issue #13960: HTMLParser is now able to handle broken comments when
strict=False.
- Issue #13921: Undocument and clean up sqlite3.OptimizedUnicode, - Issue #13921: Undocument and clean up sqlite3.OptimizedUnicode,
which is obsolete in Python 3.x. It's now aliased to str for which is obsolete in Python 3.x. It's now aliased to str for
backwards compatibility. backwards compatibility.
...@@ -498,7 +501,7 @@ Library ...@@ -498,7 +501,7 @@ Library
- Issue #10881: Fix test_site failure with OS X framework builds. - Issue #10881: Fix test_site failure with OS X framework builds.
- Issue #964437 Make IDLE help window non-modal. - Issue #964437: Make IDLE help window non-modal.
Patch by Guilherme Polo and Roger Serwy. Patch by Guilherme Polo and Roger Serwy.
- Issue #13734: Add os.fwalk(), a directory walking function yielding file - Issue #13734: Add os.fwalk(), a directory walking function yielding file
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment