Commit 15cb4892 authored by Ezio Melotti's avatar Ezio Melotti

#13358: HTMLParser now calls handle_data only once for each CDATA.

parent 8008f2ab
...@@ -14,7 +14,6 @@ import re ...@@ -14,7 +14,6 @@ import re
# Regular expressions used for parsing # Regular expressions used for parsing
interesting_normal = re.compile('[&<]') interesting_normal = re.compile('[&<]')
interesting_cdata = re.compile(r'<(/|\Z)')
incomplete = re.compile('&[a-zA-Z#]') incomplete = re.compile('&[a-zA-Z#]')
entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]') entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
...@@ -149,8 +148,8 @@ class HTMLParser(_markupbase.ParserBase): ...@@ -149,8 +148,8 @@ class HTMLParser(_markupbase.ParserBase):
return self.__starttag_text return self.__starttag_text
def set_cdata_mode(self, elem): def set_cdata_mode(self, elem):
self.interesting = interesting_cdata
self.cdata_elem = elem.lower() self.cdata_elem = elem.lower()
self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
def clear_cdata_mode(self): def clear_cdata_mode(self):
self.interesting = interesting_normal self.interesting = interesting_normal
...@@ -168,6 +167,8 @@ class HTMLParser(_markupbase.ParserBase): ...@@ -168,6 +167,8 @@ class HTMLParser(_markupbase.ParserBase):
if match: if match:
j = match.start() j = match.start()
else: else:
if self.cdata_elem:
break
j = n j = n
if i < j: self.handle_data(rawdata[i:j]) if i < j: self.handle_data(rawdata[i:j])
i = self.updatepos(i, j) i = self.updatepos(i, j)
...@@ -250,7 +251,7 @@ class HTMLParser(_markupbase.ParserBase): ...@@ -250,7 +251,7 @@ class HTMLParser(_markupbase.ParserBase):
else: else:
assert 0, "interesting.search() lied" assert 0, "interesting.search() lied"
# end while # end while
if end and i < n: if end and i < n and not self.cdata_elem:
self.handle_data(rawdata[i:n]) self.handle_data(rawdata[i:n])
i = self.updatepos(i, n) i = self.updatepos(i, n)
self.rawdata = rawdata[i:] self.rawdata = rawdata[i:]
......
...@@ -301,7 +301,27 @@ DOCTYPE html [ ...@@ -301,7 +301,27 @@ DOCTYPE html [
("data", content), ("data", content),
("endtag", element_lower)]) ("endtag", element_lower)])
def test_cdata_with_closing_tags(self):
# see issue #13358
# make sure that HTMLParser calls handle_data only once for each CDATA.
# The normal event collector normalizes the events in get_events,
# so we override it to return the original list of events.
class Collector(EventCollector):
def get_events(self):
return self.events
content = """<!-- not a comment --> &not-an-entity-ref;
<a href="" /> </p><p> <span></span></style>
'</script' + '>'"""
for element in [' script', 'script ', ' script ',
'\nscript', 'script\n', '\nscript\n']:
element_lower = element.lower().strip()
s = '<script>{content}</{element}>'.format(element=element,
content=content)
self._run_check(s, [("starttag", element_lower, []),
("data", content),
("endtag", element_lower)],
collector=Collector())
class HTMLParserTolerantTestCase(HTMLParserStrictTestCase): class HTMLParserTolerantTestCase(HTMLParserStrictTestCase):
......
...@@ -76,6 +76,8 @@ Core and Builtins ...@@ -76,6 +76,8 @@ Core and Builtins
Library Library
------- -------
- Issue #13358: HTMLParser now calls handle_data only once for each CDATA.
- Issue #4147: minidom's toprettyxml no longer adds whitespace around a text - Issue #4147: minidom's toprettyxml no longer adds whitespace around a text
node when it is the only child of an element. Initial patch by Dan node when it is the only child of an element. Initial patch by Dan
Kenigsberg. Kenigsberg.
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment