#20288: fix handling of invalid numeric charrefs in HTMLParser.

f27b9a74 · Ezio Melotti · a479b750 · f27b9a74 · f27b9a74 · f27b9a74
Commit f27b9a74 authored Feb 01, 2014 by Ezio Melotti
Hide whitespace changes
Inline Side-by-side

Showing with 11 additions and 3 deletions

Lib/html/parser.py Lib/html/parser.py +3 -3

Lib/test/test_htmlparser.py Lib/test/test_htmlparser.py +6 -0

Misc/NEWS Misc/NEWS +2 -0

No files found.
--- a/Lib/html/parser.py
+++ b/Lib/html/parser.py
@@ -228,9 +228,9 @@ class HTMLParser(_markupbase.ParserBase):
                    i = self.updatepos(i, k)
                    continue
                else:
-                    if ";" in rawdata[i:]: #bail by consuming &#
-                        self.handle_data(rawdata[0:2])
-                        i = self.updatepos(i, 2)
+                    if ";" in rawdata[i:]:  # bail by consuming &#
+                        self.handle_data(rawdata[i:i+2])
+                        i = self.updatepos(i, i+2)
                    break
            elif startswith('&', i):
                match = entityref.match(rawdata, i)

--- a/Lib/test/test_htmlparser.py
+++ b/Lib/test/test_htmlparser.py
@@ -151,6 +151,12 @@ text
            ("data", "&#bad;"),
            ("endtag", "p"),
        ])
+        # add the [] as a workaround to avoid buffering (see #20288)
+        self._run_check(["<div>&#bad;</div>"], [
+            ("starttag", "div", []),
+            ("data", "&#bad;"),
+            ("endtag", "div"),
+        ])

    def test_unclosed_entityref(self):
        self._run_check("&entityref foo", [

--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -45,6 +45,8 @@ Core and Builtins
 Library
 -------

+- Issue #20288: fix handling of invalid numeric charrefs in HTMLParser.
+
 - Issue #20424: Python implementation of io.StringIO now supports lone surrogates.

 - Issue #19456: ntpath.join() now joins relative paths correctly when a drive