Commit 2e3607c1 authored by Ezio Melotti's avatar Ezio Melotti

#7311: fix html.parser to accept non-ASCII attribute values.

parent 9b5ac3ef
......@@ -28,7 +28,7 @@ tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*')
# make it correctly strict without breaking backward compatibility.
attrfind = re.compile(
r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~@]*))?')
r'(\'[^\']*\'|"[^"]*"|[^\s"\'=<>`]*))?')
attrfind_tolerant = re.compile(
r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
r'(\'[^\']*\'|"[^"]*"|[^>\s]*))?')
......
......@@ -217,6 +217,23 @@ DOCTYPE html [
("starttag", "a", [("href", "mailto:xyz@example.com")]),
])
def test_attr_nonascii(self):
# see issue 7311
self._run_check("<img src=/foo/bar.png alt=\u4e2d\u6587>", [
("starttag", "img", [("src", "/foo/bar.png"),
("alt", "\u4e2d\u6587")]),
])
self._run_check("<a title='\u30c6\u30b9\u30c8' "
"href='\u30c6\u30b9\u30c8.html'>", [
("starttag", "a", [("title", "\u30c6\u30b9\u30c8"),
("href", "\u30c6\u30b9\u30c8.html")]),
])
self._run_check('<a title="\u30c6\u30b9\u30c8" '
'href="\u30c6\u30b9\u30c8.html">', [
("starttag", "a", [("title", "\u30c6\u30b9\u30c8"),
("href", "\u30c6\u30b9\u30c8.html")]),
])
def test_attr_entity_replacement(self):
self._run_check("""<a b='&amp;&gt;&lt;&quot;&apos;'>""", [
("starttag", "a", [("b", "&><\"'")]),
......
......@@ -49,6 +49,8 @@ Core and Builtins
Library
-------
- Issue #7311: fix html.parser to accept non-ASCII attribute values.
- Issue #11605: email.parser.BytesFeedParser was incorrectly converting multipart
subpararts with an 8bit CTE into unicode instead of preserving the bytes.
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment