Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
C
cpython
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
Kirill Smelkov
cpython
Commits
7165d8b9
Commit
7165d8b9
authored
Nov 07, 2013
by
Ezio Melotti
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
#19480: HTMLParser now accepts all valid start-tag names as defined by the HTML5 standard.
parent
d5a2f0b3
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
28 additions
and
13 deletions
+28
-13
Lib/html/parser.py
Lib/html/parser.py
+12
-9
Lib/test/test_htmlparser.py
Lib/test/test_htmlparser.py
+13
-4
Misc/NEWS
Misc/NEWS
+3
-0
No files found.
Lib/html/parser.py
View file @
7165d8b9
...
@@ -23,16 +23,16 @@ charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
...
@@ -23,16 +23,16 @@ charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
starttagopen
=
re
.
compile
(
'<[a-zA-Z]'
)
starttagopen
=
re
.
compile
(
'<[a-zA-Z]'
)
piclose
=
re
.
compile
(
'>'
)
piclose
=
re
.
compile
(
'>'
)
commentclose
=
re
.
compile
(
r'--\
s*>
')
commentclose
=
re
.
compile
(
r'--\
s*>
')
tagfind = re.compile('
([
a
-
zA
-
Z
][
-
.
a
-
zA
-
Z0
-
9
:
_
]
*
)(
?
:
\
s
|/
(
?!
>
))
*
')
# see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
# and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
tagfind_tolerant = re.compile('
[
a
-
zA
-
Z
][
^
\
t
\
n
\
r
\
f
/>
\
x00
]
*
')
# Note:
# Note:
# 1) the strict attrfind isn'
t
really
strict
,
but
we
can
't make it
# 1) the strict attrfind isn'
t
really
strict
,
but
we
can
't make it
# correctly strict without breaking backward compatibility;
# correctly strict without breaking backward compatibility;
# 2) if you change attrfind remember to update locatestarttagend too;
# 2) if you change
tagfind/
attrfind remember to update locatestarttagend too;
# 3) if you change attrfind and/or locatestarttagend the parser will
# 3) if you change
tagfind/
attrfind and/or locatestarttagend the parser will
# explode, so don'
t
do
it
.
# explode, so don'
t
do
it
.
tagfind
=
re
.
compile
(
'([a-zA-Z][-.a-zA-Z0-9:_]*)(?:
\
s|/(?!>))*
'
)
# see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
# and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
tagfind_tolerant = re.compile('
([
a
-
zA
-
Z
][
^
\
t
\
n
\
r
\
f
/>
\
x00
]
*
)(
?
:
\
s
|/
(
?!
>
))
*
')
attrfind = re.compile(
attrfind = re.compile(
r'
\
s
*
([
a
-
zA
-
Z_
][
-
.:
a
-
zA
-
Z_0
-
9
]
*
)(
\
s
*=
\
s
*
'
r'
\
s
*
([
a
-
zA
-
Z_
][
-
.:
a
-
zA
-
Z_0
-
9
]
*
)(
\
s
*=
\
s
*
'
r'
(
\
'[^
\
'
]*
\
'
|"[^"]*"|[^
\
s
"
\
'
=<>`]*))?'
)
r'
(
\
'[^
\
'
]*
\
'
|"[^"]*"|[^
\
s
"
\
'
=<>`]*))?'
)
...
@@ -54,7 +54,7 @@ locatestarttagend = re.compile(r"""
...
@@ -54,7 +54,7 @@ locatestarttagend = re.compile(r"""
\
s* #
trailing whitespace
\
s* #
trailing whitespace
"""
,
re
.
VERBOSE
)
"""
,
re
.
VERBOSE
)
locatestarttagend_tolerant
=
re
.
compile
(
r"""
locatestarttagend_tolerant
=
re
.
compile
(
r"""
<[a-zA-Z][
-.a-zA-Z0-9:_]*
# tag name
<[a-zA-Z][
^\t\n\r\f />\x00]*
# tag name
(?:[\
s/]* # op
tional whitespace before attribute name
(?:[\
s/]* # op
tional whitespace before attribute name
(?:(?<=['"\
s/])[^
\s/>][^\
s/=>]* #
attribute name
(?:(?<=['"\
s/])[^
\s/>][^\
s/=>]* #
attribute name
(?:\
s*=+
\s* # value indicator
(?:\
s*=+
\s* # value indicator
...
@@ -328,7 +328,10 @@ class HTMLParser(_markupbase.ParserBase):
...
@@ -328,7 +328,10 @@ class HTMLParser(_markupbase.ParserBase):
# Now parse the data between i+1 and j into a tag and attrs
# Now parse the data between i+1 and j into a tag and attrs
attrs
=
[]
attrs
=
[]
match
=
tagfind
.
match
(
rawdata
,
i
+
1
)
if
self
.
strict
:
match
=
tagfind
.
match
(
rawdata
,
i
+
1
)
else
:
match
=
tagfind_tolerant
.
match
(
rawdata
,
i
+
1
)
assert
match
,
'unexpected call to parse_starttag()'
assert
match
,
'unexpected call to parse_starttag()'
k
=
match
.
end
()
k
=
match
.
end
()
self
.
lasttag
=
tag
=
match
.
group
(
1
).
lower
()
self
.
lasttag
=
tag
=
match
.
group
(
1
).
lower
()
...
@@ -440,7 +443,7 @@ class HTMLParser(_markupbase.ParserBase):
...
@@ -440,7 +443,7 @@ class HTMLParser(_markupbase.ParserBase):
return
i
+
3
return
i
+
3
else
:
else
:
return
self
.
parse_bogus_comment
(
i
)
return
self
.
parse_bogus_comment
(
i
)
tagname
=
namematch
.
group
().
lower
()
tagname
=
namematch
.
group
(
1
).
lower
()
# consume and ignore other stuff between the name and the >
# consume and ignore other stuff between the name and the >
# Note: this is not 100% correct, since we might have things like
# Note: this is not 100% correct, since we might have things like
# </tag attr=">">, but looking for > after tha name should cover
# </tag attr=">">, but looking for > after tha name should cover
...
...
Lib/test/test_htmlparser.py
View file @
7165d8b9
...
@@ -229,6 +229,11 @@ text
...
@@ -229,6 +229,11 @@ text
self
.
_parse_error
(
"<a foo='bar"
)
self
.
_parse_error
(
"<a foo='bar"
)
self
.
_parse_error
(
"<a foo='>'"
)
self
.
_parse_error
(
"<a foo='>'"
)
self
.
_parse_error
(
"<a foo='>"
)
self
.
_parse_error
(
"<a foo='>"
)
self
.
_parse_error
(
"<a$>"
)
self
.
_parse_error
(
"<a$b>"
)
self
.
_parse_error
(
"<a$b/>"
)
self
.
_parse_error
(
"<a$b >"
)
self
.
_parse_error
(
"<a$b />"
)
def
test_valid_doctypes
(
self
):
def
test_valid_doctypes
(
self
):
# from http://www.w3.org/QA/2002/04/valid-dtd-list.html
# from http://www.w3.org/QA/2002/04/valid-dtd-list.html
...
@@ -368,8 +373,8 @@ class HTMLParserTolerantTestCase(HTMLParserStrictTestCase):
...
@@ -368,8 +373,8 @@ class HTMLParserTolerantTestCase(HTMLParserStrictTestCase):
(
'starttag'
,
'html'
,
[(
'<html'
,
None
)]),
(
'starttag'
,
'html'
,
[(
'<html'
,
None
)]),
(
'data'
,
'te>>xt'
),
(
'data'
,
'te>>xt'
),
(
'entityref'
,
'a'
),
(
'entityref'
,
'a'
),
(
'data'
,
'<
<bc
'
),
(
'data'
,
'<'
),
(
'
endtag'
,
'a'
),
(
'
starttag'
,
'bc<'
,
[(
'a'
,
None
)]
),
(
'endtag'
,
'html'
),
(
'endtag'
,
'html'
),
(
'data'
,
'
\
n
<img src="URL>'
),
(
'data'
,
'
\
n
<img src="URL>'
),
(
'comment'
,
'/img'
),
(
'comment'
,
'/img'
),
...
@@ -380,8 +385,7 @@ class HTMLParserTolerantTestCase(HTMLParserStrictTestCase):
...
@@ -380,8 +385,7 @@ class HTMLParserTolerantTestCase(HTMLParserStrictTestCase):
self
.
_run_check
(
"</$>"
,
[(
'comment'
,
'$'
)])
self
.
_run_check
(
"</$>"
,
[(
'comment'
,
'$'
)])
self
.
_run_check
(
"</"
,
[(
'data'
,
'</'
)])
self
.
_run_check
(
"</"
,
[(
'data'
,
'</'
)])
self
.
_run_check
(
"</a"
,
[(
'data'
,
'</a'
)])
self
.
_run_check
(
"</a"
,
[(
'data'
,
'</a'
)])
# XXX this might be wrong
self
.
_run_check
(
"<a<a>"
,
[(
'starttag'
,
'a<a'
,
[])])
self
.
_run_check
(
"<a<a>"
,
[(
'data'
,
'<a'
),
(
'starttag'
,
'a'
,
[])])
self
.
_run_check
(
"</a<a>"
,
[(
'endtag'
,
'a<a'
)])
self
.
_run_check
(
"</a<a>"
,
[(
'endtag'
,
'a<a'
)])
self
.
_run_check
(
"<!"
,
[(
'data'
,
'<!'
)])
self
.
_run_check
(
"<!"
,
[(
'data'
,
'<!'
)])
self
.
_run_check
(
"<a"
,
[(
'data'
,
'<a'
)])
self
.
_run_check
(
"<a"
,
[(
'data'
,
'<a'
)])
...
@@ -389,6 +393,11 @@ class HTMLParserTolerantTestCase(HTMLParserStrictTestCase):
...
@@ -389,6 +393,11 @@ class HTMLParserTolerantTestCase(HTMLParserStrictTestCase):
self
.
_run_check
(
"<a foo='bar"
,
[(
'data'
,
"<a foo='bar"
)])
self
.
_run_check
(
"<a foo='bar"
,
[(
'data'
,
"<a foo='bar"
)])
self
.
_run_check
(
"<a foo='>'"
,
[(
'data'
,
"<a foo='>'"
)])
self
.
_run_check
(
"<a foo='>'"
,
[(
'data'
,
"<a foo='>'"
)])
self
.
_run_check
(
"<a foo='>"
,
[(
'data'
,
"<a foo='>"
)])
self
.
_run_check
(
"<a foo='>"
,
[(
'data'
,
"<a foo='>"
)])
self
.
_run_check
(
"<a$>"
,
[(
'starttag'
,
'a$'
,
[])])
self
.
_run_check
(
"<a$b>"
,
[(
'starttag'
,
'a$b'
,
[])])
self
.
_run_check
(
"<a$b/>"
,
[(
'startendtag'
,
'a$b'
,
[])])
self
.
_run_check
(
"<a$b >"
,
[(
'starttag'
,
'a$b'
,
[])])
self
.
_run_check
(
"<a$b />"
,
[(
'startendtag'
,
'a$b'
,
[])])
def
test_slashes_in_starttag
(
self
):
def
test_slashes_in_starttag
(
self
):
self
.
_run_check
(
'<a foo="var"/>'
,
[(
'startendtag'
,
'a'
,
[(
'foo'
,
'var'
)])])
self
.
_run_check
(
'<a foo="var"/>'
,
[(
'startendtag'
,
'a'
,
[(
'foo'
,
'var'
)])])
...
...
Misc/NEWS
View file @
7165d8b9
...
@@ -13,6 +13,9 @@ Core and Builtins
...
@@ -13,6 +13,9 @@ Core and Builtins
Library
Library
-------
-------
- Issue #19480: HTMLParser now accepts all valid start-tag names as defined
by the HTML5 standard.
- Issue #6157: Fixed tkinter.Text.debug(). Original patch by Guilherme Polo.
- Issue #6157: Fixed tkinter.Text.debug(). Original patch by Guilherme Polo.
- Issue #6160: The bbox() method of tkinter.Spinbox now returns a tuple of
- Issue #6160: The bbox() method of tkinter.Spinbox now returns a tuple of
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment