Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
C
cpython
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
Kirill Smelkov
cpython
Commits
d1c7b1af
Commit
d1c7b1af
authored
Feb 13, 2012
by
Ezio Melotti
Browse files
Options
Browse Files
Download
Plain Diff
#13993: merge with 3.2.
parents
3dc74c0a
5211ffe4
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
71 additions
and
17 deletions
+71
-17
Lib/html/parser.py
Lib/html/parser.py
+27
-15
Lib/test/test_htmlparser.py
Lib/test/test_htmlparser.py
+41
-2
Misc/NEWS
Misc/NEWS
+3
-0
No files found.
Lib/html/parser.py
View file @
d1c7b1af
...
...
@@ -23,6 +23,9 @@ starttagopen = re.compile('<[a-zA-Z]')
piclose
=
re
.
compile
(
'>'
)
commentclose
=
re
.
compile
(
r'--\
s*>
')
tagfind = re.compile('
[
a
-
zA
-
Z
][
-
.
a
-
zA
-
Z0
-
9
:
_
]
*
')
# see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
# and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
tagfind_tolerant = re.compile('
[
a
-
zA
-
Z
][
^
\
t
\
n
\
r
\
f
/>
\
x00
]
*
')
# Note, the strict one of this pair isn'
t
really
strict
,
but
we
can
't
# make it correctly strict without breaking backward compatibility.
attrfind = re.compile(
...
...
@@ -270,7 +273,7 @@ class HTMLParser(_markupbase.ParserBase):
# see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
def
parse_bogus_comment
(
self
,
i
,
report
=
1
):
rawdata
=
self
.
rawdata
if
rawdata
[
i
:
i
+
2
]
!=
'<!'
:
if
rawdata
[
i
:
i
+
2
]
not
in
(
'<!'
,
'</'
)
:
self
.
error
(
'unexpected call to parse_comment()'
)
pos
=
rawdata
.
find
(
'>'
,
i
+
2
)
if
pos
==
-
1
:
...
...
@@ -398,31 +401,40 @@ class HTMLParser(_markupbase.ParserBase):
match
=
endendtag
.
search
(
rawdata
,
i
+
1
)
# >
if
not
match
:
return
-
1
j
=
match
.
end
()
gtpos
=
match
.
end
()
match
=
endtagfind
.
match
(
rawdata
,
i
)
# </ + tag + >
if
not
match
:
if
self
.
cdata_elem
is
not
None
:
self
.
handle_data
(
rawdata
[
i
:
j
])
return
j
self
.
handle_data
(
rawdata
[
i
:
gtpos
])
return
gtpos
if
self
.
strict
:
self
.
error
(
"bad end tag: %r"
%
(
rawdata
[
i
:
j
],))
k
=
rawdata
.
find
(
'<'
,
i
+
1
,
j
)
if
k
>
i
:
j
=
k
if
j
<=
i
:
j
=
i
+
1
self
.
handle_data
(
rawdata
[
i
:
j
])
return
j
self
.
error
(
"bad end tag: %r"
%
(
rawdata
[
i
:
gtpos
],))
# find the name: w3.org/TR/html5/tokenization.html#tag-name-state
namematch
=
tagfind_tolerant
.
match
(
rawdata
,
i
+
2
)
if
not
namematch
:
# w3.org/TR/html5/tokenization.html#end-tag-open-state
if
rawdata
[
i
:
i
+
3
]
==
'</>'
:
return
i
+
3
else
:
return
self
.
parse_bogus_comment
(
i
)
tagname
=
namematch
.
group
().
lower
()
# consume and ignore other stuff between the name and the >
# Note: this is not 100% correct, since we might have things like
# </tag attr=">">, but looking for > after tha name should cover
# most of the cases and is much simpler
gtpos
=
rawdata
.
find
(
'>'
,
namematch
.
end
())
self
.
handle_endtag
(
tagname
)
return
gtpos
+
1
elem
=
match
.
group
(
1
).
lower
()
# script or style
if
self
.
cdata_elem
is
not
None
:
if
elem
!=
self
.
cdata_elem
:
self
.
handle_data
(
rawdata
[
i
:
j
])
return
j
self
.
handle_data
(
rawdata
[
i
:
gtpos
])
return
gtpos
self
.
handle_endtag
(
elem
.
lower
())
self
.
clear_cdata_mode
()
return
j
return
gtpos
# Overridable -- finish processing of start+end tag: <tag.../>
def
handle_startendtag
(
self
,
tag
,
attrs
):
...
...
Lib/test/test_htmlparser.py
View file @
d1c7b1af
...
...
@@ -364,8 +364,9 @@ class HTMLParserTolerantTestCase(HTMLParserStrictTestCase):
(
'data'
,
'<<bc'
),
(
'endtag'
,
'a'
),
(
'endtag'
,
'html'
),
(
'data'
,
'
\
n
<img src="URL><//img></html'
),
(
'endtag'
,
'html'
)])
(
'data'
,
'
\
n
<img src="URL>'
),
(
'comment'
,
'/img'
),
(
'endtag'
,
'html<'
)])
def
test_with_unquoted_attributes
(
self
):
# see #12008
...
...
@@ -403,6 +404,44 @@ class HTMLParserTolerantTestCase(HTMLParserStrictTestCase):
(
'starttag'
,
'form'
,
[(
'action'
,
'bogus|&#()value'
)])])
def
test_invalid_end_tags
(
self
):
# A collection of broken end tags. <br> is used as separator.
# see http://www.w3.org/TR/html5/tokenization.html#end-tag-open-state
# and #13993
html
=
(
'<br></label</p><br></div end tmAd-leaderBoard><br></<h4><br>'
'</li class="unit"><br></li
\
r
\
n
\
t
\
t
\
t
\
t
\
t
\
t
</ul><br></><br>'
)
expected
=
[(
'starttag'
,
'br'
,
[]),
# < is part of the name, / is discarded, p is an attribute
(
'endtag'
,
'label<'
),
(
'starttag'
,
'br'
,
[]),
# text and attributes are discarded
(
'endtag'
,
'div'
),
(
'starttag'
,
'br'
,
[]),
# comment because the first char after </ is not a-zA-Z
(
'comment'
,
'<h4'
),
(
'starttag'
,
'br'
,
[]),
# attributes are discarded
(
'endtag'
,
'li'
),
(
'starttag'
,
'br'
,
[]),
# everything till ul (included) is discarded
(
'endtag'
,
'li'
),
(
'starttag'
,
'br'
,
[]),
# </> is ignored
(
'starttag'
,
'br'
,
[])]
self
.
_run_check
(
html
,
expected
)
def
test_broken_invalid_end_tag
(
self
):
# This is technically wrong (the "> shouldn't be included in the 'data')
# but is probably not worth fixing it (in addition to all the cases of
# the previous test, it would require a full attribute parsing).
# see #13993
html
=
'<b>This</b attr=">"> confuses the parser'
expected
=
[(
'starttag'
,
'b'
,
[]),
(
'data'
,
'This'
),
(
'endtag'
,
'b'
),
(
'data'
,
'"> confuses the parser'
)]
self
.
_run_check
(
html
,
expected
)
def
test_correct_detection_of_start_tags
(
self
):
# see #13273
html
=
(
'<div style="" ><b>The <a href="some_url">rain</a> '
...
...
Misc/NEWS
View file @
d1c7b1af
...
...
@@ -466,6 +466,9 @@ Core and Builtins
Library
-------
-
Issue
#
13993
:
HTMLParser
is
now
able
to
handle
broken
end
tags
when
strict
=
False
.
-
Issue
#
13930
:
lib2to3
now
supports
writing
converted
output
files
to
another
directory
tree
as
well
as
copying
unchanged
files
and
altering
the
file
suffix
.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment