Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
C
cpython
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
Kirill Smelkov
cpython
Commits
fa3702dc
Commit
fa3702dc
authored
Feb 10, 2012
by
Ezio Melotti
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
#13960: HTMLParser is now able to handle broken comments when strict=False.
parent
5b14d732
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
58 additions
and
2 deletions
+58
-2
Lib/html/parser.py
Lib/html/parser.py
+24
-1
Lib/test/test_htmlparser.py
Lib/test/test_htmlparser.py
+30
-0
Misc/NEWS
Misc/NEWS
+4
-1
No files found.
Lib/html/parser.py
View file @
fa3702dc
...
@@ -184,7 +184,17 @@ class HTMLParser(_markupbase.ParserBase):
...
@@ -184,7 +184,17 @@ class HTMLParser(_markupbase.ParserBase):
elif startswith("
<
?
", i):
elif startswith("
<
?
", i):
k = self.parse_pi(i)
k = self.parse_pi(i)
elif startswith("
<
!
", i):
elif startswith("
<
!
", i):
k = self.parse_declaration(i)
# this might fail with things like <! not a comment > or
# <! -- space before '--' -->. When strict is True an
# error is raised, when it's False they will be considered
# as bogus comments and parsed (see parse_bogus_comment).
if self.strict:
k = self.parse_declaration(i)
else:
try:
k = self.parse_declaration(i)
except HTMLParseError:
k = self.parse_bogus_comment(i)
elif (i + 1) < n:
elif (i + 1) < n:
self.handle_data("
<
")
self.handle_data("
<
")
k = i + 1
k = i + 1
...
@@ -256,6 +266,19 @@ class HTMLParser(_markupbase.ParserBase):
...
@@ -256,6 +266,19 @@ class HTMLParser(_markupbase.ParserBase):
i
=
self
.
updatepos
(
i
,
n
)
i
=
self
.
updatepos
(
i
,
n
)
self
.
rawdata
=
rawdata
[
i
:]
self
.
rawdata
=
rawdata
[
i
:]
# Internal -- parse bogus comment, return length or -1 if not terminated
# see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
def
parse_bogus_comment
(
self
,
i
,
report
=
1
):
rawdata
=
self
.
rawdata
if
rawdata
[
i
:
i
+
2
]
!=
'<!'
:
self
.
error
(
'unexpected call to parse_comment()'
)
pos
=
rawdata
.
find
(
'>'
,
i
+
2
)
if
pos
==
-
1
:
return
-
1
if
report
:
self
.
handle_comment
(
rawdata
[
i
+
2
:
pos
])
return
pos
+
1
# Internal -- parse processing instr, return end or -1 if not terminated
# Internal -- parse processing instr, return end or -1 if not terminated
def
parse_pi
(
self
,
i
):
def
parse_pi
(
self
,
i
):
rawdata
=
self
.
rawdata
rawdata
=
self
.
rawdata
...
...
Lib/test/test_htmlparser.py
View file @
fa3702dc
...
@@ -323,6 +323,23 @@ DOCTYPE html [
...
@@ -323,6 +323,23 @@ DOCTYPE html [
(
"endtag"
,
element_lower
)],
(
"endtag"
,
element_lower
)],
collector
=
Collector
())
collector
=
Collector
())
def
test_comments
(
self
):
html
=
(
"<!-- I'm a valid comment -->"
'<!--me too!-->'
'<!------>'
'<!---->'
'<!----I have many hyphens---->'
'<!-- I have a > in the middle -->'
'<!-- and I have -- in the middle! -->'
)
expected
=
[(
'comment'
,
" I'm a valid comment "
),
(
'comment'
,
'me too!'
),
(
'comment'
,
'--'
),
(
'comment'
,
''
),
(
'comment'
,
'--I have many hyphens--'
),
(
'comment'
,
' I have a > in the middle '
),
(
'comment'
,
' and I have -- in the middle! '
)]
self
.
_run_check
(
html
,
expected
)
def
test_condcoms
(
self
):
def
test_condcoms
(
self
):
html
=
(
'<!--[if IE & !(lte IE 8)]>aren
\
'
t<![endif]-->'
html
=
(
'<!--[if IE & !(lte IE 8)]>aren
\
'
t<![endif]-->'
'<!--[if IE 8]>condcoms<![endif]-->'
'<!--[if IE 8]>condcoms<![endif]-->'
...
@@ -426,6 +443,19 @@ class HTMLParserTolerantTestCase(HTMLParserStrictTestCase):
...
@@ -426,6 +443,19 @@ class HTMLParserTolerantTestCase(HTMLParserStrictTestCase):
# see #12888
# see #12888
self
.
assertEqual
(
p
.
unescape
(
'{ '
*
1050
),
'{ '
*
1050
)
self
.
assertEqual
(
p
.
unescape
(
'{ '
*
1050
),
'{ '
*
1050
)
def
test_broken_comments
(
self
):
html
=
(
'<! not really a comment >'
'<! not a comment either -->'
'<! -- close enough -->'
'<!!! another bogus comment !!!>'
)
expected
=
[
(
'comment'
,
' not really a comment '
),
(
'comment'
,
' not a comment either --'
),
(
'comment'
,
' -- close enough --'
),
(
'comment'
,
'!! another bogus comment !!!'
),
]
self
.
_run_check
(
html
,
expected
)
def
test_broken_condcoms
(
self
):
def
test_broken_condcoms
(
self
):
# these condcoms are missing the '--' after '<!' and before the '>'
# these condcoms are missing the '--' after '<!' and before the '>'
html
=
(
'<![if !(IE)]>broken condcom<![endif]>'
html
=
(
'<![if !(IE)]>broken condcom<![endif]>'
...
...
Misc/NEWS
View file @
fa3702dc
...
@@ -113,6 +113,9 @@ Core and Builtins
...
@@ -113,6 +113,9 @@ Core and Builtins
Library
Library
-------
-------
- Issue #13960: HTMLParser is now able to handle broken comments when
strict=False.
- Issue #9021: Add an introduction to the copy module documentation.
- Issue #9021: Add an introduction to the copy module documentation.
- Issue #6005: Examples in the socket library documentation use sendall, where
- Issue #6005: Examples in the socket library documentation use sendall, where
...
@@ -123,7 +126,7 @@ Library
...
@@ -123,7 +126,7 @@ Library
- Issue #10881: Fix test_site failure with OS X framework builds.
- Issue #10881: Fix test_site failure with OS X framework builds.
- Issue #964437 Make IDLE help window non-modal.
- Issue #964437
:
Make IDLE help window non-modal.
Patch by Guilherme Polo and Roger Serwy.
Patch by Guilherme Polo and Roger Serwy.
- Issue #2945: Make the distutils upload command aware of bdist_rpm products.
- Issue #2945: Make the distutils upload command aware of bdist_rpm products.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment