Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
C
cpython
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
Kirill Smelkov
cpython
Commits
3163a3b4
Commit
3163a3b4
authored
Mar 30, 2003
by
Martin v. Löwis
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Patch #545300: Support marked sections.
parent
a9656493
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
107 additions
and
16 deletions
+107
-16
Lib/markupbase.py
Lib/markupbase.py
+69
-2
Lib/sgmllib.py
Lib/sgmllib.py
+8
-14
Lib/test/test_htmllib.py
Lib/test/test_htmllib.py
+27
-0
Misc/NEWS
Misc/NEWS
+3
-0
No files found.
Lib/markupbase.py
View file @
3163a3b4
...
...
@@ -4,6 +4,13 @@ import re
_declname_match
=
re
.
compile
(
r'[a-zA-Z][-_.a-zA-Z0-9]*\
s*
').match
_declstringlit_match = re.compile(r'
(
\
'[^
\
'
]*
\
'
|"[^"]*")
\
s*
'
).match
_commentclose = re.compile(r'
--
\
s
*>
')
_markedsectionclose = re.compile(r'
]
\
s
*
]
\
s
*>
')
# An analysis of the MS-Word extensions is available at
# http://www.planetpublish.com/xmlarena/xap/Thursday/WordtoXML.pdf
_msmarkedsectionclose = re.compile(r'
]
\
s
*>
')
del re
...
...
@@ -53,6 +60,13 @@ class ParserBase:
# This is some sort of declaration; in "HTML as
# deployed," this should only be the document type
# declaration ("<!DOCTYPE html...>").
# ISO 8879:1986, however, has more complex
# declaration syntax for elements in <!...>, including:
# --comment--
# [marked section]
# name in the following list: ENTITY, DOCTYPE, ELEMENT,
# ATTLIST, NOTATION, SHORTREF, USEMAP,
# LINKTYPE, LINK, IDLINK, USELINK, SYSTEM
rawdata = self.rawdata
j = i + 2
assert rawdata[i:j] == "<!", "unexpected call to parse_declaration"
...
...
@@ -60,9 +74,19 @@ class ParserBase:
# Start of comment followed by buffer boundary,
# or just a buffer boundary.
return -1
#
in practice, this should look like: ((name|stringlit) S*)
+ '
>
'
#
A simple, practical version could look like: ((name|stringlit) S*)
+ '
>
'
n = len(rawdata)
decltype, j = self._scan_name(j, i)
if rawdata[j:j+1] == '
--
': #comment
# Locate --.*-- as the body of the comment
return self.parse_comment(i)
elif rawdata[j] == '
[
': #marked section
# Locate [statusWord [...arbitrary SGML...]] as the body of the marked section
# Where statusWord is one of TEMP, CDATA, IGNORE, INCLUDE, RCDATA
# Note that this is extended by Microsoft Office "Save as Web" function
# to include [if...] and [endif].
return self.parse_marked_section(i)
else: #all other declaration elements
decltype, j = self._scan_name(j, i)
if j < 0:
return j
if decltype == "doctype":
...
...
@@ -87,8 +111,15 @@ class ParserBase:
elif c in self._decl_otherchars:
j = j + 1
elif c == "
[
":
# this could be handled in a separate doctype parser
if decltype == "
doctype
":
j = self._parse_doctype_subset(j + 1, i)
elif decltype in ("
attlist
", "
linktype
", "
link
", "
element
"):
# must tolerate []'d groups in a content model in an element declaration
# also in data attribute specifications of attlist declaration
# also link type declaration subsets in linktype declarations
# also link attribute specification lists in link declarations
self.error("
unsupported
'['
char
in
%
s
declaration
" % decltype)
else:
self.error("
unexpected
'['
char
in
declaration
")
else:
...
...
@@ -98,6 +129,42 @@ class ParserBase:
return j
return -1 # incomplete
# Internal -- parse a marked section
# Override this to handle MS-word extension syntax <![if word]>content<![endif]>
def parse_marked_section( self, i, report=1 ):
rawdata= self.rawdata
assert rawdata[i:i+3] == '<![', "
unexpected
call
to
parse_marked_section
()
"
sectName, j = self._scan_name( i+3, i )
if j < 0:
return j
if sectName in ("
temp
", "
cdata
", "
ignore
", "
include
", "
rcdata
"):
# look for standard ]]> ending
match= _markedsectionclose.search(rawdata, i+3)
elif sectName in ("
if
", "
else
", "
endif
"):
# look for MS Office ]> ending
match= _msmarkedsectionclose.search(rawdata, i+3)
else:
self.error('unknown status keyword %s in marked section' % `rawdata[i+3:j]`)
if not match:
return -1
if report:
j = match.start(0)
self.unknown_decl(rawdata[i+3: j])
return match.end(0)
# Internal -- parse comment, return length or -1 if not terminated
def parse_comment(self, i, report=1):
rawdata = self.rawdata
if rawdata[i:i+4] != '<!--':
self.error('unexpected call to parse_comment()')
match = _commentclose.search(rawdata, i+4)
if not match:
return -1
if report:
j = match.start(0)
self.handle_comment(rawdata[i+4: j])
return match.end(0)
# Internal -- scan past the internal subset in a <!DOCTYPE declaration,
# returning the index just past any whitespace following the trailing ']'.
def _parse_doctype_subset(self, i, declstartpos):
...
...
Lib/sgmllib.py
View file @
3163a3b4
...
...
@@ -30,7 +30,6 @@ shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
shorttag
=
re
.
compile
(
'<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/'
)
piclose
=
re
.
compile
(
'>'
)
endbracket
=
re
.
compile
(
'[<>]'
)
commentclose
=
re
.
compile
(
r'--\
s*>
')
tagfind
=
re
.
compile
(
'[a-zA-Z][-_.a-zA-Z0-9]*'
)
attrfind
=
re
.
compile
(
r'\
s*([
a-zA-Z_][-:.a-zA-Z_0-9]*)(\
s*=
\s*'
...
...
@@ -145,6 +144,10 @@ class SGMLParser(markupbase.ParserBase):
break
continue
if
rawdata
.
startswith
(
"<!--"
,
i
):
# Strictly speaking, a comment is --.*--
# within a declaration tag <!...>.
# This should be removed,
# and comments handled only in parse_declaration.
k
=
self
.
parse_comment
(
i
)
if
k
<
0
:
break
i
=
k
...
...
@@ -202,19 +205,6 @@ class SGMLParser(markupbase.ParserBase):
self
.
rawdata
=
rawdata
[
i
:]
# XXX if end: check for empty stack
# Internal -- parse comment, return length or -1 if not terminated
def
parse_comment
(
self
,
i
,
report
=
1
):
rawdata
=
self
.
rawdata
if
rawdata
[
i
:
i
+
4
]
!=
'<!--'
:
self
.
error
(
'unexpected call to parse_comment()'
)
match
=
commentclose
.
search
(
rawdata
,
i
+
4
)
if
not
match
:
return
-
1
if
report
:
j
=
match
.
start
(
0
)
self
.
handle_comment
(
rawdata
[
i
+
4
:
j
])
return
match
.
end
(
0
)
# Extensions for the DOCTYPE scanner:
_decl_otherchars
=
'='
...
...
@@ -471,6 +461,10 @@ class TestSGMLParser(SGMLParser):
self
.
flush
()
print
'*** unknown char ref: &#'
+
ref
+
';'
def
unknown_decl
(
self
,
data
):
self
.
flush
()
print
'*** unknown decl: ['
+
data
+
']'
def
close
(
self
):
SGMLParser
.
close
(
self
)
self
.
flush
()
...
...
Lib/test/test_htmllib.py
View file @
3163a3b4
...
...
@@ -16,6 +16,17 @@ class AnchorCollector(htmllib.HTMLParser):
def
anchor_bgn
(
self
,
*
args
):
self
.
__anchors
.
append
(
args
)
class
DeclCollector
(
htmllib
.
HTMLParser
):
def
__init__
(
self
,
*
args
,
**
kw
):
self
.
__decls
=
[]
htmllib
.
HTMLParser
.
__init__
(
self
,
*
args
,
**
kw
)
def
get_decl_info
(
self
):
return
self
.
__decls
def
unknown_decl
(
self
,
data
):
self
.
__decls
.
append
(
data
)
class
HTMLParserTestCase
(
unittest
.
TestCase
):
def
test_anchor_collection
(
self
):
...
...
@@ -33,6 +44,22 @@ class HTMLParserTestCase(unittest.TestCase):
(
''
,
'frob'
,
''
),
])
def
test_decl_collection
(
self
):
# See SF patch #545300
parser
=
DeclCollector
(
formatter
.
NullFormatter
(),
verbose
=
1
)
parser
.
feed
(
"""<html>
<body>
hallo
<![if !supportEmptyParas]> <![endif]>
</body>
</html>
"""
)
parser
.
close
()
self
.
assertEquals
(
parser
.
get_decl_info
(),
[
"if !supportEmptyParas"
,
"endif"
])
def
test_main
():
test_support
.
run_unittest
(
HTMLParserTestCase
)
...
...
Misc/NEWS
View file @
3163a3b4
...
...
@@ -67,6 +67,9 @@ Extension modules
Library
-------
- sgmllib now supports SGML marked sections, in particular the
MS Office extensions.
- The urllib module now offers support for the iterator protocol.
SF patch 698520 contributed by Brett Cannon.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment