Commit 2192cb5b authored by Fred Drake's avatar Fred Drake

In CDATA mode, make sure entity-reference syntax is not interpreted;

entity references are not allowed in that mode.

Do a better job of scanning <!DOCTYPE ...> declarations; based on the
code in
parent b63c5bd0
......@@ -5,7 +5,8 @@
# XXX There should be a way to distinguish between PCDATA (parsed
# character data -- the normal case), RCDATA (replaceable character
# data -- only char and entity references and end tags are special)
# and CDATA (character data -- only end tags are special).
# and CDATA (character data -- only end tags are special). RCDATA is
# not supported at all.
import re
......@@ -34,6 +35,9 @@ endbracket = re.compile('[<>]')
special = re.compile('<![^<>]*>')
commentopen = re.compile('<!--')
commentclose = re.compile(r'--\s*>')
declopen = re.compile('<!')
declname = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*')
declstringlit = re.compile(r'(\'[^\']*\'|"[^"]*")\s*')
tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
attrfind = re.compile(
......@@ -160,6 +164,10 @@ class SGMLParser:
i = k
elif rawdata[i] == '&':
if self.literal:
i = i+1
match = charref.match(rawdata, i)
if match:
name =
......@@ -210,11 +218,20 @@ class SGMLParser:
# Internal -- parse declaration.
def parse_declaration(self, i):
# This is some sort of declaration; in "HTML as
# deployed," this should only be the document type
# declaration ("<!DOCTYPE html...>").
rawdata = self.rawdata
j = i + 2
assert rawdata[i:j] == "<!", "unexpected call to parse_declaration"
if rawdata[j:j+1] in ("-", ""):
# Start of comment followed by buffer boundary,
# or just a buffer boundary.
return -1
# in practice, this should look like: ((name|stringlit) S*)+ '>'
n = len(rawdata)
while j < n:
c = rawdata[j:j+1]
c = rawdata[j]
if c == ">":
# end of declaration syntax
......@@ -222,15 +239,16 @@ class SGMLParser:
if c in "\"'":
m = declstringlit.match(rawdata, j)
if not m:
# incomplete or an error?
return -1
return -1 # incomplete
j = m.end()
m = decldata.match(rawdata, j)
elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
m = declname.match(rawdata, j)
if not m:
# incomplete or an error?
return -1
return -1 # incomplete
j = m.end()
raise SGMLParseError(
"unexpected char in declaration: %s" % `rawdata[j]`)
# end of buffer between tokens
return -1
Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment