Commit a3bae336 authored by Fred Drake's avatar Fred Drake

Re-factor the SGMLParser class to use the new markupbase.ParserBase class.

Use a new internal method, error(), consistently to raise parse errors;
the new base class also uses this.
Adjust the parse_comment() method to return the new offset into the buffer
instead of the number of characters scanned; this was the only helper
method that did it this way, so we have better consistency now.  Required
to share the new base class.
This fixes SF bug #448482 and #453706.
parent bfc8fea1
...@@ -9,6 +9,7 @@ ...@@ -9,6 +9,7 @@
# not supported at all. # not supported at all.
import markupbase
import re import re
__all__ = ["SGMLParser"] __all__ = ["SGMLParser"]
...@@ -27,24 +28,14 @@ charref = re.compile('&#([0-9]+)[^0-9]') ...@@ -27,24 +28,14 @@ charref = re.compile('&#([0-9]+)[^0-9]')
starttagopen = re.compile('<[>a-zA-Z]') starttagopen = re.compile('<[>a-zA-Z]')
shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/') shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/') shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/')
piopen = re.compile('<\?')
piclose = re.compile('>') piclose = re.compile('>')
endtagopen = re.compile('</[<>a-zA-Z]')
endbracket = re.compile('[<>]') endbracket = re.compile('[<>]')
special = re.compile('<![^<>]*>')
commentopen = re.compile('<!--')
commentclose = re.compile(r'--\s*>') commentclose = re.compile(r'--\s*>')
declopen = re.compile('<!')
declname = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*')
declstringlit = re.compile(r'(\'[^\']*\'|"[^"]*")\s*')
tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*') tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
attrfind = re.compile( attrfind = re.compile(
r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*' r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./:;+*%?!&$\(\)_#=~\'"]*))?') r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./:;+*%?!&$\(\)_#=~\'"]*))?')
decldata = re.compile(r'[^>\'\"]+')
declstringlit = re.compile(r'(\'[^\']*\'|"[^"]*")\s*')
class SGMLParseError(RuntimeError): class SGMLParseError(RuntimeError):
"""Exception raised for all parse errors.""" """Exception raised for all parse errors."""
...@@ -62,7 +53,7 @@ class SGMLParseError(RuntimeError): ...@@ -62,7 +53,7 @@ class SGMLParseError(RuntimeError):
# chunks). Entity references are passed by calling # chunks). Entity references are passed by calling
# self.handle_entityref() with the entity reference as argument. # self.handle_entityref() with the entity reference as argument.
class SGMLParser: class SGMLParser(markupbase.ParserBase):
def __init__(self, verbose=0): def __init__(self, verbose=0):
"""Initialize and reset this instance.""" """Initialize and reset this instance."""
...@@ -76,6 +67,7 @@ class SGMLParser: ...@@ -76,6 +67,7 @@ class SGMLParser:
self.lasttag = '???' self.lasttag = '???'
self.nomoretags = 0 self.nomoretags = 0
self.literal = 0 self.literal = 0
markupbase.ParserBase.reset(self)
def setnomoretags(self): def setnomoretags(self):
"""Enter literal mode (CDATA) till EOF. """Enter literal mode (CDATA) till EOF.
...@@ -106,6 +98,9 @@ class SGMLParser: ...@@ -106,6 +98,9 @@ class SGMLParser:
"""Handle the remaining data.""" """Handle the remaining data."""
self.goahead(1) self.goahead(1)
def error(self, message):
raise SGMLParseError(message)
# Internal -- handle data as far as reasonable. May leave state # Internal -- handle data as far as reasonable. May leave state
# and data to be processed by a subsequent call. If 'end' is # and data to be processed by a subsequent call. If 'end' is
# true, force handling all data as if followed by EOF marker. # true, force handling all data as if followed by EOF marker.
...@@ -119,9 +114,10 @@ class SGMLParser: ...@@ -119,9 +114,10 @@ class SGMLParser:
i = n i = n
break break
match = interesting.search(rawdata, i) match = interesting.search(rawdata, i)
if match: j = match.start(0) if match: j = match.start()
else: j = n else: j = n
if i < j: self.handle_data(rawdata[i:j]) if i < j:
self.handle_data(rawdata[i:j])
i = j i = j
if i == n: break if i == n: break
if rawdata[i] == '<': if rawdata[i] == '<':
...@@ -134,36 +130,31 @@ class SGMLParser: ...@@ -134,36 +130,31 @@ class SGMLParser:
if k < 0: break if k < 0: break
i = k i = k
continue continue
if endtagopen.match(rawdata, i): if rawdata.startswith("</", i):
k = self.parse_endtag(i) k = self.parse_endtag(i)
if k < 0: break if k < 0: break
i = k i = k
self.literal = 0 self.literal = 0
continue continue
if commentopen.match(rawdata, i):
if self.literal: if self.literal:
self.handle_data(rawdata[i]) if n > (i + 1):
self.handle_data("<")
i = i+1 i = i+1
else:
# incomplete
break
continue continue
if rawdata.startswith("<!--", i):
k = self.parse_comment(i) k = self.parse_comment(i)
if k < 0: break if k < 0: break
i = i+k i = k
continue
if piopen.match(rawdata, i):
if self.literal:
self.handle_data(rawdata[i])
i = i+1
continue continue
if rawdata.startswith("<?", i):
k = self.parse_pi(i) k = self.parse_pi(i)
if k < 0: break if k < 0: break
i = i+k i = i+k
continue continue
match = special.match(rawdata, i) if rawdata.startswith("<!", i):
if match:
if self.literal:
self.handle_data(rawdata[i])
i = i+1
continue
# This is some sort of declaration; in "HTML as # This is some sort of declaration; in "HTML as
# deployed," this should only be the document type # deployed," this should only be the document type
# declaration ("<!DOCTYPE html...>"). # declaration ("<!DOCTYPE html...>").
...@@ -191,7 +182,7 @@ class SGMLParser: ...@@ -191,7 +182,7 @@ class SGMLParser:
if rawdata[i-1] != ';': i = i-1 if rawdata[i-1] != ';': i = i-1
continue continue
else: else:
raise SGMLParseError('neither < nor & ??') self.error('neither < nor & ??')
# We get here only if incomplete matches but # We get here only if incomplete matches but
# nothing else # nothing else
match = incomplete.match(rawdata, i) match = incomplete.match(rawdata, i)
...@@ -212,59 +203,26 @@ class SGMLParser: ...@@ -212,59 +203,26 @@ class SGMLParser:
# XXX if end: check for empty stack # XXX if end: check for empty stack
# Internal -- parse comment, return length or -1 if not terminated # Internal -- parse comment, return length or -1 if not terminated
def parse_comment(self, i): def parse_comment(self, i, report=1):
rawdata = self.rawdata rawdata = self.rawdata
if rawdata[i:i+4] != '<!--': if rawdata[i:i+4] != '<!--':
raise SGMLParseError('unexpected call to parse_comment()') self.error('unexpected call to parse_comment()')
match = commentclose.search(rawdata, i+4) match = commentclose.search(rawdata, i+4)
if not match: if not match:
return -1 return -1
if report:
j = match.start(0) j = match.start(0)
self.handle_comment(rawdata[i+4: j]) self.handle_comment(rawdata[i+4: j])
j = match.end(0) return match.end(0)
return j-i
# Internal -- parse declaration. # Extensions for the DOCTYPE scanner:
def parse_declaration(self, i): _decl_otherchars = '='
# This is some sort of declaration; in "HTML as
# deployed," this should only be the document type
# declaration ("<!DOCTYPE html...>").
rawdata = self.rawdata
j = i + 2
assert rawdata[i:j] == "<!", "unexpected call to parse_declaration"
if rawdata[j:j+1] in ("-", ""):
# Start of comment followed by buffer boundary,
# or just a buffer boundary.
return -1
# in practice, this should look like: ((name|stringlit) S*)+ '>'
n = len(rawdata)
while j < n:
c = rawdata[j]
if c == ">":
# end of declaration syntax
self.handle_decl(rawdata[i+2:j])
return j + 1
if c in "\"'":
m = declstringlit.match(rawdata, j)
if not m:
return -1 # incomplete
j = m.end()
elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
m = declname.match(rawdata, j)
if not m:
return -1 # incomplete
j = m.end()
else:
raise SGMLParseError(
"unexpected char in declaration: %s" % `rawdata[j]`)
# end of buffer between tokens
return -1
# Internal -- parse processing instr, return length or -1 if not terminated # Internal -- parse processing instr, return length or -1 if not terminated
def parse_pi(self, i): def parse_pi(self, i):
rawdata = self.rawdata rawdata = self.rawdata
if rawdata[i:i+2] != '<?': if rawdata[i:i+2] != '<?':
raise SGMLParseError('unexpected call to parse_pi()') self.error('unexpected call to parse_pi()')
match = piclose.search(rawdata, i+2) match = piclose.search(rawdata, i+2)
if not match: if not match:
return -1 return -1
...@@ -311,7 +269,7 @@ class SGMLParser: ...@@ -311,7 +269,7 @@ class SGMLParser:
else: else:
match = tagfind.match(rawdata, i+1) match = tagfind.match(rawdata, i+1)
if not match: if not match:
raise SGMLParseError('unexpected call to parse_starttag') self.error('unexpected call to parse_starttag')
k = match.end(0) k = match.end(0)
tag = rawdata[i+1:k].lower() tag = rawdata[i+1:k].lower()
self.lasttag = tag self.lasttag = tag
...@@ -465,6 +423,7 @@ class SGMLParser: ...@@ -465,6 +423,7 @@ class SGMLParser:
def unknown_endtag(self, tag): pass def unknown_endtag(self, tag): pass
def unknown_charref(self, ref): pass def unknown_charref(self, ref): pass
def unknown_entityref(self, ref): pass def unknown_entityref(self, ref): pass
def unknown_decl(self, data): pass
class TestSGMLParser(SGMLParser): class TestSGMLParser(SGMLParser):
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment