Commit 0588e8ac authored by Guido van Rossum's avatar Guido van Rossum

First steps towards an HTML parser

parent b38c7192
from nsgmllib import SGMLParser
BOOLEAN_HTML_ATTRS = [
# List of Boolean attributes in HTML that may be given in
# minimized form (e.g. <img ismap> rather than <img ismap="">)
# From http://www.w3.org/TR/xhtml1/#guidelines (C.10)
"compact", "nowrap", "ismap", "declare", "noshade", "checked",
"disabled", "readonly", "multiple", "selected", "noresize",
"defer"
]
EMPTY_HTML_TAGS = [
# List of HTML tags with an empty content model; these are
# rendered in minimized form, e.g. <img />.
# From http://www.w3.org/TR/xhtml1/#dtds
"base", "meta", "link", "hr", "br", "param", "img", "area",
"input", "col", "basefont", "isindex", "frame",
]
from TALGenerator import TALGenerator
class HTMLTALParser(SGMLParser):
# External API
def __init__(self, gen=None):
SGMLParser.__init__(self)
if gen is None:
gen = TALGenerator()
self.gen = gen
self.tagstack = []
self.nsstack = []
self.nsdict = {}
def parseFile(self, file):
f = open(file)
data = f.read()
f.close()
self.feed(data)
self.close()
while self.tagstack:
self.finish_endtag(None)
assert self.tagstack == []
assert self.nsstack == []
assert self.nsdict == {}, self.nsdict
def getCode(self):
return self.gen.program, self.gen.macros
# Internal thingies
def scan_xmlns(self, attrs):
nsnew = {}
for key, value in attrs:
if key[:6] == "xmlns:":
nsnew[key[6:]] = value
if nsnew:
self.nsstack.append(self.nsdict)
self.nsdict = self.nsdict.copy()
self.nsdict.update(nsnew)
else:
self.nsstack.append(self.nsdict)
def pop_xmlns(self):
self.nsdict = self.nsstack.pop()
# Overriding SGMLParser methods
def finish_starttag(self, tag, attrs):
self.scan_xmlns(attrs)
print tag, self.nsdict
if tag not in EMPTY_HTML_TAGS:
self.tagstack.append(tag)
else:
self.pop_xmlns()
print "<", tag, self.nsdict
self.gen.emitStartTag(tag, attrs)
def finish_endtag(self, tag):
if tag not in EMPTY_HTML_TAGS:
if not tag:
tag = self.tagstack.pop()
else:
assert tag in self.tagstack
while self.tagstack[-1] != tag:
self.finish_endtag(None)
self.tagstack.pop()
self.pop_xmlns()
print "<", tag, self.nsdict
self.gen.emitEndTag(tag)
def handle_charref(self, name):
self.gen.emit("rawtext", "&#%s;" % name)
def handle_entityref(self, name):
self.gen.emit("rawtext", "&%s;" % name)
def handle_data(self, data):
self.gen.emit("text", data)
def handle_comment(self, data):
self.gen.emit("rawtext", "<!--%s-->" % data)
def handle_pi(self, data):
self.gen.emit("rawtext", "<?%s>" % data)
......@@ -54,7 +54,9 @@ DummyEngine.py simple-minded TALES execution engine
TALInterpreter.py class to interpret intermediate code
TALGenerator.py class to generate intermediate code
XMLParser.py base class to parse XML, avoiding DOM
TALParser.py class to parse TAL into intermediate code
TALParser.py class to parse XML with TAL into intermediate code
HTMLTALParser.py class to parse HTML with TAL into intermediate code
nsgmllib.py modified version of sgmllib.py
driver.py script to demonstrate TAL expansion
timer.py script to time various processing phases
setpath.py hack to set sys.path and import ZODB
......
......@@ -105,20 +105,26 @@ FILE = "test/test1.xml"
def main():
versionTest = 1
macros = 0
html = 0
try:
opts, args = getopt.getopt(sys.argv[1:], "mn")
opts, args = getopt.getopt(sys.argv[1:], "hmnx")
except getopt.error, msg:
sys.stderr.write("%s\n" % str(msg))
sys.stderr.write(
"usage: driver.py [-m] [-n] [file]\n")
"usage: driver.py [-h|-x] [-m] [-n] [file]\n")
sys.stderr.write("-h/-x -- HTML/XML input (default XML)\n")
sys.stderr.write("-m -- macro expansion only\n")
sys.stderr.write("-n -- turn of the Python 1.5.2 test\n")
sys.exit(2)
for o, a in opts:
if o == '-h':
html = 1
if o == '-m':
macros = 1
if o == '-n':
versionTest = 0
if o == '-x':
html = 0
if not versionTest:
if sys.version[:5] != "1.5.2":
sys.stderr.write(
......@@ -128,7 +134,7 @@ def main():
file = args[0]
else:
file = FILE
it = compilefile(file)
it = compilefile(file, html=html)
interpretit(it, tal=(not macros))
def interpretit(it, engine=None, stream=None, tal=1):
......@@ -138,9 +144,13 @@ def interpretit(it, engine=None, stream=None, tal=1):
engine = DummyEngine(macros)
TALInterpreter(program, macros, engine, stream, wrap=0, tal=tal)()
def compilefile(file):
from TALParser import TALParser
p = TALParser()
def compilefile(file, html=0):
if html:
from HTMLTALParser import HTMLTALParser
p = HTMLTALParser()
else:
from TALParser import TALParser
p = TALParser()
p.parseFile(file)
return p.getCode()
......
"""A parser for SGML, using the derived class as a static DTD."""
# XXX This only supports those SGML features used by HTML.
# XXX There should be a way to distinguish between PCDATA (parsed
# character data -- the normal case), RCDATA (replaceable character
# data -- only char and entity references and end tags are special)
# and CDATA (character data -- only end tags are special).
import re
import string
__all__ = ["SGMLParser"]
# Regular expressions used for parsing
interesting = re.compile('[&<]')
incomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|'
'<([a-zA-Z][^<>]*|'
'/([a-zA-Z][^<>]*)?|'
'![^<>]*)?')
entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
charref = re.compile('&#([0-9]+)[^0-9]')
starttagopen = re.compile('<[>a-zA-Z]')
shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/')
piopen = re.compile('<\?')
piclose = re.compile('>')
endtagopen = re.compile('</[<>a-zA-Z]')
endbracket = re.compile('[<>]')
special = re.compile('<![^<>]*>')
commentopen = re.compile('<!--')
commentclose = re.compile('--[%s]*>' % string.whitespace)
tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9]*')
attrfind = re.compile(
'[%s]*([a-zA-Z_][-.:a-zA-Z_0-9]*)' % string.whitespace
+ ('([%s]*=[%s]*' % (string.whitespace, string.whitespace))
+ r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./:;+*%?!&$\(\)_#=~]*))?')
# SGML parser base class -- find tags and call handler functions.
# Usage: p = SGMLParser(); p.feed(data); ...; p.close().
# The dtd is defined by deriving a class which defines methods
# with special names to handle tags: start_foo and end_foo to handle
# <foo> and </foo>, respectively, or do_foo to handle <foo> by itself.
# (Tags are converted to lower case for this purpose.) The data
# between tags is passed to the parser by calling self.handle_data()
# with some data as argument (the data may be split up in arbitrary
# chunks). Entity references are passed by calling
# self.handle_entityref() with the entity reference as argument.
class SGMLParser:
# Interface -- initialize and reset this instance
def __init__(self, verbose=0):
self.verbose = verbose
self.reset()
# Interface -- reset this instance. Loses all unprocessed data
def reset(self):
self.rawdata = ''
self.stack = []
self.lasttag = '???'
self.nomoretags = 0
self.literal = 0
# For derived classes only -- enter literal mode (CDATA) till EOF
def setnomoretags(self):
self.nomoretags = self.literal = 1
# For derived classes only -- enter literal mode (CDATA)
def setliteral(self, *args):
self.literal = 1
# Interface -- feed some data to the parser. Call this as
# often as you want, with as little or as much text as you
# want (may include '\n'). (This just saves the text, all the
# processing is done by goahead().)
def feed(self, data):
self.rawdata = self.rawdata + data
self.goahead(0)
# Interface -- handle the remaining data
def close(self):
self.goahead(1)
# Internal -- handle data as far as reasonable. May leave state
# and data to be processed by a subsequent call. If 'end' is
# true, force handling all data as if followed by EOF marker.
def goahead(self, end):
rawdata = self.rawdata
i = 0
n = len(rawdata)
while i < n:
if self.nomoretags:
self.handle_data(rawdata[i:n])
i = n
break
match = interesting.search(rawdata, i)
if match: j = match.start(0)
else: j = n
if i < j: self.handle_data(rawdata[i:j])
i = j
if i == n: break
if rawdata[i] == '<':
if starttagopen.match(rawdata, i):
if self.literal:
self.handle_data(rawdata[i])
i = i+1
continue
k = self.parse_starttag(i)
if k < 0: break
i = k
continue
if endtagopen.match(rawdata, i):
k = self.parse_endtag(i)
if k < 0: break
i = k
self.literal = 0
continue
if commentopen.match(rawdata, i):
if self.literal:
self.handle_data(rawdata[i])
i = i+1
continue
k = self.parse_comment(i)
if k < 0: break
i = i+k
continue
if piopen.match(rawdata, i):
if self.literal:
self.handle_data(rawdata[i])
i = i+1
continue
k = self.parse_pi(i)
if k < 0: break
i = i+k
continue
match = special.match(rawdata, i)
if match:
if self.literal:
self.handle_data(rawdata[i])
i = i+1
continue
i = match.end(0)
continue
elif rawdata[i] == '&':
match = charref.match(rawdata, i)
if match:
name = match.group(1)
self.handle_charref(name)
i = match.end(0)
if rawdata[i-1] != ';': i = i-1
continue
match = entityref.match(rawdata, i)
if match:
name = match.group(1)
self.handle_entityref(name)
i = match.end(0)
if rawdata[i-1] != ';': i = i-1
continue
else:
raise RuntimeError, 'neither < nor & ??'
# We get here only if incomplete matches but
# nothing else
match = incomplete.match(rawdata, i)
if not match:
self.handle_data(rawdata[i])
i = i+1
continue
j = match.end(0)
if j == n:
break # Really incomplete
self.handle_data(rawdata[i:j])
i = j
# end while
if end and i < n:
self.handle_data(rawdata[i:n])
i = n
self.rawdata = rawdata[i:]
# XXX if end: check for empty stack
# Internal -- parse comment, return length or -1 if not terminated
def parse_comment(self, i):
rawdata = self.rawdata
if rawdata[i:i+4] != '<!--':
raise RuntimeError, 'unexpected call to handle_comment'
match = commentclose.search(rawdata, i+4)
if not match:
return -1
j = match.start(0)
self.handle_comment(rawdata[i+4: j])
j = match.end(0)
return j-i
# Internal -- parse processing instr, return length or -1 if not terminated
def parse_pi(self, i):
rawdata = self.rawdata
if rawdata[i:i+2] != '<?':
raise RuntimeError, 'unexpected call to handle_pi'
match = piclose.search(rawdata, i+2)
if not match:
return -1
j = match.start(0)
self.handle_pi(rawdata[i+2: j])
j = match.end(0)
return j-i
__starttag_text = None
def get_starttag_text(self):
return self.__starttag_text
# Internal -- handle starttag, return length or -1 if not terminated
def parse_starttag(self, i):
self.__starttag_text = None
start_pos = i
rawdata = self.rawdata
if shorttagopen.match(rawdata, i):
# SGML shorthand: <tag/data/ == <tag>data</tag>
# XXX Can data contain &... (entity or char refs)?
# XXX Can data contain < or > (tag characters)?
# XXX Can there be whitespace before the first /?
match = shorttag.match(rawdata, i)
if not match:
return -1
tag, data = match.group(1, 2)
self.__starttag_text = '<%s/' % tag
tag = tag.lower()
k = match.end(0)
self.finish_shorttag(tag, data)
self.__starttag_text = rawdata[start_pos:match.end(1) + 1]
return k
# XXX The following should skip matching quotes (' or ")
match = endbracket.search(rawdata, i+1)
if not match:
return -1
j = match.start(0)
# Now parse the data between i+1 and j into a tag and attrs
attrs = []
if rawdata[i:i+2] == '<>':
# SGML shorthand: <> == <last open tag seen>
k = j
tag = self.lasttag
else:
match = tagfind.match(rawdata, i+1)
if not match:
raise RuntimeError, 'unexpected call to parse_starttag'
k = match.end(0)
tag = rawdata[i+1:k].lower()
self.lasttag = tag
while k < j:
match = attrfind.match(rawdata, k)
if not match: break
attrname, rest, attrvalue = match.group(1, 2, 3)
if not rest:
attrvalue = attrname
elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
attrvalue[:1] == '"' == attrvalue[-1:]:
attrvalue = attrvalue[1:-1]
attrs.append((attrname.lower(), attrvalue))
k = match.end(0)
if rawdata[j] == '>':
j = j+1
self.__starttag_text = rawdata[start_pos:j]
self.finish_starttag(tag, attrs)
return j
# Internal -- parse endtag
def parse_endtag(self, i):
rawdata = self.rawdata
match = endbracket.search(rawdata, i+1)
if not match:
return -1
j = match.start(0)
tag = rawdata[i+2:j].strip().lower()
if rawdata[j] == '>':
j = j+1
self.finish_endtag(tag)
return j
# Internal -- finish parsing of <tag/data/ (same as <tag>data</tag>)
def finish_shorttag(self, tag, data):
self.finish_starttag(tag, [])
self.handle_data(data)
self.finish_endtag(tag)
# Internal -- finish processing of start tag
# Return -1 for unknown tag, 0 for open-only tag, 1 for balanced tag
def finish_starttag(self, tag, attrs):
try:
method = getattr(self, 'start_' + tag)
except AttributeError:
try:
method = getattr(self, 'do_' + tag)
except AttributeError:
self.unknown_starttag(tag, attrs)
return -1
else:
self.handle_starttag(tag, method, attrs)
return 0
else:
self.stack.append(tag)
self.handle_starttag(tag, method, attrs)
return 1
# Internal -- finish processing of end tag
def finish_endtag(self, tag):
if not tag:
found = len(self.stack) - 1
if found < 0:
self.unknown_endtag(tag)
return
else:
if tag not in self.stack:
try:
method = getattr(self, 'end_' + tag)
except AttributeError:
self.unknown_endtag(tag)
else:
self.report_unbalanced(tag)
return
found = len(self.stack)
for i in range(found):
if self.stack[i] == tag: found = i
while len(self.stack) > found:
tag = self.stack[-1]
try:
method = getattr(self, 'end_' + tag)
except AttributeError:
method = None
if method:
self.handle_endtag(tag, method)
else:
self.unknown_endtag(tag)
del self.stack[-1]
# Overridable -- handle start tag
def handle_starttag(self, tag, method, attrs):
method(attrs)
# Overridable -- handle end tag
def handle_endtag(self, tag, method):
method()
# Example -- report an unbalanced </...> tag.
def report_unbalanced(self, tag):
if self.verbose:
print '*** Unbalanced </' + tag + '>'
print '*** Stack:', self.stack
# Example -- handle character reference, no need to override
def handle_charref(self, name):
try:
n = int(name)
except ValueError:
self.unknown_charref(name)
return
if not 0 <= n <= 255:
self.unknown_charref(name)
return
self.handle_data(chr(n))
# Definition of entities -- derived classes may override
entitydefs = \
{'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}
# Example -- handle entity reference, no need to override
def handle_entityref(self, name):
table = self.entitydefs
if table.has_key(name):
self.handle_data(table[name])
else:
self.unknown_entityref(name)
return
# Example -- handle data, should be overridden
def handle_data(self, data):
pass
# Example -- handle comment, could be overridden
def handle_comment(self, data):
pass
# Example -- handle processing instruction, could be overridden
def handle_pi(self, data):
pass
# To be overridden -- handlers for unknown objects
def unknown_starttag(self, tag, attrs): pass
def unknown_endtag(self, tag): pass
def unknown_charref(self, ref): pass
def unknown_entityref(self, ref): pass
class TestSGMLParser(SGMLParser):
def __init__(self, verbose=0):
self.testdata = ""
SGMLParser.__init__(self, verbose)
def handle_data(self, data):
self.testdata = self.testdata + data
if len(`self.testdata`) >= 70:
self.flush()
def flush(self):
data = self.testdata
if data:
self.testdata = ""
print 'data:', `data`
def handle_comment(self, data):
self.flush()
r = `data`
if len(r) > 68:
r = r[:32] + '...' + r[-32:]
print 'comment:', r
def unknown_starttag(self, tag, attrs):
self.flush()
if not attrs:
print 'start tag: <' + tag + '>'
else:
print 'start tag: <' + tag,
for name, value in attrs:
print name + '=' + '"' + value + '"',
print '>'
def unknown_endtag(self, tag):
self.flush()
print 'end tag: </' + tag + '>'
def unknown_entityref(self, ref):
self.flush()
print '*** unknown entity ref: &' + ref + ';'
def unknown_charref(self, ref):
self.flush()
print '*** unknown char ref: &#' + ref + ';'
def close(self):
SGMLParser.close(self)
self.flush()
def test(args = None):
import sys
if not args:
args = sys.argv[1:]
if args and args[0] == '-s':
args = args[1:]
klass = SGMLParser
else:
klass = TestSGMLParser
if args:
file = args[0]
else:
file = 'test.html'
if file == '-':
f = sys.stdin
else:
try:
f = open(file, 'r')
except IOError, msg:
print file, ":", msg
sys.exit(1)
data = f.read()
if f is not sys.stdin:
f.close()
x = klass()
for c in data:
x.feed(c)
x.close()
if __name__ == '__main__':
test()
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment