Commit 0588e8ac authored by Guido van Rossum's avatar Guido van Rossum

First steps towards an HTML parser

parent b38c7192
from nsgmllib import SGMLParser
BOOLEAN_HTML_ATTRS = [
# List of Boolean attributes in HTML that may be given in
# minimized form (e.g. <img ismap> rather than <img ismap="">)
# From http://www.w3.org/TR/xhtml1/#guidelines (C.10)
"compact", "nowrap", "ismap", "declare", "noshade", "checked",
"disabled", "readonly", "multiple", "selected", "noresize",
"defer"
]
EMPTY_HTML_TAGS = [
# List of HTML tags with an empty content model; these are
# rendered in minimized form, e.g. <img />.
# From http://www.w3.org/TR/xhtml1/#dtds
"base", "meta", "link", "hr", "br", "param", "img", "area",
"input", "col", "basefont", "isindex", "frame",
]
from TALGenerator import TALGenerator
class HTMLTALParser(SGMLParser):
# External API
def __init__(self, gen=None):
SGMLParser.__init__(self)
if gen is None:
gen = TALGenerator()
self.gen = gen
self.tagstack = []
self.nsstack = []
self.nsdict = {}
def parseFile(self, file):
f = open(file)
data = f.read()
f.close()
self.feed(data)
self.close()
while self.tagstack:
self.finish_endtag(None)
assert self.tagstack == []
assert self.nsstack == []
assert self.nsdict == {}, self.nsdict
def getCode(self):
return self.gen.program, self.gen.macros
# Internal thingies
def scan_xmlns(self, attrs):
nsnew = {}
for key, value in attrs:
if key[:6] == "xmlns:":
nsnew[key[6:]] = value
if nsnew:
self.nsstack.append(self.nsdict)
self.nsdict = self.nsdict.copy()
self.nsdict.update(nsnew)
else:
self.nsstack.append(self.nsdict)
def pop_xmlns(self):
self.nsdict = self.nsstack.pop()
# Overriding SGMLParser methods
def finish_starttag(self, tag, attrs):
self.scan_xmlns(attrs)
print tag, self.nsdict
if tag not in EMPTY_HTML_TAGS:
self.tagstack.append(tag)
else:
self.pop_xmlns()
print "<", tag, self.nsdict
self.gen.emitStartTag(tag, attrs)
def finish_endtag(self, tag):
if tag not in EMPTY_HTML_TAGS:
if not tag:
tag = self.tagstack.pop()
else:
assert tag in self.tagstack
while self.tagstack[-1] != tag:
self.finish_endtag(None)
self.tagstack.pop()
self.pop_xmlns()
print "<", tag, self.nsdict
self.gen.emitEndTag(tag)
def handle_charref(self, name):
self.gen.emit("rawtext", "&#%s;" % name)
def handle_entityref(self, name):
self.gen.emit("rawtext", "&%s;" % name)
def handle_data(self, data):
self.gen.emit("text", data)
def handle_comment(self, data):
self.gen.emit("rawtext", "<!--%s-->" % data)
def handle_pi(self, data):
self.gen.emit("rawtext", "<?%s>" % data)
...@@ -54,7 +54,9 @@ DummyEngine.py simple-minded TALES execution engine ...@@ -54,7 +54,9 @@ DummyEngine.py simple-minded TALES execution engine
TALInterpreter.py class to interpret intermediate code TALInterpreter.py class to interpret intermediate code
TALGenerator.py class to generate intermediate code TALGenerator.py class to generate intermediate code
XMLParser.py base class to parse XML, avoiding DOM XMLParser.py base class to parse XML, avoiding DOM
TALParser.py class to parse TAL into intermediate code TALParser.py class to parse XML with TAL into intermediate code
HTMLTALParser.py class to parse HTML with TAL into intermediate code
nsgmllib.py modified version of sgmllib.py
driver.py script to demonstrate TAL expansion driver.py script to demonstrate TAL expansion
timer.py script to time various processing phases timer.py script to time various processing phases
setpath.py hack to set sys.path and import ZODB setpath.py hack to set sys.path and import ZODB
......
...@@ -105,20 +105,26 @@ FILE = "test/test1.xml" ...@@ -105,20 +105,26 @@ FILE = "test/test1.xml"
def main(): def main():
versionTest = 1 versionTest = 1
macros = 0 macros = 0
html = 0
try: try:
opts, args = getopt.getopt(sys.argv[1:], "mn") opts, args = getopt.getopt(sys.argv[1:], "hmnx")
except getopt.error, msg: except getopt.error, msg:
sys.stderr.write("%s\n" % str(msg)) sys.stderr.write("%s\n" % str(msg))
sys.stderr.write( sys.stderr.write(
"usage: driver.py [-m] [-n] [file]\n") "usage: driver.py [-h|-x] [-m] [-n] [file]\n")
sys.stderr.write("-h/-x -- HTML/XML input (default XML)\n")
sys.stderr.write("-m -- macro expansion only\n") sys.stderr.write("-m -- macro expansion only\n")
sys.stderr.write("-n -- turn of the Python 1.5.2 test\n") sys.stderr.write("-n -- turn of the Python 1.5.2 test\n")
sys.exit(2) sys.exit(2)
for o, a in opts: for o, a in opts:
if o == '-h':
html = 1
if o == '-m': if o == '-m':
macros = 1 macros = 1
if o == '-n': if o == '-n':
versionTest = 0 versionTest = 0
if o == '-x':
html = 0
if not versionTest: if not versionTest:
if sys.version[:5] != "1.5.2": if sys.version[:5] != "1.5.2":
sys.stderr.write( sys.stderr.write(
...@@ -128,7 +134,7 @@ def main(): ...@@ -128,7 +134,7 @@ def main():
file = args[0] file = args[0]
else: else:
file = FILE file = FILE
it = compilefile(file) it = compilefile(file, html=html)
interpretit(it, tal=(not macros)) interpretit(it, tal=(not macros))
def interpretit(it, engine=None, stream=None, tal=1): def interpretit(it, engine=None, stream=None, tal=1):
...@@ -138,9 +144,13 @@ def interpretit(it, engine=None, stream=None, tal=1): ...@@ -138,9 +144,13 @@ def interpretit(it, engine=None, stream=None, tal=1):
engine = DummyEngine(macros) engine = DummyEngine(macros)
TALInterpreter(program, macros, engine, stream, wrap=0, tal=tal)() TALInterpreter(program, macros, engine, stream, wrap=0, tal=tal)()
def compilefile(file): def compilefile(file, html=0):
from TALParser import TALParser if html:
p = TALParser() from HTMLTALParser import HTMLTALParser
p = HTMLTALParser()
else:
from TALParser import TALParser
p = TALParser()
p.parseFile(file) p.parseFile(file)
return p.getCode() return p.getCode()
......
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment