Lots of adjustments to deal with the document content now being stored

in a fragment rather than the main document object.

Lots of adjustments to deal with the document content now being stored
in a fragment rather than the main document object.
e779d4f0 · Fred Drake · 54fb7fb9 · e779d4f0
Commit e779d4f0 authored May 10, 1999 by Fred Drake
Show whitespace changes
Inline Side-by-side

Showing with 94 additions and 91 deletions

Doc/tools/sgmlconv/docfixer.py Doc/tools/sgmlconv/docfixer.py +94 -91

No files found.
--- a/Doc/tools/sgmlconv/docfixer.py
+++ b/Doc/tools/sgmlconv/docfixer.py
@@ -12,7 +12,10 @@ import re
 import string
 import sys
 import xml.dom.core
-import xml.dom.esis_builder
+
+from xml.dom.core import \
+     ELEMENT, \
+     TEXT


 class ConversionError(Exception):
@@ -32,11 +35,11 @@ else:
 # Workaround to deal with invalid documents (multiple root elements).  This
 # does not indicate a bug in the DOM implementation.
 #
-def get_documentElement(self):
+def get_documentElement(doc):
    docelem = None
-    for n in self._node.children:
-        if n.type == xml.dom.core.ELEMENT:
-            docelem = xml.dom.core.Element(n, self, self)
+    for n in doc.childNodes:
+        if n.nodeType == ELEMENT:
+            docelem = n
    return docelem

 xml.dom.core.Document.get_documentElement = get_documentElement
@@ -46,15 +49,15 @@ xml.dom.core.Document.get_documentElement = get_documentElement
 # accessed from the Document object via .childNodes (no matter how many
 # levels of access are used) will be given an ownerDocument of None.
 #
-def get_childNodes(self):
-    return xml.dom.core.NodeList(self._node.children, self, self)
+def get_childNodes(doc):
+    return xml.dom.core.NodeList(doc._node.children, doc._node)

 xml.dom.core.Document.get_childNodes = get_childNodes


 def get_first_element(doc, gi):
    for n in doc.childNodes:
-        if n.nodeType == xml.dom.core.ELEMENT and n.tagName == gi:
+        if n.nodeType == ELEMENT and n.tagName == gi:
            return n

 def extract_first_element(doc, gi):
@@ -66,10 +69,10 @@ def extract_first_element(doc, gi):

 def find_all_elements(doc, gi):
    nodes = []
-    if doc.nodeType == xml.dom.core.ELEMENT and doc.tagName == gi:
+    if doc.nodeType == ELEMENT and doc.tagName == gi:
        nodes.append(doc)
    for child in doc.childNodes:
-        if child.nodeType == xml.dom.core.ELEMENT:
+        if child.nodeType == ELEMENT:
            if child.tagName == gi:
                nodes.append(child)
            for node in child.getElementsByTagName(gi):
@@ -77,36 +80,36 @@ def find_all_elements(doc, gi):
    return nodes        


-def simplify(doc):
+def simplify(doc, fragment):
    # Try to rationalize the document a bit, since these things are simply
    # not valid SGML/XML documents as they stand, and need a little work.
    documentclass = "document"
    inputs = []
-    node = extract_first_element(doc, "documentclass")
+    node = extract_first_element(fragment, "documentclass")
    if node is not None:
        documentclass = node.getAttribute("classname")
-    node = extract_first_element(doc, "title")
+    node = extract_first_element(fragment, "title")
    if node is not None:
        inputs.append(node)
    # update the name of the root element
-    node = get_first_element(doc, "document")
+    node = get_first_element(fragment, "document")
    if node is not None:
        node._node.name = documentclass
    while 1:
-        node = extract_first_element(doc, "input")
+        node = extract_first_element(fragment, "input")
        if node is None:
            break
        inputs.append(node)
    if inputs:
-        docelem = doc.documentElement
+        docelem = get_documentElement(fragment)
        inputs.reverse()
        for node in inputs:
            text = doc.createTextNode("\n")
            docelem.insertBefore(text, docelem.firstChild)
            docelem.insertBefore(node, text)
        docelem.insertBefore(doc.createTextNode("\n"), docelem.firstChild)
-    while doc.firstChild.nodeType == xml.dom.core.TEXT:
-        doc.removeChild(doc.firstChild)
+    while fragment.firstChild.nodeType == TEXT:
+        fragment.removeChild(fragment.firstChild)


 def cleanup_root_text(doc):
@@ -115,9 +118,9 @@ def cleanup_root_text(doc):
    for n in doc.childNodes:
        prevskip = skip
        skip = 0
-        if n.nodeType == xml.dom.core.TEXT and not prevskip:
+        if n.nodeType == TEXT and not prevskip:
            discards.append(n)
-        elif n.nodeType == xml.dom.core.ELEMENT and n.tagName == "COMMENT":
+        elif n.nodeType == ELEMENT and n.tagName == "COMMENT":
            skip = 1
    for node in discards:
        doc.removeChild(node)
@@ -130,8 +133,8 @@ DESCRIPTOR_ELEMENTS = (
    "datadesc", "datadescni",
    )

-def fixup_descriptors(doc):
-    sections = find_all_elements(doc, "section")
+def fixup_descriptors(doc, fragment):
+    sections = find_all_elements(fragment, "section")
    for section in sections:
        find_and_fix_descriptors(doc, section)

@@ -139,7 +142,7 @@ def fixup_descriptors(doc):
 def find_and_fix_descriptors(doc, container):
    children = container.childNodes
    for child in children:
-        if child.nodeType == xml.dom.core.ELEMENT:
+        if child.nodeType == ELEMENT:
            tagName = child.tagName
            if tagName in DESCRIPTOR_ELEMENTS:
                rewrite_descriptor(doc, child)
@@ -191,7 +194,7 @@ def rewrite_descriptor(doc, descriptor):
    pos = skip_leading_nodes(children, 0)
    if pos < len(children):
        child = children[pos]
-        if child.nodeType == xml.dom.core.ELEMENT and child.tagName == "args":
+        if child.nodeType == ELEMENT and child.tagName == "args":
            # create an <args> in <signature>:
            args = doc.createElement("args")
            argchildren = []
@@ -205,7 +208,7 @@ def rewrite_descriptor(doc, descriptor):
    # 3, 4.
    pos = skip_leading_nodes(children, pos + 1)
    while pos < len(children) \
-          and children[pos].nodeType == xml.dom.core.ELEMENT \
+          and children[pos].nodeType == ELEMENT \
          and children[pos].tagName in (linename, "versionadded"):
        if children[pos].tagName == linename:
            # this is really a supplemental signature, create <signature>
@@ -222,7 +225,7 @@ def rewrite_descriptor(doc, descriptor):
    newchildren.append(description)
    move_children(descriptor, description, pos)
    last = description.childNodes[-1]
-    if last.nodeType == xml.dom.core.TEXT:
+    if last.nodeType == TEXT:
        last.data = string.rstrip(last.data) + "\n  "
    # 6.
    # should have nothing but whitespace and signature lines in <descriptor>;
@@ -259,16 +262,16 @@ def move_children(origin, dest, start=0):
        dest.appendChild(node)


-def handle_appendix(doc):
+def handle_appendix(doc, fragment):
    # must be called after simplfy() if document is multi-rooted to begin with
-    docelem = doc.documentElement
+    docelem = get_documentElement(fragment)
    toplevel = docelem.tagName == "manual" and "chapter" or "section"
    appendices = 0
    nodes = []
    for node in docelem.childNodes:
        if appendices:
            nodes.append(node)
-        elif node.nodeType == xml.dom.core.ELEMENT:
+        elif node.nodeType == ELEMENT:
            appnodes = node.getElementsByTagName("appendix")
            if appnodes:
                appendices = 1
@@ -281,7 +284,7 @@ def handle_appendix(doc):
        back = doc.createElement("back-matter")
        docelem.appendChild(back)
        back.appendChild(doc.createTextNode("\n"))
-        while nodes and nodes[0].nodeType == xml.dom.core.TEXT \
+        while nodes and nodes[0].nodeType == TEXT \
              and not string.strip(nodes[0].data):
            del nodes[0]
        map(back.appendChild, nodes)
@@ -307,28 +310,28 @@ def fixup_trailing_whitespace(doc, wsmap):
    while queue:
        node = queue[0]
        del queue[0]
-        if node.nodeType == xml.dom.core.ELEMENT \
+        if node.nodeType == ELEMENT \
           and wsmap.has_key(node.tagName):
            ws = wsmap[node.tagName]
            children = node.childNodes
            children.reverse()
-            if children[0].nodeType == xml.dom.core.TEXT:
+            if children[0].nodeType == TEXT:
                data = string.rstrip(children[0].data) + ws
                children[0].data = data
            children.reverse()
            # hack to get the title in place:
            if node.tagName == "title" \
-               and node.parentNode.firstChild.nodeType == xml.dom.core.ELEMENT:
+               and node.parentNode.firstChild.nodeType == ELEMENT:
                node.parentNode.insertBefore(doc.createText("\n  "),
                                             node.parentNode.firstChild)
        for child in node.childNodes:
-            if child.nodeType == xml.dom.core.ELEMENT:
+            if child.nodeType == ELEMENT:
                queue.append(child)


 def normalize(doc):
    for node in doc.childNodes:
-        if node.nodeType == xml.dom.core.ELEMENT:
+        if node.nodeType == ELEMENT:
            node.normalize()


@@ -339,7 +342,7 @@ def cleanup_trailing_parens(doc, element_names):
    rewrite_element = d.has_key
    queue = []
    for node in doc.childNodes:
-        if node.nodeType == xml.dom.core.ELEMENT:
+        if node.nodeType == ELEMENT:
            queue.append(node)
    while queue:
        node = queue[0]
@@ -347,13 +350,13 @@ def cleanup_trailing_parens(doc, element_names):
        if rewrite_element(node.tagName):
            children = node.childNodes
            if len(children) == 1 \
-               and children[0].nodeType == xml.dom.core.TEXT:
+               and children[0].nodeType == TEXT:
                data = children[0].data
                if data[-2:] == "()":
                    children[0].data = data[:-2]
        else:
            for child in node.childNodes:
-                if child.nodeType == xml.dom.core.ELEMENT:
+                if child.nodeType == ELEMENT:
                    queue.append(child)


@@ -366,13 +369,13 @@ def contents_match(left, right):
        nodeType = l.nodeType
        if nodeType != r.nodeType:
            return 0
-        if nodeType == xml.dom.core.ELEMENT:
+        if nodeType == ELEMENT:
            if l.tagName != r.tagName:
                return 0
            # should check attributes, but that's not a problem here
            if not contents_match(l, r):
                return 0
-        elif nodeType == xml.dom.core.TEXT:
+        elif nodeType == TEXT:
            if l.data != r.data:
                return 0
        else:
@@ -388,7 +391,7 @@ def create_module_info(doc, section):
        return
    node._node.name = "synopsis"
    lastchild = node.childNodes[-1]
-    if lastchild.nodeType == xml.dom.core.TEXT \
+    if lastchild.nodeType == TEXT \
       and lastchild.data[-1:] == ".":
        lastchild.data = lastchild.data[:-1]
    modauthor = extract_first_element(section, "moduleauthor")
@@ -423,7 +426,7 @@ def create_module_info(doc, section):
        if title:
            children = title.childNodes
            if len(children) >= 2 \
-               and children[0].nodeType == xml.dom.core.ELEMENT \
+               and children[0].nodeType == ELEMENT \
               and children[0].tagName == "module" \
               and children[0].childNodes[0].data == name:
                # this is it; morph the <title> into <short-synopsis>
@@ -431,7 +434,7 @@ def create_module_info(doc, section):
                if first_data.data[:4] == " ---":
                    first_data.data = string.lstrip(first_data.data[4:])
                title._node.name = "short-synopsis"
-                if children[-1].nodeType == xml.dom.core.TEXT \
+                if children[-1].nodeType == TEXT \
                   and children[-1].data[-1:] == ".":
                    children[-1].data = children[-1].data[:-1]
                section.removeChild(title)
@@ -470,10 +473,10 @@ def create_module_info(doc, section):
        children = section.childNodes
        for i in range(len(children)):
            node = children[i]
-            if node.nodeType == xml.dom.core.ELEMENT \
+            if node.nodeType == ELEMENT \
               and node.tagName == "moduleinfo":
                nextnode = children[i+1]
-                if nextnode.nodeType == xml.dom.core.TEXT:
+                if nextnode.nodeType == TEXT:
                    data = nextnode.data
                    if len(string.lstrip(data)) < (len(data) - 4):
                        nextnode.data = "\n\n\n" + string.lstrip(data)
@@ -487,7 +490,7 @@ def cleanup_synopses(doc):
 def remap_element_names(root, name_map):
    queue = []
    for child in root.childNodes:
-        if child.nodeType == xml.dom.core.ELEMENT:
+        if child.nodeType == ELEMENT:
            queue.append(child)
    while queue:
        node = queue.pop()
@@ -498,13 +501,13 @@ def remap_element_names(root, name_map):
            for attr, value in attrs.items():
                node.setAttribute(attr, value)
        for child in node.childNodes:
-            if child.nodeType == xml.dom.core.ELEMENT:
+            if child.nodeType == ELEMENT:
                queue.append(child)


-def fixup_table_structures(doc):
+def fixup_table_structures(doc, fragment):
    # must be done after remap_element_names(), or the tables won't be found
-    for table in find_all_elements(doc, "table"):
+    for table in find_all_elements(fragment, "table"):
        fixup_table(doc, table)


@@ -522,7 +525,7 @@ def fixup_table(doc, table):
    last_was_hline = 0
    children = table.childNodes
    for child in children:
-        if child.nodeType == xml.dom.core.ELEMENT:
+        if child.nodeType == ELEMENT:
            tagName = child.tagName
            if tagName == "hline" and prev_row is not None:
                prev_row.setAttribute("rowsep", "1")
@@ -535,12 +538,12 @@ def fixup_table(doc, table):
    while children:
        child = children[0]
        nodeType = child.nodeType
-        if nodeType == xml.dom.core.TEXT:
+        if nodeType == TEXT:
            if string.strip(child.data):
                raise ConversionError("unexpected free data in table")
            table.removeChild(child)
            continue
-        if nodeType == xml.dom.core.ELEMENT:
+        if nodeType == ELEMENT:
            if child.tagName != "hline":
                raise ConversionError(
                    "unexpected <%s> in table" % child.tagName)
@@ -572,7 +575,7 @@ def fixup_row(doc, row):
 def move_elements_by_name(doc, source, dest, name, sep=None):
    nodes = []
    for child in source.childNodes:
-        if child.nodeType == xml.dom.core.ELEMENT and child.tagName == name:
+        if child.nodeType == ELEMENT and child.tagName == name:
            nodes.append(child)
    for node in nodes:
        source.removeChild(node)
@@ -606,13 +609,13 @@ PARA_LEVEL_PRECEEDERS = (
    )


-def fixup_paras(doc):
-    for child in doc.childNodes:
-        if child.nodeType == xml.dom.core.ELEMENT \
+def fixup_paras(doc, fragment):
+    for child in fragment.childNodes:
+        if child.nodeType == ELEMENT \
           and child.tagName in RECURSE_INTO_PARA_CONTAINERS:
            #
            fixup_paras_helper(doc, child)
-    descriptions = find_all_elements(doc, "description")
+    descriptions = find_all_elements(fragment, "description")
    for description in descriptions:
        fixup_paras_helper(doc, description)

@@ -628,7 +631,7 @@ def fixup_paras_helper(doc, container, depth=0):
        #
        # Either paragraph material or something to recurse into:
        #
-        if (children[start].nodeType == xml.dom.core.ELEMENT) \
+        if (children[start].nodeType == ELEMENT) \
           and (children[start].tagName in RECURSE_INTO_PARA_CONTAINERS):
            fixup_paras_helper(doc, children[start])
            start = skip_leading_nodes(children, start + 1)
@@ -653,11 +656,11 @@ def build_para(doc, parent, start, i):
        after = j + 1
        child = children[j]
        nodeType = child.nodeType
-        if nodeType == xml.dom.core.ELEMENT:
+        if nodeType == ELEMENT:
            if child.tagName in BREAK_ELEMENTS:
                after = j
                break
-        elif nodeType == xml.dom.core.TEXT:
+        elif nodeType == TEXT:
            pos = string.find(child.data, "\n\n")
            if pos == 0:
                after = j
@@ -670,7 +673,7 @@ def build_para(doc, parent, start, i):
    if (start + 1) > after:
        raise ConversionError(
            "build_para() could not identify content to turn into a paragraph")
-    if children[after - 1].nodeType == xml.dom.core.TEXT:
+    if children[after - 1].nodeType == TEXT:
        # we may need to split off trailing white space:
        child = children[after - 1]
        data = child.data
@@ -707,7 +710,7 @@ def skip_leading_nodes(children, start):
        # skip over leading comments and whitespace:
        child = children[start]
        nodeType = child.nodeType
-        if nodeType == xml.dom.core.TEXT:
+        if nodeType == TEXT:
            data = child.data
            shortened = string.lstrip(data)
            if shortened:
@@ -717,7 +720,7 @@ def skip_leading_nodes(children, start):
                    return start + 1
                return start
            # all whitespace, just skip
-        elif nodeType == xml.dom.core.ELEMENT:
+        elif nodeType == ELEMENT:
            tagName = child.tagName
            if tagName in RECURSE_INTO_PARA_CONTAINERS:
                return start
@@ -727,15 +730,15 @@ def skip_leading_nodes(children, start):
    return start


-def fixup_rfc_references(doc):
-    for rfcnode in find_all_elements(doc, "rfc"):
+def fixup_rfc_references(doc, fragment):
+    for rfcnode in find_all_elements(fragment, "rfc"):
        rfcnode.appendChild(doc.createTextNode(
            "RFC " + rfcnode.getAttribute("num")))


-def fixup_signatures(doc):
-    for child in doc.childNodes:
-        if child.nodeType == xml.dom.core.ELEMENT:
+def fixup_signatures(doc, fragment):
+    for child in fragment.childNodes:
+        if child.nodeType == ELEMENT:
            args = child.getElementsByTagName("args")
            for arg in args:
                fixup_args(doc, arg)
@@ -748,7 +751,7 @@ def fixup_signatures(doc):

 def fixup_args(doc, arglist):
    for child in arglist.childNodes:
-        if child.nodeType == xml.dom.core.ELEMENT \
+        if child.nodeType == ELEMENT \
           and child.tagName == "optional":
            # found it; fix and return
            arglist.insertBefore(doc.createTextNode("["), child)
@@ -762,8 +765,8 @@ def fixup_args(doc, arglist):
            return fixup_args(doc, arglist)


-def fixup_sectionauthors(doc):
-    for sectauth in find_all_elements(doc, "sectionauthor"):
+def fixup_sectionauthors(doc, fragment):
+    for sectauth in find_all_elements(fragment, "sectionauthor"):
        section = sectauth.parentNode
        section.removeChild(sectauth)
        sectauth._node.name = "author"
@@ -772,7 +775,7 @@ def fixup_sectionauthors(doc):
        sectauth.removeAttribute("name")
        after = section.childNodes[2]
        title = section.childNodes[1]
-        if title.nodeType == xml.dom.core.ELEMENT and title.tagName != "title":
+        if title.nodeType == ELEMENT and title.tagName != "title":
            after = section.childNodes[0]
        section.insertBefore(doc.createTextNode("\n  "), after)
        section.insertBefore(sectauth, after)
@@ -781,10 +784,9 @@ def fixup_sectionauthors(doc):
 def fixup_verbatims(doc):
    for verbatim in find_all_elements(doc, "verbatim"):
        child = verbatim.childNodes[0]
-        if child.nodeType == xml.dom.core.TEXT \
+        if child.nodeType == TEXT \
           and string.lstrip(child.data)[:3] == ">>>":
-            verbatim._node.name = "interpreter-session"
-            #verbatim.setAttribute("interactive", "interactive")
+            verbatim._node.name = "interactive-session"


 _token_rx = re.compile(r"[a-zA-Z][a-zA-Z0-9.-]*$")
@@ -792,7 +794,7 @@ _token_rx = re.compile(r"[a-zA-Z][a-zA-Z0-9.-]*$")
 def write_esis(doc, ofp, knownempty):
    for node in doc.childNodes:
        nodeType = node.nodeType
-        if nodeType == xml.dom.core.ELEMENT:
+        if nodeType == ELEMENT:
            gi = node.tagName
            if knownempty(gi):
                if node.hasChildNodes():
@@ -808,7 +810,7 @@ def write_esis(doc, ofp, knownempty):
            ofp.write("(%s\n" % gi)
            write_esis(node, ofp, knownempty)
            ofp.write(")%s\n" % gi)
-        elif nodeType == xml.dom.core.TEXT:
+        elif nodeType == TEXT:
            ofp.write("-%s\n" % esistools.encode(node.data))
        else:
            raise RuntimeError, "unsupported node type: %s" % nodeType
@@ -818,10 +820,11 @@ def convert(ifp, ofp):
    p = esistools.ExtendedEsisBuilder()
    p.feed(ifp.read())
    doc = p.document
-    normalize(doc)
-    simplify(doc)
-    handle_labels(doc)
-    handle_appendix(doc)
+    fragment = p.fragment
+    normalize(fragment)
+    simplify(doc, fragment)
+    handle_labels(fragment)
+    handle_appendix(doc, fragment)
    fixup_trailing_whitespace(doc, {
        "abstract": "\n",
        "title": "",
@@ -835,12 +838,12 @@ def convert(ifp, ofp):
    cleanup_root_text(doc)
    cleanup_trailing_parens(doc, ["function", "method", "cfunction"])
    cleanup_synopses(doc)
-    fixup_descriptors(doc)
-    fixup_verbatims(doc)
-    normalize(doc)
-    fixup_paras(doc)
-    fixup_sectionauthors(doc)
-    remap_element_names(doc, {
+    fixup_descriptors(doc, fragment)
+    fixup_verbatims(fragment)
+    normalize(fragment)
+    fixup_paras(doc, fragment)
+    fixup_sectionauthors(doc, fragment)
+    remap_element_names(fragment, {
        "tableii": ("table", {"cols": "2"}),
        "tableiii": ("table", {"cols": "3"}),
        "tableiv": ("table", {"cols": "4"}),
@@ -849,9 +852,9 @@ def convert(ifp, ofp):
        "lineiv": ("row", {}),
        "refmodule": ("module", {"link": "link"}),
        })
-    fixup_table_structures(doc)
-    fixup_rfc_references(doc)
-    fixup_signatures(doc)
+    fixup_table_structures(doc, fragment)
+    fixup_rfc_references(doc, fragment)
+    fixup_signatures(doc, fragment)
    #
    d = {}
    for gi in p.get_empties():
@@ -861,7 +864,7 @@ def convert(ifp, ofp):
    knownempty = d.has_key
    #
    try:
-        write_esis(doc, ofp, knownempty)
+        write_esis(fragment, ofp, knownempty)
    except IOError, (err, msg):
        # Ignore EPIPE; it just means that whoever we're writing to stopped
        # reading.  The rest of the output would be ignored.  All other errors