Add "suspicious" builder which finds leftover markup in the HTML files.

Patch by Gabriel Genellina.

Add "suspicious" builder which finds leftover markup in the HTML files.
Patch by Gabriel Genellina.
700cf28f · Georg Brandl · 775aa4a0 · 700cf28f · 700cf28f · 700cf28f
Commit 700cf28f authored Jan 04, 2009 by Georg Brandl
5 changed files
--- a/Doc/Makefile
+++ b/Doc/Makefile
@@ -24,6 +24,7 @@ help:
 	@echo "  text      to make plain text files"
 	@echo "  changes   to make an overview over all changed/added/deprecated items"
 	@echo "  linkcheck to check all external links for integrity"
+	@echo "  suspicious to check for suspicious markup in output text"
 	@echo "  coverage  to check documentation coverage for library and C API"
 	@echo "  dist      to create a \"dist\" directory with archived docs for download"

@@ -84,6 +85,11 @@ linkcheck: build
 	@echo "Link check complete; look for any errors in the above output " \
 	      "or in build/$(BUILDER)/output.txt"

+suspicious: BUILDER = suspicious
+suspicious: build
+	@echo "Suspicious check complete; look for any errors in the above output " \
+	      "or in build/$(BUILDER)/suspicious.txt"
+
 coverage: BUILDER = coverage
 coverage: build
 	@echo "Coverage finished; see c.txt and python.txt in build/coverage"

--- a/Doc/make.bat
+++ b/Doc/make.bat
@@ -8,28 +8,35 @@ if "%HTMLHELP%" EQU "" set HTMLHELP=%ProgramFiles%\HTML Help Workshop\hhc.exe
 if "%1" EQU "" goto help
 if "%1" EQU "html" goto build
 if "%1" EQU "htmlhelp" goto build
-if "%1" EQU "web" goto build
-if "%1" EQU "webrun" goto webrun
+if "%1" EQU "latex" goto build
+if "%1" EQU "text" goto build
+if "%1" EQU "suspicious" goto build
+if "%1" EQU "linkcheck" goto build
+if "%1" EQU "changes" goto build
 if "%1" EQU "checkout" goto checkout
 if "%1" EQU "update" goto update

 :help
+set this=%~n0
 echo HELP
 echo.
-echo builddoc checkout
-echo builddoc update
-echo builddoc html
-echo builddoc htmlhelp
-echo builddoc web
-echo builddoc webrun
+echo %this% checkout
+echo %this% update
+echo %this% html
+echo %this% htmlhelp
+echo %this% latex
+echo %this% text
+echo %this% suspicious
+echo %this% linkcheck
+echo %this% changes
 echo.
 goto end

 :checkout
 svn co %SVNROOT%/doctools/trunk/sphinx tools/sphinx
-svn co %SVNROOT%/external/docutils-0.4/docutils tools/docutils
-svn co %SVNROOT%/external/Jinja-1.1/jinja tools/jinja
-svn co %SVNROOT%/external/Pygments-0.9/pygments tools/pygments
+svn co %SVNROOT%/external/docutils-0.5/docutils tools/docutils
+svn co %SVNROOT%/external/Jinja-1.2/jinja tools/jinja
+svn co %SVNROOT%/external/Pygments-0.11.1/pygments tools/pygments
 goto end

 :update
@@ -43,7 +50,7 @@ goto end
 if not exist build mkdir build
 if not exist build\%1 mkdir build\%1
 if not exist build\doctrees mkdir build\doctrees
-cmd /C %PYTHON% tools\sphinx-build.py -b%1 -dbuild\doctrees . build\%1
+cmd /C %PYTHON% tools\sphinx-build.py -b%1 -dbuild\doctrees . build\%*
 if "%1" EQU "htmlhelp" "%HTMLHELP%" build\htmlhelp\pydoc.hhp
 goto end


--- a/Doc/tools/sphinxext/pyspecific.py
+++ b/Doc/tools/sphinxext/pyspecific.py
@@ -92,6 +92,9 @@ class PydocTopicsBuilder(Builder):
        finally:
            f.close()

+# Support for checking for suspicious markup
+
+import suspicious

 # Support for documenting Opcodes

@@ -116,5 +119,6 @@ def parse_opcode_signature(env, sig, signode):
 def setup(app):
    app.add_role('issue', issue_role)
    app.add_builder(PydocTopicsBuilder)
+    app.add_builder(suspicious.CheckSuspiciousMarkupBuilder)
    app.add_description_unit('opcode', 'opcode', '%s (opcode)',
                             parse_opcode_signature)
--- a/Doc/tools/sphinxext/susp-ignored.csv
+++ b/Doc/tools/sphinxext/susp-ignored.csv
+c-api/arg,,:ref,"PyArg_ParseTuple(args, ""O|O:ref"", &object, &callback)"
+c-api/list,,:high,list[low:high]
+c-api/list,,:high,list[low:high] = itemlist
+c-api/sequence,,:i2,o[i1:i2]
+c-api/sequence,,:i2,o[i1:i2] = v
+c-api/sequence,,:i2,del o[i1:i2]
+c-api/unicode,,:end,str[start:end]
+distutils/apiref,,:action,http://pypi.python.org/pypi?:action=list_classifiers
+distutils/setupscript,,::,
+extending/embedding,,:numargs,"if(!PyArg_ParseTuple(args, "":numargs""))"
+extending/extending,,:set,"if (PyArg_ParseTuple(args, ""O:set_callback"", &temp)) {"
+extending/extending,,:myfunction,"PyArg_ParseTuple(args, ""D:myfunction"", &c);"
+extending/newtypes,,:call,"if (!PyArg_ParseTuple(args, ""sss:call"", &arg1, &arg2, &arg3)) {"
+extending/windows,,:initspam,/export:initspam
+howto/cporting,,:add,"if (!PyArg_ParseTuple(args, ""ii:add_ints"", &one, &two))"
+howto/cporting,,:encode,"if (!PyArg_ParseTuple(args, ""O:encode_object"", &myobj))"
+howto/cporting,,:say,"if (!PyArg_ParseTuple(args, ""U:say_hello"", &name))"
+howto/curses,,:black,"They are: 0:black, 1:red, 2:green, 3:yellow, 4:blue, 5:magenta, 6:cyan, and"
+howto/curses,,:blue,"They are: 0:black, 1:red, 2:green, 3:yellow, 4:blue, 5:magenta, 6:cyan, and"
+howto/curses,,:cyan,"They are: 0:black, 1:red, 2:green, 3:yellow, 4:blue, 5:magenta, 6:cyan, and"
+howto/curses,,:green,"They are: 0:black, 1:red, 2:green, 3:yellow, 4:blue, 5:magenta, 6:cyan, and"
+howto/curses,,:magenta,"They are: 0:black, 1:red, 2:green, 3:yellow, 4:blue, 5:magenta, 6:cyan, and"
+howto/curses,,:red,"They are: 0:black, 1:red, 2:green, 3:yellow, 4:blue, 5:magenta, 6:cyan, and"
+howto/curses,,:white,"7:white."
+howto/curses,,:yellow,"They are: 0:black, 1:red, 2:green, 3:yellow, 4:blue, 5:magenta, 6:cyan, and"
+howto/regex,,::,
+howto/regex,,:foo,(?:foo)
+howto/urllib2,,:example,"for example ""joe@password:example.com"""
+howto/webservers,,.. image:,.. image:: http.png
+library/audioop,,:ipos,"# factor = audioop.findfactor(in_test[ipos*2:ipos*2+len(out_test)],"
+library/datetime,,:MM,
+library/datetime,,:SS,
+library/decimal,,:optional,"trailneg:optional trailing minus indicator"
+library/difflib,,:ahi,a[alo:ahi]
+library/difflib,,:bhi,b[blo:bhi]
+library/difflib,,:i2,
+library/difflib,,:j2,
+library/difflib,,:i1,
+library/dis,,:TOS,
+library/dis,,`,TOS = `TOS`
+library/doctest,,`,``factorial`` from the ``example`` module:
+library/doctest,,`,The ``example`` module
+library/doctest,,`,Using ``factorial``
+library/functions,,:step,a[start:stop:step]
+library/functions,,:stop,"a[start:stop, i]"
+library/functions,,:stop,a[start:stop:step]
+library/hotshot,,:lineno,"ncalls  tottime  percall  cumtime  percall filename:lineno(function)"
+library/httplib,,:port,host:port
+library/imaplib,,:MM,"""DD-Mmm-YYYY HH:MM:SS +HHMM"""
+library/imaplib,,:SS,"""DD-Mmm-YYYY HH:MM:SS +HHMM"""
+library/linecache,,:sys,"sys:x:3:3:sys:/dev:/bin/sh"
+library/logging,,:And,
+library/logging,,:package1,
+library/logging,,:package2,
+library/logging,,:root,
+library/logging,,:This,
+library/logging,,:port,host:port
+library/mmap,,:i2,obj[i1:i2]
+library/multiprocessing,,:queue,">>> QueueManager.register('get_queue', callable=lambda:queue)"
+library/multiprocessing,,`,">>> l._callmethod('__getitem__', (20,))     # equiv to `l[20]`"
+library/multiprocessing,,`,">>> l._callmethod('__getslice__', (2, 7))   # equiv to `l[2:7]`"
+library/multiprocessing,,`,# `BaseManager`.
+library/multiprocessing,,`,# `Pool.imap()` (which will save on the amount of code needed anyway).
+library/multiprocessing,,`,# A test file for the `multiprocessing` package
+library/multiprocessing,,`,# A test of `multiprocessing.Pool` class
+library/multiprocessing,,`,# Add more tasks using `put()`
+library/multiprocessing,,`,# create server for a `HostManager` object
+library/multiprocessing,,`,# Depends on `multiprocessing` package -- tested with `processing-0.60`
+library/multiprocessing,,`,# in the original order then consider using `Pool.map()` or
+library/multiprocessing,,`,# Not sure if we should synchronize access to `socket.accept()` method by
+library/multiprocessing,,`,# object.  (We import `multiprocessing.reduction` to enable this pickling.)
+library/multiprocessing,,`,# register the Foo class; make `f()` and `g()` accessible via proxy
+library/multiprocessing,,`,# register the Foo class; make `g()` and `_h()` accessible via proxy
+library/multiprocessing,,`,# register the generator function baz; use `GeneratorProxy` to make proxies
+library/multiprocessing,,`,`Cluster` is a subclass of `SyncManager` so it allows creation of
+library/multiprocessing,,`,`hostname` gives the name of the host.  If hostname is not
+library/multiprocessing,,`,`slots` is used to specify the number of slots for processes on
+library/optparse,,:len,"del parser.rargs[:len(value)]"
+library/os.path,,:foo,c:foo
+library/parser,,`,"""Make a function that raises an argument to the exponent `exp`."""
+library/posix,,`,"CFLAGS=""`getconf LFS_CFLAGS`"" OPT=""-g -O2 $CFLAGS"""
+library/profile,,:lineno,ncalls  tottime  percall  cumtime  percall filename:lineno(function)
+library/profile,,:lineno,filename:lineno(function)
+library/pyexpat,,:elem1,<py:elem1 />
+library/pyexpat,,:py,"xmlns:py = ""http://www.python.org/ns/"">"
+library/repr,,`,"return `obj`"
+library/smtplib,,:port,"as well as a regular host:port server."
+library/socket,,::,'5aef:2b::8'
+library/sqlite3,,:memory,
+library/sqlite3,,:age,"select name_last, age from people where name_last=:who and age=:age"
+library/sqlite3,,:who,"select name_last, age from people where name_last=:who and age=:age"
+library/ssl,,:My,"Organization Name (eg, company) [Internet Widgits Pty Ltd]:My Organization, Inc."
+library/ssl,,:My,"Organizational Unit Name (eg, section) []:My Group"
+library/ssl,,:myserver,"Common Name (eg, YOUR name) []:myserver.mygroup.myorganization.com"
+library/ssl,,:MyState,State or Province Name (full name) [Some-State]:MyState
+library/ssl,,:ops,Email Address []:ops@myserver.mygroup.myorganization.com
+library/ssl,,:Some,"Locality Name (eg, city) []:Some City"
+library/ssl,,:US,Country Name (2 letter code) [AU]:US
+library/stdtypes,,:len,s[len(s):len(s)]
+library/stdtypes,,:len,s[len(s):len(s)]
+library/string,,:end,s[start:end]
+library/string,,:end,s[start:end]
+library/subprocess,,`,"output=`mycmd myarg`"
+library/subprocess,,`,"output=`dmesg | grep hda`"
+library/tarfile,,:compression,filemode[:compression]
+library/tarfile,,:gz,
+library/tarfile,,:bz2,
+library/time,,:mm,
+library/time,,:ss,
+library/turtle,,::,Example::
+library/urllib,,:port,:port
+library/urllib2,,:password,"""joe:password@python.org"""
+library/uuid,,:uuid,urn:uuid:12345678-1234-5678-1234-567812345678
+library/xmlrpclib,,:pass,http://user:pass@host:port/path
+library/xmlrpclib,,:pass,user:pass
+library/xmlrpclib,,:port,http://user:pass@host:port/path
+license,,`,THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
+license,,:zooko,mailto:zooko@zooko.com
+license,,`,THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+reference/datamodel,,:step,a[i:j:step]
+reference/datamodel,,:max,
+reference/expressions,,:index,x[index:index]
+reference/expressions,,:datum,{key:datum...}
+reference/expressions,,`,`expressions...`
+reference/grammar,,:output,#diagram:output
+reference/grammar,,:rules,#diagram:rules
+reference/grammar,,:token,#diagram:token
+reference/grammar,,`,'`' testlist1 '`'
+reference/lexical_analysis,,:fileencoding,# vim:fileencoding=<encoding-name>
+reference/lexical_analysis,,`,",       :       .       `       =       ;"
+tutorial/datastructures,,:value,key:value pairs within the braces adds initial key:value pairs
+tutorial/datastructures,,:value,It is also possible to delete a key:value
+tutorial/stdlib2,,:start,"fields = struct.unpack('<IIIHH', data[start:start+16])"
+tutorial/stdlib2,,:start,extra = data[start:start+extra_size]
+tutorial/stdlib2,,:start,filename = data[start:start+filenamesize]
+tutorial/stdlib2,,:config,"logging.warning('Warning:config file %s not found', 'server.conf')"
+tutorial/stdlib2,,:config,WARNING:root:Warning:config file server.conf not found
+tutorial/stdlib2,,:Critical,CRITICAL:root:Critical error -- shutting down
+tutorial/stdlib2,,:Error,ERROR:root:Error occurred
+tutorial/stdlib2,,:root,CRITICAL:root:Critical error -- shutting down
+tutorial/stdlib2,,:root,ERROR:root:Error occurred
+tutorial/stdlib2,,:root,WARNING:root:Warning:config file server.conf not found
+tutorial/stdlib2,,:Warning,WARNING:root:Warning:config file server.conf not found
+using/cmdline,,:line,file:line: category: message
+using/cmdline,,:category,action:message:category:module:line
+using/cmdline,,:line,action:message:category:module:line
+using/cmdline,,:message,action:message:category:module:line
+using/cmdline,,:module,action:message:category:module:line
+using/cmdline,,:errorhandler,:errorhandler
+using/windows,162,`,`` this fixes syntax highlighting errors in some editors due to the \\\\ hackery
+using/windows,170,`,``
+whatsnew/2.0,418,:len,
+whatsnew/2.3,,::,
+whatsnew/2.3,,:config,
+whatsnew/2.3,,:Critical,
+whatsnew/2.3,,:Error,
+whatsnew/2.3,,:Problem,
+whatsnew/2.3,,:root,
+whatsnew/2.3,,:Warning,
+whatsnew/2.4,,::,
+whatsnew/2.4,,:System,
+whatsnew/2.5,,:memory,:memory:
+whatsnew/2.5,,:step,[start:stop:step]
+whatsnew/2.5,,:stop,[start:stop:step]
--- a/Doc/tools/sphinxext/suspicious.py
+++ b/Doc/tools/sphinxext/suspicious.py
+"""
+Try to detect suspicious constructs, resembling markup
+that has leaked into the final output.
+
+Suspicious lines are reported in a comma-separated-file,
+``suspicious.csv``, located in the output directory.
+
+The file is utf-8 encoded, and each line contains four fields:
+
+ * document name (normalized)
+ * line number in the source document
+ * problematic text
+ * complete line showing the problematic text in context
+
+It is common to find many false positives. To avoid reporting them
+again and again, they may be added to the ``ignored.csv`` file
+(located in the configuration directory). The file has the same
+format as ``suspicious.csv`` with a few differences:
+
+  - each line defines a rule; if the rule matches, the issue
+    is ignored.
+  - line number may be empty (that is, nothing between the
+    commas: ",,"). In this case, line numbers are ignored (the
+    rule matches anywhere in the file).
+  - the last field does not have to be a complete line; some
+    surrounding text (never more than a line) is enough for
+    context.
+
+Rules are processed sequentially. A rule matches when:
+
+ * document names are the same
+ * problematic texts are the same
+ * line numbers are close to each other (5 lines up or down)
+ * the rule text is completely contained into the source line
+
+The simplest way to create the ignored.csv file is by copying
+undesired entries from suspicious.csv (possibly trimming the last
+field.)
+
+Copyright 2009 Gabriel A. Genellina
+
+"""
+
+import os, sys
+import csv
+import re
+from docutils import nodes
+from sphinx.builder import Builder
+
+detect_all = re.compile(ur'''
+    ::(?=[^=])|            # two :: (but NOT ::=)
+    :[a-zA-Z][a-zA-Z0-9]+| # :foo
+    `|                     # ` (seldom used by itself)
+    (?<!\.)\.\.[ \t]*\w+:  # .. foo: (but NOT ... else:)
+    ''', re.UNICODE | re.VERBOSE).finditer
+
+class Rule:
+    def __init__(self, docname, lineno, issue, line):
+        "A rule for ignoring issues"
+        self.docname = docname # document to which this rule applies
+        self.lineno = lineno   # line number in the original source;
+                               # this rule matches only near that.
+                               # None -> don't care
+        self.issue = issue     # the markup fragment that triggered this rule
+        self.line = line       # text of the container element (single line only)
+
+
+class CheckSuspiciousMarkupBuilder(Builder):
+    """
+    Checks for possibly invalid markup that may leak into the output
+    """
+    name = 'suspicious'
+
+    def init(self):
+        # create output file
+        self.log_file_name = os.path.join(self.outdir, 'suspicious.csv')
+        open(self.log_file_name, 'w').close()
+        # load database of previously ignored issues
+        self.load_rules(os.path.join(os.path.dirname(__file__), 'susp-ignored.csv'))
+
+    def get_outdated_docs(self):
+        return self.env.found_docs
+
+    def get_target_uri(self, docname, typ=None):
+        return ''
+
+    def prepare_writing(self, docnames):
+        ### PYTHON PROJECT SPECIFIC ###
+        for name in set(docnames):
+            if name.split('/', 1)[0] == 'documenting':
+                docnames.remove(name)
+        ### PYTHON PROJECT SPECIFIC ###
+
+    def write_doc(self, docname, doctree):
+        self.any_issue = False # set when any issue is encountered in this document
+        self.docname = docname
+        visitor = SuspiciousVisitor(doctree, self)
+        doctree.walk(visitor)
+
+    def finish(self):
+        return
+
+    def check_issue(self, line, lineno, issue):
+        if not self.is_ignored(line, lineno, issue):
+            self.report_issue(line, lineno, issue)
+
+    def is_ignored(self, line, lineno, issue):
+        """Determine whether this issue should be ignored.
+        """
+        docname = self.docname
+        for rule in self.rules:
+            if rule.docname != docname: continue
+            if rule.issue != issue: continue
+            # Both lines must match *exactly*. This is rather strict,
+            # and probably should be improved.
+            # Doing fuzzy matches with levenshtein distance could work,
+            # but that means bringing other libraries...
+            # Ok, relax that requirement: just check if the rule fragment
+            # is contained in the document line
+            if rule.line not in line: continue
+            # Check both line numbers. If they're "near"
+            # this rule matches. (lineno=None means "don't care")
+            if (rule.lineno is not None) and \
+                abs(rule.lineno - lineno) > 5: continue
+            # if it came this far, the rule matched
+            return True
+        return False
+
+    def report_issue(self, text, lineno, issue):
+        if not self.any_issue: self.info()
+        self.any_issue = True
+        self.write_log_entry(lineno, issue, text)
+        self.warn('[%s:%d] "%s" found in "%-.120s"' % (
+                self.docname.encode(sys.getdefaultencoding(),'replace'),
+                lineno,
+                issue.encode(sys.getdefaultencoding(),'replace'),
+                text.strip().encode(sys.getdefaultencoding(),'replace')))
+        self.app.statuscode = 1
+
+    def write_log_entry(self, lineno, issue, text):
+        f = open(self.log_file_name, 'ab')
+        writer = csv.writer(f)
+        writer.writerow([self.docname.encode('utf-8'),
+                lineno,
+                issue.encode('utf-8'),
+                text.strip().encode('utf-8')])
+        del writer
+        f.close()
+
+    def load_rules(self, filename):
+        """Load database of previously ignored issues.
+
+        A csv file, with exactly the same format as suspicious.csv
+        Fields: document name (normalized), line number, issue, surrounding text
+        """
+        self.info("loading ignore rules... ", nonl=1)
+        self.rules = rules = []
+        try: f = open(filename, 'rb')
+        except IOError: return
+        for i, row in enumerate(csv.reader(f)):
+            if len(row) != 4:
+                raise ValueError, "wrong format in %s, line %d: %s" % (filename, i+1, row)
+            docname, lineno, issue, text = row
+            docname = docname.decode('utf-8')
+            if lineno: lineno = int(lineno)
+            else: lineno = None
+            issue = issue.decode('utf-8')
+            text = text.decode('utf-8')
+            rule = Rule(docname, lineno, issue, text)
+            rules.append(rule)
+        f.close()
+        self.info('done, %d rules loaded' % len(self.rules))
+
+
+def get_lineno(node):
+    "Obtain line number information for a node"
+    lineno = None
+    while lineno is None and node:
+        node = node.parent
+        lineno = node.line
+    return lineno
+
+
+def extract_line(text, index):
+    """text may be a multiline string; extract
+    only the line containing the given character index.
+
+    >>> extract_line("abc\ndefgh\ni", 6)
+    >>> 'defgh'
+    >>> for i in (0, 2, 3, 4, 10):
+    ...   print extract_line("abc\ndefgh\ni", i)
+    abc
+    abc
+    abc
+    defgh
+    defgh
+    i
+    """
+    p = text.rfind('\n', 0, index) + 1
+    q = text.find('\n', index)
+    if q<0: q = len(text)
+    return text[p:q]
+
+
+class SuspiciousVisitor(nodes.GenericNodeVisitor):
+
+    lastlineno = 0
+
+    def __init__(self, document, builder):
+        nodes.GenericNodeVisitor.__init__(self, document)
+        self.builder = builder
+
+    def default_visit(self, node):
+        if isinstance(node, (nodes.Text, nodes.image)): # direct text containers
+            text = node.astext()
+            # lineno seems to go backwards sometimes (?)
+            self.lastlineno = lineno = max(get_lineno(node) or 0, self.lastlineno)
+            seen = set() # don't report the same issue more than only once per line
+            for match in detect_all(text):
+                #import pdb; pdb.set_trace()
+                issue = match.group()
+                line = extract_line(text, match.start())
+                if (issue, line) not in seen:
+                    self.builder.check_issue(line, lineno, issue)
+                    seen.add((issue, line))
+
+    unknown_visit = default_visit
+
+    def visit_document(self, node):
+        self.lastlineno = 0
+
+    def visit_comment(self, node):
+        # ignore comments -- too much false positives.
+        # (although doing this could miss some errors;
+        # there were two sections "commented-out" by mistake
+        # in the Python docs that would not be catched)
+        raise nodes.SkipNode