Commit 700cf28f authored by Georg Brandl's avatar Georg Brandl

Add "suspicious" builder which finds leftover markup in the HTML files.

Patch by Gabriel Genellina.
parent 775aa4a0
......@@ -24,6 +24,7 @@ help:
@echo " text to make plain text files"
@echo " changes to make an overview over all changed/added/deprecated items"
@echo " linkcheck to check all external links for integrity"
@echo " suspicious to check for suspicious markup in output text"
@echo " coverage to check documentation coverage for library and C API"
@echo " dist to create a \"dist\" directory with archived docs for download"
......@@ -84,6 +85,11 @@ linkcheck: build
@echo "Link check complete; look for any errors in the above output " \
"or in build/$(BUILDER)/output.txt"
suspicious: BUILDER = suspicious
suspicious: build
@echo "Suspicious check complete; look for any errors in the above output " \
"or in build/$(BUILDER)/suspicious.txt"
coverage: BUILDER = coverage
coverage: build
@echo "Coverage finished; see c.txt and python.txt in build/coverage"
......
......@@ -8,28 +8,35 @@ if "%HTMLHELP%" EQU "" set HTMLHELP=%ProgramFiles%\HTML Help Workshop\hhc.exe
if "%1" EQU "" goto help
if "%1" EQU "html" goto build
if "%1" EQU "htmlhelp" goto build
if "%1" EQU "web" goto build
if "%1" EQU "webrun" goto webrun
if "%1" EQU "latex" goto build
if "%1" EQU "text" goto build
if "%1" EQU "suspicious" goto build
if "%1" EQU "linkcheck" goto build
if "%1" EQU "changes" goto build
if "%1" EQU "checkout" goto checkout
if "%1" EQU "update" goto update
:help
set this=%~n0
echo HELP
echo.
echo builddoc checkout
echo builddoc update
echo builddoc html
echo builddoc htmlhelp
echo builddoc web
echo builddoc webrun
echo %this% checkout
echo %this% update
echo %this% html
echo %this% htmlhelp
echo %this% latex
echo %this% text
echo %this% suspicious
echo %this% linkcheck
echo %this% changes
echo.
goto end
:checkout
svn co %SVNROOT%/doctools/trunk/sphinx tools/sphinx
svn co %SVNROOT%/external/docutils-0.4/docutils tools/docutils
svn co %SVNROOT%/external/Jinja-1.1/jinja tools/jinja
svn co %SVNROOT%/external/Pygments-0.9/pygments tools/pygments
svn co %SVNROOT%/external/docutils-0.5/docutils tools/docutils
svn co %SVNROOT%/external/Jinja-1.2/jinja tools/jinja
svn co %SVNROOT%/external/Pygments-0.11.1/pygments tools/pygments
goto end
:update
......@@ -43,7 +50,7 @@ goto end
if not exist build mkdir build
if not exist build\%1 mkdir build\%1
if not exist build\doctrees mkdir build\doctrees
cmd /C %PYTHON% tools\sphinx-build.py -b%1 -dbuild\doctrees . build\%1
cmd /C %PYTHON% tools\sphinx-build.py -b%1 -dbuild\doctrees . build\%*
if "%1" EQU "htmlhelp" "%HTMLHELP%" build\htmlhelp\pydoc.hhp
goto end
......
......@@ -92,6 +92,9 @@ class PydocTopicsBuilder(Builder):
finally:
f.close()
# Support for checking for suspicious markup
import suspicious
# Support for documenting Opcodes
......@@ -116,5 +119,6 @@ def parse_opcode_signature(env, sig, signode):
def setup(app):
app.add_role('issue', issue_role)
app.add_builder(PydocTopicsBuilder)
app.add_builder(suspicious.CheckSuspiciousMarkupBuilder)
app.add_description_unit('opcode', 'opcode', '%s (opcode)',
parse_opcode_signature)
c-api/arg,,:ref,"PyArg_ParseTuple(args, ""O|O:ref"", &object, &callback)"
c-api/list,,:high,list[low:high]
c-api/list,,:high,list[low:high] = itemlist
c-api/sequence,,:i2,o[i1:i2]
c-api/sequence,,:i2,o[i1:i2] = v
c-api/sequence,,:i2,del o[i1:i2]
c-api/unicode,,:end,str[start:end]
distutils/apiref,,:action,http://pypi.python.org/pypi?:action=list_classifiers
distutils/setupscript,,::,
extending/embedding,,:numargs,"if(!PyArg_ParseTuple(args, "":numargs""))"
extending/extending,,:set,"if (PyArg_ParseTuple(args, ""O:set_callback"", &temp)) {"
extending/extending,,:myfunction,"PyArg_ParseTuple(args, ""D:myfunction"", &c);"
extending/newtypes,,:call,"if (!PyArg_ParseTuple(args, ""sss:call"", &arg1, &arg2, &arg3)) {"
extending/windows,,:initspam,/export:initspam
howto/cporting,,:add,"if (!PyArg_ParseTuple(args, ""ii:add_ints"", &one, &two))"
howto/cporting,,:encode,"if (!PyArg_ParseTuple(args, ""O:encode_object"", &myobj))"
howto/cporting,,:say,"if (!PyArg_ParseTuple(args, ""U:say_hello"", &name))"
howto/curses,,:black,"They are: 0:black, 1:red, 2:green, 3:yellow, 4:blue, 5:magenta, 6:cyan, and"
howto/curses,,:blue,"They are: 0:black, 1:red, 2:green, 3:yellow, 4:blue, 5:magenta, 6:cyan, and"
howto/curses,,:cyan,"They are: 0:black, 1:red, 2:green, 3:yellow, 4:blue, 5:magenta, 6:cyan, and"
howto/curses,,:green,"They are: 0:black, 1:red, 2:green, 3:yellow, 4:blue, 5:magenta, 6:cyan, and"
howto/curses,,:magenta,"They are: 0:black, 1:red, 2:green, 3:yellow, 4:blue, 5:magenta, 6:cyan, and"
howto/curses,,:red,"They are: 0:black, 1:red, 2:green, 3:yellow, 4:blue, 5:magenta, 6:cyan, and"
howto/curses,,:white,"7:white."
howto/curses,,:yellow,"They are: 0:black, 1:red, 2:green, 3:yellow, 4:blue, 5:magenta, 6:cyan, and"
howto/regex,,::,
howto/regex,,:foo,(?:foo)
howto/urllib2,,:example,"for example ""joe@password:example.com"""
howto/webservers,,.. image:,.. image:: http.png
library/audioop,,:ipos,"# factor = audioop.findfactor(in_test[ipos*2:ipos*2+len(out_test)],"
library/datetime,,:MM,
library/datetime,,:SS,
library/decimal,,:optional,"trailneg:optional trailing minus indicator"
library/difflib,,:ahi,a[alo:ahi]
library/difflib,,:bhi,b[blo:bhi]
library/difflib,,:i2,
library/difflib,,:j2,
library/difflib,,:i1,
library/dis,,:TOS,
library/dis,,`,TOS = `TOS`
library/doctest,,`,``factorial`` from the ``example`` module:
library/doctest,,`,The ``example`` module
library/doctest,,`,Using ``factorial``
library/functions,,:step,a[start:stop:step]
library/functions,,:stop,"a[start:stop, i]"
library/functions,,:stop,a[start:stop:step]
library/hotshot,,:lineno,"ncalls tottime percall cumtime percall filename:lineno(function)"
library/httplib,,:port,host:port
library/imaplib,,:MM,"""DD-Mmm-YYYY HH:MM:SS +HHMM"""
library/imaplib,,:SS,"""DD-Mmm-YYYY HH:MM:SS +HHMM"""
library/linecache,,:sys,"sys:x:3:3:sys:/dev:/bin/sh"
library/logging,,:And,
library/logging,,:package1,
library/logging,,:package2,
library/logging,,:root,
library/logging,,:This,
library/logging,,:port,host:port
library/mmap,,:i2,obj[i1:i2]
library/multiprocessing,,:queue,">>> QueueManager.register('get_queue', callable=lambda:queue)"
library/multiprocessing,,`,">>> l._callmethod('__getitem__', (20,)) # equiv to `l[20]`"
library/multiprocessing,,`,">>> l._callmethod('__getslice__', (2, 7)) # equiv to `l[2:7]`"
library/multiprocessing,,`,# `BaseManager`.
library/multiprocessing,,`,# `Pool.imap()` (which will save on the amount of code needed anyway).
library/multiprocessing,,`,# A test file for the `multiprocessing` package
library/multiprocessing,,`,# A test of `multiprocessing.Pool` class
library/multiprocessing,,`,# Add more tasks using `put()`
library/multiprocessing,,`,# create server for a `HostManager` object
library/multiprocessing,,`,# Depends on `multiprocessing` package -- tested with `processing-0.60`
library/multiprocessing,,`,# in the original order then consider using `Pool.map()` or
library/multiprocessing,,`,# Not sure if we should synchronize access to `socket.accept()` method by
library/multiprocessing,,`,# object. (We import `multiprocessing.reduction` to enable this pickling.)
library/multiprocessing,,`,# register the Foo class; make `f()` and `g()` accessible via proxy
library/multiprocessing,,`,# register the Foo class; make `g()` and `_h()` accessible via proxy
library/multiprocessing,,`,# register the generator function baz; use `GeneratorProxy` to make proxies
library/multiprocessing,,`,`Cluster` is a subclass of `SyncManager` so it allows creation of
library/multiprocessing,,`,`hostname` gives the name of the host. If hostname is not
library/multiprocessing,,`,`slots` is used to specify the number of slots for processes on
library/optparse,,:len,"del parser.rargs[:len(value)]"
library/os.path,,:foo,c:foo
library/parser,,`,"""Make a function that raises an argument to the exponent `exp`."""
library/posix,,`,"CFLAGS=""`getconf LFS_CFLAGS`"" OPT=""-g -O2 $CFLAGS"""
library/profile,,:lineno,ncalls tottime percall cumtime percall filename:lineno(function)
library/profile,,:lineno,filename:lineno(function)
library/pyexpat,,:elem1,<py:elem1 />
library/pyexpat,,:py,"xmlns:py = ""http://www.python.org/ns/"">"
library/repr,,`,"return `obj`"
library/smtplib,,:port,"as well as a regular host:port server."
library/socket,,::,'5aef:2b::8'
library/sqlite3,,:memory,
library/sqlite3,,:age,"select name_last, age from people where name_last=:who and age=:age"
library/sqlite3,,:who,"select name_last, age from people where name_last=:who and age=:age"
library/ssl,,:My,"Organization Name (eg, company) [Internet Widgits Pty Ltd]:My Organization, Inc."
library/ssl,,:My,"Organizational Unit Name (eg, section) []:My Group"
library/ssl,,:myserver,"Common Name (eg, YOUR name) []:myserver.mygroup.myorganization.com"
library/ssl,,:MyState,State or Province Name (full name) [Some-State]:MyState
library/ssl,,:ops,Email Address []:ops@myserver.mygroup.myorganization.com
library/ssl,,:Some,"Locality Name (eg, city) []:Some City"
library/ssl,,:US,Country Name (2 letter code) [AU]:US
library/stdtypes,,:len,s[len(s):len(s)]
library/stdtypes,,:len,s[len(s):len(s)]
library/string,,:end,s[start:end]
library/string,,:end,s[start:end]
library/subprocess,,`,"output=`mycmd myarg`"
library/subprocess,,`,"output=`dmesg | grep hda`"
library/tarfile,,:compression,filemode[:compression]
library/tarfile,,:gz,
library/tarfile,,:bz2,
library/time,,:mm,
library/time,,:ss,
library/turtle,,::,Example::
library/urllib,,:port,:port
library/urllib2,,:password,"""joe:password@python.org"""
library/uuid,,:uuid,urn:uuid:12345678-1234-5678-1234-567812345678
library/xmlrpclib,,:pass,http://user:pass@host:port/path
library/xmlrpclib,,:pass,user:pass
library/xmlrpclib,,:port,http://user:pass@host:port/path
license,,`,THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
license,,:zooko,mailto:zooko@zooko.com
license,,`,THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
reference/datamodel,,:step,a[i:j:step]
reference/datamodel,,:max,
reference/expressions,,:index,x[index:index]
reference/expressions,,:datum,{key:datum...}
reference/expressions,,`,`expressions...`
reference/grammar,,:output,#diagram:output
reference/grammar,,:rules,#diagram:rules
reference/grammar,,:token,#diagram:token
reference/grammar,,`,'`' testlist1 '`'
reference/lexical_analysis,,:fileencoding,# vim:fileencoding=<encoding-name>
reference/lexical_analysis,,`,", : . ` = ;"
tutorial/datastructures,,:value,key:value pairs within the braces adds initial key:value pairs
tutorial/datastructures,,:value,It is also possible to delete a key:value
tutorial/stdlib2,,:start,"fields = struct.unpack('<IIIHH', data[start:start+16])"
tutorial/stdlib2,,:start,extra = data[start:start+extra_size]
tutorial/stdlib2,,:start,filename = data[start:start+filenamesize]
tutorial/stdlib2,,:config,"logging.warning('Warning:config file %s not found', 'server.conf')"
tutorial/stdlib2,,:config,WARNING:root:Warning:config file server.conf not found
tutorial/stdlib2,,:Critical,CRITICAL:root:Critical error -- shutting down
tutorial/stdlib2,,:Error,ERROR:root:Error occurred
tutorial/stdlib2,,:root,CRITICAL:root:Critical error -- shutting down
tutorial/stdlib2,,:root,ERROR:root:Error occurred
tutorial/stdlib2,,:root,WARNING:root:Warning:config file server.conf not found
tutorial/stdlib2,,:Warning,WARNING:root:Warning:config file server.conf not found
using/cmdline,,:line,file:line: category: message
using/cmdline,,:category,action:message:category:module:line
using/cmdline,,:line,action:message:category:module:line
using/cmdline,,:message,action:message:category:module:line
using/cmdline,,:module,action:message:category:module:line
using/cmdline,,:errorhandler,:errorhandler
using/windows,162,`,`` this fixes syntax highlighting errors in some editors due to the \\\\ hackery
using/windows,170,`,``
whatsnew/2.0,418,:len,
whatsnew/2.3,,::,
whatsnew/2.3,,:config,
whatsnew/2.3,,:Critical,
whatsnew/2.3,,:Error,
whatsnew/2.3,,:Problem,
whatsnew/2.3,,:root,
whatsnew/2.3,,:Warning,
whatsnew/2.4,,::,
whatsnew/2.4,,:System,
whatsnew/2.5,,:memory,:memory:
whatsnew/2.5,,:step,[start:stop:step]
whatsnew/2.5,,:stop,[start:stop:step]
"""
Try to detect suspicious constructs, resembling markup
that has leaked into the final output.
Suspicious lines are reported in a comma-separated-file,
``suspicious.csv``, located in the output directory.
The file is utf-8 encoded, and each line contains four fields:
* document name (normalized)
* line number in the source document
* problematic text
* complete line showing the problematic text in context
It is common to find many false positives. To avoid reporting them
again and again, they may be added to the ``ignored.csv`` file
(located in the configuration directory). The file has the same
format as ``suspicious.csv`` with a few differences:
- each line defines a rule; if the rule matches, the issue
is ignored.
- line number may be empty (that is, nothing between the
commas: ",,"). In this case, line numbers are ignored (the
rule matches anywhere in the file).
- the last field does not have to be a complete line; some
surrounding text (never more than a line) is enough for
context.
Rules are processed sequentially. A rule matches when:
* document names are the same
* problematic texts are the same
* line numbers are close to each other (5 lines up or down)
* the rule text is completely contained into the source line
The simplest way to create the ignored.csv file is by copying
undesired entries from suspicious.csv (possibly trimming the last
field.)
Copyright 2009 Gabriel A. Genellina
"""
import os, sys
import csv
import re
from docutils import nodes
from sphinx.builder import Builder
detect_all = re.compile(ur'''
::(?=[^=])| # two :: (but NOT ::=)
:[a-zA-Z][a-zA-Z0-9]+| # :foo
`| # ` (seldom used by itself)
(?<!\.)\.\.[ \t]*\w+: # .. foo: (but NOT ... else:)
''', re.UNICODE | re.VERBOSE).finditer
class Rule:
def __init__(self, docname, lineno, issue, line):
"A rule for ignoring issues"
self.docname = docname # document to which this rule applies
self.lineno = lineno # line number in the original source;
# this rule matches only near that.
# None -> don't care
self.issue = issue # the markup fragment that triggered this rule
self.line = line # text of the container element (single line only)
class CheckSuspiciousMarkupBuilder(Builder):
"""
Checks for possibly invalid markup that may leak into the output
"""
name = 'suspicious'
def init(self):
# create output file
self.log_file_name = os.path.join(self.outdir, 'suspicious.csv')
open(self.log_file_name, 'w').close()
# load database of previously ignored issues
self.load_rules(os.path.join(os.path.dirname(__file__), 'susp-ignored.csv'))
def get_outdated_docs(self):
return self.env.found_docs
def get_target_uri(self, docname, typ=None):
return ''
def prepare_writing(self, docnames):
### PYTHON PROJECT SPECIFIC ###
for name in set(docnames):
if name.split('/', 1)[0] == 'documenting':
docnames.remove(name)
### PYTHON PROJECT SPECIFIC ###
def write_doc(self, docname, doctree):
self.any_issue = False # set when any issue is encountered in this document
self.docname = docname
visitor = SuspiciousVisitor(doctree, self)
doctree.walk(visitor)
def finish(self):
return
def check_issue(self, line, lineno, issue):
if not self.is_ignored(line, lineno, issue):
self.report_issue(line, lineno, issue)
def is_ignored(self, line, lineno, issue):
"""Determine whether this issue should be ignored.
"""
docname = self.docname
for rule in self.rules:
if rule.docname != docname: continue
if rule.issue != issue: continue
# Both lines must match *exactly*. This is rather strict,
# and probably should be improved.
# Doing fuzzy matches with levenshtein distance could work,
# but that means bringing other libraries...
# Ok, relax that requirement: just check if the rule fragment
# is contained in the document line
if rule.line not in line: continue
# Check both line numbers. If they're "near"
# this rule matches. (lineno=None means "don't care")
if (rule.lineno is not None) and \
abs(rule.lineno - lineno) > 5: continue
# if it came this far, the rule matched
return True
return False
def report_issue(self, text, lineno, issue):
if not self.any_issue: self.info()
self.any_issue = True
self.write_log_entry(lineno, issue, text)
self.warn('[%s:%d] "%s" found in "%-.120s"' % (
self.docname.encode(sys.getdefaultencoding(),'replace'),
lineno,
issue.encode(sys.getdefaultencoding(),'replace'),
text.strip().encode(sys.getdefaultencoding(),'replace')))
self.app.statuscode = 1
def write_log_entry(self, lineno, issue, text):
f = open(self.log_file_name, 'ab')
writer = csv.writer(f)
writer.writerow([self.docname.encode('utf-8'),
lineno,
issue.encode('utf-8'),
text.strip().encode('utf-8')])
del writer
f.close()
def load_rules(self, filename):
"""Load database of previously ignored issues.
A csv file, with exactly the same format as suspicious.csv
Fields: document name (normalized), line number, issue, surrounding text
"""
self.info("loading ignore rules... ", nonl=1)
self.rules = rules = []
try: f = open(filename, 'rb')
except IOError: return
for i, row in enumerate(csv.reader(f)):
if len(row) != 4:
raise ValueError, "wrong format in %s, line %d: %s" % (filename, i+1, row)
docname, lineno, issue, text = row
docname = docname.decode('utf-8')
if lineno: lineno = int(lineno)
else: lineno = None
issue = issue.decode('utf-8')
text = text.decode('utf-8')
rule = Rule(docname, lineno, issue, text)
rules.append(rule)
f.close()
self.info('done, %d rules loaded' % len(self.rules))
def get_lineno(node):
"Obtain line number information for a node"
lineno = None
while lineno is None and node:
node = node.parent
lineno = node.line
return lineno
def extract_line(text, index):
"""text may be a multiline string; extract
only the line containing the given character index.
>>> extract_line("abc\ndefgh\ni", 6)
>>> 'defgh'
>>> for i in (0, 2, 3, 4, 10):
... print extract_line("abc\ndefgh\ni", i)
abc
abc
abc
defgh
defgh
i
"""
p = text.rfind('\n', 0, index) + 1
q = text.find('\n', index)
if q<0: q = len(text)
return text[p:q]
class SuspiciousVisitor(nodes.GenericNodeVisitor):
lastlineno = 0
def __init__(self, document, builder):
nodes.GenericNodeVisitor.__init__(self, document)
self.builder = builder
def default_visit(self, node):
if isinstance(node, (nodes.Text, nodes.image)): # direct text containers
text = node.astext()
# lineno seems to go backwards sometimes (?)
self.lastlineno = lineno = max(get_lineno(node) or 0, self.lastlineno)
seen = set() # don't report the same issue more than only once per line
for match in detect_all(text):
#import pdb; pdb.set_trace()
issue = match.group()
line = extract_line(text, match.start())
if (issue, line) not in seen:
self.builder.check_issue(line, lineno, issue)
seen.add((issue, line))
unknown_visit = default_visit
def visit_document(self, node):
self.lastlineno = 0
def visit_comment(self, node):
# ignore comments -- too much false positives.
# (although doing this could miss some errors;
# there were two sections "commented-out" by mistake
# in the Python docs that would not be catched)
raise nodes.SkipNode
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment