Commit e5605ba3 authored by Guido van Rossum's avatar Guido van Rossum

Many misc changes.

- Faster HTML parser derivede from SGMLparser (Fred Gansevles).

- All manipulations of todo, done, ext, bad are done via methods, so a
derived class can override.  Also moved the 'done' marking to
dopage(), so run() is much simpler.

- Added a method status() which returns a string containing the
summary counts; added a "total" count.

- Drop the guessing of the file type before opening the document -- we
still need to check those links for validity!

- Added a subroutine to close a connection which first slurps up the
remaining data when it's an ftp URL -- apparently closing an ftp
connection without reading till the end makes it hang.

- Added -n option to skip running (only useful with -R).

- The Checker object now has an instance variable which is set to 1
when it is changed.  This is not pickled.
parent 941f70c3
...@@ -59,12 +59,13 @@ by the robots.txt file are reported as external URLs. ...@@ -59,12 +59,13 @@ by the robots.txt file are reported as external URLs.
skipped. The size limit can be set with the -m option. skipped. The size limit can be set with the -m option.
- Before fetching a page, it guesses its type based on its extension. - Before fetching a page, it guesses its type based on its extension.
If it is a known extension and the type is not text/http, the page is If it is a known extension and the type is not text/html, the page is
not fetched. This is a huge optimization but occasionally it means not fetched. This is a huge optimization but occasionally it means
links can be missed. The mimetypes.py module (also in this directory) links can be missed, and such links aren't checked for validity
has a built-in table mapping most currently known suffixes, and in (XXX!). The mimetypes.py module (also in this directory) has a
addition attempts to read the mime.types configuration files in the built-in table mapping most currently known suffixes, and in addition
default locations of Netscape and the NCSA HTTP daemon. attempts to read the mime.types configuration files in the default
locations of Netscape and the NCSA HTTP daemon.
- It only follows links indicated by <A> tags. It doesn't follow - It only follows links indicated by <A> tags. It doesn't follow
links in <FORM> or <IMG> or whatever other tags might contain links in <FORM> or <IMG> or whatever other tags might contain
...@@ -83,6 +84,7 @@ Options: ...@@ -83,6 +84,7 @@ Options:
-R -- restart from checkpoint file -R -- restart from checkpoint file
-d file -- checkpoint filename (default %(DUMPFILE)s) -d file -- checkpoint filename (default %(DUMPFILE)s)
-m bytes -- skip HTML pages larger than this size (default %(MAXPAGE)d) -m bytes -- skip HTML pages larger than this size (default %(MAXPAGE)d)
-n -- reports only, no checking (use with -R)
-q -- quiet operation (also suppresses external links report) -q -- quiet operation (also suppresses external links report)
-r number -- number of links processed per round (default %(ROUNDSIZE)d) -r number -- number of links processed per round (default %(ROUNDSIZE)d)
-v -- verbose operation; repeating -v will increase verbosity -v -- verbose operation; repeating -v will increase verbosity
...@@ -95,7 +97,10 @@ rooturl -- URL to start checking ...@@ -95,7 +97,10 @@ rooturl -- URL to start checking
""" """
__version__ = "0.2" # ' Emacs bait
__version__ = "0.3"
import sys import sys
...@@ -108,8 +113,7 @@ import pickle ...@@ -108,8 +113,7 @@ import pickle
import urllib import urllib
import urlparse import urlparse
import htmllib import sgmllib
import formatter
import mimetypes import mimetypes
import robotparser import robotparser
...@@ -134,9 +138,10 @@ def main(): ...@@ -134,9 +138,10 @@ def main():
dumpfile = DUMPFILE dumpfile = DUMPFILE
restart = 0 restart = 0
checkext = 0 checkext = 0
norun = 0
try: try:
opts, args = getopt.getopt(sys.argv[1:], 'Rd:m:qr:vx') opts, args = getopt.getopt(sys.argv[1:], 'Rd:m:nqr:vx')
except getopt.error, msg: except getopt.error, msg:
sys.stdout = sys.stderr sys.stdout = sys.stderr
print msg print msg
...@@ -148,6 +153,8 @@ def main(): ...@@ -148,6 +153,8 @@ def main():
dumpfile = a dumpfile = a
if o == '-m': if o == '-m':
maxpage = string.atoi(a) maxpage = string.atoi(a)
if o == '-n':
norun = 1
if o == '-q': if o == '-q':
verbose = 0 verbose = 0
if o == '-r': if o == '-r':
...@@ -157,7 +164,7 @@ def main(): ...@@ -157,7 +164,7 @@ def main():
if o == '-x': if o == '-x':
checkext = 1 checkext = 1
if verbose: if verbose > 0:
print AGENTNAME, "version", __version__ print AGENTNAME, "version", __version__
if restart: if restart:
...@@ -177,32 +184,33 @@ def main(): ...@@ -177,32 +184,33 @@ def main():
for arg in args: for arg in args:
c.addroot(arg) c.addroot(arg)
if not c.todo: if not norun:
needsave = 0
else:
needsave = 1
try: try:
c.run() c.run()
except KeyboardInterrupt: except KeyboardInterrupt:
if verbose > 0: if verbose > 0:
print "[run interrupted]" print "[run interrupted]"
try: try:
c.report(checkext) c.report(checkext)
except KeyboardInterrupt: except KeyboardInterrupt:
if verbose > 0: if verbose > 0:
print "[report interrupted]" print "[report interrupted]"
if not needsave:
if not c.changed:
if verbose > 0: if verbose > 0:
print print
print "No need to save checkpoint" print "No need to save checkpoint"
elif dumpfile: elif not dumpfile:
if verbose > 0:
print "No dumpfile, won't save checkpoint"
else:
if verbose > 0: if verbose > 0:
print print
print "Saving checkpoint to %s ..." % dumpfile print "Saving checkpoint to %s ..." % dumpfile
newfile = dumpfile + ".new" newfile = dumpfile + ".new"
f = open(newfile, "wb") f = open(newfile, "wb")
pickle.dump(c, f) pickle.dump(c, f)
f.flush()
f.close() f.close()
try: try:
os.unlink(dumpfile) os.unlink(dumpfile)
...@@ -226,9 +234,11 @@ class Checker: ...@@ -226,9 +234,11 @@ class Checker:
self.done = {} self.done = {}
self.ext = {} self.ext = {}
self.bad = {} self.bad = {}
self.urlopener = MyURLopener()
self.round = 0 self.round = 0
# The following are not pickled:
self.robots = {} self.robots = {}
self.urlopener = MyURLopener()
self.changed = 0
def __getstate__(self): def __getstate__(self):
return (self.roots, self.todo, self.done, return (self.roots, self.todo, self.done,
...@@ -243,15 +253,15 @@ class Checker: ...@@ -243,15 +253,15 @@ class Checker:
def addroot(self, root): def addroot(self, root):
if root not in self.roots: if root not in self.roots:
self.roots.append(root) self.roots.append(root)
self.todo[root] = []
self.addrobot(root) self.addrobot(root)
self.newintlink(root, ("<root>", root))
def addrobot(self, root): def addrobot(self, root):
url = urlparse.urljoin(root, "/robots.txt") url = urlparse.urljoin(root, "/robots.txt")
self.robots[root] = rp = robotparser.RobotFileParser() self.robots[root] = rp = robotparser.RobotFileParser()
if verbose > 2: if verbose > 2:
print "Parsing", url print "Parsing", url
rp.debug = 1 rp.debug = verbose > 3
rp.set_url(url) rp.set_url(url)
try: try:
rp.read() rp.read()
...@@ -264,24 +274,23 @@ class Checker: ...@@ -264,24 +274,23 @@ class Checker:
self.round = self.round + 1 self.round = self.round + 1
if verbose > 0: if verbose > 0:
print print
print "Round", self.round, print "Round", self.round, self.status()
print "(%d to do, %d done, %d external, %d bad)" % (
len(self.todo), len(self.done),
len(self.ext), len(self.bad))
print print
urls = self.todo.keys()[:roundsize] urls = self.todo.keys()[:roundsize]
for url in urls: for url in urls:
self.dopage(url) self.dopage(url)
self.done[url] = self.todo[url]
del self.todo[url] def status(self):
return "(%d total, %d to do, %d done, %d external, %d bad)" % (
len(self.todo)+len(self.done),
len(self.todo), len(self.done),
len(self.ext), len(self.bad))
def report(self, checkext=0): def report(self, checkext=0):
print print
if not self.todo: print "Final", if not self.todo: print "Final",
else: print "Interim", else: print "Interim",
print "Report (%d to do, %d done, %d external, %d bad)" % ( print "Report", self.status()
len(self.todo), len(self.done),
len(self.ext), len(self.bad))
if verbose > 0 or checkext: if verbose > 0 or checkext:
self.report_extrefs(checkext) self.report_extrefs(checkext)
# Report errors last because the output may get truncated # Report errors last because the output may get truncated
...@@ -313,12 +322,14 @@ class Checker: ...@@ -313,12 +322,14 @@ class Checker:
if verbose > 2: print "Checking", url, "..." if verbose > 2: print "Checking", url, "..."
try: try:
f = self.urlopener.open(url) f = self.urlopener.open(url)
f.close() safeclose(f)
if verbose > 3: print "OK" if verbose > 3: print "OK"
if self.bad.has_key(url):
self.setgood(url)
except IOError, msg: except IOError, msg:
msg = sanitize(msg) msg = sanitize(msg)
if verbose > 0: print "Error", msg if verbose > 0: print "Error", msg
self.bad[url] = msg self.setbad(url, msg)
def report_errors(self): def report_errors(self):
if not self.bad: if not self.bad:
...@@ -366,36 +377,51 @@ class Checker: ...@@ -366,36 +377,51 @@ class Checker:
else: else:
print "Page ", url print "Page ", url
page = self.getpage(url) page = self.getpage(url)
if not page: if page:
return
for info in page.getlinkinfos(): for info in page.getlinkinfos():
link, rawlink = info link, rawlink = info
origin = url, rawlink origin = url, rawlink
if not self.inroots(link): if not self.inroots(link):
self.newextlink(link, origin)
else:
self.newintlink(link, origin)
self.markdone(url)
def newextlink(self, url, origin):
try: try:
self.ext[link].append(origin) self.ext[url].append(origin)
if verbose > 3: if verbose > 3:
print " New ext link", link, print " New ext link", url
if link != rawlink: print "(%s)" % rawlink,
print
except KeyError: except KeyError:
self.ext[url] = [origin]
if verbose > 3: if verbose > 3:
print " Seen ext link", link, print " Seen ext link", url
if link != rawlink: print "(%s)" % rawlink,
print def newintlink(self, url, origin):
self.ext[link] = [origin] if self.done.has_key(url):
elif self.done.has_key(link): self.newdonelink(url, origin)
else:
self.newtodolink(url, origin)
def newdonelink(self, url, origin):
self.done[url].append(origin)
if verbose > 3: if verbose > 3:
print " Done link", link print " Done link", url
self.done[link].append(origin)
elif self.todo.has_key(link): def newtodolink(self, url, origin):
if self.todo.has_key(url):
self.todo[url].append(origin)
if verbose > 3: if verbose > 3:
print " Seen todo link", link print " Seen todo link", url
self.todo[link].append(origin)
else: else:
self.todo[url] = [origin]
if verbose > 3: if verbose > 3:
print " New todo link", link print " New todo link", url
self.todo[link] = [origin]
def markdone(self, url):
self.done[url] = self.todo[url]
del self.todo[url]
self.changed = 1
def inroots(self, url): def inroots(self, url):
for root in self.roots: for root in self.roots:
...@@ -404,15 +430,6 @@ class Checker: ...@@ -404,15 +430,6 @@ class Checker:
return 0 return 0
def getpage(self, url): def getpage(self, url):
ctype, encoding = mimetypes.guess_type(url)
if encoding:
if verbose > 2:
print " Won't bother, URL suggests encoding %s" % `encoding`
return None
if ctype and ctype != 'text/html':
if verbose > 2:
print " Won't bother, URL suggests mime type %s" % `ctype`
return None
try: try:
f = self.urlopener.open(url) f = self.urlopener.open(url)
except IOError, msg: except IOError, msg:
...@@ -421,26 +438,43 @@ class Checker: ...@@ -421,26 +438,43 @@ class Checker:
print "Error ", msg print "Error ", msg
if verbose > 0: if verbose > 0:
show(" HREF ", url, " from", self.todo[url]) show(" HREF ", url, " from", self.todo[url])
self.bad[url] = msg self.setbad(url, msg)
return None return None
nurl = f.geturl() nurl = f.geturl()
info = f.info() info = f.info()
if info.has_key('content-type'): if info.has_key('content-type'):
ctype = string.lower(info['content-type']) ctype = string.lower(info['content-type'])
else:
ctype = None
if nurl != url: if nurl != url:
if verbose > 1: if verbose > 1:
print " Redirected to", nurl print " Redirected to", nurl
if not ctype: if not ctype:
ctype, encoding = mimetypes.guess_type(nurl) ctype, encoding = mimetypes.guess_type(nurl)
if ctype != 'text/html': if ctype != 'text/html':
f.close() safeclose(f)
if verbose > 2: if verbose > 1:
print " Not HTML, mime type", ctype print " Not HTML, mime type", ctype
return None return None
text = f.read() text = f.read()
f.close() f.close()
return Page(text, nurl) return Page(text, nurl)
def setgood(self, url):
if self.bad.has_key(url):
del self.bad[url]
self.changed = 1
if verbose > 0:
print "(Clear previously seen error)"
def setbad(self, url, msg):
if self.bad.has_key(url) and self.bad[url] == msg:
if verbose > 0:
print "(Seen this error before)"
return
self.bad[url] = msg
self.changed = 1
class Page: class Page:
...@@ -457,7 +491,7 @@ class Page: ...@@ -457,7 +491,7 @@ class Page:
return [] return []
if verbose > 2: if verbose > 2:
print " Parsing", self.url, "(%d bytes)" % size print " Parsing", self.url, "(%d bytes)" % size
parser = MyHTMLParser(formatter.NullFormatter()) parser = MyHTMLParser()
parser.feed(self.text) parser.feed(self.text)
parser.close() parser.close()
rawlinks = parser.getlinks() rawlinks = parser.getlinks()
...@@ -519,28 +553,32 @@ class MyURLopener(urllib.FancyURLopener): ...@@ -519,28 +553,32 @@ class MyURLopener(urllib.FancyURLopener):
return urllib.FancyURLopener.open_file(self, path) return urllib.FancyURLopener.open_file(self, path)
class MyHTMLParser(htmllib.HTMLParser): class MyHTMLParser(sgmllib.SGMLParser):
def __init__(*args): def __init__(self):
self = args[0]
self.base = None self.base = None
self.links = [] self.links = {}
apply(htmllib.HTMLParser.__init__, args) sgmllib.SGMLParser.__init__ (self)
def start_a(self, attributes): def start_a(self, attributes):
for name, value in attributes: for name, value in attributes:
if name == 'href' and value and value not in self.links: if name == 'href':
self.links.append(string.strip(value)) if value: value = string.strip(value)
if value: self.links[value] = None
return # match only first href
def do_base(self, attributes): def do_base(self, attributes):
for name, value in attributes: for name, value in attributes:
if name == 'href' and value: if name == 'href':
if value: value = string.strip(value)
if value:
if verbose > 1: if verbose > 1:
print " Base", value print " Base", value
self.base = value self.base = value
return # match only first href
def getlinks(self): def getlinks(self):
return self.links return self.links.keys()
def getbase(self): def getbase(self):
return self.base return self.base
...@@ -569,5 +607,14 @@ def sanitize(msg): ...@@ -569,5 +607,14 @@ def sanitize(msg):
return msg return msg
def safeclose(f):
url = f.geturl()
if url[:4] == 'ftp:' or url[:7] == 'file://':
# Apparently ftp connections don't like to be closed
# prematurely...
text = f.read()
f.close()
if __name__ == '__main__': if __name__ == '__main__':
main() main()
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment