Commit e5605ba3 authored by Guido van Rossum's avatar Guido van Rossum

Many misc changes.

- Faster HTML parser derivede from SGMLparser (Fred Gansevles).

- All manipulations of todo, done, ext, bad are done via methods, so a
derived class can override.  Also moved the 'done' marking to
dopage(), so run() is much simpler.

- Added a method status() which returns a string containing the
summary counts; added a "total" count.

- Drop the guessing of the file type before opening the document -- we
still need to check those links for validity!

- Added a subroutine to close a connection which first slurps up the
remaining data when it's an ftp URL -- apparently closing an ftp
connection without reading till the end makes it hang.

- Added -n option to skip running (only useful with -R).

- The Checker object now has an instance variable which is set to 1
when it is changed.  This is not pickled.
parent 941f70c3
......@@ -59,12 +59,13 @@ by the robots.txt file are reported as external URLs.
skipped. The size limit can be set with the -m option.
- Before fetching a page, it guesses its type based on its extension.
If it is a known extension and the type is not text/http, the page is
If it is a known extension and the type is not text/html, the page is
not fetched. This is a huge optimization but occasionally it means
links can be missed. The mimetypes.py module (also in this directory)
has a built-in table mapping most currently known suffixes, and in
addition attempts to read the mime.types configuration files in the
default locations of Netscape and the NCSA HTTP daemon.
links can be missed, and such links aren't checked for validity
(XXX!). The mimetypes.py module (also in this directory) has a
built-in table mapping most currently known suffixes, and in addition
attempts to read the mime.types configuration files in the default
locations of Netscape and the NCSA HTTP daemon.
- It only follows links indicated by <A> tags. It doesn't follow
links in <FORM> or <IMG> or whatever other tags might contain
......@@ -83,6 +84,7 @@ Options:
-R -- restart from checkpoint file
-d file -- checkpoint filename (default %(DUMPFILE)s)
-m bytes -- skip HTML pages larger than this size (default %(MAXPAGE)d)
-n -- reports only, no checking (use with -R)
-q -- quiet operation (also suppresses external links report)
-r number -- number of links processed per round (default %(ROUNDSIZE)d)
-v -- verbose operation; repeating -v will increase verbosity
......@@ -95,7 +97,10 @@ rooturl -- URL to start checking
"""
__version__ = "0.2"
# ' Emacs bait
__version__ = "0.3"
import sys
......@@ -108,8 +113,7 @@ import pickle
import urllib
import urlparse
import htmllib
import formatter
import sgmllib
import mimetypes
import robotparser
......@@ -134,9 +138,10 @@ def main():
dumpfile = DUMPFILE
restart = 0
checkext = 0
norun = 0
try:
opts, args = getopt.getopt(sys.argv[1:], 'Rd:m:qr:vx')
opts, args = getopt.getopt(sys.argv[1:], 'Rd:m:nqr:vx')
except getopt.error, msg:
sys.stdout = sys.stderr
print msg
......@@ -148,6 +153,8 @@ def main():
dumpfile = a
if o == '-m':
maxpage = string.atoi(a)
if o == '-n':
norun = 1
if o == '-q':
verbose = 0
if o == '-r':
......@@ -157,7 +164,7 @@ def main():
if o == '-x':
checkext = 1
if verbose:
if verbose > 0:
print AGENTNAME, "version", __version__
if restart:
......@@ -177,32 +184,33 @@ def main():
for arg in args:
c.addroot(arg)
if not c.todo:
needsave = 0
else:
needsave = 1
try:
c.run()
except KeyboardInterrupt:
if verbose > 0:
print "[run interrupted]"
if not norun:
try:
c.run()
except KeyboardInterrupt:
if verbose > 0:
print "[run interrupted]"
try:
c.report(checkext)
except KeyboardInterrupt:
if verbose > 0:
print "[report interrupted]"
if not needsave:
if not c.changed:
if verbose > 0:
print
print "No need to save checkpoint"
elif dumpfile:
elif not dumpfile:
if verbose > 0:
print "No dumpfile, won't save checkpoint"
else:
if verbose > 0:
print
print "Saving checkpoint to %s ..." % dumpfile
newfile = dumpfile + ".new"
f = open(newfile, "wb")
pickle.dump(c, f)
f.flush()
f.close()
try:
os.unlink(dumpfile)
......@@ -226,9 +234,11 @@ class Checker:
self.done = {}
self.ext = {}
self.bad = {}
self.urlopener = MyURLopener()
self.round = 0
# The following are not pickled:
self.robots = {}
self.urlopener = MyURLopener()
self.changed = 0
def __getstate__(self):
return (self.roots, self.todo, self.done,
......@@ -243,15 +253,15 @@ class Checker:
def addroot(self, root):
if root not in self.roots:
self.roots.append(root)
self.todo[root] = []
self.addrobot(root)
self.newintlink(root, ("<root>", root))
def addrobot(self, root):
url = urlparse.urljoin(root, "/robots.txt")
self.robots[root] = rp = robotparser.RobotFileParser()
if verbose > 2:
print "Parsing", url
rp.debug = 1
rp.debug = verbose > 3
rp.set_url(url)
try:
rp.read()
......@@ -264,24 +274,23 @@ class Checker:
self.round = self.round + 1
if verbose > 0:
print
print "Round", self.round,
print "(%d to do, %d done, %d external, %d bad)" % (
len(self.todo), len(self.done),
len(self.ext), len(self.bad))
print
print "Round", self.round, self.status()
print
urls = self.todo.keys()[:roundsize]
for url in urls:
self.dopage(url)
self.done[url] = self.todo[url]
del self.todo[url]
def status(self):
return "(%d total, %d to do, %d done, %d external, %d bad)" % (
len(self.todo)+len(self.done),
len(self.todo), len(self.done),
len(self.ext), len(self.bad))
def report(self, checkext=0):
print
if not self.todo: print "Final",
else: print "Interim",
print "Report (%d to do, %d done, %d external, %d bad)" % (
len(self.todo), len(self.done),
len(self.ext), len(self.bad))
print "Report", self.status()
if verbose > 0 or checkext:
self.report_extrefs(checkext)
# Report errors last because the output may get truncated
......@@ -313,12 +322,14 @@ class Checker:
if verbose > 2: print "Checking", url, "..."
try:
f = self.urlopener.open(url)
f.close()
safeclose(f)
if verbose > 3: print "OK"
if self.bad.has_key(url):
self.setgood(url)
except IOError, msg:
msg = sanitize(msg)
if verbose > 0: print "Error", msg
self.bad[url] = msg
self.setbad(url, msg)
def report_errors(self):
if not self.bad:
......@@ -366,36 +377,51 @@ class Checker:
else:
print "Page ", url
page = self.getpage(url)
if not page:
return
for info in page.getlinkinfos():
link, rawlink = info
origin = url, rawlink
if not self.inroots(link):
try:
self.ext[link].append(origin)
if verbose > 3:
print " New ext link", link,
if link != rawlink: print "(%s)" % rawlink,
print
except KeyError:
if verbose > 3:
print " Seen ext link", link,
if link != rawlink: print "(%s)" % rawlink,
print
self.ext[link] = [origin]
elif self.done.has_key(link):
if verbose > 3:
print " Done link", link
self.done[link].append(origin)
elif self.todo.has_key(link):
if verbose > 3:
print " Seen todo link", link
self.todo[link].append(origin)
else:
if verbose > 3:
print " New todo link", link
self.todo[link] = [origin]
if page:
for info in page.getlinkinfos():
link, rawlink = info
origin = url, rawlink
if not self.inroots(link):
self.newextlink(link, origin)
else:
self.newintlink(link, origin)
self.markdone(url)
def newextlink(self, url, origin):
try:
self.ext[url].append(origin)
if verbose > 3:
print " New ext link", url
except KeyError:
self.ext[url] = [origin]
if verbose > 3:
print " Seen ext link", url
def newintlink(self, url, origin):
if self.done.has_key(url):
self.newdonelink(url, origin)
else:
self.newtodolink(url, origin)
def newdonelink(self, url, origin):
self.done[url].append(origin)
if verbose > 3:
print " Done link", url
def newtodolink(self, url, origin):
if self.todo.has_key(url):
self.todo[url].append(origin)
if verbose > 3:
print " Seen todo link", url
else:
self.todo[url] = [origin]
if verbose > 3:
print " New todo link", url
def markdone(self, url):
self.done[url] = self.todo[url]
del self.todo[url]
self.changed = 1
def inroots(self, url):
for root in self.roots:
......@@ -404,15 +430,6 @@ class Checker:
return 0
def getpage(self, url):
ctype, encoding = mimetypes.guess_type(url)
if encoding:
if verbose > 2:
print " Won't bother, URL suggests encoding %s" % `encoding`
return None
if ctype and ctype != 'text/html':
if verbose > 2:
print " Won't bother, URL suggests mime type %s" % `ctype`
return None
try:
f = self.urlopener.open(url)
except IOError, msg:
......@@ -421,26 +438,43 @@ class Checker:
print "Error ", msg
if verbose > 0:
show(" HREF ", url, " from", self.todo[url])
self.bad[url] = msg
self.setbad(url, msg)
return None
nurl = f.geturl()
info = f.info()
if info.has_key('content-type'):
ctype = string.lower(info['content-type'])
else:
ctype = None
if nurl != url:
if verbose > 1:
print " Redirected to", nurl
if not ctype:
ctype, encoding = mimetypes.guess_type(nurl)
if not ctype:
ctype, encoding = mimetypes.guess_type(nurl)
if ctype != 'text/html':
f.close()
if verbose > 2:
print " Not HTML, mime type", ctype
safeclose(f)
if verbose > 1:
print " Not HTML, mime type", ctype
return None
text = f.read()
f.close()
return Page(text, nurl)
def setgood(self, url):
if self.bad.has_key(url):
del self.bad[url]
self.changed = 1
if verbose > 0:
print "(Clear previously seen error)"
def setbad(self, url, msg):
if self.bad.has_key(url) and self.bad[url] == msg:
if verbose > 0:
print "(Seen this error before)"
return
self.bad[url] = msg
self.changed = 1
class Page:
......@@ -457,7 +491,7 @@ class Page:
return []
if verbose > 2:
print " Parsing", self.url, "(%d bytes)" % size
parser = MyHTMLParser(formatter.NullFormatter())
parser = MyHTMLParser()
parser.feed(self.text)
parser.close()
rawlinks = parser.getlinks()
......@@ -519,28 +553,32 @@ class MyURLopener(urllib.FancyURLopener):
return urllib.FancyURLopener.open_file(self, path)
class MyHTMLParser(htmllib.HTMLParser):
class MyHTMLParser(sgmllib.SGMLParser):
def __init__(*args):
self = args[0]
def __init__(self):
self.base = None
self.links = []
apply(htmllib.HTMLParser.__init__, args)
self.links = {}
sgmllib.SGMLParser.__init__ (self)
def start_a(self, attributes):
for name, value in attributes:
if name == 'href' and value and value not in self.links:
self.links.append(string.strip(value))
if name == 'href':
if value: value = string.strip(value)
if value: self.links[value] = None
return # match only first href
def do_base(self, attributes):
for name, value in attributes:
if name == 'href' and value:
if verbose > 1:
print " Base", value
self.base = value
if name == 'href':
if value: value = string.strip(value)
if value:
if verbose > 1:
print " Base", value
self.base = value
return # match only first href
def getlinks(self):
return self.links
return self.links.keys()
def getbase(self):
return self.base
......@@ -569,5 +607,14 @@ def sanitize(msg):
return msg
def safeclose(f):
url = f.geturl()
if url[:4] == 'ftp:' or url[:7] == 'file://':
# Apparently ftp connections don't like to be closed
# prematurely...
text = f.read()
f.close()
if __name__ == '__main__':
main()
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment