Commit 0b7d4621 authored by Guido van Rossum's avatar Guido van Rossum

Added -x option to check external links. Slooooow!

parent 8a2df4d8
...@@ -70,9 +70,11 @@ default locations of Netscape and the NCSA HTTP daemon. ...@@ -70,9 +70,11 @@ default locations of Netscape and the NCSA HTTP daemon.
links in <FORM> or <IMG> or whatever other tags might contain links in <FORM> or <IMG> or whatever other tags might contain
hyperlinks. It does honor the <BASE> tag. hyperlinks. It does honor the <BASE> tag.
- It could be argued that it should also check external links for - Checking external links is not done by default; use -x to enable
validity. This is true, but is is more error-prone. I think I will this feature. This is done because checking external links usually
make this an option in the future. takes a lot of time. When enabled, this check is executed during the
report generation phase (so -x is ignored when -q is specified). Even
when -x is enabled, only ``http:'' URLs are checked.
Usage: webchecker.py [option] ... [rooturl] ... Usage: webchecker.py [option] ... [rooturl] ...
...@@ -85,6 +87,7 @@ Options: ...@@ -85,6 +87,7 @@ Options:
-q -- quiet operation (also suppresses external links report) -q -- quiet operation (also suppresses external links report)
-r number -- number of links processed per round (default %(ROUNDSIZE)d) -r number -- number of links processed per round (default %(ROUNDSIZE)d)
-v -- verbose operation; repeating -v will increase verbosity -v -- verbose operation; repeating -v will increase verbosity
-x -- check external links (during report phase)
Arguments: Arguments:
...@@ -131,9 +134,10 @@ def main(): ...@@ -131,9 +134,10 @@ def main():
global verbose, maxpage, roundsize global verbose, maxpage, roundsize
dumpfile = DUMPFILE dumpfile = DUMPFILE
restart = 0 restart = 0
checkext = 0
try: try:
opts, args = getopt.getopt(sys.argv[1:], 'Rd:m:qr:v') opts, args = getopt.getopt(sys.argv[1:], 'Rd:m:qr:vx')
except getopt.error, msg: except getopt.error, msg:
sys.stdout = sys.stderr sys.stdout = sys.stderr
print msg print msg
...@@ -151,6 +155,8 @@ def main(): ...@@ -151,6 +155,8 @@ def main():
roundsize = string.atoi(a) roundsize = string.atoi(a)
if o == '-v': if o == '-v':
verbose = verbose + 1 verbose = verbose + 1
if o == '-x':
checkext = 1
if verbose: if verbose:
print AGENTNAME, "version", __version__ print AGENTNAME, "version", __version__
...@@ -180,8 +186,12 @@ def main(): ...@@ -180,8 +186,12 @@ def main():
c.run() c.run()
except KeyboardInterrupt: except KeyboardInterrupt:
if verbose > 0: if verbose > 0:
print "[interrupted]" print "[run interrupted]"
c.report() try:
c.report(checkext)
except KeyboardInterrupt:
if verbose > 0:
print "[report interrupted]"
if not needsave: if not needsave:
if verbose > 0: if verbose > 0:
print print
...@@ -266,7 +276,7 @@ class Checker: ...@@ -266,7 +276,7 @@ class Checker:
self.done[url] = self.todo[url] self.done[url] = self.todo[url]
del self.todo[url] del self.todo[url]
def report(self): def report(self, checkext=0):
print print
if not self.todo: print "Final", if not self.todo: print "Final",
else: print "Interim", else: print "Interim",
...@@ -274,22 +284,34 @@ class Checker: ...@@ -274,22 +284,34 @@ class Checker:
len(self.todo), len(self.done), len(self.todo), len(self.done),
len(self.ext), len(self.bad)) len(self.ext), len(self.bad))
if verbose > 0: if verbose > 0:
self.report_extrefs() self.report_extrefs(checkext)
# Report errors last because the output may get truncated # Report errors last because the output may get truncated
self.report_errors() self.report_errors()
def report_extrefs(self): def report_extrefs(self, checkext=0):
if not self.ext: if not self.ext:
print print
print "No external URLs" print "No external URLs"
return return
print print
print "External URLs:" if checkext:
print "External URLs (checking validity):"
else:
print "External URLs (not checked):"
print print
urls = self.ext.keys() urls = self.ext.keys()
urls.sort() urls.sort()
for url in urls: for url in urls:
show("HREF ", url, " from", self.ext[url]) show("HREF ", url, " from", self.ext[url])
if not checkext:
continue
if verbose > 2: print "Checking", url, "..."
try:
f = self.urlopener.open(url)
f.close()
if verbose > 3: print "OK"
except IOError, msg:
print "Error:", msg
def report_errors(self): def report_errors(self):
if not self.bad: if not self.bad:
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment