Commit a95f8d67 authored by Guido van Rossum's avatar Guido van Rossum

Set proper User-agent header (Python-webchecker/<version>).

When -x is combined with -q, still do the checking, but don't print
the error in this phase -- they are reported by report_errors().
parent 4a834570
...@@ -73,8 +73,7 @@ hyperlinks. It does honor the <BASE> tag. ...@@ -73,8 +73,7 @@ hyperlinks. It does honor the <BASE> tag.
- Checking external links is not done by default; use -x to enable - Checking external links is not done by default; use -x to enable
this feature. This is done because checking external links usually this feature. This is done because checking external links usually
takes a lot of time. When enabled, this check is executed during the takes a lot of time. When enabled, this check is executed during the
report generation phase (so -x is ignored when -q is specified). Even report generation phase (even when the report is silent).
when -x is enabled, only ``http:'' URLs are checked.
Usage: webchecker.py [option] ... [rooturl] ... Usage: webchecker.py [option] ... [rooturl] ...
...@@ -96,7 +95,7 @@ rooturl -- URL to start checking ...@@ -96,7 +95,7 @@ rooturl -- URL to start checking
""" """
__version__ = "0.1" __version__ = "0.2"
import sys import sys
...@@ -283,26 +282,29 @@ class Checker: ...@@ -283,26 +282,29 @@ class Checker:
print "Report (%d to do, %d done, %d external, %d bad)" % ( print "Report (%d to do, %d done, %d external, %d bad)" % (
len(self.todo), len(self.done), len(self.todo), len(self.done),
len(self.ext), len(self.bad)) len(self.ext), len(self.bad))
if verbose > 0: if verbose > 0 or checkext:
self.report_extrefs(checkext) self.report_extrefs(checkext)
# Report errors last because the output may get truncated # Report errors last because the output may get truncated
self.report_errors() self.report_errors()
def report_extrefs(self, checkext=0): def report_extrefs(self, checkext=0):
if not self.ext: if not self.ext:
print if verbose > 0:
print "No external URLs" print
print "No external URLs"
return return
print if verbose > 0:
if checkext: print
print "External URLs (checking validity):" if checkext:
else: print "External URLs (checking validity):"
print "External URLs (not checked):" else:
print print "External URLs (not checked):"
print
urls = self.ext.keys() urls = self.ext.keys()
urls.sort() urls.sort()
for url in urls: for url in urls:
show("HREF ", url, " from", self.ext[url]) if verbose > 0:
show("HREF ", url, " from", self.ext[url])
if not checkext: if not checkext:
continue continue
if url[:7] == 'mailto:': if url[:7] == 'mailto:':
...@@ -315,7 +317,7 @@ class Checker: ...@@ -315,7 +317,7 @@ class Checker:
if verbose > 3: print "OK" if verbose > 3: print "OK"
except IOError, msg: except IOError, msg:
msg = sanitize(msg) msg = sanitize(msg)
print "Error", msg if verbose > 0: print "Error", msg
self.bad[url] = msg self.bad[url] = msg
def report_errors(self): def report_errors(self):
...@@ -488,6 +490,11 @@ class MyURLopener(urllib.FancyURLopener): ...@@ -488,6 +490,11 @@ class MyURLopener(urllib.FancyURLopener):
http_error_default = urllib.URLopener.http_error_default http_error_default = urllib.URLopener.http_error_default
def __init__(*args):
self = args[0]
apply(urllib.FancyURLopener.__init__, args)
self.addheaders = [('User-agent', 'Python-webchecker/%s' % __version__)]
def open_file(self, url): def open_file(self, url):
path = urllib.url2pathname(urllib.unquote(url)) path = urllib.url2pathname(urllib.unquote(url))
if path[-1] != os.sep: if path[-1] != os.sep:
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment