Commit 6e44ef95 authored by Guido van Rossum's avatar Guido van Rossum

Added robots.txt support, using Skip Montanaro's parser.

Fixed occasional inclusion of unpicklable objects (Message in errors).
Changed indent of a few messages.
parent 6b22409b
...@@ -50,8 +50,13 @@ overwritten, but all work done in the current run is lost. ...@@ -50,8 +50,13 @@ overwritten, but all work done in the current run is lost.
Miscellaneous: Miscellaneous:
- Webchecker honors the "robots.txt" convention. Thanks to Skip
Montanaro for his robotparser.py module (included in this directory)!
The agent name is hardwired to "webchecker". URLs that are disallowed
by the robots.txt file are reported as external URLs.
- Because the HTML parser is a bit slow, very large HTML files are - Because the HTML parser is a bit slow, very large HTML files are
skipped. The size limit can be set with the -m option. skipped. The size limit can be set with the -m option.
- Before fetching a page, it guesses its type based on its extension. - Before fetching a page, it guesses its type based on its extension.
If it is a known extension and the type is not text/http, the page is If it is a known extension and the type is not text/http, the page is
...@@ -103,6 +108,7 @@ import htmllib ...@@ -103,6 +108,7 @@ import htmllib
import formatter import formatter
import mimetypes import mimetypes
import robotparser
# Tunable parameters # Tunable parameters
...@@ -110,6 +116,7 @@ DEFROOT = "file:/usr/local/etc/httpd/htdocs/" # Default root URL ...@@ -110,6 +116,7 @@ DEFROOT = "file:/usr/local/etc/httpd/htdocs/" # Default root URL
MAXPAGE = 50000 # Ignore files bigger than this MAXPAGE = 50000 # Ignore files bigger than this
ROUNDSIZE = 50 # Number of links processed per round ROUNDSIZE = 50 # Number of links processed per round
DUMPFILE = "@webchecker.pickle" # Pickled checkpoint DUMPFILE = "@webchecker.pickle" # Pickled checkpoint
AGENTNAME = "webchecker" # Agent name for robots.txt parser
# Global variables # Global variables
...@@ -208,11 +215,32 @@ class Checker: ...@@ -208,11 +215,32 @@ class Checker:
self.bad = {} self.bad = {}
self.urlopener = MyURLopener() self.urlopener = MyURLopener()
self.round = 0 self.round = 0
self.robots = {}
def __getstate__(self):
return (self.roots, self.todo, self.done,
self.ext, self.bad, self.round)
def __setstate__(self, state):
(self.roots, self.todo, self.done,
self.ext, self.bad, self.round) = state
for root in self.roots:
self.addrobot(root)
def addroot(self, root): def addroot(self, root):
if root not in self.roots: if root not in self.roots:
self.roots.append(root) self.roots.append(root)
self.todo[root] = [] self.todo[root] = []
self.addrobot(root)
def addrobot(self, root):
self.robots[root] = rp = robotparser.RobotFileParser()
if verbose > 3:
print "Parsing robots.txt file"
rp.debug = 1
url = urlparse.urljoin(root, "/robots.txt")
rp.set_url(url)
rp.read()
def run(self): def run(self):
while self.todo: while self.todo:
...@@ -332,7 +360,7 @@ class Checker: ...@@ -332,7 +360,7 @@ class Checker:
def inroots(self, url): def inroots(self, url):
for root in self.roots: for root in self.roots:
if url[:len(root)] == root: if url[:len(root)] == root:
return 1 return self.robots[root].can_fetch(AGENTNAME, url)
return 0 return 0
def getpage(self, url): def getpage(self, url):
...@@ -348,6 +376,13 @@ class Checker: ...@@ -348,6 +376,13 @@ class Checker:
try: try:
f = self.urlopener.open(url) f = self.urlopener.open(url)
except IOError, msg: except IOError, msg:
if (type(msg) == TupleType and
len(msg) >= 4 and
msg[0] == 'http error' and
type(msg[3]) == InstanceType):
# Remove the Message instance -- it may contain
# a file object which prevents pickling.
msg = msg[:3] + msg[4:]
if verbose > 0: if verbose > 0:
print "Error ", msg print "Error ", msg
if verbose > 0: if verbose > 0:
...@@ -360,7 +395,7 @@ class Checker: ...@@ -360,7 +395,7 @@ class Checker:
ctype = string.lower(info['content-type']) ctype = string.lower(info['content-type'])
if nurl != url: if nurl != url:
if verbose > 1: if verbose > 1:
print "Redirected to", nurl print " Redirected to", nurl
if not ctype: if not ctype:
ctype, encoding = mimetypes.guess_type(nurl) ctype, encoding = mimetypes.guess_type(nurl)
if ctype != 'text/html': if ctype != 'text/html':
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment