Commit 2237b73b authored by Guido van Rossum's avatar Guido van Rossum

Several changes:

- Change the code that looks for robots.txt to always look in /, even
if the "root" path is somewhere deep down below.

- Add link processing in <AREA> tags.

- Change safeclose() to avoid crashing when the file has no geturl()
method.
parent dc0f00ad
...@@ -251,11 +251,21 @@ class Checker: ...@@ -251,11 +251,21 @@ class Checker:
def addroot(self, root): def addroot(self, root):
if root not in self.roots: if root not in self.roots:
self.roots.append(root) troot = root
scheme, netloc, path, params, query, fragment = \
urlparse.urlparse(root)
i = string.rfind(path, "/") + 1
if 0 < i < len(path):
path = path[:i]
troot = urlparse.urlunparse((scheme, netloc, path,
params, query, fragment))
self.roots.append(troot)
self.addrobot(root) self.addrobot(root)
self.newlink(root, ("<root>", root)) self.newlink(root, ("<root>", root))
def addrobot(self, root): def addrobot(self, root):
root = urlparse.urljoin(root, "/")
if self.robots.has_key(root): return
url = urlparse.urljoin(root, "/robots.txt") url = urlparse.urljoin(root, "/robots.txt")
self.robots[root] = rp = robotparser.RobotFileParser() self.robots[root] = rp = robotparser.RobotFileParser()
if verbose > 2: if verbose > 2:
...@@ -357,6 +367,7 @@ class Checker: ...@@ -357,6 +367,7 @@ class Checker:
def inroots(self, url): def inroots(self, url):
for root in self.roots: for root in self.roots:
if url[:len(root)] == root: if url[:len(root)] == root:
root = urlparse.urljoin(root, "/")
return self.robots[root].can_fetch(AGENTNAME, url) return self.robots[root].can_fetch(AGENTNAME, url)
return 0 return 0
...@@ -528,6 +539,9 @@ class MyHTMLParser(sgmllib.SGMLParser): ...@@ -528,6 +539,9 @@ class MyHTMLParser(sgmllib.SGMLParser):
def end_a(self): pass def end_a(self): pass
def do_area(self, attributes):
self.link_attr(attributes, 'href')
def do_img(self, attributes): def do_img(self, attributes):
self.link_attr(attributes, 'src', 'lowsrc') self.link_attr(attributes, 'src', 'lowsrc')
...@@ -580,11 +594,15 @@ def sanitize(msg): ...@@ -580,11 +594,15 @@ def sanitize(msg):
def safeclose(f): def safeclose(f):
url = f.geturl() try:
if url[:4] == 'ftp:' or url[:7] == 'file://': url = f.geturl()
# Apparently ftp connections don't like to be closed except AttributeError:
# prematurely... pass
text = f.read() else:
if url[:4] == 'ftp:' or url[:7] == 'file://':
# Apparently ftp connections don't like to be closed
# prematurely...
text = f.read()
f.close() f.close()
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment