Commit d22368ff authored by Martin v. Löwis's avatar Martin v. Löwis

Patch #499513: use readline() instead of readlines(). Removed the

unnecessary redirection limit code which is already in FancyURLopener.
parent 73e61873
...@@ -4,7 +4,7 @@ ...@@ -4,7 +4,7 @@
You can choose between two licenses when using this package: You can choose between two licenses when using this package:
1) GNU GPLv2 1) GNU GPLv2
2) PYTHON 2.0 OPEN SOURCE LICENSE 2) PSF license for Python 2.2
The robots.txt Exclusion Protocol is implemented as specified in The robots.txt Exclusion Protocol is implemented as specified in
http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html
...@@ -42,7 +42,11 @@ class RobotFileParser: ...@@ -42,7 +42,11 @@ class RobotFileParser:
def read(self): def read(self):
opener = URLopener() opener = URLopener()
f = opener.open(self.url) f = opener.open(self.url)
lines = f.readlines() lines = []
line = f.readline()
while line:
lines.append(line.strip())
line = f.readline()
self.errcode = opener.errcode self.errcode = opener.errcode
if self.errcode == 401 or self.errcode == 403: if self.errcode == 401 or self.errcode == 403:
self.disallow_all = 1 self.disallow_all = 1
...@@ -63,7 +67,6 @@ class RobotFileParser: ...@@ -63,7 +67,6 @@ class RobotFileParser:
entry = Entry() entry = Entry()
for line in lines: for line in lines:
line = line.strip()
linenumber = linenumber + 1 linenumber = linenumber + 1
if not line: if not line:
if state==1: if state==1:
...@@ -209,25 +212,12 @@ class URLopener(urllib.FancyURLopener): ...@@ -209,25 +212,12 @@ class URLopener(urllib.FancyURLopener):
def __init__(self, *args): def __init__(self, *args):
apply(urllib.FancyURLopener.__init__, (self,) + args) apply(urllib.FancyURLopener.__init__, (self,) + args)
self.errcode = 200 self.errcode = 200
self.tries = 0
self.maxtries = 10
def http_error_default(self, url, fp, errcode, errmsg, headers): def http_error_default(self, url, fp, errcode, errmsg, headers):
self.errcode = errcode self.errcode = errcode
return urllib.FancyURLopener.http_error_default(self, url, fp, errcode, return urllib.FancyURLopener.http_error_default(self, url, fp, errcode,
errmsg, headers) errmsg, headers)
def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
self.tries += 1
if self.tries >= self.maxtries:
return self.http_error_default(url, fp, 500,
"Internal Server Error: Redirect Recursion",
headers)
result = urllib.FancyURLopener.http_error_302(self, url, fp, errcode,
errmsg, headers, data)
self.tries = 0
return result
def _check(a,b): def _check(a,b):
if not b: if not b:
ac = "access denied" ac = "access denied"
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment