Commit 663f6c2a authored by Skip Montanaro's avatar Skip Montanaro

rewrite of robotparser.py by Bastian Kleineidam. Closes patch 102229.

parent a5d23a19
"""
""" robotparser.py
Copyright (C) 2000 Bastian Kleineidam
Robots.txt file parser class. Accepts a list of lines or robots.txt URL as
input, builds a set of rules from that list, then answers questions about
fetchability of other URLs.
You can choose between two licenses when using this package:
1) GNU GPLv2
2) PYTHON 2.0 OPEN SOURCE LICENSE
The robots.txt Exclusion Protocol is implemented as specified in
http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html
"""
import re,string,urlparse,urllib
class RobotFileParser:
debug = 0
def __init__(self):
self.rules = {}
self.debug = 0
self.url = ''
def _debug(msg):
if debug: print msg
class RobotFileParser:
def __init__(self, url=''):
self.entries = []
self.disallow_all = 0
self.allow_all = 0
self.set_url(url)
self.last_checked = 0
def mtime(self):
......@@ -23,75 +34,183 @@ class RobotFileParser:
def set_url(self, url):
self.url = url
self.host, self.path = urlparse.urlparse(url)[1:3]
def read(self):
import urllib
self.parse(urllib.urlopen(self.url).readlines())
import httplib
tries = 0
while tries<5:
connection = httplib.HTTP(self.host)
connection.putrequest("GET", self.path)
connection.putheader("Host", self.host)
connection.endheaders()
status, text, mime = connection.getreply()
if status in [301,302] and mime:
tries = tries + 1
newurl = mime.get("Location", mime.get("Uri", ""))
newurl = urlparse.urljoin(self.url, newurl)
self.set_url(newurl)
else:
break
if status==401 or status==403:
self.disallow_all = 1
elif status>=400:
self.allow_all = 1
else:
# status < 400
self.parse(connection.getfile().readlines())
def parse(self, lines):
"""parse the input lines from a robot.txt file"""
import string, re
active = []
"""parse the input lines from a robot.txt file.
We allow that a user-agent: line is not preceded by
one or more blank lines."""
state = 0
linenumber = 0
entry = Entry()
for line in lines:
if self.debug: print '>', line,
# blank line terminates current record
if not line[:-1]:
active = []
continue
line = string.strip(line)
linenumber = linenumber + 1
if not line:
if state==1:
_debug("line %d: warning: you should insert"
" allow: or disallow: directives below any"
" user-agent: line" % linenumber)
entry = Entry()
state = 0
elif state==2:
self.entries.append(entry)
entry = Entry()
state = 0
# remove optional comment and strip line
line = string.strip(line[:string.find(line, '#')])
i = string.find(line, '#')
if i>=0:
line = line[:i]
line = string.strip(line)
if not line:
continue
line = re.split(' *: *', line)
line = string.split(line, ':', 1)
if len(line) == 2:
line[0] = string.lower(line[0])
if line[0] == 'user-agent':
# this record applies to this user agent
if self.debug: print '>> user-agent:', line[1]
active.append(line[1])
if not self.rules.has_key(line[1]):
self.rules[line[1]] = []
elif line[0] == 'disallow':
if line[1]:
if self.debug: print '>> disallow:', line[1]
for agent in active:
self.rules[agent].append(re.compile(line[1]))
line[0] = string.lower(string.strip(line[0]))
line[1] = string.strip(line[1])
if line[0] == "user-agent":
if state==2:
_debug("line %d: warning: you should insert a blank"
" line before any user-agent"
" directive" % linenumber)
self.entries.append(entry)
entry = Entry()
entry.useragents.append(line[1])
state = 1
elif line[0] == "disallow":
if state==0:
_debug("line %d: error: you must insert a user-agent:"
" directive before this line" % linenumber)
else:
pass
for agent in active:
if self.debug: print '>> allow', agent
self.rules[agent] = []
entry.rulelines.append(RuleLine(line[1], 0))
state = 2
elif line[0] == "allow":
if state==0:
_debug("line %d: error: you must insert a user-agent:"
" directive before this line" % linenumber)
else:
if self.debug: print '>> unknown:', line
entry.rulelines.append(RuleLine(line[1], 1))
else:
_debug("line %d: warning: unknown key %s" % (linenumber,
line[0]))
else:
_debug("line %d: error: malformed line %s"%(linenumber, line))
if state==2:
self.entries.append(entry)
_debug("Parsed rules:\n%s" % str(self))
self.modified()
# returns true if agent is allowed to fetch url
def can_fetch(self, useragent, url):
"""using the parsed robots.txt decide if useragent can fetch url"""
import urlparse
ag = useragent
if not self.rules.has_key(ag): ag = '*'
if not self.rules.has_key(ag):
if self.debug: print '>> allowing', url, 'fetch by', useragent
_debug("Checking robot.txt allowance for\n%s\n%s" % (useragent, url))
if self.disallow_all:
return 0
if self.allow_all:
return 1
# search for given user agent matches
# the first match counts
useragent = string.lower(useragent)
url = urllib.quote(urlparse.urlparse(url)[2])
for entry in self.entries:
if entry.applies_to(useragent):
return entry.allowance(url)
# agent not found ==> access granted
return 1
def __str__(self):
ret = ""
for entry in self.entries:
ret = ret + str(entry) + "\n"
return ret
class RuleLine:
"""A rule line is a single "Allow:" (allowance==1) or "Disallow:"
(allowance==0) followed by a path."""
def __init__(self, path, allowance):
self.path = urllib.quote(path)
self.allowance = allowance
def applies_to(self, filename):
return self.path=="*" or re.match(self.path, filename)
def __str__(self):
return (self.allowance and "Allow" or "Disallow")+": "+self.path
class Entry:
"""An entry has one or more user-agents and zero or more rulelines"""
def __init__(self):
self.useragents = []
self.rulelines = []
def __str__(self):
ret = ""
for agent in self.useragents:
ret = ret + "User-agent: "+agent+"\n"
for line in self.rulelines:
ret = ret + str(line) + "\n"
return ret
def applies_to(self, useragent):
"check if this entry applies to the specified agent"
for agent in self.useragents:
if agent=="*":
return 1
if re.match(agent, useragent):
return 1
path = urlparse.urlparse(url)[2]
for rule in self.rules[ag]:
if rule.match(path) is not None:
if self.debug: print '>> disallowing', url, 'fetch by', useragent
return 0
if self.debug: print '>> allowing', url, 'fetch by', useragent
def allowance(self, filename):
"""Preconditions:
- our agent applies to this entry
- filename is URL decoded"""
for line in self.rulelines:
if line.applies_to(filename):
return line.allowance
return 1
def _test():
global debug
import sys
rp = RobotFileParser()
rp.debug = 1
debug = 1
if len(sys.argv) <= 1:
rp.set_url('http://www.musi-cal.com/robots.txt')
rp.read()
print rp.rules
print rp.can_fetch('*', 'http://www.musi-cal.com.com/')
print rp.can_fetch('Musi-Cal-Robot',
'http://www.musi-cal.com/cgi-bin/event-search?city=San+Francisco')
else:
rp.parse(open(sys.argv[1]).readlines())
print rp.can_fetch('*', 'http://www.musi-cal.com/')
print rp.can_fetch('Musi-Cal-Robot/1.0',
'http://www.musi-cal.com/cgi-bin/event-search'
'?city=San+Francisco')
if __name__ == "__main__":
if __name__ == '__main__':
_test()
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment