Commit 663f6c2a authored by Skip Montanaro's avatar Skip Montanaro

rewrite of robotparser.py by Bastian Kleineidam. Closes patch 102229.

parent a5d23a19
""" """ robotparser.py
Copyright (C) 2000 Bastian Kleineidam
Robots.txt file parser class. Accepts a list of lines or robots.txt URL as You can choose between two licenses when using this package:
input, builds a set of rules from that list, then answers questions about 1) GNU GPLv2
fetchability of other URLs. 2) PYTHON 2.0 OPEN SOURCE LICENSE
The robots.txt Exclusion Protocol is implemented as specified in
http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html
""" """
import re,string,urlparse,urllib
class RobotFileParser: debug = 0
def __init__(self): def _debug(msg):
self.rules = {} if debug: print msg
self.debug = 0
self.url = ''
class RobotFileParser:
def __init__(self, url=''):
self.entries = []
self.disallow_all = 0
self.allow_all = 0
self.set_url(url)
self.last_checked = 0 self.last_checked = 0
def mtime(self): def mtime(self):
...@@ -23,75 +34,183 @@ class RobotFileParser: ...@@ -23,75 +34,183 @@ class RobotFileParser:
def set_url(self, url): def set_url(self, url):
self.url = url self.url = url
self.host, self.path = urlparse.urlparse(url)[1:3]
def read(self): def read(self):
import urllib import httplib
self.parse(urllib.urlopen(self.url).readlines()) tries = 0
while tries<5:
connection = httplib.HTTP(self.host)
connection.putrequest("GET", self.path)
connection.putheader("Host", self.host)
connection.endheaders()
status, text, mime = connection.getreply()
if status in [301,302] and mime:
tries = tries + 1
newurl = mime.get("Location", mime.get("Uri", ""))
newurl = urlparse.urljoin(self.url, newurl)
self.set_url(newurl)
else:
break
if status==401 or status==403:
self.disallow_all = 1
elif status>=400:
self.allow_all = 1
else:
# status < 400
self.parse(connection.getfile().readlines())
def parse(self, lines): def parse(self, lines):
"""parse the input lines from a robot.txt file""" """parse the input lines from a robot.txt file.
import string, re We allow that a user-agent: line is not preceded by
active = [] one or more blank lines."""
state = 0
linenumber = 0
entry = Entry()
for line in lines: for line in lines:
if self.debug: print '>', line, line = string.strip(line)
# blank line terminates current record linenumber = linenumber + 1
if not line[:-1]: if not line:
active = [] if state==1:
continue _debug("line %d: warning: you should insert"
" allow: or disallow: directives below any"
" user-agent: line" % linenumber)
entry = Entry()
state = 0
elif state==2:
self.entries.append(entry)
entry = Entry()
state = 0
# remove optional comment and strip line # remove optional comment and strip line
line = string.strip(line[:string.find(line, '#')]) i = string.find(line, '#')
if i>=0:
line = line[:i]
line = string.strip(line)
if not line: if not line:
continue continue
line = re.split(' *: *', line) line = string.split(line, ':', 1)
if len(line) == 2: if len(line) == 2:
line[0] = string.lower(line[0]) line[0] = string.lower(string.strip(line[0]))
if line[0] == 'user-agent': line[1] = string.strip(line[1])
# this record applies to this user agent if line[0] == "user-agent":
if self.debug: print '>> user-agent:', line[1] if state==2:
active.append(line[1]) _debug("line %d: warning: you should insert a blank"
if not self.rules.has_key(line[1]): " line before any user-agent"
self.rules[line[1]] = [] " directive" % linenumber)
elif line[0] == 'disallow': self.entries.append(entry)
if line[1]: entry = Entry()
if self.debug: print '>> disallow:', line[1] entry.useragents.append(line[1])
for agent in active: state = 1
self.rules[agent].append(re.compile(line[1])) elif line[0] == "disallow":
if state==0:
_debug("line %d: error: you must insert a user-agent:"
" directive before this line" % linenumber)
else:
entry.rulelines.append(RuleLine(line[1], 0))
state = 2
elif line[0] == "allow":
if state==0:
_debug("line %d: error: you must insert a user-agent:"
" directive before this line" % linenumber)
else: else:
pass entry.rulelines.append(RuleLine(line[1], 1))
for agent in active:
if self.debug: print '>> allow', agent
self.rules[agent] = []
else: else:
if self.debug: print '>> unknown:', line _debug("line %d: warning: unknown key %s" % (linenumber,
line[0]))
else:
_debug("line %d: error: malformed line %s"%(linenumber, line))
if state==2:
self.entries.append(entry)
_debug("Parsed rules:\n%s" % str(self))
self.modified()
# returns true if agent is allowed to fetch url
def can_fetch(self, useragent, url): def can_fetch(self, useragent, url):
"""using the parsed robots.txt decide if useragent can fetch url""" """using the parsed robots.txt decide if useragent can fetch url"""
import urlparse _debug("Checking robot.txt allowance for\n%s\n%s" % (useragent, url))
ag = useragent if self.disallow_all:
if not self.rules.has_key(ag): ag = '*' return 0
if not self.rules.has_key(ag): if self.allow_all:
if self.debug: print '>> allowing', url, 'fetch by', useragent
return 1 return 1
path = urlparse.urlparse(url)[2] # search for given user agent matches
for rule in self.rules[ag]: # the first match counts
if rule.match(path) is not None: useragent = string.lower(useragent)
if self.debug: print '>> disallowing', url, 'fetch by', useragent url = urllib.quote(urlparse.urlparse(url)[2])
return 0 for entry in self.entries:
if self.debug: print '>> allowing', url, 'fetch by', useragent if entry.applies_to(useragent):
return entry.allowance(url)
# agent not found ==> access granted
return 1 return 1
def __str__(self):
ret = ""
for entry in self.entries:
ret = ret + str(entry) + "\n"
return ret
class RuleLine:
"""A rule line is a single "Allow:" (allowance==1) or "Disallow:"
(allowance==0) followed by a path."""
def __init__(self, path, allowance):
self.path = urllib.quote(path)
self.allowance = allowance
def applies_to(self, filename):
return self.path=="*" or re.match(self.path, filename)
def __str__(self):
return (self.allowance and "Allow" or "Disallow")+": "+self.path
class Entry:
"""An entry has one or more user-agents and zero or more rulelines"""
def __init__(self):
self.useragents = []
self.rulelines = []
def __str__(self):
ret = ""
for agent in self.useragents:
ret = ret + "User-agent: "+agent+"\n"
for line in self.rulelines:
ret = ret + str(line) + "\n"
return ret
def applies_to(self, useragent):
"check if this entry applies to the specified agent"
for agent in self.useragents:
if agent=="*":
return 1
if re.match(agent, useragent):
return 1
return 0
def allowance(self, filename):
"""Preconditions:
- our agent applies to this entry
- filename is URL decoded"""
for line in self.rulelines:
if line.applies_to(filename):
return line.allowance
return 1
def _test(): def _test():
global debug
import sys
rp = RobotFileParser() rp = RobotFileParser()
rp.debug = 1 debug = 1
rp.set_url('http://www.musi-cal.com/robots.txt') if len(sys.argv) <= 1:
rp.read() rp.set_url('http://www.musi-cal.com/robots.txt')
print rp.rules rp.read()
print rp.can_fetch('*', 'http://www.musi-cal.com.com/') else:
print rp.can_fetch('Musi-Cal-Robot', rp.parse(open(sys.argv[1]).readlines())
'http://www.musi-cal.com/cgi-bin/event-search?city=San+Francisco') print rp.can_fetch('*', 'http://www.musi-cal.com/')
print rp.can_fetch('Musi-Cal-Robot/1.0',
if __name__ == "__main__": 'http://www.musi-cal.com/cgi-bin/event-search'
'?city=San+Francisco')
if __name__ == '__main__':
_test() _test()
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment