Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
C
cpython
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
Kirill Smelkov
cpython
Commits
eb5b6479
Commit
eb5b6479
authored
Sep 11, 2016
by
Vinay Sajip
Browse files
Options
Browse Files
Download
Plain Diff
Merged upstream changes.
parents
68532323
4da0fd06
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
139 additions
and
220 deletions
+139
-220
Lib/test/test_robotparser.py
Lib/test/test_robotparser.py
+139
-220
No files found.
Lib/test/test_robotparser.py
View file @
eb5b6479
...
...
@@ -10,84 +10,49 @@ except ImportError:
threading
=
None
class
RobotTestCase
(
unittest
.
TestCase
):
def
__init__
(
self
,
index
=
None
,
parser
=
None
,
url
=
None
,
good
=
None
,
agent
=
None
,
request_rate
=
None
,
crawl_delay
=
None
):
# workaround to make unittest discovery work (see #17066)
if
not
isinstance
(
index
,
int
):
return
unittest
.
TestCase
.
__init__
(
self
)
if
good
:
self
.
str
=
"RobotTest(%d, good, %s)"
%
(
index
,
url
)
else
:
self
.
str
=
"RobotTest(%d, bad, %s)"
%
(
index
,
url
)
self
.
parser
=
parser
self
.
url
=
url
self
.
good
=
good
self
.
agent
=
agent
self
.
request_rate
=
request_rate
self
.
crawl_delay
=
crawl_delay
def
runTest
(
self
):
if
isinstance
(
self
.
url
,
tuple
):
agent
,
url
=
self
.
url
else
:
url
=
self
.
url
agent
=
self
.
agent
if
self
.
good
:
self
.
assertTrue
(
self
.
parser
.
can_fetch
(
agent
,
url
))
self
.
assertEqual
(
self
.
parser
.
crawl_delay
(
agent
),
self
.
crawl_delay
)
# if we have actual values for request rate
if
self
.
request_rate
and
self
.
parser
.
request_rate
(
agent
):
self
.
assertEqual
(
self
.
parser
.
request_rate
(
agent
).
requests
,
self
.
request_rate
.
requests
)
self
.
assertEqual
(
self
.
parser
.
request_rate
(
agent
).
seconds
,
self
.
request_rate
.
seconds
)
self
.
assertEqual
(
self
.
parser
.
request_rate
(
agent
),
self
.
request_rate
)
else
:
self
.
assertFalse
(
self
.
parser
.
can_fetch
(
agent
,
url
))
def
__str__
(
self
):
return
self
.
str
tests
=
unittest
.
TestSuite
()
def
RobotTest
(
index
,
robots_txt
,
good_urls
,
bad_urls
,
request_rate
,
crawl_delay
,
agent
=
"test_robotparser"
):
lines
=
io
.
StringIO
(
robots_txt
).
readlines
()
parser
=
urllib
.
robotparser
.
RobotFileParser
()
parser
.
parse
(
lines
)
for
url
in
good_urls
:
tests
.
addTest
(
RobotTestCase
(
index
,
parser
,
url
,
1
,
agent
,
request_rate
,
crawl_delay
))
for
url
in
bad_urls
:
tests
.
addTest
(
RobotTestCase
(
index
,
parser
,
url
,
0
,
agent
,
request_rate
,
crawl_delay
))
# Examples from http://www.robotstxt.org/wc/norobots.html (fetched 2002)
# 1.
doc
=
"""
class
BaseRobotTest
:
robots_txt
=
''
agent
=
'test_robotparser'
good
=
[]
bad
=
[]
def
setUp
(
self
):
lines
=
io
.
StringIO
(
self
.
robots_txt
).
readlines
()
self
.
parser
=
urllib
.
robotparser
.
RobotFileParser
()
self
.
parser
.
parse
(
lines
)
def
get_agent_and_url
(
self
,
url
):
if
isinstance
(
url
,
tuple
):
agent
,
url
=
url
return
agent
,
url
return
self
.
agent
,
url
def
test_good_urls
(
self
):
for
url
in
self
.
good
:
agent
,
url
=
self
.
get_agent_and_url
(
url
)
with
self
.
subTest
(
url
=
url
,
agent
=
agent
):
self
.
assertTrue
(
self
.
parser
.
can_fetch
(
agent
,
url
))
def
test_bad_urls
(
self
):
for
url
in
self
.
bad
:
agent
,
url
=
self
.
get_agent_and_url
(
url
)
with
self
.
subTest
(
url
=
url
,
agent
=
agent
):
self
.
assertFalse
(
self
.
parser
.
can_fetch
(
agent
,
url
))
class
UserAgentWildcardTest
(
BaseRobotTest
,
unittest
.
TestCase
):
robots_txt
=
"""
\
User-agent: *
Disallow: /cyberworld/map/ # This is an infinite virtual URL space
Disallow: /tmp/ # these will soon disappear
Disallow: /foo.html
"""
good
=
[
'/'
,
'/test.html'
]
bad
=
[
'/cyberworld/map/index.html'
,
'/tmp/xxx'
,
'/foo.html'
]
request_rate
=
None
crawl_delay
=
None
"""
good
=
[
'/'
,
'/test.html'
]
bad
=
[
'/cyberworld/map/index.html'
,
'/tmp/xxx'
,
'/foo.html'
]
RobotTest
(
1
,
doc
,
good
,
bad
,
request_rate
,
crawl_delay
)
# 2.
doc
=
"""
class
CrawlDelayAndCustomAgentTest
(
BaseRobotTest
,
unittest
.
TestCase
):
robots_txt
=
"""
\
# robots.txt for http://www.example.com/
User-agent: *
...
...
@@ -98,34 +63,23 @@ Disallow: /cyberworld/map/ # This is an infinite virtual URL space
# Cybermapper knows where to go.
User-agent: cybermapper
Disallow:
"""
good
=
[
'/'
,
'/test.html'
,
(
'cybermapper'
,
'/cyberworld/map/index.html'
)]
bad
=
[
'/cyberworld/map/index.html'
]
"""
good
=
[
'/'
,
'/test.html'
,(
'cybermapper'
,
'/cyberworld/map/index.html'
)]
bad
=
[
'/cyberworld/map/index.html'
]
request_rate
=
None
# The parameters should be equal to None since they
crawl_delay
=
None
# don't apply to the cybermapper user agent
RobotTest
(
2
,
doc
,
good
,
bad
,
request_rate
,
crawl_delay
)
# 3.
doc
=
"""
class
RejectAllRobotsTest
(
BaseRobotTest
,
unittest
.
TestCase
):
robots_txt
=
"""
\
# go away
User-agent: *
Disallow: /
"""
"""
good
=
[]
bad
=
[
'/cyberworld/map/index.html'
,
'/'
,
'/tmp/'
]
good
=
[]
bad
=
[
'/cyberworld/map/index.html'
,
'/'
,
'/tmp/'
]
request_rate
=
None
crawl_delay
=
None
RobotTest
(
3
,
doc
,
good
,
bad
,
request_rate
,
crawl_delay
)
# Examples from http://www.robotstxt.org/wc/norobots-rfc.html (fetched 2002)
# 4.
doc
=
"""
class
CrawlDelayAndRequestRateTest
(
BaseRobotTest
,
unittest
.
TestCase
):
robots_txt
=
"""
\
User-agent: figtree
Crawl-delay: 3
Request-rate: 9/30
...
...
@@ -133,28 +87,43 @@ Disallow: /tmp
Disallow: /a%3cd.html
Disallow: /a%2fb.html
Disallow: /%7ejoe/index.html
"""
good
=
[]
# XFAIL '/a/b.html'
bad
=
[
'/tmp'
,
'/tmp.html'
,
'/tmp/a.html'
,
'/a%3cd.html'
,
'/a%3Cd.html'
,
'/a%2fb.html'
,
'/~joe/index.html'
]
request_rate
=
namedtuple
(
'req_rate'
,
'requests seconds'
)
request_rate
.
requests
=
9
request_rate
.
seconds
=
30
crawl_delay
=
3
request_rate_bad
=
None
# not actually tested, but we still need to parse it
crawl_delay_bad
=
None
# in order to accommodate the input parameters
RobotTest
(
4
,
doc
,
good
,
bad
,
request_rate
,
crawl_delay
,
'figtree'
)
RobotTest
(
5
,
doc
,
good
,
bad
,
request_rate_bad
,
crawl_delay_bad
,
'FigTree Robot libwww-perl/5.04'
)
# 6.
doc
=
"""
"""
agent
=
'figtree'
request_rate
=
namedtuple
(
'req_rate'
,
'requests seconds'
)(
9
,
30
)
crawl_delay
=
3
good
=
[(
'figtree'
,
'/foo.html'
)]
bad
=
[
'/tmp'
,
'/tmp.html'
,
'/tmp/a.html'
,
'/a%3cd.html'
,
'/a%3Cd.html'
,
'/a%2fb.html'
,
'/~joe/index.html'
]
def
test_request_rate
(
self
):
for
url
in
self
.
good
:
agent
,
url
=
self
.
get_agent_and_url
(
url
)
with
self
.
subTest
(
url
=
url
,
agent
=
agent
):
if
self
.
crawl_delay
:
self
.
assertEqual
(
self
.
parser
.
crawl_delay
(
agent
),
self
.
crawl_delay
)
if
self
.
request_rate
and
self
.
parser
.
request_rate
(
agent
):
self
.
assertEqual
(
self
.
parser
.
request_rate
(
agent
).
requests
,
self
.
request_rate
.
requests
)
self
.
assertEqual
(
self
.
parser
.
request_rate
(
agent
).
seconds
,
self
.
request_rate
.
seconds
)
class
DifferentAgentTest
(
CrawlDelayAndRequestRateTest
):
agent
=
'FigTree Robot libwww-perl/5.04'
# these are not actually tested, but we still need to parse it
# in order to accommodate the input parameters
request_rate
=
None
crawl_delay
=
None
class
InvalidRequestRateTest
(
BaseRobotTest
,
unittest
.
TestCase
):
robots_txt
=
"""
\
User-agent: *
Disallow: /tmp/
Disallow: /a%3Cd.html
...
...
@@ -162,141 +131,102 @@ Disallow: /a/b.html
Disallow: /%7ejoe/index.html
Crawl-delay: 3
Request-rate: 9/banana
"""
good
=
[
'/tmp'
,]
# XFAIL: '/a%2fb.html'
bad
=
[
'/tmp/'
,
'/tmp/a.html'
,
'/a%3cd.html'
,
'/a%3Cd.html'
,
"/a/b.html"
,
'/%7Ejoe/index.html'
]
crawl_delay
=
3
request_rate
=
None
# since request rate has invalid syntax, return None
"""
good
=
[
'/tmp'
]
bad
=
[
'/tmp/'
,
'/tmp/a.html'
,
'/a%3cd.html'
,
'/a%3Cd.html'
,
'/a/b.html'
,
'/%7Ejoe/index.html'
]
crawl_delay
=
3
RobotTest
(
6
,
doc
,
good
,
bad
,
None
,
None
)
# From bug report #523041
# 7.
doc
=
"""
class
InvalidCrawlDelayTest
(
BaseRobotTest
,
unittest
.
TestCase
):
# From bug report #523041
robots_txt
=
"""
\
User-Agent: *
Disallow: /.
Crawl-delay: pears
"""
good
=
[
'/foo.html'
]
bad
=
[]
# bug report says "/" should be denied, but that is not in the RFC
"""
good
=
[
'/foo.html'
]
# bug report says "/" should be denied, but that is not in the RFC
bad
=
[]
crawl_delay
=
None
# since crawl delay has invalid syntax, return None
request_rate
=
None
RobotTest
(
7
,
doc
,
good
,
bad
,
crawl_delay
,
request_rate
)
# From Google: http://www.google.com/support/webmasters/bin/answer.py?hl=en&answer=40364
# 8.
doc
=
"""
class
AnotherInvalidRequestRateTest
(
BaseRobotTest
,
unittest
.
TestCase
):
# also test that Allow and Diasallow works well with each other
robots_txt
=
"""
\
User-agent: Googlebot
Allow: /folder1/myfile.html
Disallow: /folder1/
Request-rate: whale/banana
"""
"""
agent
=
'Googlebot'
good
=
[
'/folder1/myfile.html'
]
bad
=
[
'/folder1/anotherfile.html'
]
good
=
[
'/folder1/myfile.html'
]
bad
=
[
'/folder1/anotherfile.html'
]
crawl_delay
=
None
request_rate
=
None
# invalid syntax, return none
RobotTest
(
8
,
doc
,
good
,
bad
,
crawl_delay
,
request_rate
,
agent
=
"Googlebot"
)
# 9. This file is incorrect because "Googlebot" is a substring of
# "Googlebot-Mobile", so test 10 works just like test 9.
doc
=
"""
class
UserAgentOrderingTest
(
BaseRobotTest
,
unittest
.
TestCase
):
# the order of User-agent should be correct. note
# that this file is incorrect because "Googlebot" is a
# substring of "Googlebot-Mobile"
robots_txt
=
"""
\
User-agent: Googlebot
Disallow: /
User-agent: Googlebot-Mobile
Allow: /
"""
good
=
[]
bad
=
[
'/something.jpg'
]
RobotTest
(
9
,
doc
,
good
,
bad
,
None
,
None
,
agent
=
"Googlebot"
)
good
=
[]
bad
=
[
'/something.jpg'
]
RobotTest
(
10
,
doc
,
good
,
bad
,
None
,
None
,
agent
=
"Googlebot-Mobile"
)
# 11. Get the order correct.
doc
=
"""
User-agent: Googlebot-Mobile
Allow: /
User-agent: Googlebot
Disallow: /
"""
good
=
[]
bad
=
[
'/something.jpg'
]
"""
agent
=
'Googlebot'
bad
=
[
'/something.jpg'
]
RobotTest
(
11
,
doc
,
good
,
bad
,
None
,
None
,
agent
=
"Googlebot"
)
good
=
[
'/something.jpg'
]
bad
=
[]
class
UserAgentGoogleMobileTest
(
UserAgentOrderingTest
):
agent
=
'Googlebot-Mobile'
RobotTest
(
12
,
doc
,
good
,
bad
,
None
,
None
,
agent
=
"Googlebot-Mobile"
)
# 13. Google also got the order wrong in #8. You need to specify the
# URLs from more specific to more general.
doc
=
"""
class
GoogleURLOrderingTest
(
BaseRobotTest
,
unittest
.
TestCase
):
# Google also got the order wrong. You need
# to specify the URLs from more specific to more general
robots_txt
=
"""
\
User-agent: Googlebot
Allow: /folder1/myfile.html
Disallow: /folder1/
"""
good
=
[
'/folder1/myfile.html'
]
bad
=
[
'/folder1/anotherfile.html'
]
RobotTest
(
13
,
doc
,
good
,
bad
,
None
,
None
,
agent
=
"googlebot"
)
"""
agent
=
'googlebot'
good
=
[
'/folder1/myfile.html'
]
bad
=
[
'/folder1/anotherfile.html'
]
# 14. For issue #6325 (query string support)
doc
=
"""
class
DisallowQueryStringTest
(
BaseRobotTest
,
unittest
.
TestCase
):
# see issue #6325 for details
robots_txt
=
"""
\
User-agent: *
Disallow: /some/path?name=value
"""
"""
good
=
[
'/some/path'
]
bad
=
[
'/some/path?name=value'
]
good
=
[
'/some/path'
]
bad
=
[
'/some/path?name=value'
]
RobotTest
(
14
,
doc
,
good
,
bad
,
None
,
None
)
# 15. For issue #4108 (obey first * entry)
doc
=
"""
class
UseFirstUserAgentWildcardTest
(
BaseRobotTest
,
unittest
.
TestCase
):
# obey first * entry (#4108)
robots_txt
=
"""
\
User-agent: *
Disallow: /some/path
User-agent: *
Disallow: /another/path
"""
good
=
[
'/another/path'
]
bad
=
[
'/some/path'
]
"""
good
=
[
'/another/path'
]
bad
=
[
'/some/path'
]
RobotTest
(
15
,
doc
,
good
,
bad
,
None
,
None
)
# 16. Empty query (issue #17403). Normalizing the url first.
doc
=
"""
class
EmptyQueryStringTest
(
BaseRobotTest
,
unittest
.
TestCase
):
# normalize the URL first (#17403)
robots_txt
=
"""
\
User-agent: *
Allow: /some/path?
Disallow: /another/path?
"""
good
=
[
'/some/path?'
]
bad
=
[
'/another/path?'
]
RobotTest
(
16
,
doc
,
good
,
bad
,
None
,
None
)
"""
good
=
[
'/some/path?'
]
bad
=
[
'/another/path?'
]
class
RobotHandler
(
BaseHTTPRequestHandler
):
...
...
@@ -329,9 +259,6 @@ class PasswordProtectedSiteTestCase(unittest.TestCase):
self
.
t
.
join
()
self
.
server
.
server_close
()
def
runTest
(
self
):
self
.
testPasswordProtectedSite
()
def
testPasswordProtectedSite
(
self
):
addr
=
self
.
server
.
server_address
url
=
'http://'
+
support
.
HOST
+
':'
+
str
(
addr
[
1
])
...
...
@@ -341,8 +268,6 @@ class PasswordProtectedSiteTestCase(unittest.TestCase):
parser
.
read
()
self
.
assertFalse
(
parser
.
can_fetch
(
"*"
,
robots_url
))
def
__str__
(
self
):
return
'%s'
%
self
.
__class__
.
__name__
class
NetworkTestCase
(
unittest
.
TestCase
):
...
...
@@ -356,11 +281,5 @@ class NetworkTestCase(unittest.TestCase):
self
.
assertTrue
(
parser
.
can_fetch
(
"*"
,
"http://www.python.org/robots.txt"
))
def
load_tests
(
loader
,
suite
,
pattern
):
suite
=
unittest
.
makeSuite
(
NetworkTestCase
)
suite
.
addTest
(
tests
)
suite
.
addTest
(
PasswordProtectedSiteTestCase
())
return
suite
if
__name__
==
'__main__'
:
unittest
.
main
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment