Commit 41e4faa8 authored by Johannes Gijsbers's avatar Johannes Gijsbers

Patch #712317: In URLs such as http://www.example.com?query=spam, treat '?' as

a delimiter. Previously, the 'network location' (<authority> in RFC 2396) would
become 'www.example.com?query=spam', while RFC 2396 does not allow a '?' in
<authority>. See bug #548176 for further discussion.
parent cdd625a7
...@@ -8,20 +8,22 @@ RFC1808_BASE = "http://a/b/c/d;p?q#f" ...@@ -8,20 +8,22 @@ RFC1808_BASE = "http://a/b/c/d;p?q#f"
RFC2396_BASE = "http://a/b/c/d;p?q" RFC2396_BASE = "http://a/b/c/d;p?q"
class UrlParseTestCase(unittest.TestCase): class UrlParseTestCase(unittest.TestCase):
def test_frags(self):
for url, parsed, split in [ def checkRoundtrips(self, url, parsed, split):
('http://www.python.org', result = urlparse.urlparse(url)
('http', 'www.python.org', '', '', '', ''), self.assertEqual(result, parsed)
('http', 'www.python.org', '', '', '')), # put it back together and it should be the same
('http://www.python.org#abc', result2 = urlparse.urlunparse(result)
('http', 'www.python.org', '', '', '', 'abc'), self.assertEqual(result2, url)
('http', 'www.python.org', '', '', 'abc')),
('http://www.python.org/#abc', # check the roundtrip using urlsplit() as well
('http', 'www.python.org', '/', '', '', 'abc'), result = urlparse.urlsplit(url)
('http', 'www.python.org', '/', '', 'abc')), self.assertEqual(result, split)
(RFC1808_BASE, result2 = urlparse.urlunsplit(result)
('http', 'a', '/b/c/d', 'p', 'q', 'f'), self.assertEqual(result2, url)
('http', 'a', '/b/c/d;p', 'q', 'f')),
def test_roundtrips(self):
testcases = [
('file:///tmp/junk.txt', ('file:///tmp/junk.txt',
('file', '', '/tmp/junk.txt', '', '', ''), ('file', '', '/tmp/junk.txt', '', '', ''),
('file', '', '/tmp/junk.txt', '', '')), ('file', '', '/tmp/junk.txt', '', '')),
...@@ -29,20 +31,41 @@ class UrlParseTestCase(unittest.TestCase): ...@@ -29,20 +31,41 @@ class UrlParseTestCase(unittest.TestCase):
('imap', 'mail.python.org', '/mbox1', '', '', ''), ('imap', 'mail.python.org', '/mbox1', '', '', ''),
('imap', 'mail.python.org', '/mbox1', '', '')), ('imap', 'mail.python.org', '/mbox1', '', '')),
('mms://wms.sys.hinet.net/cts/Drama/09006251100.asf', ('mms://wms.sys.hinet.net/cts/Drama/09006251100.asf',
('mms', 'wms.sys.hinet.net', '/cts/Drama/09006251100.asf', '', '', ''), ('mms', 'wms.sys.hinet.net', '/cts/Drama/09006251100.asf',
('mms', 'wms.sys.hinet.net', '/cts/Drama/09006251100.asf', '', '')), '', '', ''),
]: ('mms', 'wms.sys.hinet.net', '/cts/Drama/09006251100.asf',
result = urlparse.urlparse(url) '', '')),
self.assertEqual(result, parsed) ]
# put it back together and it should be the same for url, parsed, split in testcases:
result2 = urlparse.urlunparse(result) self.checkRoundtrips(url, parsed, split)
self.assertEqual(result2, url)
def test_http_roundtrips(self):
# check the roundtrip using urlsplit() as well # urlparse.urlsplit treats 'http:' as an optimized special case,
result = urlparse.urlsplit(url) # so we test both 'http:' and 'https:' in all the following.
self.assertEqual(result, split) # Three cheers for white box knowledge!
result2 = urlparse.urlunsplit(result) testcases = [
self.assertEqual(result2, url) ('://www.python.org',
('www.python.org', '', '', '', ''),
('www.python.org', '', '', '')),
('://www.python.org#abc',
('www.python.org', '', '', '', 'abc'),
('www.python.org', '', '', 'abc')),
('://www.python.org?q=abc',
('www.python.org', '', '', 'q=abc', ''),
('www.python.org', '', 'q=abc', '')),
('://www.python.org/#abc',
('www.python.org', '/', '', '', 'abc'),
('www.python.org', '/', '', 'abc')),
('://a/b/c/d;p?q#f',
('a', '/b/c/d', 'p', 'q', 'f'),
('a', '/b/c/d;p', 'q', 'f')),
]
for scheme in ('http', 'https'):
for url, parsed, split in testcases:
url = scheme + url
parsed = (scheme,) + parsed
split = (scheme,) + split
self.checkRoundtrips(url, parsed, split)
def checkJoin(self, base, relurl, expected): def checkJoin(self, base, relurl, expected):
self.assertEqual(urlparse.urljoin(base, relurl), expected, self.assertEqual(urlparse.urljoin(base, relurl), expected,
......
...@@ -63,6 +63,15 @@ def _splitparams(url): ...@@ -63,6 +63,15 @@ def _splitparams(url):
i = url.find(';') i = url.find(';')
return url[:i], url[i+1:] return url[:i], url[i+1:]
def _splitnetloc(url, start=0):
for c in '/?#': # the order is important!
delim = url.find(c, start)
if delim >= 0:
break
else:
delim = len(url)
return url[start:delim], url[delim:]
def urlsplit(url, scheme='', allow_fragments=1): def urlsplit(url, scheme='', allow_fragments=1):
"""Parse a URL into 5 components: """Parse a URL into 5 components:
<scheme>://<netloc>/<path>?<query>#<fragment> <scheme>://<netloc>/<path>?<query>#<fragment>
...@@ -82,13 +91,7 @@ def urlsplit(url, scheme='', allow_fragments=1): ...@@ -82,13 +91,7 @@ def urlsplit(url, scheme='', allow_fragments=1):
scheme = url[:i].lower() scheme = url[:i].lower()
url = url[i+1:] url = url[i+1:]
if url[:2] == '//': if url[:2] == '//':
i = url.find('/', 2) netloc, url = _splitnetloc(url, 2)
if i < 0:
i = url.find('#')
if i < 0:
i = len(url)
netloc = url[2:i]
url = url[i:]
if allow_fragments and '#' in url: if allow_fragments and '#' in url:
url, fragment = url.split('#', 1) url, fragment = url.split('#', 1)
if '?' in url: if '?' in url:
...@@ -101,12 +104,8 @@ def urlsplit(url, scheme='', allow_fragments=1): ...@@ -101,12 +104,8 @@ def urlsplit(url, scheme='', allow_fragments=1):
break break
else: else:
scheme, url = url[:i].lower(), url[i+1:] scheme, url = url[:i].lower(), url[i+1:]
if scheme in uses_netloc: if scheme in uses_netloc and url[:2] == '//':
if url[:2] == '//': netloc, url = _splitnetloc(url, 2)
i = url.find('/', 2)
if i < 0:
i = len(url)
netloc, url = url[2:i], url[i:]
if allow_fragments and scheme in uses_fragment and '#' in url: if allow_fragments and scheme in uses_fragment and '#' in url:
url, fragment = url.split('#', 1) url, fragment = url.split('#', 1)
if scheme in uses_query and '?' in url: if scheme in uses_query and '?' in url:
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment