Commit 55ac5b3f authored by Antoine Pitrou's avatar Antoine Pitrou

Issue #22118: Switch urllib.parse to use RFC 3986 semantics for the resolution...

Issue #22118: Switch urllib.parse to use RFC 3986 semantics for the resolution of relative URLs, rather than RFCs 1808 and 2396.

Patch by Demian Brecht.
parent a7eb7462
...@@ -267,6 +267,11 @@ or on combining URL components into a URL string. ...@@ -267,6 +267,11 @@ or on combining URL components into a URL string.
:func:`urlunsplit`, removing possible *scheme* and *netloc* parts. :func:`urlunsplit`, removing possible *scheme* and *netloc* parts.
.. versionchanged:: 3.5
Behaviour updated to match the semantics defined in :rfc:`3986`.
.. function:: urldefrag(url) .. function:: urldefrag(url)
If *url* contains a fragment identifier, return a modified version of *url* If *url* contains a fragment identifier, return a modified version of *url*
......
...@@ -211,10 +211,6 @@ class UrlParseTestCase(unittest.TestCase): ...@@ -211,10 +211,6 @@ class UrlParseTestCase(unittest.TestCase):
# "abnormal" cases from RFC 1808: # "abnormal" cases from RFC 1808:
self.checkJoin(RFC1808_BASE, '', 'http://a/b/c/d;p?q#f') self.checkJoin(RFC1808_BASE, '', 'http://a/b/c/d;p?q#f')
self.checkJoin(RFC1808_BASE, '../../../g', 'http://a/../g')
self.checkJoin(RFC1808_BASE, '../../../../g', 'http://a/../../g')
self.checkJoin(RFC1808_BASE, '/./g', 'http://a/./g')
self.checkJoin(RFC1808_BASE, '/../g', 'http://a/../g')
self.checkJoin(RFC1808_BASE, 'g.', 'http://a/b/c/g.') self.checkJoin(RFC1808_BASE, 'g.', 'http://a/b/c/g.')
self.checkJoin(RFC1808_BASE, '.g', 'http://a/b/c/.g') self.checkJoin(RFC1808_BASE, '.g', 'http://a/b/c/.g')
self.checkJoin(RFC1808_BASE, 'g..', 'http://a/b/c/g..') self.checkJoin(RFC1808_BASE, 'g..', 'http://a/b/c/g..')
...@@ -229,6 +225,13 @@ class UrlParseTestCase(unittest.TestCase): ...@@ -229,6 +225,13 @@ class UrlParseTestCase(unittest.TestCase):
#self.checkJoin(RFC1808_BASE, 'http:g', 'http:g') #self.checkJoin(RFC1808_BASE, 'http:g', 'http:g')
#self.checkJoin(RFC1808_BASE, 'http:', 'http:') #self.checkJoin(RFC1808_BASE, 'http:', 'http:')
# XXX: The following tests are no longer compatible with RFC3986
# self.checkJoin(RFC1808_BASE, '../../../g', 'http://a/../g')
# self.checkJoin(RFC1808_BASE, '../../../../g', 'http://a/../../g')
# self.checkJoin(RFC1808_BASE, '/./g', 'http://a/./g')
# self.checkJoin(RFC1808_BASE, '/../g', 'http://a/../g')
def test_RFC2368(self): def test_RFC2368(self):
# Issue 11467: path that starts with a number is not parsed correctly # Issue 11467: path that starts with a number is not parsed correctly
self.assertEqual(urllib.parse.urlparse('mailto:1337@example.org'), self.assertEqual(urllib.parse.urlparse('mailto:1337@example.org'),
...@@ -259,10 +262,6 @@ class UrlParseTestCase(unittest.TestCase): ...@@ -259,10 +262,6 @@ class UrlParseTestCase(unittest.TestCase):
self.checkJoin(RFC2396_BASE, '../../', 'http://a/') self.checkJoin(RFC2396_BASE, '../../', 'http://a/')
self.checkJoin(RFC2396_BASE, '../../g', 'http://a/g') self.checkJoin(RFC2396_BASE, '../../g', 'http://a/g')
self.checkJoin(RFC2396_BASE, '', RFC2396_BASE) self.checkJoin(RFC2396_BASE, '', RFC2396_BASE)
self.checkJoin(RFC2396_BASE, '../../../g', 'http://a/../g')
self.checkJoin(RFC2396_BASE, '../../../../g', 'http://a/../../g')
self.checkJoin(RFC2396_BASE, '/./g', 'http://a/./g')
self.checkJoin(RFC2396_BASE, '/../g', 'http://a/../g')
self.checkJoin(RFC2396_BASE, 'g.', 'http://a/b/c/g.') self.checkJoin(RFC2396_BASE, 'g.', 'http://a/b/c/g.')
self.checkJoin(RFC2396_BASE, '.g', 'http://a/b/c/.g') self.checkJoin(RFC2396_BASE, '.g', 'http://a/b/c/.g')
self.checkJoin(RFC2396_BASE, 'g..', 'http://a/b/c/g..') self.checkJoin(RFC2396_BASE, 'g..', 'http://a/b/c/g..')
...@@ -278,10 +277,17 @@ class UrlParseTestCase(unittest.TestCase): ...@@ -278,10 +277,17 @@ class UrlParseTestCase(unittest.TestCase):
self.checkJoin(RFC2396_BASE, 'g#s/./x', 'http://a/b/c/g#s/./x') self.checkJoin(RFC2396_BASE, 'g#s/./x', 'http://a/b/c/g#s/./x')
self.checkJoin(RFC2396_BASE, 'g#s/../x', 'http://a/b/c/g#s/../x') self.checkJoin(RFC2396_BASE, 'g#s/../x', 'http://a/b/c/g#s/../x')
# XXX: The following tests are no longer compatible with RFC3986
# self.checkJoin(RFC2396_BASE, '../../../g', 'http://a/../g')
# self.checkJoin(RFC2396_BASE, '../../../../g', 'http://a/../../g')
# self.checkJoin(RFC2396_BASE, '/./g', 'http://a/./g')
# self.checkJoin(RFC2396_BASE, '/../g', 'http://a/../g')
def test_RFC3986(self): def test_RFC3986(self):
# Test cases from RFC3986 # Test cases from RFC3986
self.checkJoin(RFC3986_BASE, '?y','http://a/b/c/d;p?y') self.checkJoin(RFC3986_BASE, '?y','http://a/b/c/d;p?y')
self.checkJoin(RFC2396_BASE, ';x', 'http://a/b/c/;x') self.checkJoin(RFC3986_BASE, ';x', 'http://a/b/c/;x')
self.checkJoin(RFC3986_BASE, 'g:h','g:h') self.checkJoin(RFC3986_BASE, 'g:h','g:h')
self.checkJoin(RFC3986_BASE, 'g','http://a/b/c/g') self.checkJoin(RFC3986_BASE, 'g','http://a/b/c/g')
self.checkJoin(RFC3986_BASE, './g','http://a/b/c/g') self.checkJoin(RFC3986_BASE, './g','http://a/b/c/g')
...@@ -305,17 +311,17 @@ class UrlParseTestCase(unittest.TestCase): ...@@ -305,17 +311,17 @@ class UrlParseTestCase(unittest.TestCase):
self.checkJoin(RFC3986_BASE, '../..','http://a/') self.checkJoin(RFC3986_BASE, '../..','http://a/')
self.checkJoin(RFC3986_BASE, '../../','http://a/') self.checkJoin(RFC3986_BASE, '../../','http://a/')
self.checkJoin(RFC3986_BASE, '../../g','http://a/g') self.checkJoin(RFC3986_BASE, '../../g','http://a/g')
self.checkJoin(RFC3986_BASE, '../../../g', 'http://a/g')
#Abnormal Examples #Abnormal Examples
# The 'abnormal scenarios' are incompatible with RFC2986 parsing # The 'abnormal scenarios' are incompatible with RFC2986 parsing
# Tests are here for reference. # Tests are here for reference.
#self.checkJoin(RFC3986_BASE, '../../../g','http://a/g') self.checkJoin(RFC3986_BASE, '../../../g','http://a/g')
#self.checkJoin(RFC3986_BASE, '../../../../g','http://a/g') self.checkJoin(RFC3986_BASE, '../../../../g','http://a/g')
#self.checkJoin(RFC3986_BASE, '/./g','http://a/g') self.checkJoin(RFC3986_BASE, '/./g','http://a/g')
#self.checkJoin(RFC3986_BASE, '/../g','http://a/g') self.checkJoin(RFC3986_BASE, '/../g','http://a/g')
self.checkJoin(RFC3986_BASE, 'g.','http://a/b/c/g.') self.checkJoin(RFC3986_BASE, 'g.','http://a/b/c/g.')
self.checkJoin(RFC3986_BASE, '.g','http://a/b/c/.g') self.checkJoin(RFC3986_BASE, '.g','http://a/b/c/.g')
self.checkJoin(RFC3986_BASE, 'g..','http://a/b/c/g..') self.checkJoin(RFC3986_BASE, 'g..','http://a/b/c/g..')
...@@ -355,10 +361,8 @@ class UrlParseTestCase(unittest.TestCase): ...@@ -355,10 +361,8 @@ class UrlParseTestCase(unittest.TestCase):
self.checkJoin(SIMPLE_BASE, '../g','http://a/b/g') self.checkJoin(SIMPLE_BASE, '../g','http://a/b/g')
self.checkJoin(SIMPLE_BASE, '../..','http://a/') self.checkJoin(SIMPLE_BASE, '../..','http://a/')
self.checkJoin(SIMPLE_BASE, '../../g','http://a/g') self.checkJoin(SIMPLE_BASE, '../../g','http://a/g')
self.checkJoin(SIMPLE_BASE, '../../../g','http://a/../g')
self.checkJoin(SIMPLE_BASE, './../g','http://a/b/g') self.checkJoin(SIMPLE_BASE, './../g','http://a/b/g')
self.checkJoin(SIMPLE_BASE, './g/.','http://a/b/c/g/') self.checkJoin(SIMPLE_BASE, './g/.','http://a/b/c/g/')
self.checkJoin(SIMPLE_BASE, '/./g','http://a/./g')
self.checkJoin(SIMPLE_BASE, 'g/./h','http://a/b/c/g/h') self.checkJoin(SIMPLE_BASE, 'g/./h','http://a/b/c/g/h')
self.checkJoin(SIMPLE_BASE, 'g/../h','http://a/b/c/h') self.checkJoin(SIMPLE_BASE, 'g/../h','http://a/b/c/h')
self.checkJoin(SIMPLE_BASE, 'http:g','http://a/b/c/g') self.checkJoin(SIMPLE_BASE, 'http:g','http://a/b/c/g')
...@@ -372,6 +376,10 @@ class UrlParseTestCase(unittest.TestCase): ...@@ -372,6 +376,10 @@ class UrlParseTestCase(unittest.TestCase):
self.checkJoin('svn://pathtorepo/dir1', 'dir2', 'svn://pathtorepo/dir2') self.checkJoin('svn://pathtorepo/dir1', 'dir2', 'svn://pathtorepo/dir2')
self.checkJoin('svn+ssh://pathtorepo/dir1', 'dir2', 'svn+ssh://pathtorepo/dir2') self.checkJoin('svn+ssh://pathtorepo/dir1', 'dir2', 'svn+ssh://pathtorepo/dir2')
# XXX: The following tests are no longer compatible with RFC3986
# self.checkJoin(SIMPLE_BASE, '../../../g','http://a/../g')
# self.checkJoin(SIMPLE_BASE, '/./g','http://a/./g')
def test_RFC2732(self): def test_RFC2732(self):
str_cases = [ str_cases = [
('http://Test.python.org:5432/foo/', 'test.python.org', 5432), ('http://Test.python.org:5432/foo/', 'test.python.org', 5432),
......
...@@ -409,11 +409,13 @@ def urljoin(base, url, allow_fragments=True): ...@@ -409,11 +409,13 @@ def urljoin(base, url, allow_fragments=True):
return url return url
if not url: if not url:
return base return base
base, url, _coerce_result = _coerce_args(base, url) base, url, _coerce_result = _coerce_args(base, url)
bscheme, bnetloc, bpath, bparams, bquery, bfragment = \ bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
urlparse(base, '', allow_fragments) urlparse(base, '', allow_fragments)
scheme, netloc, path, params, query, fragment = \ scheme, netloc, path, params, query, fragment = \
urlparse(url, bscheme, allow_fragments) urlparse(url, bscheme, allow_fragments)
if scheme != bscheme or scheme not in uses_relative: if scheme != bscheme or scheme not in uses_relative:
return _coerce_result(url) return _coerce_result(url)
if scheme in uses_netloc: if scheme in uses_netloc:
...@@ -421,9 +423,7 @@ def urljoin(base, url, allow_fragments=True): ...@@ -421,9 +423,7 @@ def urljoin(base, url, allow_fragments=True):
return _coerce_result(urlunparse((scheme, netloc, path, return _coerce_result(urlunparse((scheme, netloc, path,
params, query, fragment))) params, query, fragment)))
netloc = bnetloc netloc = bnetloc
if path[:1] == '/':
return _coerce_result(urlunparse((scheme, netloc, path,
params, query, fragment)))
if not path and not params: if not path and not params:
path = bpath path = bpath
params = bparams params = bparams
...@@ -431,29 +431,42 @@ def urljoin(base, url, allow_fragments=True): ...@@ -431,29 +431,42 @@ def urljoin(base, url, allow_fragments=True):
query = bquery query = bquery
return _coerce_result(urlunparse((scheme, netloc, path, return _coerce_result(urlunparse((scheme, netloc, path,
params, query, fragment))) params, query, fragment)))
segments = bpath.split('/')[:-1] + path.split('/')
# XXX The stuff below is bogus in various ways... base_parts = bpath.split('/')
if segments[-1] == '.': if base_parts[-1] != '':
segments[-1] = '' # the last item is not a directory, so will not be taken into account
while '.' in segments: # in resolving the relative path
segments.remove('.') del base_parts[-1]
while 1:
i = 1 # for rfc3986, ignore all base path should the first character be root.
n = len(segments) - 1 if path[:1] == '/':
while i < n: segments = path.split('/')
if (segments[i] == '..' else:
and segments[i-1] not in ('', '..')): segments = base_parts + path.split('/')
del segments[i-1:i+1]
break resolved_path = []
i = i+1
for seg in segments:
if seg == '..':
try:
resolved_path.pop()
except IndexError:
# ignore any .. segments that would otherwise cause an IndexError
# when popped from resolved_path if resolving for rfc3986
pass
elif seg == '.':
continue
else: else:
break resolved_path.append(seg)
if segments == ['', '..']:
segments[-1] = '' if segments[-1] in ('.', '..'):
elif len(segments) >= 2 and segments[-1] == '..': # do some post-processing here. if the last segment was a relative dir,
segments[-2:] = [''] # then we need to append the trailing '/'
return _coerce_result(urlunparse((scheme, netloc, '/'.join(segments), resolved_path.append('')
params, query, fragment)))
return _coerce_result(urlunparse((scheme, netloc, '/'.join(
resolved_path), params, query, fragment)))
def urldefrag(url): def urldefrag(url):
"""Removes any existing fragment from URL. """Removes any existing fragment from URL.
......
...@@ -124,6 +124,10 @@ Core and Builtins ...@@ -124,6 +124,10 @@ Core and Builtins
Library Library
------- -------
- Issue #22118: Switch urllib.parse to use RFC 3986 semantics for the
resolution of relative URLs, rather than RFCs 1808 and 2396.
Patch by Demian Brecht.
- Issue #21549: Added the "members" parameter to TarFile.list(). - Issue #21549: Added the "members" parameter to TarFile.list().
- Issue #19628: Allow compileall recursion depth to be specified with a -r - Issue #19628: Allow compileall recursion depth to be specified with a -r
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment