Fix Issue754016 - urlparse goes wrong with IP:port without scheme

84c7d9f8 · Senthil Kumaran · 4aa0d4d2 · 84c7d9f8 · 84c7d9f8 · 84c7d9f8
Commit 84c7d9f8 authored Aug 04, 2010 by Senthil Kumaran
Hide whitespace changes
Inline Side-by-side

Showing with 44 additions and 5 deletions

Doc/library/urllib.parse.rst Doc/library/urllib.parse.rst +17 -0

Lib/test/test_urlparse.py Lib/test/test_urlparse.py +21 -0

Lib/urllib/parse.py Lib/urllib/parse.py +6 -5

No files found.
--- a/Doc/library/urllib.parse.rst
+++ b/Doc/library/urllib.parse.rst
@@ -48,6 +48,23 @@ The :mod:`urllib.parse` module defines the following functions:
      >>> o.geturl()
      'http://www.cwi.nl:80/%7Eguido/Python.html'

+   If the scheme value is not specified, urlparse following the syntax
+   specifications from RFC 1808, expects the netloc value to start with '//',
+   Otherwise, it is not possible to distinguish between net_loc and path
+   component and would classify the indistinguishable component as path as in
+   a relative url.
+
+       >>> from urlparse import urlparse
+       >>> urlparse('//www.cwi.nl:80/%7Eguido/Python.html')
+       ParseResult(scheme='', netloc='www.cwi.nl:80', path='/%7Eguido/Python.html',
+                  params='', query='', fragment='')
+       >>> urlparse('www.cwi.nl:80/%7Eguido/Python.html')
+       ParseResult(scheme='', netloc='', path='www.cwi.nl:80/%7Eguido/Python.html',
+                  params='', query='', fragment='')
+       >>> urlparse('help/Python.html')
+       ParseResult(scheme='', netloc='', path='help/Python.html', params='',
+                  query='', fragment='')
+
   If the *scheme* argument is specified, it gives the default addressing
   scheme, to be used only if the URL does not specify one.  The default value for
   this argument is the empty string.

--- a/Lib/test/test_urlparse.py
+++ b/Lib/test/test_urlparse.py
@@ -461,6 +461,27 @@ class UrlParseTestCase(unittest.TestCase):
        self.assertEqual(urllib.parse.urlparse("http://example.com?blahblah=/foo"),
                         ('http', 'example.com', '', '', 'blahblah=/foo', ''))

+    def test_withoutscheme(self):
+        # Test urlparse without scheme
+        # Issue 754016: urlparse goes wrong with IP:port without scheme
+        # RFC 1808 specifies that netloc should start with //, urlparse expects
+        # the same, otherwise it classifies the portion of url as path.
+        self.assertEqual(urllib.parse.urlparse("path"),
+                ('','','path','','',''))
+        self.assertEqual(urllib.parse.urlparse("//www.python.org:80"),
+                ('','www.python.org:80','','','',''))
+        self.assertEqual(urllib.parse.urlparse("http://www.python.org:80"),
+                ('http','www.python.org:80','','','',''))
+
+    def test_portseparator(self):
+        # Issue 754016 makes changes for port separator ':' from scheme separator
+        self.assertEqual(urllib.parse.urlparse("path:80"),
+                ('','','path:80','','',''))
+        self.assertEqual(urllib.parse.urlparse("http:"),('http','','','','',''))
+        self.assertEqual(urllib.parse.urlparse("https:"),('https','','','','',''))
+        self.assertEqual(urllib.parse.urlparse("http://www.python.org:80"),
+                ('http','www.python.org:80','','','',''))
+
    def test_usingsys(self):
        # Issue 3314: sys module is used in the error
        self.assertRaises(TypeError, urllib.parse.urlencode, "foo")

--- a/Lib/urllib/parse.py
+++ b/Lib/urllib/parse.py
@@ -192,11 +192,12 @@ def urlsplit(url, scheme='', allow_fragments=True):
            v = SplitResult(scheme, netloc, url, query, fragment)
            _parse_cache[key] = v
            return v
-        for c in url[:i]:
-            if c not in scheme_chars:
-                break
-        else:
-            scheme, url = url[:i].lower(), url[i+1:]
+        if url.endswith(':') or not url[i+1].isdigit():
+            for c in url[:i]:
+                if c not in scheme_chars:
+                    break
+            else:
+                scheme, url = url[:i].lower(), url[i+1:]
    if url[:2] == '//':
        netloc, url = _splitnetloc(url, 2)
        if (('[' in netloc and ']' not in netloc) or