Commit 90e01e50 authored by postmasters's avatar postmasters Committed by Victor Stinner

urllib: Simplify splithost by calling into urlparse. (#1849)

The current regex based splitting produces a wrong result. For example::

  http://abc#@def

Web browsers parse that URL as ``http://abc/#@def``, that is, the host
is ``abc``, the path is ``/``, and the fragment is ``#@def``.
parent 5cc7ac24
...@@ -755,28 +755,35 @@ class UrlParseTestCase(unittest.TestCase): ...@@ -755,28 +755,35 @@ class UrlParseTestCase(unittest.TestCase):
def test_parse_fragments(self): def test_parse_fragments(self):
# Exercise the allow_fragments parameter of urlparse() and urlsplit() # Exercise the allow_fragments parameter of urlparse() and urlsplit()
tests = ( tests = (
("http:#frag", "path"), ("http:#frag", "path", "frag"),
("//example.net#frag", "path"), ("//example.net#frag", "path", "frag"),
("index.html#frag", "path"), ("index.html#frag", "path", "frag"),
(";a=b#frag", "params"), (";a=b#frag", "params", "frag"),
("?a=b#frag", "query"), ("?a=b#frag", "query", "frag"),
("#frag", "path"), ("#frag", "path", "frag"),
("abc#@frag", "path", "@frag"),
("//abc#@frag", "path", "@frag"),
("//abc:80#@frag", "path", "@frag"),
("//abc#@frag:80", "path", "@frag:80"),
) )
for url, attr in tests: for url, attr, expected_frag in tests:
for func in (urllib.parse.urlparse, urllib.parse.urlsplit): for func in (urllib.parse.urlparse, urllib.parse.urlsplit):
if attr == "params" and func is urllib.parse.urlsplit: if attr == "params" and func is urllib.parse.urlsplit:
attr = "path" attr = "path"
with self.subTest(url=url, function=func): with self.subTest(url=url, function=func):
result = func(url, allow_fragments=False) result = func(url, allow_fragments=False)
self.assertEqual(result.fragment, "") self.assertEqual(result.fragment, "")
self.assertTrue(getattr(result, attr).endswith("#frag")) self.assertTrue(
getattr(result, attr).endswith("#" + expected_frag))
self.assertEqual(func(url, "", False).fragment, "") self.assertEqual(func(url, "", False).fragment, "")
result = func(url, allow_fragments=True) result = func(url, allow_fragments=True)
self.assertEqual(result.fragment, "frag") self.assertEqual(result.fragment, expected_frag)
self.assertFalse(getattr(result, attr).endswith("frag")) self.assertFalse(
self.assertEqual(func(url, "", True).fragment, "frag") getattr(result, attr).endswith(expected_frag))
self.assertEqual(func(url).fragment, "frag") self.assertEqual(func(url, "", True).fragment,
expected_frag)
self.assertEqual(func(url).fragment, expected_frag)
def test_mixed_types_rejected(self): def test_mixed_types_rejected(self):
# Several functions that process either strings or ASCII encoded bytes # Several functions that process either strings or ASCII encoded bytes
...@@ -983,6 +990,26 @@ class Utility_Tests(unittest.TestCase): ...@@ -983,6 +990,26 @@ class Utility_Tests(unittest.TestCase):
self.assertEqual(splithost('/foo/bar/baz.html'), self.assertEqual(splithost('/foo/bar/baz.html'),
(None, '/foo/bar/baz.html')) (None, '/foo/bar/baz.html'))
# bpo-30500: # starts a fragment.
self.assertEqual(splithost('//127.0.0.1#@host.com'),
('127.0.0.1', '/#@host.com'))
self.assertEqual(splithost('//127.0.0.1#@host.com:80'),
('127.0.0.1', '/#@host.com:80'))
self.assertEqual(splithost('//127.0.0.1:80#@host.com'),
('127.0.0.1:80', '/#@host.com'))
# Empty host is returned as empty string.
self.assertEqual(splithost("///file"),
('', '/file'))
# Trailing semicolon, question mark and hash symbol are kept.
self.assertEqual(splithost("//example.net/file;"),
('example.net', '/file;'))
self.assertEqual(splithost("//example.net/file?"),
('example.net', '/file?'))
self.assertEqual(splithost("//example.net/file#"),
('example.net', '/file#'))
def test_splituser(self): def test_splituser(self):
splituser = urllib.parse.splituser splituser = urllib.parse.splituser
self.assertEqual(splituser('User:Pass@www.python.org:080'), self.assertEqual(splituser('User:Pass@www.python.org:080'),
......
...@@ -947,7 +947,7 @@ def splithost(url): ...@@ -947,7 +947,7 @@ def splithost(url):
"""splithost('//host[:port]/path') --> 'host[:port]', '/path'.""" """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
global _hostprog global _hostprog
if _hostprog is None: if _hostprog is None:
_hostprog = re.compile('//([^/?]*)(.*)', re.DOTALL) _hostprog = re.compile('//([^/#?]*)(.*)', re.DOTALL)
match = _hostprog.match(url) match = _hostprog.match(url)
if match: if match:
......
...@@ -1091,6 +1091,7 @@ Max Neunhöffer ...@@ -1091,6 +1091,7 @@ Max Neunhöffer
Anthon van der Neut Anthon van der Neut
George Neville-Neil George Neville-Neil
Hieu Nguyen Hieu Nguyen
Nam Nguyen
Johannes Nicolai Johannes Nicolai
Samuel Nicolary Samuel Nicolary
Jonathan Niehof Jonathan Niehof
......
...@@ -365,6 +365,11 @@ Extension Modules ...@@ -365,6 +365,11 @@ Extension Modules
Library Library
------- -------
- [Security] bpo-30500: Fix urllib.parse.splithost() to correctly parse
fragments. For example, ``splithost('http://127.0.0.1#@evil.com/')`` now
correctly returns the ``127.0.0.1`` host, instead of treating ``@evil.com``
as the host in an authentification (``login@host``).
- bpo-30038: Fix race condition between signal delivery and wakeup file - bpo-30038: Fix race condition between signal delivery and wakeup file
descriptor. Patch by Nathaniel Smith. descriptor. Patch by Nathaniel Smith.
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment