Commit 171c2459 authored by Julien Muchembled's avatar Julien Muchembled

Rewrite 'urlretrieve' helper to fix various download-related issues

- Py3: stop using legacy API of urllib.request and
       fix download of http(s) URLs containing user:passwd@
- Py2: avoid OOM when downloading huge files

This is implemented as a method in case we want to make it configurable
via [buildout].
parent c9e9b267
......@@ -20,3 +20,16 @@ class UserError(Exception):
def __str__(self):
return " ".join(map(str, self.args))
# Used for Python 2-3 compatibility
if str is bytes:
bytes2str = str2bytes = lambda s: s
def unicode2str(s):
return s.encode('utf-8')
def bytes2str(s):
return s.decode()
def str2bytes(s):
return s.encode()
def unicode2str(s):
return s
......@@ -20,35 +20,17 @@ except ImportError:
# Python 3
from urllib.request import urlretrieve
from urllib.parse import urlparse
from urllib.request import Request, splituser, urlopen
from urllib.parse import urlparse, urlunparse
except ImportError:
# Python 2
import base64
from urlparse import urlparse
from urlparse import urlunparse
import urllib2
def urlretrieve(url, tmp_path):
"""Work around Python issue 24599 includig basic auth support
scheme, netloc, path, params, query, frag = urlparse(url)
auth, host = urllib2.splituser(netloc)
if auth:
url = urlunparse((scheme, host, path, params, query, frag))
req = urllib2.Request(url)
base64string = base64.encodestring(auth)[:-1]
basic = "Basic " + base64string
req.add_header("Authorization", basic)
req = urllib2.Request(url)
url_obj = urllib2.urlopen(req)
with open(tmp_path, 'wb') as fp:
return tmp_path,
from urllib2 import Request, splituser, urlopen
from zc.buildout.easy_install import realpath
from base64 import b64encode
from contextlib import closing
import logging
import os
import os.path
......@@ -56,7 +38,7 @@ import re
import shutil
import tempfile
import zc.buildout
from . import bytes2str, str2bytes
class ChecksumError(zc.buildout.UserError):
......@@ -213,13 +195,14 @@ class Download(object):
nc.get('signature-certificate-list'), md5sum):
# Download from original url if not cached or md5sum doesn't match.
tmp_path, headers = urlretrieve(url, tmp_path)
tmp_path, headers = self.urlretrieve(url, tmp_path)
except HTTPError:
if not alternate_url:
raise'using alternate URL: %s', alternate_url)
download_url = alternate_url
tmp_path, headers = urlretrieve(alternate_url, tmp_path)
tmp_path, headers = self.urlretrieve(
alternate_url, tmp_path)
if not check_md5sum(tmp_path, md5sum):
raise ChecksumError(
'MD5 checksum mismatch downloading %r' % download_url)
......@@ -272,6 +255,21 @@ class Download(object):
url_host, url_port = parsed[-2:]
return '%s:%s' % (url_host, url_port)
def urlretrieve(self, url, tmp_path):
scheme, netloc, path, params, query, frag = urlparse(url)
req = url
if scheme in ('http', 'https'):
auth, host = splituser(netloc)
if auth:
url = urlunparse((scheme, host, path, params, query, frag))
req = Request(url)
"Basic " + bytes2str(b64encode(str2bytes(auth))))
with closing(urlopen(req)) as src:
with open(tmp_path, 'wb') as dst:
shutil.copyfileobj(src, dst)
return tmp_path,
def check_md5sum(path, md5sum):
"""Tell whether the MD5 checksum of the file at path matches.
......@@ -152,6 +152,19 @@ This is a foo text.
>>> remove(path)
HTTP basic authentication:
>>> download = Download()
>>> user_url = server_url.replace('/localhost:', '/%s@localhost:') + 'private/'
>>> path, is_temp = download(user_url % 'foo:' + 'foo:')
>>> is_temp; remove(path)
>>> path, is_temp = download(user_url % 'foo:bar' + 'foo:bar')
>>> is_temp; remove(path)
>>> download(user_url % 'bar:' + 'foo:')
Traceback (most recent call last):
UserError: Error downloading ...: HTTP Error 403: Forbidden
Downloading using the download cache
......@@ -23,6 +23,7 @@ except ImportError:
from BaseHTTPServer import HTTPServer, BaseHTTPRequestHandler
from urllib2 import urlopen
import base64
import errno
import logging
import os
......@@ -364,6 +365,23 @@ class Handler(BaseHTTPRequestHandler):
self.__server.__log = False
return k()
if self.path.startswith('/private/'):
auth = self.headers.get('Authorization')
if auth and auth.startswith('Basic ') and \
self.path[9:].encode() == base64.b64decode(
return k()
# But not returning 401+WWW-Authenticate, we check that the client
# skips auth challenge, which is not free (in terms of performance)
# and useless for what we support.
self.send_response(403, 'Forbidden')
out = '<html><body>Forbidden</body></html>'.encode()
self.send_header('Content-Length', str(len(out)))
self.send_header('Content-Type', 'text/html')
path = os.path.abspath(os.path.join(self.tree, *self.path.split('/')))
if not (
((path == self.tree) or path.startswith(self.tree+os.path.sep))
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment