...
 
Commits (3)
......@@ -66,6 +66,7 @@ except ImportError:
from zc.buildout.easy_install import realpath
import hashlib
import logging
import os
import os.path
......@@ -122,7 +123,7 @@ class Download(object):
if self.download_cache is not None:
return os.path.join(self.download_cache, self.namespace or '')
def __call__(self, url, md5sum=None, path=None):
def __call__(self, url, md5sum=None, path=None, hashes=None):
"""Download a file according to the utility's configuration.
url: URL to download
......@@ -133,13 +134,13 @@ class Download(object):
"""
if self.cache:
local_path, is_temp = self.download_cached(url, md5sum)
local_path, is_temp = self.download_cached(url, md5sum, hashes)
else:
local_path, is_temp = self.download(url, md5sum, path)
local_path, is_temp = self.download(url, md5sum, path, hashes)
return locate_at(local_path, path), is_temp
def download_cached(self, url, md5sum=None):
def download_cached(self, url, md5sum=None, hashes=None):
"""Download a file from a URL using the cache.
This method assumes that the cache has been configured. Optionally, it
......@@ -164,7 +165,7 @@ class Download(object):
is_temp = False
if self.fallback:
try:
_, is_temp = self.download(url, md5sum, cached_path)
_, is_temp = self.download(url, md5sum, cached_path, hashes)
except ChecksumError:
raise
except Exception:
......@@ -174,15 +175,19 @@ class Download(object):
raise ChecksumError(
'MD5 checksum mismatch for cached download '
'from %r at %r' % (url, cached_path))
if not check_integrity(cached_path, hashes):
raise ChecksumError(
'Checksum mismatch for cached download '
'from %r at %r' % (url, cached_path))
self.logger.debug('Using cache file %s' % cached_path)
else:
self.logger.debug('Cache miss; will cache %s as %s' %
(url, cached_path))
_, is_temp = self.download(url, md5sum, cached_path)
_, is_temp = self.download(url, md5sum, cached_path, hashes)
return cached_path, is_temp
def download(self, url, md5sum=None, path=None):
def download(self, url, md5sum=None, path=None, hashes=None):
"""Download a file from a URL to a given or temporary path.
An online resource is always downloaded to a temporary file and moved
......@@ -204,6 +209,10 @@ class Download(object):
raise ChecksumError(
'MD5 checksum mismatch for local resource at %r.' %
url_path)
if not check_integrity(url_path, hashes):
raise ChecksumError(
'Checksum mismatch for local resource at %r.' %
url_path)
return locate_at(url_path, path), False
if self.offline:
......@@ -225,6 +234,10 @@ class Download(object):
if not check_md5sum(tmp_path, md5sum):
raise ChecksumError(
'MD5 checksum mismatch downloading %r' % url)
if not check_integrity(tmp_path, hashes):
raise ChecksumError(
'Checksum mismatch downloading %r' % url)
# Upload the file to network cache.
if nc.get('upload-cache-url') and nc.get('upload-dir-url'):
upload_network_cached(
......@@ -280,6 +293,36 @@ class Download(object):
return '%s:%s' % (url_host, url_port)
def check_integrity(path, hashes):
"""Tell wether the checksum of the file at path matches any of the hashes.
The hashes is string following format `algorithm:hash`, or None.
Multiple hashes can be specified, by separating them by spaces. In that
case, having one hash matching is enough.
hashes being None is considered a match.
"""
if hashes is None:
return True
for algorithm_and_expected_hash in hashes.split():
algorithm, expected_hash = algorithm_and_expected_hash.split(':', 1)
f = open(path, 'rb')
checksum = hashlib.new(algorithm)
try:
chunk = f.read(2**16)
while chunk:
checksum.update(chunk)
chunk = f.read(2**16)
if checksum.hexdigest() == expected_hash:
return True
finally:
f.close()
return False
def check_md5sum(path, md5sum):
"""Tell whether the MD5 checksum of the file at path matches.
......@@ -289,16 +332,7 @@ def check_md5sum(path, md5sum):
if md5sum is None:
return True
f = open(path, 'rb')
checksum = md5()
try:
chunk = f.read(2**16)
while chunk:
checksum.update(chunk)
chunk = f.read(2**16)
return checksum.hexdigest() == md5sum
finally:
f.close()
return check_integrity(path, 'md5:' + md5sum)
def remove(path):
......
......@@ -69,19 +69,56 @@ the local file itself:
>>> download(join(server_data, 'foo.txt'))
('/sample_files/foo.txt', False)
We can also have the downloaded file's MD5 sum checked:
We can also have the downloaded file's integrity checked:
>>> import hashlib
>>> path, is_temp = download(server_url+'foo.txt',
... hashes='sha256:%s' % hashlib.sha256('This is a foo text.'.encode()).hexdigest())
>>> is_temp
True
>>> remove(path)
>>> download(server_url+'foo.txt',
... hashes='sha256:%s' % hashlib.sha256('The wrong text.'.encode()).hexdigest())
Traceback (most recent call last):
ChecksumError: Checksum mismatch downloading 'http://localhost/foo.txt'
We can specify multiple hashes, as long as one match, the download is satisfied.
XXX not sure it makes sense to expose this here ...
XXX but then hashe*s* is a bad name - maybe integrity ?
>>> path, is_temp = download(
... server_url+'foo.txt',
... hashes='sha512:1234567-wrong-hash sha256:%s' % hashlib.sha256('This is a foo text.'.encode()).hexdigest())
>>> is_temp
True
>>> remove(path)
The error message in the event of an checksum mismatch for a local file
reads somewhat differently:
>>> download(join(server_data, 'foo.txt'),
... hashes='sha256:%s' % hashlib.sha256('This is a foo text.'.encode()).hexdigest())
('/sample_files/foo.txt', False)
>>> download(join(server_data, 'foo.txt'),
... hashes='sha256:%s' % hashlib.sha256('The wrong text.'.encode()).hexdigest())
Traceback (most recent call last):
ChecksumError: Checksum mismatch for local resource at '/sample_files/foo.txt'.
>>> try: from hashlib import md5
... except ImportError: from md5 import new as md5
This also support MD5 sum for legacy:
>>> path, is_temp = download(server_url+'foo.txt',
... md5('This is a foo text.'.encode()).hexdigest())
... hashlib.md5('This is a foo text.'.encode()).hexdigest())
>>> is_temp
True
>>> remove(path)
>>> download(server_url+'foo.txt',
... md5('The wrong text.'.encode()).hexdigest())
... hashlib.md5('The wrong text.'.encode()).hexdigest())
Traceback (most recent call last):
ChecksumError: MD5 checksum mismatch downloading 'http://localhost/foo.txt'
......@@ -89,11 +126,11 @@ The error message in the event of an MD5 checksum mismatch for a local file
reads somewhat differently:
>>> download(join(server_data, 'foo.txt'),
... md5('This is a foo text.'.encode()).hexdigest())
... hashlib.md5('This is a foo text.'.encode()).hexdigest())
('/sample_files/foo.txt', False)
>>> download(join(server_data, 'foo.txt'),
... md5('The wrong text.'.encode()).hexdigest())
... hashlib.md5('The wrong text.'.encode()).hexdigest())
Traceback (most recent call last):
ChecksumError: MD5 checksum mismatch for local resource at '/sample_files/foo.txt'.
......@@ -165,10 +202,17 @@ the file on the server to see this:
>>> cat(path)
This is a foo text.
If we specify an MD5 checksum for a file that is already in the cache, the
If we specify hashes for a file that is already in the cache, the
cached copy's checksum will be verified:
>>> download(server_url+'foo.txt', md5('The wrong text.'.encode()).hexdigest())
>>> download(server_url+'foo.txt', hashes='sha256:%s' % hashlib.sha256('The wrong text.'.encode()).hexdigest())
Traceback (most recent call last):
ChecksumError: Checksum mismatch for cached download
from 'http://localhost/foo.txt' at '/download-cache/foo.txt'
Same for legacy MD5 checksums:
>>> download(server_url+'foo.txt', hashlib.md5('The wrong text.'.encode()).hexdigest())
Traceback (most recent call last):
ChecksumError: MD5 checksum mismatch for cached download
from 'http://localhost/foo.txt' at '/download-cache/foo.txt'
......@@ -247,7 +291,14 @@ This is a foo text.
However, resources with checksum mismatches will not be copied to the cache:
>>> download(server_url+'foo.txt', md5('The wrong text.'.encode()).hexdigest())
>>> download(server_url+'foo.txt', hashes='sha256:%s' % hashlib.sha256('The wrong text.'.encode()).hexdigest())
Traceback (most recent call last):
ChecksumError: Checksum mismatch downloading 'http://localhost/foo.txt'
>>> ls(cache)
Same for legay MD5 checksum:
>>> download(server_url+'foo.txt', hashlib.md5('The wrong text.'.encode()).hexdigest())
Traceback (most recent call last):
ChecksumError: MD5 checksum mismatch downloading 'http://localhost/foo.txt'
>>> ls(cache)
......@@ -347,7 +398,7 @@ the test is run, so we don't actually know the full URL of the file. Let's
check that the checksum actually belongs to the particular URL used:
>>> (path.lower() ==
... join(cache, md5((server_url+'foo.txt').encode()).hexdigest()).lower())
... join(cache, hashlib.md5((server_url+'foo.txt').encode()).hexdigest()).lower())
True
The cached copy is used when downloading the file again:
......@@ -370,7 +421,7 @@ cache under a different name:
>>> path == path2
False
>>> (path2.lower() ==
... join(cache, md5((server_url+'other/foo.txt').encode()).hexdigest()
... join(cache, hashlib.md5((server_url+'other/foo.txt').encode()).hexdigest()
... ).lower())
True
>>> cat(path)
......@@ -451,12 +502,19 @@ When trying to download a resource whose checksum does not match, the cached
copy will neither be used nor overwritten:
>>> write(server_data, 'foo.txt', 'This is a foo text.')
>>> download(server_url+'foo.txt', md5('The wrong text.'.encode()).hexdigest())
>>> download(server_url+'foo.txt', hashes='sha256:%s' % hashlib.sha256('The wrong text.'.encode()).hexdigest())
Traceback (most recent call last):
ChecksumError: MD5 checksum mismatch downloading 'http://localhost/foo.txt'
ChecksumError: Checksum mismatch downloading 'http://localhost/foo.txt'
>>> cat(cache, 'foo.txt')
The wrong text.
This is also the case with legacy MD5
>>> download(server_url+'foo.txt', hashlib.md5('The wrong text.'.encode()).hexdigest())
Traceback (most recent call last):
ChecksumError: MD5 checksum mismatch downloading 'http://localhost/foo.txt'
>>> cat(cache, 'foo.txt')
The wrong text.
Configuring the download utility from buildout options
------------------------------------------------------
......@@ -538,7 +596,7 @@ True
Regressions
-----------
MD5 checksum calculation needs to be reliable on all supported systems, which
Checksum calculation needs to be reliable on all supported systems, which
requires text files to be treated as binary to avoid implicit line-ending
conversions:
......@@ -547,7 +605,10 @@ conversions:
>>> _ = f.write(text.encode())
>>> f.close()
>>> path, is_temp = Download()(server_url+'foo.txt',
... md5(text.encode()).hexdigest())
... hashlib.md5(text.encode()).hexdigest())
>>> remove(path)
>>> path, is_temp = Download()(server_url+'foo.txt',
... hashes='sha512:%s' % hashlib.sha512(text.encode()).hexdigest())
>>> remove(path)
When "downloading" a directory given by file-system path or ``file:`` URL and
......
......@@ -58,11 +58,11 @@ from setuptools.package_index import HREF, URL_SCHEME, \
try:
# Python 3
from urllib.error import HTTPError
from urllib.parse import urljoin
from urllib.parse import urljoin, urlparse
except ImportError:
# Python 2
from urllib2 import HTTPError
from urlparse import urljoin
from urlparse import urljoin, urlparse
default_index_url = os.environ.get(
'BUILDOUT_TESTING_INDEX_URL',
......@@ -380,13 +380,13 @@ class Installer:
if name.lower() in line.lower()]
return '\n '.join(output)
def _satisfied(self, req, source=None):
def _satisfied(self, req, source=None, hashes=None):
dists = [dist for dist in self._env[req.project_name] if dist in req]
if not dists:
logger.debug('We have no distributions for %s that satisfies %r.',
req.project_name, str(req))
return None, self._obtain(req, source)
return None, self._obtain(req, source, hashes)
# Note that dists are sorted from best to worst, as promised by
# env.__getitem__
......@@ -424,7 +424,7 @@ class Installer:
# any are newer. We only do this if we're willing to install
# something, which is only true if dest is not None:
best_available = self._obtain(req, source)
best_available = self._obtain(req, source, hashes)
if best_available is None:
# That's a bit odd. There aren't any distros available.
......@@ -547,11 +547,12 @@ class Installer:
finally:
shutil.rmtree(tmp)
def _obtain(self, requirement, source=None):
def _obtain(self, requirement, source=None, hashes=None):
# get the non-patched version
req = str(requirement)
if PATCH_MARKER in req:
requirement = pkg_resources.Requirement.parse(re.sub(orig_versions_re, '', req))
# XXX hashes: maybe we can just put the hash in requirement ??
# initialize out index for this project:
index = self._index
......@@ -561,12 +562,13 @@ class Installer:
return None
# Filter the available dists for the requirement and source flag
wheel = getattr(requirement, 'wheel', False)
# Accept tarballs and pure-python wheels as sources
dists = [dist for dist in index[requirement.project_name]
if ((dist in requirement)
and (dist.precedence == WHL_DIST) == wheel and
and
((not source) or
(dist.precedence == pkg_resources.SOURCE_DIST)
((dist.precedence == pkg_resources.SOURCE_DIST) or
(dist.precedence == WHL_DIST and dist.platform is None))
)
)
]
......@@ -581,6 +583,42 @@ class Installer:
# There are final dists, so only use those
dists = fdists
# if we are using hashes, filter for dists matching our hashes
if hashes is not None:
def dist_key(dist):
# optimization: sort to try first the distributions with the
# hash in the URL, so that if there's a matching dist we use
# it directly without trying all dists
for hash_ in hashes:
# the --hash format uses alg:hash, but in the URL fragments
# it is alg=hash
if hash_.replace(':', '=') in urlparse(dist.location).fragment:
return 0
return 1
# TODO: can we reuse download cache here ????
#tmp = self._download_cache
#if tmp is None:
# tmp = tempfile.mkdtemp('check_hash')
downloader = zc.buildout.download.Download()
def hash_match(dist):
try:
downloaded_filename, is_temp = downloader(dist.location, hashes=hashes)
if is_temp:
os.remove(downloaded_filename)
except zc.buildout.download.ChecksumError:
return False
return True
dists.sort(key=dist_key)
if hash_match(dists[0]):
dists = [dists[0]]
else:
dists = [dist for dist in dists[1:] if hash_match(dist)]
# Now find the best one:
best = []
bestv = None
......@@ -656,12 +694,12 @@ class Installer:
return dist.clone(location=new_location)
def _get_dist(self, requirement, ws, for_buildout_run=False):
def _get_dist(self, requirement, ws, for_buildout_run=False, hashes=None):
__doing__ = 'Getting distribution for %r.', str(requirement)
# Maybe an existing dist is already the best dist that satisfies the
# requirement
dist, avail = self._satisfied(requirement)
dist, avail = self._satisfied(requirement, hashes=hashes)
if dist is None:
if self._dest is None:
......@@ -719,6 +757,10 @@ class Installer:
if tmp != self._download_cache:
shutil.rmtree(tmp)
# TODO hashes if we:
# 1. install version1 h1
# 2. change to version1 h2 (to get another lower wheel build number)
# ... will it keep using h1 ??? maybe this is not something we can support.
self._env.scan([self._dest])
dist = self._env.best_match(requirement, ws)
logger.info("Got %s.", dist)
......@@ -786,6 +828,9 @@ class Installer:
"""Return requirement with optional [versions] constraint added."""
constraint = self._versions.get(requirement.project_name.lower())
if constraint:
# accept `<egg>:whl` as just `<egg>` for backward compatibility
# because once we were handling :whl suffix as explicit request to
# install via wheel and parts of slapos still use this.
wheel = constraint.endswith(':whl')
if wheel:
constraint = constraint[:-4]
......@@ -796,8 +841,6 @@ class Installer:
logger.info(self._version_conflict_information(
requirement.project_name.lower()))
raise
if wheel:
requirement.wheel = True
return requirement
......@@ -817,6 +860,22 @@ class Installer:
requirements = [self._constrain(pkg_resources.Requirement.parse(spec))
for spec in specs]
def get_hashes(spec):
version = self._versions.get(spec, '')
if '--hash' in version:
# TODO see pip actual implementation
# TODO for test, there can be multiple --hash like with pip
hashes = []
next_is_hash = False
for part in version.split():
if next_is_hash:
hashes.append(part)
next_is_hash = part == '--hash'
return ' '.join(hashes)
return None
hashes_dict = {spec: get_hashes(spec) for spec in specs}
if working_set is None:
ws = pkg_resources.WorkingSet([])
else:
......@@ -827,7 +886,8 @@ class Installer:
self._env.scan(
self.build(str(requirement), {}, patch_dict=patch_dict))
for dist in self._get_dist(requirement, ws,
for_buildout_run=for_buildout_run):
for_buildout_run=for_buildout_run,
hashes=hashes_dict.get(requirement.project_name)):
ws.add(dist)
self._maybe_add_setuptools(ws, dist)
......@@ -904,6 +964,8 @@ class Installer:
requirement = self._constrain(pkg_resources.Requirement.parse(spec))
print("TODO: Why are we here ???") # XXX hash this branch is not covered
import pdb; pdb.set_trace()
dist, avail = self._satisfied(requirement, 1)
if dist is not None:
return [dist.location]
......@@ -1764,6 +1826,8 @@ def redo_pyc(egg):
call_subprocess(args)
def _constrained_requirement(constraint, requirement):
if "--hash" in constraint:
constraint = constraint.split()[0]
if constraint[0] not in '<>':
if constraint.startswith('='):
assert constraint.startswith('==')
......@@ -1863,3 +1927,9 @@ def _move_to_eggs_dir_and_compile(dist, dest):
# Remember that temporary directories must be removed
shutil.rmtree(tmp_dest)
return newloc
# TODO hash sudy more:
# https://pip.pypa.io/en/stable/cli/pip_install/#requirements-file-format
# https://www.python.org/dev/peps/pep-0440/
# https://pip.pypa.io/en/stable/cli/pip_install/#hash-checking-mode