Commit 6a9722bb authored by Jérome Perrin's avatar Jérome Perrin

download: support more hash algorithms, not just md5sum

TODO: see XXX comments in the doctest

This changes to use hashlib directly, the module was introduced in python2.5
we don't need to support older versions.
parent 50e0e3b7
......@@ -66,6 +66,7 @@ except ImportError:
from zc.buildout.easy_install import realpath
import hashlib
import logging
import os
import os.path
......@@ -122,7 +123,7 @@ class Download(object):
if self.download_cache is not None:
return os.path.join(self.download_cache, self.namespace or '')
def __call__(self, url, md5sum=None, path=None):
def __call__(self, url, md5sum=None, path=None, hashes=None):
"""Download a file according to the utility's configuration.
url: URL to download
......@@ -133,13 +134,13 @@ class Download(object):
"""
if self.cache:
local_path, is_temp = self.download_cached(url, md5sum)
local_path, is_temp = self.download_cached(url, md5sum, hashes)
else:
local_path, is_temp = self.download(url, md5sum, path)
local_path, is_temp = self.download(url, md5sum, path, hashes)
return locate_at(local_path, path), is_temp
def download_cached(self, url, md5sum=None):
def download_cached(self, url, md5sum=None, hashes=None):
"""Download a file from a URL using the cache.
This method assumes that the cache has been configured. Optionally, it
......@@ -164,7 +165,7 @@ class Download(object):
is_temp = False
if self.fallback:
try:
_, is_temp = self.download(url, md5sum, cached_path)
_, is_temp = self.download(url, md5sum, cached_path, hashes)
except ChecksumError:
raise
except Exception:
......@@ -174,15 +175,19 @@ class Download(object):
raise ChecksumError(
'MD5 checksum mismatch for cached download '
'from %r at %r' % (url, cached_path))
if not check_integrity(cached_path, hashes):
raise ChecksumError(
'Checksum mismatch for cached download '
'from %r at %r' % (url, cached_path))
self.logger.debug('Using cache file %s' % cached_path)
else:
self.logger.debug('Cache miss; will cache %s as %s' %
(url, cached_path))
_, is_temp = self.download(url, md5sum, cached_path)
_, is_temp = self.download(url, md5sum, cached_path, hashes)
return cached_path, is_temp
def download(self, url, md5sum=None, path=None):
def download(self, url, md5sum=None, path=None, hashes=None):
"""Download a file from a URL to a given or temporary path.
An online resource is always downloaded to a temporary file and moved
......@@ -204,6 +209,10 @@ class Download(object):
raise ChecksumError(
'MD5 checksum mismatch for local resource at %r.' %
url_path)
if not check_integrity(url_path, hashes):
raise ChecksumError(
'Checksum mismatch for local resource at %r.' %
url_path)
return locate_at(url_path, path), False
if self.offline:
......@@ -225,6 +234,10 @@ class Download(object):
if not check_md5sum(tmp_path, md5sum):
raise ChecksumError(
'MD5 checksum mismatch downloading %r' % url)
if not check_integrity(tmp_path, hashes):
raise ChecksumError(
'Checksum mismatch downloading %r' % url)
# Upload the file to network cache.
if nc.get('upload-cache-url') and nc.get('upload-dir-url'):
upload_network_cached(
......@@ -280,6 +293,36 @@ class Download(object):
return '%s:%s' % (url_host, url_port)
def check_integrity(path, hashes):
"""Tell wether the checksum of the file at path matches any of the hashes.
The hashes is string following format `algorithm:hash`, or None.
Multiple hashes can be specified, by separating them by spaces. In that
case, having one hash matching is enough.
hashes being None is considered a match.
"""
if hashes is None:
return True
for algorithm_and_expected_hash in hashes.split():
algorithm, expected_hash = algorithm_and_expected_hash.split(':', 1)
f = open(path, 'rb')
checksum = hashlib.new(algorithm)
try:
chunk = f.read(2**16)
while chunk:
checksum.update(chunk)
chunk = f.read(2**16)
if checksum.hexdigest() == expected_hash:
return True
finally:
f.close()
return False
def check_md5sum(path, md5sum):
"""Tell whether the MD5 checksum of the file at path matches.
......@@ -289,16 +332,7 @@ def check_md5sum(path, md5sum):
if md5sum is None:
return True
f = open(path, 'rb')
checksum = md5()
try:
chunk = f.read(2**16)
while chunk:
checksum.update(chunk)
chunk = f.read(2**16)
return checksum.hexdigest() == md5sum
finally:
f.close()
return check_integrity(path, 'md5:' + md5sum)
def remove(path):
......
......@@ -69,19 +69,56 @@ the local file itself:
>>> download(join(server_data, 'foo.txt'))
('/sample_files/foo.txt', False)
We can also have the downloaded file's MD5 sum checked:
We can also have the downloaded file's integrity checked:
>>> import hashlib
>>> path, is_temp = download(server_url+'foo.txt',
... hashes='sha256:%s' % hashlib.sha256('This is a foo text.'.encode()).hexdigest())
>>> is_temp
True
>>> remove(path)
>>> download(server_url+'foo.txt',
... hashes='sha256:%s' % hashlib.sha256('The wrong text.'.encode()).hexdigest())
Traceback (most recent call last):
ChecksumError: Checksum mismatch downloading 'http://localhost/foo.txt'
We can specify multiple hashes, as long as one match, the download is satisfied.
XXX not sure it makes sense to expose this here ...
XXX but then hashe*s* is a bad name - maybe integrity ?
>>> path, is_temp = download(
... server_url+'foo.txt',
... hashes='sha512:1234567-wrong-hash sha256:%s' % hashlib.sha256('This is a foo text.'.encode()).hexdigest())
>>> is_temp
True
>>> remove(path)
The error message in the event of an checksum mismatch for a local file
reads somewhat differently:
>>> download(join(server_data, 'foo.txt'),
... hashes='sha256:%s' % hashlib.sha256('This is a foo text.'.encode()).hexdigest())
('/sample_files/foo.txt', False)
>>> download(join(server_data, 'foo.txt'),
... hashes='sha256:%s' % hashlib.sha256('The wrong text.'.encode()).hexdigest())
Traceback (most recent call last):
ChecksumError: Checksum mismatch for local resource at '/sample_files/foo.txt'.
>>> try: from hashlib import md5
... except ImportError: from md5 import new as md5
This also support MD5 sum for legacy:
>>> path, is_temp = download(server_url+'foo.txt',
... md5('This is a foo text.'.encode()).hexdigest())
... hashlib.md5('This is a foo text.'.encode()).hexdigest())
>>> is_temp
True
>>> remove(path)
>>> download(server_url+'foo.txt',
... md5('The wrong text.'.encode()).hexdigest())
... hashlib.md5('The wrong text.'.encode()).hexdigest())
Traceback (most recent call last):
ChecksumError: MD5 checksum mismatch downloading 'http://localhost/foo.txt'
......@@ -89,11 +126,11 @@ The error message in the event of an MD5 checksum mismatch for a local file
reads somewhat differently:
>>> download(join(server_data, 'foo.txt'),
... md5('This is a foo text.'.encode()).hexdigest())
... hashlib.md5('This is a foo text.'.encode()).hexdigest())
('/sample_files/foo.txt', False)
>>> download(join(server_data, 'foo.txt'),
... md5('The wrong text.'.encode()).hexdigest())
... hashlib.md5('The wrong text.'.encode()).hexdigest())
Traceback (most recent call last):
ChecksumError: MD5 checksum mismatch for local resource at '/sample_files/foo.txt'.
......@@ -165,10 +202,17 @@ the file on the server to see this:
>>> cat(path)
This is a foo text.
If we specify an MD5 checksum for a file that is already in the cache, the
If we specify hashes for a file that is already in the cache, the
cached copy's checksum will be verified:
>>> download(server_url+'foo.txt', md5('The wrong text.'.encode()).hexdigest())
>>> download(server_url+'foo.txt', hashes='sha256:%s' % hashlib.sha256('The wrong text.'.encode()).hexdigest())
Traceback (most recent call last):
ChecksumError: Checksum mismatch for cached download
from 'http://localhost/foo.txt' at '/download-cache/foo.txt'
Same for legacy MD5 checksums:
>>> download(server_url+'foo.txt', hashlib.md5('The wrong text.'.encode()).hexdigest())
Traceback (most recent call last):
ChecksumError: MD5 checksum mismatch for cached download
from 'http://localhost/foo.txt' at '/download-cache/foo.txt'
......@@ -247,7 +291,14 @@ This is a foo text.
However, resources with checksum mismatches will not be copied to the cache:
>>> download(server_url+'foo.txt', md5('The wrong text.'.encode()).hexdigest())
>>> download(server_url+'foo.txt', hashes='sha256:%s' % hashlib.sha256('The wrong text.'.encode()).hexdigest())
Traceback (most recent call last):
ChecksumError: Checksum mismatch downloading 'http://localhost/foo.txt'
>>> ls(cache)
Same for legay MD5 checksum:
>>> download(server_url+'foo.txt', hashlib.md5('The wrong text.'.encode()).hexdigest())
Traceback (most recent call last):
ChecksumError: MD5 checksum mismatch downloading 'http://localhost/foo.txt'
>>> ls(cache)
......@@ -347,7 +398,7 @@ the test is run, so we don't actually know the full URL of the file. Let's
check that the checksum actually belongs to the particular URL used:
>>> (path.lower() ==
... join(cache, md5((server_url+'foo.txt').encode()).hexdigest()).lower())
... join(cache, hashlib.md5((server_url+'foo.txt').encode()).hexdigest()).lower())
True
The cached copy is used when downloading the file again:
......@@ -370,7 +421,7 @@ cache under a different name:
>>> path == path2
False
>>> (path2.lower() ==
... join(cache, md5((server_url+'other/foo.txt').encode()).hexdigest()
... join(cache, hashlib.md5((server_url+'other/foo.txt').encode()).hexdigest()
... ).lower())
True
>>> cat(path)
......@@ -451,12 +502,19 @@ When trying to download a resource whose checksum does not match, the cached
copy will neither be used nor overwritten:
>>> write(server_data, 'foo.txt', 'This is a foo text.')
>>> download(server_url+'foo.txt', md5('The wrong text.'.encode()).hexdigest())
>>> download(server_url+'foo.txt', hashes='sha256:%s' % hashlib.sha256('The wrong text.'.encode()).hexdigest())
Traceback (most recent call last):
ChecksumError: MD5 checksum mismatch downloading 'http://localhost/foo.txt'
ChecksumError: Checksum mismatch downloading 'http://localhost/foo.txt'
>>> cat(cache, 'foo.txt')
The wrong text.
This is also the case with legacy MD5
>>> download(server_url+'foo.txt', hashlib.md5('The wrong text.'.encode()).hexdigest())
Traceback (most recent call last):
ChecksumError: MD5 checksum mismatch downloading 'http://localhost/foo.txt'
>>> cat(cache, 'foo.txt')
The wrong text.
Configuring the download utility from buildout options
------------------------------------------------------
......@@ -538,7 +596,7 @@ True
Regressions
-----------
MD5 checksum calculation needs to be reliable on all supported systems, which
Checksum calculation needs to be reliable on all supported systems, which
requires text files to be treated as binary to avoid implicit line-ending
conversions:
......@@ -547,7 +605,10 @@ conversions:
>>> _ = f.write(text.encode())
>>> f.close()
>>> path, is_temp = Download()(server_url+'foo.txt',
... md5(text.encode()).hexdigest())
... hashlib.md5(text.encode()).hexdigest())
>>> remove(path)
>>> path, is_temp = Download()(server_url+'foo.txt',
... hashes='sha512:%s' % hashlib.sha512(text.encode()).hexdigest())
>>> remove(path)
When "downloading" a directory given by file-system path or ``file:`` URL and
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment