Commit 0a6d8c19 authored by Julien Muchembled's avatar Julien Muchembled Committed by Xavier Thompson

[fix/opti] download: clean-up, fix, optimization

An optimization is to avoid temporary file when possible: a rename
(or hard link) is not always possible (different mount points).

Another one is to not check md5sum twice when using cache file.

Fall-back mode is ignored if an MD5 checksum is given.

In case of checksum mismatch for a cached path, remove it and
download again, mainly to cover the following cases:
- the url content changes and the user updates the checksum
- buildout killed while downloading directly to cache
  (see above optimization)
- shutil.copyfile is interrupted
parent 722bc55d
......@@ -54,9 +54,9 @@ import os
import os.path
import re
import shutil
import sys
import tempfile
import zc.buildout
from .rmtree import rmtree
class ChecksumError(zc.buildout.UserError):
......@@ -74,7 +74,8 @@ class Download(object):
cache: path to the download cache (excluding namespaces)
namespace: namespace directory to use inside the cache
offline: whether to operate in offline mode
fallback: whether to use the cache as a fallback (try downloading first)
fallback: whether to use the cache as a fallback (try downloading first),
when an MD5 checksum is not given
hash_name: whether to use a hash of the URL as cache file name
logger: an optional logger to receive download-related log messages
......@@ -107,7 +108,8 @@ class Download(object):
if self.download_cache is not None:
return os.path.join(self.download_cache, self.namespace or '')
def __call__(self, url, md5sum=None, path=None):
@property
def __call__(self):
"""Download a file according to the utility's configuration.
url: URL to download
......@@ -117,19 +119,14 @@ class Download(object):
Returns the path to the downloaded file.
"""
if self.cache:
local_path, is_temp = self.download_cached(url, md5sum)
else:
local_path, is_temp = self.download(url, md5sum, path)
return self.download_cached if self.cache else self.download
return locate_at(local_path, path), is_temp
def download_cached(self, url, md5sum=None):
def download_cached(self, url, md5sum=None, path=None):
"""Download a file from a URL using the cache.
This method assumes that the cache has been configured. Optionally, it
raises a ChecksumError if a cached copy of a file has an MD5 mismatch,
but will not remove the copy in that case.
This method assumes that the cache has been configured.
If a cached copy of a file has an MD5 mismatch, download
and update the cache on success.
"""
if not os.path.exists(self.download_cache):
......@@ -146,26 +143,43 @@ class Download(object):
self.logger.debug('Searching cache at %s' % cache_dir)
if os.path.exists(cached_path):
is_temp = False
if self.fallback:
try:
_, is_temp = self.download(url, md5sum, cached_path)
except ChecksumError:
if check_md5sum(cached_path, md5sum):
if md5sum or not self.fallback:
self.logger.debug('Using cache file %s', cached_path)
return locate_at(cached_path, path), False
else:
self.logger.warning(
'MD5 checksum mismatch for cached download from %r at %r',
url, cached_path)
# Don't download directly to cached_path to minimize
# the probability to alter old data if download fails.
try:
path, is_temp = self.download(url, md5sum, path)
except ChecksumError:
raise
except Exception:
if md5sum:
raise
except Exception:
pass
if not check_md5sum(cached_path, md5sum):
raise ChecksumError(
'MD5 checksum mismatch for cached download '
'from %r at %r' % (url, cached_path))
self.logger.debug('Using cache file %s' % cached_path)
self.logger.debug("Fallback to cache using %s",
cached_path, exception=1)
else:
samefile = getattr(os.path, 'samefile', None)
if not (samefile and samefile(path, cached_path)):
# update cache
try:
os.remove(cached_path)
except OSError as e:
if e.errno != errno.EISDIR:
raise
rmtree(cached_path)
locate_at(path, cached_path)
return path, is_temp
else:
self.logger.debug('Cache miss; will cache %s as %s' %
(url, cached_path))
_, is_temp = self.download(url, md5sum, cached_path)
self.download(url, md5sum, cached_path)
return cached_path, is_temp
return locate_at(cached_path, path), False
def download(self, url, md5sum=None, path=None):
"""Download a file from a URL to a given or temporary path.
......@@ -196,27 +210,25 @@ class Download(object):
"Couldn't download %r in offline mode." % url)
self.logger.info('Downloading %s' % url)
handle, tmp_path = tempfile.mkstemp(prefix='buildout-')
os.close(handle)
tmp_path = path
cleanup = True
try:
if not path:
handle, tmp_path = tempfile.mkstemp(prefix='buildout-')
os.close(handle)
tmp_path, headers = urlretrieve(url, tmp_path)
if not check_md5sum(tmp_path, md5sum):
raise ChecksumError(
'MD5 checksum mismatch downloading %r' % url)
except IOError:
e = sys.exc_info()[1]
os.remove(tmp_path)
raise zc.buildout.UserError("Error downloading extends for URL "
"%s: %s" % (url, e))
except Exception:
os.remove(tmp_path)
raise
if path:
shutil.move(tmp_path, path)
return path, False
else:
return tmp_path, True
cleanup = False
except IOError as e:
raise zc.buildout.UserError("Error downloading %s: %s"
% (url, e))
finally:
if cleanup and tmp_path:
remove(tmp_path)
return tmp_path, not path
def filename(self, url):
"""Determine a file name from a URL according to the configuration.
......
......@@ -165,14 +165,6 @@ the file on the server to see this:
>>> cat(path)
This is a foo text.
If we specify an MD5 checksum for a file that is already in the cache, the
cached copy's checksum will be verified:
>>> download(server_url+'foo.txt', md5('The wrong text.'.encode()).hexdigest())
Traceback (most recent call last):
ChecksumError: MD5 checksum mismatch for cached download
from 'http://localhost/foo.txt' at '/download-cache/foo.txt'
Trying to access another file at a different URL which has the same base name
will result in the cached copy being used:
......@@ -184,6 +176,14 @@ will result in the cached copy being used:
>>> cat(path)
This is a foo text.
If we specify an MD5 checksum for a file that is already in the cache, the
cached copy's checksum will be verified and the cache will be refreshed:
>>> path, is_temp = download(server_url+'foo.txt', md5('The wrong text.'.encode()).hexdigest())
>>> is_temp
True
>>> remove(path)
Given a target path for the download, the utility will provide a copy of the
file at that location both when first downloading the file and when using a
cached copy:
......@@ -259,7 +259,7 @@ If the file is completely missing it should notify the user of the error:
>>> download(server_url+'bar.txt') # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
Traceback (most recent call last):
...
UserError: Error downloading extends for URL http://localhost/bar.txt:
UserError: Error downloading http://localhost/bar.txt:
...404...
>>> ls(cache)
......@@ -442,18 +442,22 @@ However, when downloading the file normally with the cache being used in
fall-back mode, the file will be downloaded from the net and the cached copy
will be replaced with the new content:
>>> cat(download(server_url+'foo.txt')[0])
>>> path, is_temp = download(server_url+'foo.txt')
>>> cat(path)
The wrong text.
>>> cat(cache, 'foo.txt')
The wrong text.
>>> is_temp
True
>>> remove(path)
When trying to download a resource whose checksum does not match, the cached
copy will neither be used nor overwritten:
Fall-back mode is meaningless if md5sum is given. If the checksum of the
cached copy matches, the resource is not downloaded:
>>> write(server_data, 'foo.txt', 'This is a foo text.')
>>> download(server_url+'foo.txt', md5('The wrong text.'.encode()).hexdigest())
Traceback (most recent call last):
ChecksumError: MD5 checksum mismatch downloading 'http://localhost/foo.txt'
>>> path, is_temp = download(server_url+'foo.txt', md5('The wrong text.'.encode()).hexdigest())
>>> print_(path)
/download-cache/foo.txt
>>> cat(cache, 'foo.txt')
The wrong text.
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment