1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#!${buildout:executable}
import hashlib
import json
import os
import re
import subprocess
import sys
# stolen from download_file.in
def md5Checksum(file_path):
with open(file_path, 'rb') as fh:
m = hashlib.md5()
while True:
data = fh.read(8192)
if not data:
break
m.update(data)
return m.hexdigest()
# Note: Assuring only one running instance is not done, as this script is only
# run from supervisord, which does it already
if __name__ == "__main__":
configuration, curl, md5sum_fail_file, error_state_file, \
processed_md5sum = sys.argv[1:]
error_list = []
md5sum_re = re.compile(r"^([a-fA-F\d]{32})$")
image_prefix = 'image_'
# build currently wanted list
with open(configuration) as fh:
try:
config = json.load(fh)
except Exception as e:
print('ERR: Problem loading configuration: %s' % (e,))
sys.exit(1)
if config['error-amount'] != 0:
print('ERR: There are problems with configuration')
print('INF: Storing errors in %s' % (error_state_file,))
# switch to error state during image download
with open(error_state_file, 'w') as fh:
fh.write('\n'.join(['INF Download in progress']))
# clean the destination directory
file_to_keep_list = []
for image in config['image-list']:
file_to_keep_list.append(image['destination'])
for fname in os.listdir(config['destination-directory']):
if fname not in file_to_keep_list:
print('INF: Removing obsolete %s' % (fname,))
os.remove(os.path.join(config['destination-directory'], fname))
# prepare state dicts
# current and new are used to remove not existing configurations
# and also to allow re-add some configuration
try:
with open(md5sum_fail_file) as fh:
md5sum_state_dict = json.load(fh)
except Exception:
md5sum_state_dict = {}
new_md5sum_state_dict = {}
# fetch the wanted list
for image in config['image-list']:
destination = os.path.join(
config['destination-directory'], image['destination'])
if os.path.exists(destination):
# Note: There is no need to recheck md5sum here
# The image name is its md5sum, so if it exists, it means it has
# correct md5sum
# Calculating md5sum of big images takes more time than processing
# of the partition and running promises and this leads to endless
# loop of never ending promise failures
# Of course, someone nasty can come to the partition and damage
# this image, but it's another story, and shall not be fixed
# during download phase.
print('INF: %s : already downloaded' % (image['url'],))
continue
# key is str, as the dict is dumped to JSON which does not accept tuples
md5sum_state_key = '%s#%s' % (image['url'], image['md5sum'])
md5sum_state_amount = md5sum_state_dict.get(md5sum_state_key, 0)
if md5sum_state_amount >= 4:
new_md5sum_state_dict[md5sum_state_key] = md5sum_state_amount
error_list.append(
'ERR: %s : Checksum is incorrect after %s tries, will not retry' % (
image['url'], md5sum_state_amount))
continue
print('INF: %s : Downloading' % (image['url'],))
download_success = True
destination_tmp = os.path.join(
config['destination-directory'], image['destination-tmp'])
try:
subprocess.check_output([
curl,
'--location', # follow redirects
'--no-progress-meter', # do not tell too much
'--max-time', '14400', # maximum time for download is 4 hours
'--max-filesize', '10737418240', # maximum 10G for an image
'--output', destination_tmp, image['url']],
stderr=subprocess.STDOUT)
except subprocess.CalledProcessError as e:
error_list.append('ERR: %s : Problem while downloading: %r' % (
image['url'], e.output.strip()))
continue
if not(os.path.exists(destination_tmp)):
error_list.append('ERR: %s : Image disappeared, will retry later')
continue
computed_md5sum = md5Checksum(destination_tmp)
if computed_md5sum != image['md5sum']:
try:
os.remove(destination_tmp)
except Exception:
pass
error_list.append(
'ERR: %s : MD5 mismatch expected is %s but got instead %s' % (
image['url'], image['md5sum'], computed_md5sum))
# Store yet another failure while computing md5sum for this
new_md5sum_state_dict[md5sum_state_key] = md5sum_state_amount + 1
else:
os.rename(destination_tmp, destination)
print('INF: %s : Stored with checksum %s' % (
image['url'], image['md5sum']))
with open(md5sum_fail_file, 'w') as fh:
if new_md5sum_state_dict != {}:
json.dump(new_md5sum_state_dict, fh, indent=2)
else:
# if no problems reported, just empty the file
fh.write('')
with open(error_state_file, 'w') as fh:
fh.write('\n'.join(error_list))
with open(processed_md5sum, 'w') as fh:
fh.write(config['config-md5sum'])
sys.exit(len(error_list))