Commit a9881575 authored by Eteri

embulk-input-filename: add last modification date. add python scripts to test uploads

parent 9cec4744
import argparse
from datetime import datetime, timedelta, date
import glob
parser = argparse.ArgumentParser(description='Test if files in a given range are on the server')
parser.add_argument("-path", "--path", help = "Specify the path, e.g. /mic/L0444-001/duo/MDA01", required = True, default = "")
parser.add_argument("-data", "--data", help = "Specify the data name, e.g. 002 or 06 or Nsp", required = True, default = "")
parser.add_argument("-start_date", "--start_date", help = "Specify the start date : year-month-day, e.g. 2017-11-15", required = True, default = "")
parser.add_argument("-start_time", "--start_time", help = "Specify the start time : hour-minute, e.g. 11-10", required = True, default = "")
parser.add_argument("-end_date", "--end_date", help = "Specify the end date : year-month-day, e.g. 2017-11-17", required = True, default = "")
parser.add_argument("-end_time", "--end_time", help = "Specify the end time : hour-minute, e.g. 09-15", required = True, default = "")
args = parser.parse_args()
path = args.path
data_name =
start_date = args.start_date
end_date = args.end_date
start_time = args.start_time
end_time = args.end_time
first_year = int(start_date.split("-")[0])
first_month = int(start_date.split("-")[1])
first_day = int(start_date.split("-")[2])
last_year = int(end_date.split("-")[0])
last_month = int(end_date.split("-")[1])
last_day = int(end_date.split("-")[2])
first_hour = int(start_time.split("-")[0])
first_minute = int(start_time.split("-")[1])
last_hour = int(end_time.split("-")[0])
last_minute = int(end_time.split("-")[1])
start = datetime(first_year, first_month, first_day, first_hour, first_minute, 0)
end = datetime(last_year, last_month, last_day, last_hour, last_minute, 0)
list_of_dates_to_check = []
list_of_files_to_check = []
list_of_missing_files = []
while start <= end:
start += timedelta(minutes=5)
for time in list_of_dates_to_check:
file_time = (str(time).split()[1].replace(':','-'))
file_date = (str(time).split()[0] + '_' + file_time)
full_file_name = path + file_date + "." + data_name
all_files = glob.glob(path + "/*." + data_name)
for file_to_check in list_of_files_to_check:
if file_to_check in all_files :
# print (file_to_check)
# print ("File is missing")
print ("List of missing files")
print (list_of_missing_files)
print ("Number of missing files")
print (len(list_of_missing_files))
import os
import requests
import hashlib
import argparse
def get_not_uploaded_files_list(path, path_with_dots):
r = requests.get(request_string_has_bucket_key + path_with_dots, auth=(username, password))
has_bucket_key_res = str(r.text)
md5sum_local_file = hashlib.md5(open(path, 'rb').read()).hexdigest() # get md5 sum of the file_to_be_uploaded
r = requests.get(request_string_md5sum + path_with_dots, auth=(username, password))
md5sum_uploaded_file = r.text # get md5 sum of the uploaded file
if has_bucket_key_res == 'True' :
if md5sum_local_file == md5sum_uploaded_file :
print (path)
print ("local file ", md5sum_local_file)
print ("uploaded file= ", md5sum_uploaded_file)
print ("UPLOADED")
else :
print (path)
print ("File is uploaded BUT md5sum is Different")
print ("local file ", md5sum_local_file)
print ("uploaded file= ", md5sum_uploaded_file)
else :
print (path)
print ("NOT UPLOADED")
return files_not_uploaded, uploaded_with_diff_md5sum
def check_files (files, *args, **kwargs):
start = kwargs.get('start', None)
end = kwargs.get('end', None)
if end != None and start != None :
for file in files[start:end] :
file_size = os.stat(file).st_size # check only files with the proper size
if file_size != 1792 or file.startswith(".") :
# if file.startswith(".") :
print (file)
print ("Not loaded because of size or hidden")
else :
path_after_background = file.split('background/')[1]
# path_after_background = file.split('events/')[1]
path_with_dots = path_after_background.replace('/', '.') # replace i.e. 2017/10/17280145.BMR -> 2017.10.17280145.BMR
print ("path_after_background", path_after_background)
print ("path_with_dots", path_with_dots)
files_not_uploaded, uploaded_with_diff_md5sum = get_not_uploaded_files_list(file, path_with_dots)
else :
for file in files :
file_size = os.stat(file).st_size # check only files with the proper size
if file_size != 1792 or file.startswith(".") :
# if file.startswith(".") :
print (file)
print ("Not uploaded because of size or hidden")
else :
path_after_background = file.split('background/')[1]
# path_after_background = file.split('events/')[1]
path_with_dots = path_after_background.replace('/', '.') # replace i.e. 2017/10/17280145.BMR -> 2017.10.17280145.BMR
files_not_uploaded, uploaded_with_diff_md5sum = get_not_uploaded_files_list(file, path_with_dots)
return sorted(files_not_uploaded), sorted(uploaded_with_diff_md5sum)
# start
# get the command line arguments
parser = argparse.ArgumentParser(description='Test if all files are uploaded')
parser.add_argument("-p", "--path", help = "Path of the files to be uploaded, e.g. /mic/syscomtestuser/syscom/SYSCOM02-12400555/background/", required = True, default = "")
parser.add_argument("-string_md5sum", "--request_string_md5sum", help = "Request string to get md5sum, e.g.", required = True, default = "")
parser.add_argument("-string_has_bucket_key", "--request_string_has_bucket_key", help = "Request string to get hasBucketKey value, e.g.", required = True, default = "")
parser.add_argument("-user", "--username", help = "Username", required = True, default = "")
parser.add_argument("-pswd", "--password", help = "Password", required = True, default = "")
parser.add_argument("-c", "--chunk", help = "Check only chunk of files", required = False, action='store_true')
parser.add_argument("-s", "--start", help = "Start of the chunk. Used only when -c", required = False, default = "")
parser.add_argument("-e", "--end", help = "End of the chunk. Used only when -c and -s", required = False, default = "")
args = parser.parse_args()
chunk_start = ""
chunk_end = ""
if args.chunk :
print ("chunk is set")
if not args.start or not args.end :
print ("Start and/or End of the chunk is not given")
else :
chunk_start = args.start
chunk_end = args.end
dir_path = args.path
request_string_md5sum = args.request_string_md5sum
request_string_has_bucket_key = args.request_string_has_bucket_key
username = args.username
password = args.password
print ("Directory to be uploaded")
print (dir_path)
print ("Request string to get md5sum")
print (request_string_md5sum)
print ("Request string to get the value of hasBucketKey")
print (request_string_has_bucket_key)
print ("Username")
print (username)
print ("Password")
print (password)
# specify the directory of files to be uploaded and request strings
# data_stream_module/26/ : the very first one with 2016
#dir_path = "/mic/syscomtestuser/syscom/SYSCOM02-12400555/background/"
#request_string_md5sum = ""
#request_string_has_bucket_key = ""
#username = "test"
#password = "Lty5Gg54gtzr"
# data_stream_module/42/ : the one i deleted
#dir_path = "/mic/L0444-001/syscom/syscom004-14360007/background/"
#request_string_md5sum = ""
#request_string_has_bucket_key = ""
# my own test linked to /data_stream_module/28/
#dir_path = "/home/eteri/data/syscom004-14360007/background/" #my own test
#request_string_md5sum = ''
#request_string_has_bucket_key = ""
#username = "zope"
#password = "dbguylpn"
# some initializations
files_not_uploaded = []
path_with_dots = ""
files_full_path = []
uploaded_with_diff_md5sum = []
# walk through directories, get the files and compare md5sums with uploaded ones
for root, dirs, files in os.walk(dir_path):
files[:] = [f for f in files if not f.startswith('.')]
for file in files:
path = os.path.join(root,file)
files_full_path = sorted(files_full_path)
if args.chunk and len(files_full_path) != 0:
index_start = files_full_path.index(chunk_start)
index_end = files_full_path.index(chunk_end)
files_not_uploaded, uploaded_with_diff_md5sum = check_files (files_full_path, start = index_start, end = index_end)
else :
files_not_uploaded, uploaded_with_diff_md5sum = check_files (files_full_path)
print("number of not uploaded files : %s" %( len(files_not_uploaded)))
print("list of not uploaded files: %s" %( files_not_uploaded))
print("number of uploaded files with different md5sum : %s" %( len(uploaded_with_diff_md5sum)))
print("list of uploaded files with different md5sum: %s" %( uploaded_with_diff_md5sum))
