ebulk: features to handle modifications an deletions

parent 2912f542
*~
ebulk-data/config/*config.yml
...@@ -221,10 +221,10 @@ function checkCurl { ...@@ -221,10 +221,10 @@ function checkCurl {
function checkSoftware { function checkSoftware {
# CHECK JAVA VERSION # CHECK JAVA VERSION
if type -p java >/dev/null; then if [[ -n "$JAVA_HOME" ]] && [[ -x "$JAVA_HOME/bin/java" ]]; then
_java=java
elif [[ -n "$JAVA_HOME" ]] && [[ -x "$JAVA_HOME/bin/java" ]]; then
_java="$JAVA_HOME/bin/java" _java="$JAVA_HOME/bin/java"
elif type -p java >/dev/null; then
_java=java
else else
javaNotInstalled >&2; return 1 javaNotInstalled >&2; return 1
fi fi
......
require_relative 'filelogger'
require 'digest/md5'
# class that handles dataset tasks report
class DatasetUtils
DATASET_REPORT_FILE = ".dataset-task-report"
DATASET_COMPLETED_FILE = ".dataset-completed"
RESUME_OPERATION_FILE = ".resume-operation"
RUN_DONE = "done"
RUN_ERROR = "error"
RUN_ABORTED = "aborted"
DELETE = "DELETE"
INGESTION = "ingestion"
MEGA = 1000000
def initialize(data_set_directory)
@data_set_directory = data_set_directory
@logger = LogManager.instance()
@task_report_file = @data_set_directory + DATASET_REPORT_FILE
@completed_file = @data_set_directory + DATASET_COMPLETED_FILE
@resume_operation_file = @data_set_directory + RESUME_OPERATION_FILE
end
def getLocalFiles(remove=nil)
local_files = {}
begin
File.readlines(@task_report_file).each do |line|
record = line.split(";")
if record[1].chomp == RUN_DONE
if (remove.nil?) || (remove != record[0])
local_files[record[0]] = {"size" => record[2].chomp, "hash" => record[3].chomp, "status" => record[1].chomp, "modification_date" => record[4].chomp }
end
end
end
rescue Exception => e
@logger.error("An error occurred in DatasetUtils method 'getLocalFiles':" + e.to_s)
@logger.error(e.backtrace)
end
return local_files
end
def saveReport(local_files)
begin
File.delete(@task_report_file) if File.exist?(@task_report_file)
if local_files.empty?
File.open(@task_report_file, 'w') {}
else
local_files.each do |key, array|
File.open(@task_report_file, 'ab') { |file| file.puts(key+";"+array["status"]+";"+array["size"].to_s+";"+array["hash"]+";"+array["modification_date"]) }
end
end
rescue Exception => e
@logger.error("An error occurred in DatasetUtils method 'saveReport':" + e.to_s)
@logger.error(e.backtrace)
end
end
def removeCurrentOperation()
if File.exist?(@resume_operation_file)
File.delete(@resume_operation_file)
end
end
def saveCurrentOperation(operation, reference)
if File.exist?(@resume_operation_file)
File.delete(@resume_operation_file)
end
File.open(@resume_operation_file, 'w') { |file| file.puts(operation+";"+reference) }
end
def reportUpToDate(data_stream_dict)
begin
if not reportFileExist() and not completedFileExist()
# directory never downloaded -new or used for partial ingestions-
return TRUE
end
if reportFileExist() and not completedFileExist()
# download not finished
return FALSE
end
if data_stream_dict["status_code"] == 2
return FALSE
end
if data_stream_dict["status_code"] != 0
return TRUE
end
changes = getRemoteChangedDataStreams(data_stream_dict["result"])
if changes.empty?
return TRUE
elsif changes.length == 1
# check if the unique detected change corresponds to an interrumped ingestion
if File.exist?(@resume_operation_file)
operation=File.open(@resume_operation_file).read.chomp.split(";")
if operation[0] == INGESTION
if operation[1] == changes[0]["reference"]
File.delete(@resume_operation_file)
return TRUE
end
end
end
end
return FALSE
rescue Exception => e
@logger.error("An error occurred in DatasetUtils method 'reportUpToDate':" + e.to_s)
@logger.error(e.backtrace)
return FALSE
end
end
def deleteCompletedFile()
File.delete(@completed_file) if File.exist?(@completed_file)
end
def createCompletedFile()
File.open(@completed_file, 'w') {}
end
def completedFileExist()
return File.exist?(@completed_file)
end
def createReportFile()
File.open(@task_report_file, 'w') {}
end
def reportFileExist()
return File.exist?(@task_report_file)
end
def addToReport(reference, status, size, hash, data_set)
local_files = {}
begin
data_set = data_set.end_with?("/") ? data_set : data_set + "/"
file_path = @data_set_directory + reference.reverse.sub("/".reverse, ".".reverse).reverse.sub(data_set, "")
file_path = file_path[0...-5] if file_path.end_with?(".none")
modification_date = File.exist?(file_path) ? File.mtime(file_path).strftime("%Y-%m-%d-%H-%M-%S") : "not-modification-date"
if not reportFileExist()
File.open(@task_report_file, 'w') {}
end
new_file = TRUE
File.readlines(@task_report_file).each do |line|
record = line.split(";")
if reference.to_s == record[0].to_s
local_files[reference] = {"size" => size, "hash" => hash, "status" => status, "modification_date" => modification_date }
new_file = FALSE
else
local_files[record[0]] = {"size" => record[2].chomp, "hash" => record[3].chomp, "status" => record[1].chomp, "modification_date" => record[4].chomp }
end
end
if new_file
local_files[reference] = {"size" => size, "hash" => hash, "status" => status, "modification_date" => modification_date }
end
rescue Exception => e
@logger.error("An error occurred in DatasetUtils method 'addToReport':" + e.to_s)
@logger.error(e.backtrace)
end
saveReport(local_files)
end
def deleteFromReport(reference, status)
local_files = getLocalFiles(remove=reference)
saveReport(local_files)
end
def getHash(file)
begin
chunk_size = 4 * MEGA
md5 = Digest::MD5.new
open(file) do |f|
while chunk=f.read(chunk_size)
md5.update(chunk)
end
end
return md5.hexdigest
rescue Exception => e
@logger.error("An error occurred while getting hash of file " + file + ":" + e.to_s, print=TRUE)
@logger.error(e.backtrace)
raise e
end
end
def getLocalChanges(files, data_set)
new_files = []
begin
if reportFileExist()
File.readlines(@task_report_file).each do |line|
record = line.split(";")
if record[1].chomp == RUN_DONE
data_set = data_set.end_with?("/") ? data_set : data_set + "/"
file_path = @data_set_directory + record[0].reverse.sub("/".reverse, ".".reverse).reverse.sub(data_set, "")
file_path = file_path[0...-5] if file_path.end_with?(".none")
if files.include? file_path
modification_date = File.mtime(file_path).strftime("%Y-%m-%d-%H-%M-%S")
if modification_date != record[4].chomp
size = File.size(file_path).to_s
hash = getHash(file_path).to_s
if size == record[2].to_s
if hash != record[3].chomp
new_files.push({"path" => file_path, "size" => size, "hash" => hash })
end
else
new_files.push({"path" => file_path, "size" => size, "hash" => hash })
end
end
files.delete(file_path)
else
new_files.push({"path" => file_path, "size" => "", "hash" => DELETE })
end
end
end
end
files.each do |path|
new_files.push({"path" => path, "size" => "", "hash" => "" })
end
rescue Exception => e
@logger.error("An error occurred in DatasetUtils method 'getLocalChanges':" + e.to_s)
@logger.error(e.backtrace)
end
return new_files
end
def getRemoteChangedDataStreams(data_streams)
pending_data_streams = []
begin
if reportFileExist()
local_files = {}
remote_files = []
File.readlines(@task_report_file).each do |line|
record = line.split(";")
if record[1].chomp == RUN_DONE
local_files[record[0]] = {"size" => record[2].chomp, "hash" => record[3].chomp, }
end
end
data_streams.each do |data_stream|
remote_files.push(data_stream["reference"])
pending = TRUE
reference = data_stream["reference"]
if local_files.has_key? reference
size = local_files[reference]["size"]
if size.to_s == data_stream["size"].to_s
hash = local_files[reference]["hash"]
if hash == data_stream["hash"] or data_stream["hash"] == ""
pending = FALSE
end
end
end
if pending
local_files.delete(reference)
pending_data_streams.push(data_stream)
end
end
local_files.each do |key, array|
if not remote_files.include? key
pending_data_streams.push({"reference" => key, "hash" => DELETE })
end
end
end
rescue Exception => e
@logger.error("An error occurred in DatasetUtils method 'getRemoteChangedDataStreams':" + e.to_s)
@logger.error(e.backtrace)
end
return pending_data_streams
end
end
...@@ -38,11 +38,13 @@ class LogManager ...@@ -38,11 +38,13 @@ class LogManager
log(message, print, type=ERROR) log(message, print, type=ERROR)
end end
def abortExecution() def abortExecution(error=TRUE)
puts
info("PROCESS ABORTED") info("PROCESS ABORTED")
unless @path.nil? if error
puts "PROCESS ABORTED : For more detailed information, please refer to the log file '#{@path}'" puts
unless @path.nil?
puts "PROCESS ABORTED : For more detailed information, please refer to the log file '#{@path}'"
end
end end
exec("Process.kill 9, Process.pid >/dev/null 2>&1") exec("Process.kill 9, Process.pid >/dev/null 2>&1")
end end
......
require 'base64' require 'base64'
require 'fileutils' require 'fileutils'
require_relative '../dataset_utils'
require_relative '../filelogger' require_relative '../filelogger'
module Embulk module Embulk
...@@ -39,14 +40,23 @@ module Embulk ...@@ -39,14 +40,23 @@ module Embulk
if ref.end_with?(".none") if ref.end_with?(".none")
ref = ref[0...-5] ref = ref[0...-5]
end end
dirname = File.dirname(data_set_directory + ref) file_path = data_set_directory + ref
unless File.directory?(dirname) write_mode = 'ab'
FileUtils.mkdir_p(dirname) if record[3] == DatasetUtils::DELETE
File.delete(file_path) if File.exist?(file_path)
else
if record[3] == TRUE.to_s
write_mode = 'w'
end
dirname = File.dirname(data_set_directory + ref)
unless File.directory?(dirname)
FileUtils.mkdir_p(dirname)
end
File.open(file_path, write_mode) { |file| file.write(data_chunk) }
end end
File.open(data_set_directory + ref, 'ab') { |file| file.write(data_chunk) }
end end
rescue Exception => e rescue Exception => e
@logger.error("An error occurred while writing file.", print=TRUE) @logger.error("An error occurred while procesing file.", print=TRUE)
@logger.error(e.backtrace) @logger.error(e.backtrace)
raise e raise e
end end
......
require 'base64' require 'base64'
require_relative '../wendelin_client' require_relative '../wendelin_client'
require_relative '../dataset_utils'
module Embulk module Embulk
module Output module Output
...@@ -41,10 +42,17 @@ module Embulk ...@@ -41,10 +42,17 @@ module Embulk
extension = record[3] extension = record[3]
eof = record[5] eof = record[5]
data_chunk = record[4] data_chunk = record[4]
reference = [supplier, dataset, filename, extension, eof].join("/") size = record[6]
hash = record[7]
begin begin
if not @wendelin.ingest(reference, data_chunk) if eof == DatasetUtils::DELETE
raise "could not ingest" reference = [dataset, filename, extension].join("/")
@wendelin.delete(reference)
else
reference = [supplier, dataset, filename, extension, eof, size, hash].join("/")
if not @wendelin.ingest(reference, data_chunk)
raise "could not ingest"
end
end end
rescue Exception => e rescue Exception => e
raise e raise e
......
...@@ -40,7 +40,9 @@ module Embulk ...@@ -40,7 +40,9 @@ module Embulk
Column.new(2, "file", :string), Column.new(2, "file", :string),
Column.new(3, "extension", :string), Column.new(3, "extension", :string),
Column.new(4, "data_chunk", :string), Column.new(4, "data_chunk", :string),
Column.new(5, "eof", :string) Column.new(5, "eof", :string),
Column.new(6, "size", :string),
Column.new(7, "hash", :string)
] ]
yield(task, columns) yield(task, columns)
...@@ -78,22 +80,27 @@ module Embulk ...@@ -78,22 +80,27 @@ module Embulk
data = next_byte data = next_byte
if not next_byte if not next_byte
if first if first
values = [task['supplier'], task['data_set'], filename, extension, "", EOF] # this means this is an empty file
values = [task['supplier'], task['data_set'], filename, extension, "", "", "", ""]
yield(values) yield(values)
end end
break break
end end
first = FALSE
data += file.read(chunk_size) data += file.read(chunk_size)
next_byte = file.read(1) next_byte = file.read(1)
if not next_byte if not next_byte
eof = EOF eof = EOF
if first
# this means that the whole file will be ingested at once (not split)
eof = ""
end
else else
npart += 1 npart += 1
eof = npart.to_s.rjust(3, "0") eof = npart.to_s.rjust(3, "0")
end end
content = Base64.encode64(data) content = Base64.encode64(data)
values = [task['supplier'], task['data_set'], filename, extension, content, eof] values = [task['supplier'], task['data_set'], filename, extension, content, eof, "", ""]
first = FALSE
yield(values) yield(values)
end end
end end
......
...@@ -16,11 +16,6 @@ class WendelinClient ...@@ -16,11 +16,6 @@ class WendelinClient
@last_ingestion = Time.new - 2 @last_ingestion = Time.new - 2
end end
def removeEOF(reference)
root = reference.dup
return root[0...root.rindex('/')]
end
def exists(reference) def exists(reference)
uri = URI("#{@erp5_url}/ingestionReferenceExists?reference=#{reference}") uri = URI("#{@erp5_url}/ingestionReferenceExists?reference=#{reference}")
begin begin
...@@ -34,20 +29,25 @@ class WendelinClient ...@@ -34,20 +29,25 @@ class WendelinClient
end end
end end
def delete(reference)
@logger.info("Deletion requested for reference #{reference}", print=TRUE)
uri = URI("#{@erp5_url}/ERP5Site_invalidateIngestionObjects?reference=#{reference}")
res = handleRequest(uri)
if res == FALSE
@logger.abortExecution()
end
end
def ingest(reference, data_chunk) def ingest(reference, data_chunk)
@logger.info("Ingestion reference: #{reference}", print=TRUE) @logger.info("Ingestion reference: #{reference}", print=TRUE)
if @banned_references_list.include? removeEOF(reference)
return FALSE
end
if Time.new - @last_ingestion < 2 if Time.new - @last_ingestion < 2
# avoid send ingestions to close (specially for split ones) # avoid send ingestions to close (specially for split ones)
sleep 3 sleep 2
end end
if exists(reference) if exists(reference)
@logger.info("There is another ingestion already done for the pair data_set-filename. Reference "\ @logger.info("There is another ingestion already done for the pair data_set-filename. Reference "\
+ removeEOF(reference), print=TRUE) + reference, print=TRUE)
@logger.info("Rename your reference or delete the older ingestion.", print=TRUE) @logger.info("Rename your reference or delete the older ingestion.", print=TRUE)
@banned_references_list << removeEOF(reference)
return FALSE return FALSE
end end
if reference.include? "#" or reference.include? "+" if reference.include? "#" or reference.include? "+"
...@@ -91,7 +91,6 @@ class WendelinClient ...@@ -91,7 +91,6 @@ class WendelinClient
end end
def getDataStreams(data_set_reference) def getDataStreams(data_set_reference)
@logger.info("Getting file list for dataset '#{data_set_reference}'", print=TRUE)
uri = URI("#{@erp5_url}getDataStreamList?data_set_reference=#{data_set_reference}") uri = URI("#{@erp5_url}getDataStreamList?data_set_reference=#{data_set_reference}")
str = handleRequest(uri) str = handleRequest(uri)
if str == FALSE if str == FALSE
...@@ -115,7 +114,6 @@ class WendelinClient ...@@ -115,7 +114,6 @@ class WendelinClient
req.set_form_data('data_chunk' => data_chunk) req.set_form_data('data_chunk' => data_chunk)
rescue java.lang.OutOfMemoryError rescue java.lang.OutOfMemoryError
@logger.logOutOfMemoryError(reference) @logger.logOutOfMemoryError(reference)
@banned_references_list << removeEOF(reference)
return FALSE return FALSE
end end
@logger.info("Sending record:'#{reference}'...", print=TRUE) @logger.info("Sending record:'#{reference}'...", print=TRUE)
...@@ -125,7 +123,7 @@ class WendelinClient ...@@ -125,7 +123,7 @@ class WendelinClient
res = Net::HTTP.start(uri.hostname, uri.port, res = Net::HTTP.start(uri.hostname, uri.port,
:use_ssl => (uri.scheme == 'https'), :use_ssl => (uri.scheme == 'https'),
:verify_mode => OpenSSL::SSL::VERIFY_NONE, :verify_mode => OpenSSL::SSL::VERIFY_NONE,
:ssl_timeout => 32000, :open_timeout => 32000, :read_timeout => 32000, :ssl_timeout => 20, :open_timeout => 20, :read_timeout => 20,
) do |http| ) do |http|
http.request(req) http.request(req)
end end
...@@ -135,7 +133,7 @@ class WendelinClient ...@@ -135,7 +133,7 @@ class WendelinClient
return FALSE return FALSE
else else
if res.kind_of?(Net::HTTPSuccess) # res.code is 2XX if res.kind_of?(Net::HTTPSuccess) # res.code is 2XX
@logger.info("Done", print=TRUE) @logger.info("Done")
return res.body return res.body
else else
@logger.error("HTTP FAIL - code: #{res.code}", print=TRUE) @logger.error("HTTP FAIL - code: #{res.code}", print=TRUE)
...@@ -146,7 +144,6 @@ class WendelinClient ...@@ -146,7 +144,6 @@ class WendelinClient
@logger.abortExecution() @logger.abortExecution()
else else
@logger.error("Sorry, an error ocurred. If the error persists, please contact the administrator.", print=TRUE) @logger.error("Sorry, an error ocurred. If the error persists, please contact the administrator.", print=TRUE)
#@logger.error(res.value)
end end
return FALSE return FALSE
end end
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment