ebulk fix resume initial ingestion

parent c6e035d3
*~
ebulk-data/config/*config.yml
exec:
max_threads: 1
min_output_tasks: 1
in:
type: file
path_prefix: ./csv/
parser:
charset: UTF-8
type: csv
delimiter: ';'
columns:
- {name: id, type: string}
- {name: id2, type: string}
- {name: id3, type: string}
- {name: id4, type: string}
out:
type: wendelin
erp5_url: "https://softinst102878.host.vifib.net/erp5/portal_ingestion_policies/wendelin_embulk"
user: "zope"
password: "asd"
exec:
max_threads: 1
min_output_tasks: 1
in:
type: file
path_prefix: ./csv/
parser:
charset: UTF-8
# newline: CRLF
type: csv
delimiter: ';'
# quote: '"'
# escape: ''
# null_string: 'NULL'
columns:
- {name: id, type: string}
- {name: id2, type: string}
- {name: id3, type: string}
- {name: id4, type: string}
out:
type: wendelin
erp5_url: "https://softinst102878.host.vifib.net/erp5/portal_ingestion_policies/wendelin_embulk"
user: "zope"
password: "asd"
exec:
max_threads: 1
min_output_tasks: 1
in:
type: wendelin
erp5_url: "https://softinst102878.host.vifib.net/erp5/"
user: "asd"
password: "asd"
data_set: "sample"
chunk_size: "50"
output_path: "sample"
tool_dir: "."
out:
type: fif
output_path: "sample"
tool_dir: "."
exec:
max_threads: 1
min_output_tasks: 1
in:
type: wendelin
erp5_url: $DOWN_URL
user: $USER
password: $pwd
data_set: $DATA_SET
chunk_size: $CHUNK
output_path: $DATASET_DIR
tool_dir: $TOOL_DIR
out:
type: fif
output_path: $DATASET_DIR
tool_dir: $TOOL_DIR
exec:
max_threads: 1
min_output_tasks: 1
in:
type: fif
path_prefix: ["input/"]
supplier: [SUPPLIER]
data_set: [DATA_SET]
chunk_size: 0
out:
type: wendelin
erp5_url: 'https://softinst79462.host.vifib.net/erp5/portal_ingestion_policies/wendelin_embulk'
user: [USER]
password: [PASSWORD]
tag: supplier.dataset.filename.extension.end
exec:
max_threads: 1
min_output_tasks: 1
in:
type: fif
path_prefix: [$DATASET_DIR]
supplier: $USER
data_set: $DATA_SET
chunk_size: $CHUNK
erp5_url: $DOWN_URL
user: $USER
password: $pwd
tool_dir: $TOOL_DIR
out:
type: wendelin
erp5_url: $ING_URL
user: $USER
password: $pwd
tool_dir: $TOOL_DIR
# CUSTOM CONFIGURATION FILE
# PLEASE FILL THE FILE WITH THE CONFIGURATION OF YOUR CUSTOM EMBULK PLUGIN
# ONLY THE 'IN' SECTION, OTHERS MUST REMAIN AS THEY ARE
# PLEASE FILL THE 'IN' SECTION ACCORDING TO YOUR PLUGIN
in:
# FOR EXAMPLE CSV FILES
# type: file
# path_prefix: MY_CSV_DIRECTORY
# FOR EXAMPLE AWS-S3 storage:
# type: s3
# bucket: MY_BUCKET
# path_prefix: ""
# access_key_id: MY_KEY_ID
# secret_access_key: MY_SECRET_KEY
# PLEASE LEAVE THE SECTIONS BELOW AS THEY ARE (unless you know what you are doing)
parser:
type: binary
supplier: $USER
data_set: $DATA_SET
tool_dir: $TOOL_DIR
chunk_size: $CHUNK
input_plugin: $STORAGE
out:
type: wendelin
erp5_url: $ING_URL
user: $USER
password: $pwd
exec:
max_threads: 1
min_output_tasks: 1
# FTP CONFIGURATION FILE
# PLEASE FILL THE FILE WITH THE CONFIGURATION OF YOUR FTP STORAGE
# ONLY THE 'IN' SECTION, OTHERS MUST REMAIN AS THEY ARE
in:
type: ftp
host: $FTP_HOST
user: $FTP_USER
password: $FTP_PASSWORD
path_prefix: $FTP_PATH
#ssl_verify: false
#port: 21
# PLEASE LEAVE THE SECTIONS BELOW AS THEY ARE (unless you know what you are doing)
parser:
type: binary
supplier: $USER
data_set: $DATA_SET
tool_dir: $TOOL_DIR
chunk_size: $CHUNK
storage: $STORAGE
out:
type: wendelin
erp5_url: $ING_URL
user: $USER
password: $pwd
exec:
max_threads: 1
min_output_tasks: 1
# HTTP CONFIGURATION FILE
# PLEASE FILL THE FILE WITH THE CONFIGURATION OF YOUR HTTP URL
# ONLY THE 'IN' SECTION, OTHERS MUST REMAIN AS THEY ARE
in:
type: http
url: "http://archive.ics.uci.edu/ml/machine-learning-databases/00000/Donnees%20conso%20autos.txt"
method: "get"
# basic_auth:
# user: MyUser
# password: MyPassword
# params:
# - {name: paramA, value: valueA}
# - {name: paramB, value: valueB}
# PLEASE LEAVE THE SECTIONS BELOW AS THEY ARE (unless you know what you are doing)
parser:
type: binary
supplier: "zope"
data_set: "http"
tool_dir: "."
chunk_size: "50"
storage: "http"
path_prefix:
out:
type: wendelin
erp5_url: "https://softinst102878.host.vifib.net/erp5/portal_ingestion_policies/wendelin_embulk"
user: "zope"
password: "telecom"
exec:
max_threads: 1
min_output_tasks: 1
# HTTP CONFIGURATION FILE
# PLEASE FILL THE FILE WITH THE CONFIGURATION OF YOUR HTTP URL
# ONLY THE 'IN' SECTION, OTHERS MUST REMAIN AS THEY ARE
in:
type: http
url: $HTTP_URL
method: $HTTP_METHOD
# basic_auth:
# user: MyUser
# password: MyPassword
# params:
# - {name: paramA, value: valueA}
# - {name: paramB, value: valueB}
# PLEASE LEAVE THE SECTIONS BELOW AS THEY ARE (unless you know what you are doing)
parser:
type: binary
supplier: $USER
data_set: $DATA_SET
tool_dir: $TOOL_DIR
chunk_size: $CHUNK
storage: $STORAGE
path_prefix: $HTTP_PREFIX
out:
type: wendelin
erp5_url: $ING_URL
user: $USER
password: $pwd
exec:
max_threads: 1
min_output_tasks: 1
exec:
max_threads: 1
min_output_tasks: 1
in:
type: s3
bucket: "roque5"
path_prefix: ""
access_key_id: "AKIAJLY3N4YBNAJMBLGQ"
secret_access_key: "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"
parser:
type: binary
supplier: "zope"
data_set: "encoding"
tool_dir: "."
chunk_size: "5"
input_plugin "s3"
out:
type: wendelin
erp5_url: "https://softinst102878.host.vifib.net/erp5/portal_ingestion_policies/wendelin_embulk"
user: "zope"
password: "telecom"
# S3 CONFIGURATION FILE
# PLEASE FILL THE FILE WITH THE CONFIGURATION OF YOUR S3 BUCKET
# ONLY THE 'IN' SECTION, OTHERS MUST REMAIN AS THEY ARE
in:
type: s3
bucket: $S3_BUCKET
path_prefix: $S3_PREFIX
access_key_id: $S3_ACCESS_KEY
secret_access_key: $S3_SECRET_KEY
auth_method: $S3_AUTH_METHOD
# endpoint:
# region:
# path_match_pattern:
# http_proxy:
# host:
# port:
# PLEASE LEAVE THE SECTIONS BELOW AS THEY ARE (unless you know what you are doing)
parser:
type: binary
supplier: $USER
data_set: $DATA_SET
tool_dir: $TOOL_DIR
chunk_size: $CHUNK
storage: $STORAGE
path_prefix: $S3_PREFIX
out:
type: wendelin
erp5_url: $ING_URL
user: $USER
password: $pwd
exec:
max_threads: 1
min_output_tasks: 1
......@@ -7,6 +7,7 @@ class DatasetUtils
DATASET_REPORT_FILE = ".dataset-task-report"
DATASET_COMPLETED_FILE = ".dataset-completed"
RESUME_OPERATION_FILE = ".resume-operation"
INITIAL_INGESTION_FILE = ".initial-ingestion"
RUN_DONE = "done"
RUN_ERROR = "error"
......@@ -22,6 +23,7 @@ class DatasetUtils
@task_report_file = @data_set_directory + DATASET_REPORT_FILE
@completed_file = @data_set_directory + DATASET_COMPLETED_FILE
@resume_operation_file = @data_set_directory + RESUME_OPERATION_FILE
@initial_ingestion_file = @data_set_directory + INITIAL_INGESTION_FILE
end
def getLocalFiles(remove=nil)
......@@ -130,6 +132,18 @@ class DatasetUtils
return File.exist?(@task_report_file)
end
def deleteInitialIngestionFile()
File.delete(@initial_ingestion_file) if File.exist?(@initial_ingestion_file)
end
def createInitialIngestionFile()
File.open(@initial_ingestion_file, 'w') {}
end
def initialIngestionFileExist()
return File.exist?(@initial_ingestion_file)
end
def addToReport(reference, status, size, hash, data_set)
local_files = {}
begin
......@@ -183,7 +197,7 @@ class DatasetUtils
end
def getLocalChanges(files, data_set)
new_files = []
all_files, new_files, modified_files, deleted_files = [], [], [], []
begin
if reportFileExist()
File.readlines(@task_report_file).each do |line|
......@@ -199,27 +213,31 @@ class DatasetUtils
hash = getHash(file_path).to_s
if size == record[2].to_s
if hash != record[3].chomp
new_files.push({"path" => file_path, "size" => size, "hash" => hash })
all_files.push({"path" => file_path, "size" => size, "hash" => hash })
modified_files.push(file_path)
end
else
new_files.push({"path" => file_path, "size" => size, "hash" => hash })
all_files.push({"path" => file_path, "size" => size, "hash" => hash })
modified_files.push(file_path)
end
end
files.delete(file_path)
else
new_files.push({"path" => file_path, "size" => "", "hash" => DELETE })
all_files.push({"path" => file_path, "size" => "", "hash" => DELETE })
deleted_files.push(file_path)
end
end
end
end
files.each do |path|
new_files.push({"path" => path, "size" => "", "hash" => "" })
all_files.push({"path" => path, "size" => "", "hash" => "" })
new_files.push(path)
end
rescue Exception => e
@logger.error("An error occurred in DatasetUtils method 'getLocalChanges':" + e.to_s)
@logger.error(e.backtrace)
end
return new_files
return all_files, new_files, modified_files, deleted_files
end
def getRemoteChangedDataStreams(data_streams)
......
......@@ -10,6 +10,10 @@ module Embulk
Plugin.register_input("fif", self)
NEW = "New"
MODIFIED = "Modified"
DELETED = "Deleted"
EOF = "EOF"
CHUNK_SIZE = 50000000 #50mb
MEGA = 1000000
......@@ -24,6 +28,21 @@ module Embulk
{"name"=>"hash", "type"=>"string"}
]
def self.showChangesList(changes, type, print_short)
if not changes.empty?
puts
@logger.info("#{type} file(s):", print=TRUE)
if print_short and changes.length > 50
limit = changes.length > 130 ? 130/3 : changes.length/3
@logger.info(changes[0, limit], print=TRUE)
@logger.info("....", print=TRUE)
@logger.info(changes[changes.length-limit, changes.length-1], print=TRUE)
else
@logger.info(changes, print=TRUE)
end
end
end
def self.transaction(config, &control)
begin
tool_dir = config.param('tool_dir', :string)
......@@ -36,6 +55,7 @@ module Embulk
if task['chunk_size'] == 0
task['chunk_size'] = CHUNK_SIZE
end
@data_set = task['data_set']
paths = config.param('path_prefix', :array)
paths[0] = paths[0].end_with?("/") ? paths[0] : paths[0] + "/"
@data_set_directory = paths[0]
......@@ -50,13 +70,17 @@ module Embulk
@logger.info("Checking remote dataset...", print=TRUE)
data_stream_dict = @wendelin.getDataStreams(task['data_set'])
@dataset_utils = DatasetUtils.new(@data_set_directory)
if @dataset_utils.reportFileExist()
@logger.info("Checking local dataset...", print=TRUE)
if not @dataset_utils.reportUpToDate(data_stream_dict)
puts
@logger.error("Your current dataset is outdated. Please, run a download to update it before ingest your changes.", print=TRUE)
puts
@logger.abortExecution(error=FALSE)
if not @dataset_utils.reportFileExist()
@dataset_utils.createInitialIngestionFile()
else
if not @dataset_utils.initialIngestionFileExist()
@logger.info("Checking local dataset...", print=TRUE)
if not @dataset_utils.reportUpToDate(data_stream_dict)
puts
@logger.error("Your current dataset is outdated. Please, run a download to update it before ingest your changes.", print=TRUE)
puts
@logger.abortExecution(error=FALSE)
end
end
end
if data_stream_dict["status_code"] != 0
......@@ -79,20 +103,18 @@ module Embulk
@logger.abortExecution()
end
task['paths'] = @dataset_utils.getLocalChanges(task['paths'], task['data_set'])
task['paths'], new_files, modified_files, deleted_files = @dataset_utils.getLocalChanges(task['paths'], task['data_set'])
if task['paths'].empty?
puts
@logger.info("No changes in '#{@data_set_directory}'. Everything up-to-date.", print=TRUE)
@logger.abortExecution(error=FALSE)
end
@logger.info("#{task['paths'].length} change(s) detected for ingestion: ", print=TRUE)
if task['paths'].length > 15
@logger.info(task['paths'][0, 5], print=TRUE)
@logger.info(".....", print=TRUE)
@logger.info(task['paths'][task['paths'].length-5, task['paths'].length-1], print=TRUE)
else
@logger.info(task['paths'], print=TRUE)
end
changes = @dataset_utils.reportFileExist() ? "change" : "new file"
@logger.info("#{task['paths'].length} #{changes}(s) detected for ingestion: ", print=TRUE)
print_short = task['paths'].length > 500
self.showChangesList(new_files, NEW, print_short)
self.showChangesList(modified_files, MODIFIED, print_short)
self.showChangesList(deleted_files, DELETED, print_short)
puts
@logger.info("Continue with ingestion? (y/n)", print=TRUE)
option = gets
......@@ -101,6 +123,9 @@ module Embulk
@logger.info("Ingestion cancelled by user.", print=TRUE)
@logger.abortExecution()
end
if not @dataset_utils.reportFileExist()
@dataset_utils.createReportFile()
end
columns = [
Column.new(0, "supplier", :string),
......@@ -139,9 +164,11 @@ module Embulk
@logger.info(task_reports, print=TRUE)
end
next_config_diff = task_reports.map{|hash| hash["done"]}.flatten.compact
@logger.info("#{next_config_diff.length} file(s) ingested.", print=TRUE)
changes = @dataset_utils.initialIngestionFileExist() ? "new file" : "change"
@logger.info("#{next_config_diff.length} #{changes}(s) ingested.", print=TRUE)
if(next_config_diff.length == count)
@logger.info("Dataset successfully ingested.", print=TRUE)
@wendelin.increaseDatasetVersion(@data_set)
else
next_config_diff = task_reports.map{|hash| hash["error"]}.flatten.compact
puts
......
......@@ -24,7 +24,7 @@ module Embulk
next [] unless Dir.exist?(path)
Dir[(path + '/**/*').gsub! '//', '/']
}.flatten.select{ |file| File.file?(file) }
local_changes = @dataset_utils.getLocalChanges(local_files, data_set)
local_changes, a, b, c = @dataset_utils.getLocalChanges(local_files, data_set)
data_set = @data_set.end_with?("/") ? @data_set : @data_set + "/"
remote_changes = remote_streams.map { |remote|
remote = @data_set_directory + remote["reference"].reverse.sub("/".reverse, ".".reverse).reverse.sub(data_set, "")
......@@ -178,12 +178,15 @@ module Embulk
@logger.info("This dataset was already downloaded. What do you want to do?", print=TRUE)
puts
self.askUserForAction(task, action=UPDATE)
else
elsif not @dataset_utils.initialIngestionFileExist()
puts
@logger.info("There was a previous attempt to download this dataset but it did not finish successfully.", print=TRUE)
@logger.info("What do you want to do?", print=TRUE)
puts
self.askUserForAction(task, action=RESUME)
else
puts
self.askUserForAction(task, action=UPDATE)
end
else
dir_entries = Dir.entries(@data_set_directory).length
......@@ -203,6 +206,7 @@ module Embulk
end
@dataset_utils.createReportFile()
end
@dataset_utils.deleteInitialIngestionFile()
columns = [
Column.new(0, "reference", :string),
Column.new(1, "data_chunk", :string),
......
......@@ -38,6 +38,21 @@ class WendelinClient
end
end
def increaseDatasetVersion(reference)
if reference == ""
@logger.warn("Could not increase data set version because dataset reference is empty.")
else
@logger.info("Increasing dataset version")
uri = URI("#{@erp5_url}/ERP5Site_increaseDatasetVersion?reference=#{reference}")
begin
res = open(uri, http_basic_authentication: [@user, @password]).read
rescue Exception => e
@logger.error("An error occurred while increasing dataset version: " + e.to_s)
@logger.error(e.backtrace)
end
end
end
def ingest(reference, data_chunk)
@logger.info("Ingestion reference: #{reference}", print=TRUE)
if Time.new - @last_ingestion < 2
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment