Commit 452e2f8d authored by Roque's avatar Roque

Ebulk changes

See merge request nexedi/ebulk!1
parents 428caf79 0d56bdcc
#! /usr/bin/env bash
DATA_LAKE_URL='https://wendelin.io/'
DEFAULT_DATA_LAKE_URL='https://wendelin.io/'
DOWN_URL="$DATA_LAKE_URL/"
ING_POLICY="portal_ingestion_policies/default_embulk"
ING_URL="$DATA_LAKE_URL$ING_POLICY"
DOWN_URL="$DEFAULT_DATA_LAKE_URL"
ING_POLICY="portal_ingestion_policies/default_ebulk"
ING_URL="$DEFAULT_DATA_LAKE_URL$ING_POLICY"
EBULK_VERSION="1.1.3"
EBULK_VERSION="1.1.5"
EMBULK_VERSION="0.9.7"
EBULK_DATA_PATH=~/.ebulk
EBULK_DATASET_FILE_NAME="/.ebulk_dataset"
......@@ -49,7 +49,7 @@ ASK="A"
if [ -f "$DATA_LAKE_URL_FILE" ]; then
URL=$(cat "$DATA_LAKE_URL_FILE" 2>/dev/null)
if [[ "$URL" != "" ]]; then
DOWN_URL="$URL/"
DOWN_URL="$URL"
ING_URL="$URL$ING_POLICY"
fi
fi
......@@ -221,13 +221,22 @@ function setDataLakeUrl {
echo -e "[INFO] Please enter a valid url.${NC}"
echo >&2; return 1
fi
if [ "${URL: -1}" != "/" ] ; then
URL="$URL/"
fi
echo "$URL" > "$DATA_LAKE_URL_FILE" 2>/dev/null
rm -f ${CREDENTIALS_FILE}
echo
echo "[INFO] Data-lake url set to '$URL'"
}
function defaultDataLakeUrl {
echo "" > "$DATA_LAKE_URL_FILE" 2>/dev/null
DOWN_URL="$DATA_LAKE_URL/"
ING_URL="$DATA_LAKE_URL$ING_POLICY"
DOWN_URL="$DEFAULT_DATA_LAKE_URL"
ING_URL="$DEFAULT_DATA_LAKE_URL$ING_POLICY"
rm -f ${CREDENTIALS_FILE}
echo
echo "[INFO] Data-lake url set to default '$DEFAULT_DATA_LAKE_URL'"
}
function updateConfigFile {
......@@ -305,9 +314,9 @@ function runProcess {
if [ -z "$STATUS" ]; then
if [ ! -z "$CHUNK" ]; then
if [ "$CHUNK" -eq "0" ]; then
echo "[INFO] Default chunk size: $DEFAULT_CHUNK_SIZE Mb."
echo "[INFO] Default chunk size: $DEFAULT_CHUNK_SIZE MB."
else
echo "[INFO] Chunk size set in $CHUNK Mb."
echo "[INFO] Chunk size set in $CHUNK MB."
fi
fi
if [ "$DATASET_DESCRIPTION" != "" ] ; then
......
......@@ -138,9 +138,9 @@ module Embulk
@logger.error("Your current dataset is outdated. Please, run a download to update it before ingest your changes.", print=TRUE)
puts
@logger.abortExecution(error=FALSE)
end
end
end
end
end
end
@logger.info("Supplier: #{task['supplier']}")
@logger.info("Dataset name: #{task['data_set']}")
......@@ -239,13 +239,19 @@ module Embulk
filename, extension, reference = @dataset_utils.getPathInfo(path, @dataset)
operation = rename ? DatasetUtils::RENAME : DatasetUtils::INGESTION
@dataset_utils.saveCurrentOperation(operation, reference, new_reference)
resume_split = @dataset_utils.splitOperationFileExist(reference) ? @dataset_utils.getLastSplitOperation(operation, reference, hash, @chunk_size) : 0
resume_split, large_hash = 0, ""
if @dataset_utils.splitOperationFileExist(reference)
resume_split, large_hash = @dataset_utils.getLastSplitOperation(operation, reference, hash, @chunk_size, large_hash=TRUE)
end
each_chunk(path, filename, extension, size, hash, schema[1..-1].map{|elm| elm.name}, @chunk_size, delete, new_reference, resume_split) do |entry|
@dataset_utils.createSplitOperationControlFile(reference) if split
large_hash += entry[8]
#no need to send large hash to server
entry.pop()
@page_builder.add(entry)
if ! delete && ! rename && entry[5] != ""
split = TRUE
@dataset_utils.saveSplitOperation(operation, reference, entry[5], hash, @chunk_size)
@dataset_utils.saveSplitOperation(operation, reference, entry[5], hash, @chunk_size, large_hash)
end
end
@page_builder.finish
......@@ -269,7 +275,7 @@ module Embulk
end
else
if @dataset_utils.reportFileExist()
@dataset_utils.addToReport(reference, return_value, size, hash, task['data_set'], new_reference)
@dataset_utils.addToReport(reference, return_value, size, hash, large_hash, task['data_set'], new_reference)
end
end
end
......@@ -282,11 +288,11 @@ module Embulk
def each_chunk(path, filename, extension, size, hash, fields, chunk_size=DatasetUtils::CHUNK_SIZE, delete=FALSE, new_reference=FALSE, resume_split=0)
if delete
File.delete(path) if File.exist?(path)
values = [@supplier, @dataset, filename, extension, "", DatasetUtils::DELETE, "", ""]
values = [@supplier, @dataset, filename, extension, "", DatasetUtils::DELETE, "", "", ""]
yield(values)
elsif new_reference
File.delete(path) if File.exist?(path)
values = [@supplier, @dataset, filename, extension, new_reference, DatasetUtils::RENAME, "", ""]
values = [@supplier, @dataset, filename, extension, new_reference, DatasetUtils::RENAME, "", "", ""]
yield(values)
else
file_object = File.open(path, "rb")
......@@ -297,7 +303,7 @@ module Embulk
data = next_byte
if not next_byte
if first # this means this is an empty file
values = [@supplier, @dataset, filename, extension, "", "", size, hash]
values = [@supplier, @dataset, filename, extension, "", "", size, hash, hash]
yield(values)
end
break
......@@ -320,7 +326,8 @@ module Embulk
eof = npart.to_s.rjust(3, "0")
end
content = Base64.encode64(data)
values = [@supplier, @dataset, filename, extension, content, eof, size, hash]
chunk_hash = @dataset_utils.getHashFromChunk(data)
values = [@supplier, @dataset, filename, extension, content, eof, size, hash, chunk_hash]
first = FALSE
yield(values)
end
......
......@@ -94,7 +94,6 @@ module Embulk
@erp5_url = config.param('erp5_url', :string)
@data_set = config.param('data_set', :string)
@logger.info("Dataset name: #{@data_set}")
@chunk_size = config.param('chunk_size', :float, default: 0) * DatasetUtils::MEGA
@output_path = config.param("output_path", :string, :default => nil)
if not File.directory?(@output_path)
@logger.error("Output directory not found.", print=TRUE)
......@@ -103,14 +102,14 @@ module Embulk
task = {
'erp5_url' => @erp5_url,
'data_set' => @data_set,
'chunk_size' => @chunk_size,
'chunk_size' => DatasetUtils::CHUNK_SIZE + 10,
'output_path' => @output_path,
'tool_dir' => @tool_dir
}
if task['chunk_size'] == 0
task['chunk_size'] = DatasetUtils::CHUNK_SIZE
end
@logger.info("Chunk size set in #{task['chunk_size']/DatasetUtils::MEGA}MB")
@logger.info("Download chunk size relies on server file chunks.")
@dataset_utils = DatasetUtils.new("")
task['data_set_directory'] = @dataset_utils.appendSlashTo(@output_path)
@data_set_directory = task['data_set_directory']
......@@ -242,7 +241,6 @@ module Embulk
def initialize(task, schema, index, page_builder)
super
@data_set = task['data_set']
@chunk_size = task['chunk_size']
@data_set_directory = task['data_set_directory']
@wendelin = WendelinClient.new(task['erp5_url'], task['user'], task['password'])
@logger = LogManager.instance()
......@@ -250,46 +248,61 @@ module Embulk
end
def run
data_stream = task['data_streams'][@index]
id = data_stream["id"]
reference = data_stream["reference"]
size = data_stream["size"]
hash = data_stream["hash"]
renamed = data_stream["status"] == DatasetUtils::STATUS_RENAMED
deleted = hash.to_s == DatasetUtils::DELETE
remote_file = task['data_streams'][@index]
reference = remote_file["reference"]
size = remote_file["full-size"]
large_hash = remote_file["large-hash"]
data_stream_chunk_list = remote_file["data-stream-list"]
renamed = remote_file["status"] == DatasetUtils::STATUS_RENAMED
deleted = large_hash.to_s == DatasetUtils::DELETE
begin
if deleted
entry = [reference, "", @data_set, DatasetUtils::DELETE, renamed]
page_builder.add(entry)
elsif renamed
new_reference = data_stream["new_reference"]
new_reference = remote_file["new_reference"]
entry = [reference, new_reference, @data_set, TRUE, renamed]
page_builder.add(entry)
else
@logger.info("Discarding local change on '#{data_stream["path"]}'", print=TRUE) if task['discard_changes']
chunk_size = data_stream_chunk_list[0]["size"] #first chunk size
@logger.info("Discarding local change on '#{remote_file["path"]}'", print=TRUE) if task['discard_changes']
@logger.info("Getting content from remote #{reference}", print=TRUE)
@logger.info("Downloading...", print=TRUE)
resume_split = @dataset_utils.splitOperationFileExist(reference) ? @dataset_utils.getLastSplitOperation(DatasetUtils::DOWNLOAD, reference, hash, @chunk_size) : 0
n_chunk = resume_split == 0 ? 0 : resume_split+1
split = n_chunk > 0
@logger.info("Resuming interrupted split download...", print=TRUE) if split
@wendelin.eachDataStreamContentChunk(id, @chunk_size, n_chunk) do |chunk|
content = chunk.nil? || chunk.empty? ? "" : Base64.encode64(chunk)
begin_of_file = n_chunk == 0
split = n_chunk > 0
@dataset_utils.createSplitOperationControlFile(reference) if split
entry = [reference, content, @data_set, begin_of_file, renamed]
page_builder.add(entry)
@dataset_utils.saveSplitOperation(DatasetUtils::DOWNLOAD, reference, n_chunk, hash, @chunk_size) if split
n_chunk += 1
end
resume_split = @dataset_utils.splitOperationFileExist(reference) ? @dataset_utils.getLastSplitOperation(DatasetUtils::DOWNLOAD, reference, large_hash, chunk_size) : 0
n_chunk = resume_split == 0 ? 0 : resume_split+1
split = n_chunk > 0
if split
@logger.info("Resuming interrupted split download...", print=TRUE)
else
if data_stream_chunk_list.length > 1
@logger.info("Downloading large file split in chunks...", print=TRUE)
else
@logger.info("Downloading...", print=TRUE)
end
end
data_stream_chunk_list.each_with_index do |data_stream_chunk, index|
#skip datastreams/chunks already downloaded
if n_chunk == index
content = ""
@wendelin.eachDataStreamContentChunk(data_stream_chunk["id"], chunk_size + 10, 0, data_stream_chunk_list.length > 1) do |chunk|
content = chunk.nil? || chunk.empty? ? "" : Base64.encode64(chunk)
end
begin_of_file = n_chunk == 0
split = n_chunk > 0
@dataset_utils.createSplitOperationControlFile(reference) if split
entry = [reference, content, @data_set, begin_of_file, renamed]
page_builder.add(entry)
@dataset_utils.saveSplitOperation(DatasetUtils::DOWNLOAD, reference, n_chunk, large_hash, chunk_size) if split
n_chunk += 1
end
end
@logger.info("Done", print=TRUE) if data_stream_chunk_list.length > 1
end
page_builder.finish
@dataset_utils.deleteSplitOperationFile(reference) if split
rescue java.lang.OutOfMemoryError
@logger.logOutOfMemoryError(reference)
return_value = DatasetUtils::RUN_ABORTED
rescue Exception => e
rescue Exception => e
@logger.error(e.to_s, print=TRUE)
@logger.error(e.backtrace)
puts "[INFO] For more detailed information, please refer to the log file: " + @logger.getLogPath()
......@@ -302,7 +315,9 @@ module Embulk
if deleted
@dataset_utils.deleteFromReport(reference, return_value)
else
@dataset_utils.addToReport(reference, return_value, size, hash, task['data_set'], new_reference)
file_path = renamed ? @dataset_utils.referenceToPath(new_reference, @data_set_directory, @data_set) : @dataset_utils.referenceToPath(reference, @data_set_directory, @data_set)
hash = @dataset_utils.getHash(file_path).to_s
@dataset_utils.addToReport(reference, return_value, size, hash, large_hash, task['data_set'], new_reference)
end
end
return {return_value => reference}
......
......@@ -60,7 +60,7 @@ module Embulk
File.open(file_path, write_mode) { |file| file.write(data_chunk) }
end
end
rescue Exception => e
rescue Exception => e
@logger.error("An error occurred while procesing file.", print=TRUE)
@logger.error(e.backtrace)
raise e
......
......@@ -34,7 +34,7 @@ class WendelinClient
def exists(reference)
checkReferenceChars(reference)
uri = URI(URI.escape("#{@erp5_url}/ingestionReferenceExists?reference=#{reference}"))
uri = URI(URI.escape("#{@erp5_url}/ERP5Site_checkIngestionReferenceExists?reference=#{reference}"))
begin
response = handleRequest(uri)
rescue Exception => e
......@@ -139,14 +139,14 @@ class WendelinClient
return {"success"=>TRUE, "message"=>"success"}
end
def eachDataStreamContentChunk(id, chunk_size, n_chunk=0)
def eachDataStreamContentChunk(id, chunk_size, n_chunk=0, split_operation=FALSE)
n_part = n_chunk
done = FALSE
first = TRUE
while not done
start_offset = n_part*chunk_size
end_offset = n_part*chunk_size+chunk_size
uri = URI(URI.escape("#{@erp5_url}getDataStreamChunk?id=#{id}&start_offset=#{start_offset}&end_offset=#{end_offset}"))
uri = URI(URI.escape("#{@erp5_url}ERP5Site_getDataStreamChunk?id=#{id}&start_offset=#{start_offset}&end_offset=#{end_offset}"))
success = FALSE
n_retry = 0
while ! success && n_retry < 10
......@@ -158,7 +158,11 @@ class WendelinClient
if first
yield chunk
end
@logger.info("Done", print=TRUE)
if split_operation
@logger.info("File chunk downloaded", print=TRUE)
else
@logger.info("Done", print=TRUE)
end
done = TRUE
else
first = FALSE
......@@ -181,7 +185,7 @@ class WendelinClient
end
def getDataStreams(data_set_reference)
uri = URI(URI.escape("#{@erp5_url}getDataStreamList?data_set_reference=#{data_set_reference}"))
uri = URI(URI.escape("#{@erp5_url}ERP5Site_getDataStreamList?data_set_reference=#{data_set_reference}"))
response = handleRequest(uri)
if response["success"] == FALSE
@logger.abortExecution()
......@@ -235,6 +239,8 @@ class WendelinClient
return {"success"=>TRUE, "message"=>res.body}
else
@logger.error("HTTP FAIL - code: #{res.code}", print=TRUE)
@logger.error("During request to " + uri.hostname.to_s, print=TRUE)
@logger.error(uri.to_s)
if res.code == '500' or res.code == '502' or res.code == '503'
@logger.error(HTTP_MESSAGE_5XX, print=TRUE)
elsif res.code == '401'
......
......@@ -16,6 +16,7 @@ commands:
-h, --help Tool help
-r, --readme Opens README file
-e, --examples Shows some tool usage examples
-v, --version Ebulk tool version
store-credentials Stores user and password for automatic authentication
set-data-lake-url Sets the data lake url where to ingest/download
default-data-lake-url Sets the data lake url to default
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment