ebulk: discard changes features

- --discard-changes parameter on command pull - new help and example files - several fixes

ebulk: discard changes features
- --discard-changes parameter on command pull - new help and example files - several fixes
7c13154d · roqueporchetto@gmail.com · f7de6621 · 7c13154d · 7c13154d · 7c13154d
Commit 7c13154d authored Oct 10, 2018 by roqueporchetto@gmail.com
11 changed files
--- a/ebulk
+++ b/ebulk
@@ -4,6 +4,10 @@ DOWN_URL='https://softinst104003.host.vifib.net/erp5/'
 ING_URL='https://softinst104003.host.vifib.net/erp5/portal_ingestion_policies/wendelin_embulk'
 EBULK_DATA_PATH=~/.ebulk
+EBULK_DATASET_FILE_NAME="/.ebulk_dataset"
+DATASET_REPORT_FILE_NAME="/.dataset-task-report"
+DATASET_COMPLETE_FILE_NAME="/.dataset-completed"
+DISCARD_CHANGES_FILE_NAME="/.discard-changes"
 LOG_DIR="$EBULK_DATA_PATH/logs"
 TOOL_PATH="$(dirname "$0")/ebulk-data"
 DOWN_FILE="$EBULK_DATA_PATH/download-config.yml"
@@ -46,6 +50,11 @@ function checkParameters {
    fi
    if [ "$STORAGE" = "" ] ; then
      if [ ! -d "$DATASET_DIR" ]; then
+	  if [ "$STATUS" ]; then
+		echo
+		echo -e "${ORANGE}[ERROR] ${GREEN}'$DATASET_DIR'${ORANGE} is not a dataset directory.${NC}"
+		echo >&2; return 1
+	  fi
          echo
          mkdir "$DATASET_DIR" 2>/dev/null
 	  if [ ! $? -eq 0 ]; then
@@ -56,16 +65,29 @@ function checkParameters {
 	      helpReadme >&2; return 1
 	  fi
      fi
-      EBULK_DATASET_FILE="$DATASET_DIR/.ebulk_dataset"
+      EBULK_DATASET_FILE="$DATASET_DIR$EBULK_DATASET_FILE_NAME"
      if [[ $DATASET_DIR != $REFERENCE ]]; then
 	  if [ "$REFERENCE" = "." ] ; then
 		REFERENCE=$(basename "$DATASET_DIR")
 	  fi
 	  DATA_SET=$REFERENCE
+	  if [ -f "$EBULK_DATASET_FILE" ]; then
+	      PREVIOUS_DATA_SET=$(cat "$EBULK_DATASET_FILE" 2>/dev/null)
+	      if [[ "$PREVIOUS_DATA_SET" != "$REFERENCE" ]]; then
+		  DATASET_REPORT_FILE="$DATASET_DIR$DATASET_REPORT_FILE_NAME"
+		  if [ -f "$DATASET_REPORT_FILE" ]; then
+		  	rm -f ${DATASET_REPORT_FILE}
+		  fi
+		  DATASET_COMPLETE_FILE="$DATASET_DIR$DATASET_COMPLETE_FILE_NAME"
+		  if [ -f "$DATASET_COMPLETE_FILE" ]; then
+		  	rm -f ${DATASET_COMPLETE_FILE}
+		  fi
+	      fi
+	  fi
 	  echo $REFERENCE > "$EBULK_DATASET_FILE" 2>/dev/null
      else
 	  if [ -f "$EBULK_DATASET_FILE" ]; then
-	      DATA_SET=$(cat "$DATASET_DIR/.ebulk_dataset" 2>/dev/null)
+	      DATA_SET=$(cat "$EBULK_DATASET_FILE" 2>/dev/null)
 	  else
 	      DATA_SET=$(basename "$DATASET_DIR")
 	      if [ "$DATA_SET" != "." ] ; then
@@ -403,7 +425,7 @@ function askS3parameters {
 }
 function stage {
-	EBULK_DATASET_FILE="./.ebulk_dataset"
+	EBULK_DATASET_FILE=".$EBULK_DATASET_FILE_NAME"
 	if [ ! -f "$EBULK_DATASET_FILE" ]; then
 		echo
 		echo -e "${ORANGE}[ERROR] You are not in a dataset directory."
@@ -461,6 +483,8 @@ while [ "$1" != "" ]; do
 					;;
 	-a | --advanced )		ADVANCED=true
 					;;
+	-dc | --discard-changes )	DISCARD_CHANGES=true
+					;;
 	-c | --chunk )          	shift
 					CHUNK=$1
 					;;
@@ -490,7 +514,7 @@ while [ "$1" != "" ]; do
    shift
 done
-for ELEMENT in '' '-d' '--directory' '-s' '--storage' '-cs' '--custom-storage' '-a' '--advanced' '-c' '--chunk'; do
+for ELEMENT in '' '-d' '--directory' '-s' '--storage' '-cs' '--custom-storage' '-a' '--advanced' '-c' '--chunk' '-dc' '--discard-changes'; do
  if [ "$ELEMENT" = "$REFERENCE" ]; then
 	REFERENCE="."
  fi
@@ -554,7 +578,13 @@ case $OPERATION in
 	fi
 	echo "### DATASET DOWNLOAD ###"
 	echo
-	echo -e "** The dataset will be downloaded in the specified directory: $DATASET_DIR"
+	if [ "$DISCARD_CHANGES" != "" ] ; then
+	  DISCARD_CHANGES_FILE="$DATASET_DIR$DISCARD_CHANGES_FILE_NAME"
+	  touch "$DISCARD_CHANGES_FILE" 2>/dev/null
+	  echo -e "** Discard all local changes in directory: $DATASET_DIR"
+	else
+	  echo -e "** The dataset will be downloaded in the specified directory: $DATASET_DIR"
+	fi
 	echo
 	read -n 1 -s -r -p "Press any key to continue"
 	echo
@@ -588,8 +618,9 @@ case $OPERATION in
 				PARAMETER_FUNCTION=askFTPparameters
 				STORAGE_GEM=embulk-input-ftp
 				;;
-			*)	echo -e "${ORANGE}[ERROR] '$STORAGE' storage is not available in ebulk tool yet.${NC}"
+			*)	echo -e "${ORANGE}[ERROR] '$STORAGE' storage is not available in ebulk tool yet or it is not a valid storage.${NC}"
 				echo "[INFO] If you want to configure yourself this storage, you can run the tool with parameter --custom-storage"
+				echo "[INFO] Current Ebulk version has the following storages available: ftp, http, s3."
 				echo
 				exit
 		esac

--- a/ebulk-data/config/ingestion-config_template.yml
+++ b/ebulk-data/config/ingestion-config_template.yml
@@ -11,6 +11,7 @@ in:
  user: $USER
  password: $pwd
  tool_dir: $TOOL_DIR
+  status: $STATUS
 out: 
  type: wendelin

--- a/ebulk-data/embulk-wendelin-dataset-tool/lib/embulk/dataset_utils.rb
+++ b/ebulk-data/embulk-wendelin-dataset-tool/lib/embulk/dataset_utils.rb
--- a/ebulk-data/embulk-wendelin-dataset-tool/lib/embulk/input/fif.rb
+++ b/ebulk-data/embulk-wendelin-dataset-tool/lib/embulk/input/fif.rb
@@ -19,7 +19,7 @@ module Embulk
 		 {"name"=>"eof", "type"=>"string"},
 		 {"name"=>"size", "type"=>"string"},
 		 {"name"=>"hash", "type"=>"string"}
-               ]
+	       ]
      def self.status(task, push=FALSE)
 	partial_ingestion = @dataset_utils.initialIngestionFileExist()
@@ -94,6 +94,7 @@ module Embulk
 	  @dataset_utils = DatasetUtils.new(@data_set_directory)
 	  @status = config.param('status', :string, default: FALSE)
 	  @status = @status == "" ? FALSE : @status
+	  @dataset_utils.deleteDiscardChangesFile()
 	  if @status
 	    if not @dataset_utils.initialIngestionFileExist()
 	      if not @dataset_utils.reportFileExist()
@@ -102,7 +103,7 @@ module Embulk
 		@logger.abortExecution()
 	      elsif not @dataset_utils.completedFileExist()
 		puts
-		@logger.error("There is an interrumped download operation in dataset directory. Please resume the download first.", print=TRUE)
+		@logger.error("There is an interrupted download operation in dataset directory. Please resume the download first.", print=TRUE)
 		@logger.abortExecution()
 	      end
 	    end
@@ -120,7 +121,7 @@ module Embulk
 	  if data_stream_dict["status_code"] != 0
 	    @logger.error(data_stream_dict["error_message"], print=TRUE)
 	    @logger.abortExecution()
-          end
+	  end
 	  task['data_streams'] = data_stream_dict["result"]
 	  if not @dataset_utils.reportFileExist()
@@ -128,11 +129,11 @@ module Embulk
 	  else
 	    if not @dataset_utils.initialIngestionFileExist()
 	      @logger.info("Checking local dataset...", print=TRUE)
-	      if not @dataset_utils.reportUpToDate(data_stream_dict)
+	      if not @dataset_utils.reportUpToDate(data_stream_dict, @data_set)
-	        puts
+		puts
-	        @logger.error("Your current dataset is outdated. Please, run a download to update it before ingest your changes.", print=TRUE)
+		@logger.error("Your current dataset is outdated. Please, run a download to update it before ingest your changes.", print=TRUE)
-	        puts
+		puts
-	        @logger.abortExecution(error=FALSE)
+		@logger.abortExecution(error=FALSE)
 	      end
 	    end
 	  end
@@ -145,12 +146,12 @@ module Embulk
 	    @logger.error("Could not find any valid file.", print=TRUE)
 	    @logger.error("Please make sure your dataset directory contains files for ingestion.", print=TRUE)
 	    @logger.abortExecution()
-          end
+	  end
 	  self.status(task, push=TRUE)
 	  @logger.info("Continue with ingestion? (y/n)", print=TRUE)
-          option = gets
+	  option = gets
-          option = option.chomp
+	  option = option.chomp
 	  if option == "n"
 	    @logger.info("Ingestion cancelled by user.", print=TRUE)
 	    @logger.abortExecution()
@@ -160,20 +161,20 @@ module Embulk
 	  end
 	  columns = [
-            Column.new(0, "supplier", :string),
+	    Column.new(0, "supplier", :string),
-            Column.new(1, "data_set", :string),
+	    Column.new(1, "data_set", :string),
 	    Column.new(2, "file", :string),
 	    Column.new(3, "extension", :string),
 	    Column.new(4, "data_chunk", :string),
 	    Column.new(5, "eof", :string),
 	    Column.new(6, "size", :string),
 	    Column.new(7, "hash", :string)
-          ]
+	  ]
 	  commit_reports = yield(task, columns, task['paths'].length)
-          done = commit_reports.map{|hash| hash["done"]}.flatten.compact
+	  done = commit_reports.map{|hash| hash["done"]}.flatten.compact
-          resume(task, columns, task['paths'].length, &control)
+	  resume(task, columns, task['paths'].length, &control)
 	rescue Exception => e
 	  @logger.error("An error occurred during operation: " + e.to_s, print=TRUE)
 	  @logger.error(e.backtrace)
@@ -184,9 +185,9 @@ module Embulk
      def self.resume(task, columns, count, &control)
 	@logger = LogManager.instance()
-        task_reports = yield(task, columns, count)
+	task_reports = yield(task, columns, count)
-	@dataset_utils.showTaskReport(task_reports)
+	next_config_diff = task_reports.map{|hash| hash[DatasetUtils::RUN_DONE]}.flatten.compact
-        next_config_diff = task_reports.map{|hash| hash["done"]}.flatten.compact
+	@dataset_utils.showTaskReport(next_config_diff)
 	element_output = @dataset_utils.initialIngestionFileExist() ? "new file" : "change"
 	@logger.info("#{next_config_diff.length} #{element_output}(s) ingested.", print=TRUE)
 	if(next_config_diff.length == count)
@@ -194,7 +195,7 @@ module Embulk
 	  @wendelin.increaseDatasetVersion(@data_set)
 	  @dataset_utils.deleteStagedFile()
 	else
-	  failed_tasks = task_reports.map{|hash| hash["error"]}.flatten.compact
+	  failed_tasks = task_reports.map{|hash| hash[DatasetUtils::RUN_ERROR] || hash[DatasetUtils::RUN_ABORTED] }.flatten.compact
 	  @dataset_utils.showTaskErrors(failed_tasks)
 	end
 	next_config_diff = {}
@@ -202,9 +203,9 @@ module Embulk
      end
      def initialize(task, schema, index, page_builder)
-        super
+	super
-        @supplier = task['supplier']
+	@supplier = task['supplier']
-        @dataset = task['data_set']
+	@dataset = task['data_set']
 	@chunk_size = task['chunk_size']
 	@data_set_directory = task['data_set_directory']
 	@logger = LogManager.instance()
@@ -219,16 +220,19 @@ module Embulk
 	  size = file_dict["size"]
 	  hash = file_dict["hash"]
 	  delete = hash == DatasetUtils::DELETE
+	  rename = file_dict["status"] == DatasetUtils::STATUS_RENAMED
 	  if size == "" and hash == "" #new file
 	    size = File.size(path)
 	    hash = @dataset_utils.getHash(path)
 	  end
+	  new_filename, new_extension, new_reference = @dataset_utils.getPathInfo(file_dict["new_path"], @dataset) if rename
 	  filename, extension, reference = @dataset_utils.getPathInfo(path, @dataset)
-	  @dataset_utils.saveCurrentOperation(DatasetUtils::INGESTION, reference)
+	  operation = rename ? DatasetUtils::RENAME : DatasetUtils::INGESTION
-	  each_chunk(path, filename, extension, size, hash, schema[1..-1].map{|elm| elm.name}, @chunk_size, delete) do |entry|
+	  @dataset_utils.saveCurrentOperation(operation, reference, new_reference)
+	  each_chunk(path, filename, extension, size, hash, schema[1..-1].map{|elm| elm.name}, @chunk_size, delete, new_reference) do |entry|
 	    @page_builder.add(entry)
-          end
+	  end
-          @page_builder.finish
+	  @page_builder.finish
 	rescue java.lang.OutOfMemoryError
 	  @logger.logOutOfMemoryError(path)
 	  return_value = DatasetUtils::RUN_ABORTED
@@ -247,7 +251,7 @@ module Embulk
 	    end
 	  else
 	    if @dataset_utils.reportFileExist()
-	      @dataset_utils.addToReport(reference, return_value, size, hash, task['data_set'])
+	      @dataset_utils.addToReport(reference, return_value, size, hash, task['data_set'], new_reference)
 	    end
 	  end
 	end
@@ -257,29 +261,33 @@ module Embulk
      private
-      def each_chunk(path, filename, extension, size, hash, fields, chunk_size=DatasetUtils::CHUNK_SIZE, delete=FALSE)
+      def each_chunk(path, filename, extension, size, hash, fields, chunk_size=DatasetUtils::CHUNK_SIZE, delete=FALSE, new_reference=FALSE)
 	if delete
 	  File.delete(path) if File.exist?(path)
 	  values = [@supplier, @dataset, filename, extension, "", DatasetUtils::DELETE, "", ""]
 	  yield(values)
+	elsif new_reference
+	  File.delete(path) if File.exist?(path)
+	  values = [@supplier, @dataset, filename, extension, new_reference, DatasetUtils::RENAME, "", ""]
+	  yield(values)
 	else
-          file_object = File.open(path, "rb")
+	  file_object = File.open(path, "rb")
 	  npart = 0
-          next_byte = file_object.read(1)
+	  next_byte = file_object.read(1)
 	  first = TRUE
-          while true
+	  while true
-              data = next_byte
+	      data = next_byte
-              if not next_byte
+	      if not next_byte
 		  if first # this means this is an empty file
 		    values = [@supplier, @dataset, filename, extension, "", "", size, hash]
 		    yield(values)
 		  end
 		  break
 	      end
-              data += file_object.read(chunk_size)
+	      data += file_object.read(chunk_size)
-              next_byte = file_object.read(1)
+	      next_byte = file_object.read(1)
-              if not next_byte
+	      if not next_byte
-                  eof = DatasetUtils::EOF
+		  eof = DatasetUtils::EOF
 		  if first # this means that the whole file will be ingested at once (not split)
 		    eof = ""
 		  end

--- a/ebulk-data/embulk-wendelin-dataset-tool/lib/embulk/input/wendelin.rb
+++ b/ebulk-data/embulk-wendelin-dataset-tool/lib/embulk/input/wendelin.rb
--- a/ebulk-data/embulk-wendelin-dataset-tool/lib/embulk/output/fif.rb
+++ b/ebulk-data/embulk-wendelin-dataset-tool/lib/embulk/output/fif.rb
@@ -11,15 +11,15 @@ module Embulk
      def self.transaction(config, schema, count, &control)
 	@logger = LogManager.instance()
-        task = { "output_path" => config.param("output_path", :string,  :default => nil) }
+	task = { "output_path" => config.param("output_path", :string,  :default => nil) }
 	if File.directory?(task['output_path'])
 	else
      	  @logger.error("Output directory not found.", print=TRUE)
 	  @logger.abortExecution()
 	end
-        task_reports = yield(task)
+	task_reports = yield(task)
-        next_config_diff = {}
+	next_config_diff = {}
-        return next_config_diff
+	return next_config_diff
      end
      def init
@@ -32,29 +32,37 @@ module Embulk
      def add(page)
 	begin
-          page.each do |record|
+	  page.each do |record|
 	    reference = record[0]
-	    data_chunk = Base64.decode64(record[1])
 	    @dataset_utils = DatasetUtils.new("")
 	    data_set_directory = @dataset_utils.appendSlashTo(@output_path)
 	    file_path = @dataset_utils.referenceToPath(reference, data_set_directory, record[2])
 	    write_mode = 'ab'
 	    if record[3] == DatasetUtils::DELETE
+	      @logger.info("Deleting '#{file_path}'", print=TRUE)
 	      File.delete(file_path) if File.exist?(file_path)
+	    elsif record[4] == TRUE.to_s # if renamed
+	      new_file_path = @dataset_utils.referenceToPath(record[1], data_set_directory, record[2])
+	      @logger.info("Renaming '#{file_path}' to '#{new_file_path}'", print=TRUE)
+	      unless File.directory?(File.dirname(new_file_path))
+		FileUtils.mkdir_p(File.dirname(new_file_path))
+	      end
+	      FileUtils.mv(file_path, new_file_path) if File.exist?(file_path)
 	    else
+	      data_chunk = Base64.decode64(record[1])
 	      if record[3] == TRUE.to_s
-	        write_mode = 'w'
+		write_mode = 'w'
 	      end
 	      dirname = File.dirname(file_path)
 	      unless File.directory?(dirname)
-	        FileUtils.mkdir_p(dirname)
+		FileUtils.mkdir_p(dirname)
 	      end
 	      File.open(file_path, write_mode) { |file| file.write(data_chunk) }
 	    end
-          end
+	  end
 	rescue Exception => e  
-          @logger.error("An error occurred while procesing file.", print=TRUE)
+	  @logger.error("An error occurred while procesing file.", print=TRUE)
-          @logger.error(e.backtrace)
+	  @logger.error(e.backtrace)
 	  raise e
 	end
      end
@@ -66,8 +74,8 @@ module Embulk
      end
      def commit
-        task_report = {}
+	task_report = {}
-        return task_report
+	return task_report
      end
    end

--- a/ebulk-data/embulk-wendelin-dataset-tool/lib/embulk/output/wendelin.rb
+++ b/ebulk-data/embulk-wendelin-dataset-tool/lib/embulk/output/wendelin.rb
@@ -9,33 +9,33 @@ module Embulk
      Plugin.register_output("wendelin", self)
      def self.transaction(config, schema, count, &control)
-        task = {
+	task = {
-          "erp5_url" => config.param("erp5_url", :string),
+	  "erp5_url" => config.param("erp5_url", :string),
-          "user" => config.param("user", :string, defualt: nil),
+	  "user" => config.param("user", :string, defualt: nil),
-          "password" => config.param("password", :string, default: nil),
+	  "password" => config.param("password", :string, default: nil),
-          "path_prefix" => config.param("path_prefix", :string,  :default => nil),
+	  "path_prefix" => config.param("path_prefix", :string,  :default => nil),
-        }
+	}
-        task_reports = yield(task)
+	task_reports = yield(task)
-        next_config_diff = {}
+	next_config_diff = {}
-        @logger = LogManager.instance()
+	@logger = LogManager.instance()
 	@logger.info("Your ingested files will be available in the site in a few minutes. Thank for your patience.", print=TRUE)
-        return next_config_diff
+	return next_config_diff
      end
      def init
-        credentials = {}
+	credentials = {}
-        @erp5_url = task["erp5_url"]
+	@erp5_url = task["erp5_url"]
 	@user = task["user"]
-        @password = task["password"]
+	@password = task["password"]
-        @logger = LogManager.instance()
+	@logger = LogManager.instance()
-        @wendelin = WendelinClient.new(@erp5_url, @user, @password)
+	@wendelin = WendelinClient.new(@erp5_url, @user, @password)
      end
      def close
      end
      def add(page)
-        page.each do |record|
+	page.each do |record|
 	  supplier = (record[0].nil? || record[0].empty?) ? "default" : record[0]
 	  dataset = (record[1].nil? || record[1].empty?) ? "default" : record[1]
 	  filename = record[2]
@@ -48,18 +48,21 @@ module Embulk
 	    if eof == DatasetUtils::DELETE
 	      reference = [dataset, filename, extension].join(DatasetUtils::REFERENCE_SEPARATOR)
 	      @wendelin.delete(reference)
+	    elsif eof == DatasetUtils::RENAME
+	      reference = [dataset, filename, extension].join(DatasetUtils::REFERENCE_SEPARATOR)
+	      @wendelin.rename(reference, record[4].to_s)
 	    else
 	      reference = [supplier, dataset, filename, extension, eof, size, hash].join(DatasetUtils::REFERENCE_SEPARATOR)
 	      split = eof != ""
 	      if not @wendelin.ingest(reference, data_chunk, split)
-	        raise "could not ingest"
+		raise "could not ingest"
 	      end
 	    end
 	  rescue Exception => e  
 	    raise e
-            @logger.error(e.backtrace)
+	    @logger.error(e.backtrace)
 	  end
-        end
+	end
      end
      def finish
@@ -69,8 +72,8 @@ module Embulk
      end
      def commit
-        task_report = {}
+	task_report = {}
-        return task_report
+	return task_report
      end
    end

--- a/ebulk-data/embulk-wendelin-dataset-tool/lib/embulk/parser/binary.rb
+++ b/ebulk-data/embulk-wendelin-dataset-tool/lib/embulk/parser/binary.rb
@@ -24,42 +24,42 @@ module Embulk
 	tool_dir = config.param('tool_dir', :string, default: ".")
 	@logger = LogManager.instance()
 	@logger.setFilename(tool_dir, "parser")
-        task = {
+	task = {
 	  chunk_size: config.param('chunk_size', :float, default: 0) * DatasetUtils::MEGA,
 	  supplier: config.param("supplier", :string, default: "parser"),
 	  data_set: config.param("data_set", :string),
 	  input_plugin: config.param("storage", :string, default: "parser"),
 	  date: Time.now.strftime("%Y-%m-%d_%H-%M-%S")
-        }
+	}
 	if task['chunk_size'] == 0
 	  task['chunk_size'] = DatasetUtils::CHUNK_SIZE
 	end
 	columns = [
-            Column.new(0, "supplier", :string),
+	    Column.new(0, "supplier", :string),
-            Column.new(1, "data_set", :string),
+	    Column.new(1, "data_set", :string),
 	    Column.new(2, "file", :string),
 	    Column.new(3, "extension", :string),
 	    Column.new(4, "data_chunk", :string),
 	    Column.new(5, "eof", :string),
 	    Column.new(6, "size", :string),
 	    Column.new(7, "hash", :string)
-          ]
+	  ]
-        yield(task, columns)
+	yield(task, columns)
      end
      def run(file_input)
-        @index = Index.instance().get()
+	@index = Index.instance().get()
 	@logger = LogManager.instance()
-        while file = file_input.next_file
+	while file = file_input.next_file
 	  begin
 	    filename = "file_from_#{task['input_plugin']}_#{task['date']}"
 	    each_chunk(file, filename, task['chunk_size']) do |record|
-              @page_builder.add(record)
+	      @page_builder.add(record)
-            end
+	    end
-            @page_builder.finish
+	    @page_builder.finish
 	    Index.instance().increase()
-          rescue java.lang.OutOfMemoryError
+	  rescue java.lang.OutOfMemoryError
 	    @logger.logOutOfMemoryError(path)
 	    return
 	  rescue Exception => e
@@ -67,18 +67,18 @@ module Embulk
 	    @logger.error(e.backtrace)
 	    puts "[INFO] For more detailed information, please refer to the log file: " + @logger.getLogPath()
 	  end
-        end
+	end
      end
      private
      def each_chunk(file, filename, chunk_size=DatasetUtils::CHUNK_SIZE)
 	extension = @index.to_s.rjust(3, "0")
 	npart = 0
-        next_byte = file.read(1)
+	next_byte = file.read(1)
 	first = TRUE
-        while true
+	while true
-            data = next_byte
+	    data = next_byte
-            if not next_byte
+	    if not next_byte
 		if first
 		  # this means this is an empty file
 		  values = [task['supplier'], task['data_set'], filename, extension, "", "", "", ""]
@@ -86,10 +86,10 @@ module Embulk
 		end
 		break
 	    end
-            data += file.read(chunk_size)
+	    data += file.read(chunk_size)
-            next_byte = file.read(1)
+	    next_byte = file.read(1)
-            if not next_byte
+	    if not next_byte
-                eof = DatasetUtils::EOF
+		eof = DatasetUtils::EOF
 		if first
 		  # this means that the whole file will be ingested at once (not split)
 		  eof = ""

--- a/ebulk-data/embulk-wendelin-dataset-tool/lib/embulk/wendelin_client.rb
+++ b/ebulk-data/embulk-wendelin-dataset-tool/lib/embulk/wendelin_client.rb
@@ -16,8 +16,15 @@ class WendelinClient
    @last_ingestion = Time.new - 2
  end
+  def checkReferenceChars(reference)
+    if ["&", ";", "#", "%", '"', "+"].any? { |char| reference.include?(char) }
+      raise "invalid char in filename. Following chars are not allowed for filenames: \& \; \% \" \+ \# Please rename it."
+    end
+  end
  def exists(reference)
-    uri = URI("#{@erp5_url}/ingestionReferenceExists?reference=#{reference}")
+    checkReferenceChars(reference)
+    uri = URI(URI.escape("#{@erp5_url}/ingestionReferenceExists?reference=#{reference}"))
    begin
      res = open(uri, http_basic_authentication: [@user, @password]).read
    rescue Exception => e
@@ -34,11 +41,25 @@ class WendelinClient
  def delete(reference)
    @logger.info("Deletion requested for reference #{reference}", print=TRUE)
-    uri = URI("#{@erp5_url}/ERP5Site_invalidateIngestionObjects?reference=#{reference}")
+    checkReferenceChars(reference)
+    uri = URI(URI.escape("#{@erp5_url}/ERP5Site_invalidateIngestionObjects?reference=#{reference}"))
    res = handleRequest(uri)
    if res == FALSE
      @logger.abortExecution()
    end
+    @logger.info("Remote file successfully ingested.", print=TRUE)
+  end
+  def rename(reference, new_reference)
+    @logger.info("Rename requested for reference #{reference}, new reference #{new_reference}", print=TRUE)
+    checkReferenceChars(reference)
+    checkReferenceChars(new_reference)
+    uri = URI(URI.escape("#{@erp5_url}/ERP5Site_renameIngestion?reference=#{reference}&new_reference=#{new_reference}"))
+    res = handleRequest(uri)
+    if res == FALSE
+      @logger.abortExecution()
+    end
+    @logger.info("Remote file successfully renamed.", print=TRUE)
  end
  def increaseDatasetVersion(reference)
@@ -46,12 +67,12 @@ class WendelinClient
      @logger.warn("Could not increase data set version because dataset reference is empty.")
    else
      @logger.info("Increasing dataset version")
-      uri = URI("#{@erp5_url}/ERP5Site_increaseDatasetVersion?reference=#{reference}")
+      uri = URI(URI.escape("#{@erp5_url}/ERP5Site_increaseDatasetVersion?reference=#{reference}"))
      begin
-        res = open(uri, http_basic_authentication: [@user, @password]).read
+	res = open(uri, http_basic_authentication: [@user, @password]).read
      rescue Exception => e
-        @logger.error("An error occurred while increasing dataset version: " + e.to_s)
+	@logger.error("An error occurred while increasing dataset version: " + e.to_s)
-        @logger.error(e.backtrace)
+	@logger.error(e.backtrace)
      end
    end
  end
@@ -63,21 +84,13 @@ class WendelinClient
 	sleep 3
      end
      if exists(reference)
-        @logger.info("There is another ingestion already done for the pair dataset-filename. Reference "\
+	@logger.info("There is another ingestion already done for the pair dataset-filename. Reference "\
-              + reference, print=TRUE)
+	      + reference, print=TRUE)
 	@logger.info("Rename your file or download the full dataset to make local changes.", print=TRUE)
-        return FALSE
+	return FALSE
-      end
-      if reference.include? "#" or reference.include? "+"
-	raise "invalid chars in file name. Please rename it."
-      end
-      begin
-      	uri = URI("#{@erp5_url}/ingest?reference=#{reference}")
-      rescue Exception => e
-        @logger.error("An error occurred while generating url: " + e.to_s)
-        @logger.error(e.backtrace)
-	raise "invalid chars in file name. Please rename it."
      end
+      checkReferenceChars(reference)
+      uri = URI(URI.escape("#{@erp5_url}/ingest?reference=#{reference}"))
      response = handleRequest(uri, reference, data_chunk)
      if response == FALSE
 	return FALSE
@@ -88,28 +101,28 @@ class WendelinClient
  end
  def eachDataStreamContentChunk(id, chunk_size)
-    uri = URI("#{@erp5_url}#{id}/getData")
+    uri = URI(URI.escape("#{@erp5_url}#{id}/getData"))
    @logger.info("Downloading...", print=TRUE)
    first = TRUE
    res = open(uri, http_basic_authentication: [@user, @password])  { 
      |content| 
-        while true
+	while true
-          chunk = content.read(chunk_size)
+	  chunk = content.read(chunk_size)
-          if chunk.nil?
+	  if chunk.nil?
 	    if first
 	      yield chunk
 	    end
 	    @logger.info("Done", print=TRUE)
-            break
+	    break
-          end
+	  end
 	  first = FALSE
-          yield chunk
+	  yield chunk
-        end
+	end
     }
   end
  def getDataStreams(data_set_reference)
-    uri = URI("#{@erp5_url}getDataStreamList?data_set_reference=#{data_set_reference}")
+    uri = URI(URI.escape("#{@erp5_url}getDataStreamList?data_set_reference=#{data_set_reference}"))
    str = handleRequest(uri)
    if str == FALSE
      @logger.abortExecution()
@@ -127,44 +140,44 @@ class WendelinClient
      req.basic_auth @user, @password
      if data_chunk != nil
-        @logger.info("Setting request form data...", print=TRUE)
+	@logger.info("Setting request form data...", print=TRUE)
-        begin
+	begin
-          req.set_form_data('data_chunk' => data_chunk)
+	  req.set_form_data('data_chunk' => data_chunk)
-        rescue java.lang.OutOfMemoryError
+	rescue java.lang.OutOfMemoryError
-          @logger.logOutOfMemoryError(reference)
+	  @logger.logOutOfMemoryError(reference)
-          return FALSE
+	  return FALSE
-        end
+	end
-        @logger.info("Sending record:'#{reference}'...", print=TRUE)
+	@logger.info("Sending record:'#{reference}'...", print=TRUE)
      end
      begin
-          res = Net::HTTP.start(uri.hostname, uri.port,
+	  res = Net::HTTP.start(uri.hostname, uri.port,
-	          :use_ssl      => (uri.scheme == 'https'),
+		  :use_ssl      => (uri.scheme == 'https'),
-	          :verify_mode  => OpenSSL::SSL::VERIFY_NONE,
+		  :verify_mode  => OpenSSL::SSL::VERIFY_NONE,
-	          :ssl_timeout  => 300, :open_timeout => 300, :read_timeout => 300,
+		  :ssl_timeout  => 300, :open_timeout => 300, :read_timeout => 300,
-	        ) do |http|
+		) do |http|
 		  http.request(req)
 		end
      rescue Exception => e  
-          @logger.error("HTTP ERROR: " + e.to_s, print=TRUE)
+	  @logger.error("HTTP ERROR: " + e.to_s, print=TRUE)
-          @logger.error(e.backtrace)
+	  @logger.error(e.backtrace)
 	  return FALSE
      else
-          if res.kind_of?(Net::HTTPSuccess) # res.code is 2XX
+	  if res.kind_of?(Net::HTTPSuccess) # res.code is 2XX
 	      @logger.info("Done")
 	      return res.body
-          else
+	  else
-              @logger.error("HTTP FAIL -  code: #{res.code}", print=TRUE)
+	      @logger.error("HTTP FAIL -  code: #{res.code}", print=TRUE)
 	      if res.code == '500' or res.code == '502' or res.code == '503'
-                @logger.error("Internal Server Error: if the error persists, please contact the administrator.", print=TRUE)
+		@logger.error("Internal Server Error: if the error persists, please contact the administrator.", print=TRUE)
 	      elsif res.code == '401'
-                @logger.error("Unauthorized access. Please check your user credentials and try again.", print=TRUE)
+		@logger.error("Unauthorized access. Please check your user credentials and try again.", print=TRUE)
 		@logger.abortExecution()
 	      else
-                @logger.error("Sorry, an error ocurred. If the error persists, please contact the administrator.", print=TRUE)
+		@logger.error("Sorry, an error ocurred. If the error persists, please contact the administrator.", print=TRUE)
 	      end
 	      return FALSE
-          end
+	  end
      end 
  end
 end
--- a/ebulk-data/example.md
+++ b/ebulk-data/example.md
+ebulk ingest-download tool examples
+Basic ingestion/download
+   ebulk pull <DATASET>
+      * downloads the content of target dataset
+   ebulk push <DATASET>
+      * ingests files into the target dataset
+   ebulk pull <DATASET> -d <PATH>
+      * downloads the content of target dataset in target PATH
+      * future operations on PATH directory will use the DATASET reference implicitly
+   ebulk push <DATASET> -c 20
+      * ingests files into the <DATASET> splitting them in chunks of 20MB
+   ebulk push <DATASET> -s <STORAGE>
+      * ingests the content of the input storage [http, ftp, s3] into the target dataset
+   ebulk push <DATASET> -s <STORAGE> --advanced
+      * allows the user to edit the configuration file of the selected storage
+   ebulk push <DATASET> --custom-storage 
+      * user can install and configure a new input plugin storage
+Manage local changes
+   ebulk status <DATASET>
+      * checks local changes of target dataset
+   ebulk add <PATH>
+      * marks files in path for ingestion
+   ebulk remove <PATH>
+      * marks files in path for deletion
+   ebulk reset <PATH>
+      * resets marked files in path
+   ebulk pull --discard-changes
+      * discards local changes by checking the remote dataset
--- a/ebulk-data/help.md
+++ b/ebulk-data/help.md
 ebulk ingest-download tool help
-usage: ebulk <command> <dataset> [options...]
+  ebulk [-h|--help] [-r|--readme] [-e|--examples] <command> [<args>]
+	[-d|--directory <path>] [-c|--chunk <size>]
+	[-s|--storage <storage>] [-cs|--custom-storage]
+	[-a|--advanced] [-dc|--discard-changes]
 commands:
-   pull  <dataset>       Downloads the content of the target dataset from the site into the output folder
+   pull    [<dataset>]   Downloads the content of the target dataset from the site into the output location
-   push  <dataset>       Ingests the content of the input folder into a target dataset on the site
+   push    [<dataset>]   Ingests the content of the input location into a target dataset on the site
-   -h,   --help          Tool help
+   status  [<dataset>]   Lists the local changes of target dataset
-   -r,   --readme        Opens README file
+   add     <path>	 Marks new or modified files in path for ingestion
+   remove  <path>	 Marks files in path for removal
+   reset   <path>	 Resets marked files in path
+   -h,  --help           Tool help
+   -r,  --readme         Opens README file
+   -e,  --examples       Shows some tool usage examples
 argument:
-   dataset     		 Mandatory. Unique reference for the target dataset
+   dataset argument	 Unique reference for the target dataset
+			 If empty, current directory will be used as dataset directory and reference
 			 It must start with a letter, and only alphanumerics, dots ( . ), underscores ( _ ) and hyphens ( - ) are allowed
 			 * For download, the reference must be one of the available datasets on the site
 			 * For ingestion, an existing reference will append the files to the corresponding dataset
 			 * A new reference will create a new dataset on the site
-			 It could be a path, then the last directory will be interpreted as the reference
+			 It could be a path, then that directory will be used as dataset reference
 			 e.g. pull my_directory/sample/  -->  dataset reference will be "sample"
 options:
   -d,  --directory  <path>	  Besides the dataset reference, sets the dataset directory and it links that location to the reference
-   -c,  --chunk  <chunk>          Sets the chunk size (in megabytes) to split large files
+   -c,  --chunk  <size>           Sets the chunk size (in megabytes) to split large files
   -s,  --storage  <storage>	  Uses the selected input storage from this set: [http, ftp, s3]
   -cs, --custom-storage  	  Allows user to set a new input storage.
   -a,  --advanced	  	  Allows to edit the Embulk cofiguration file of the input storage
+   -dc,  --discard-changes	  Discards local changes by checking the remote dataset
-examples:
-   ebulk pull <DATASET>
-      * downloads the content of target dataset
-   ebulk push <DATASET>
-      * ingests files into the target dataset
-   ebulk pull <DATASET> -d <PATH>
-      * downloads the content of target dataset in target PATH
-      * future operations on PATH directory will use the DATASET reference implicitly
-   ebulk push <DATASET> -c 20
-      * ingests files into the <DATASET> splitting them in chunks of 20MB
-   ebulk push <DATASET> -s <STORAGE>
-      * ingests the content of the input storage [http, ftp, s3] into the target dataset
-   ebulk push <DATASET> -s <STORAGE> --advanced
-      * allows the user to edit the configuration file of the selected storage
-   ebulk push <DATASET> --custom-storage 
-      * user can install and configure a new input plugin storage