Commit a3fe2add authored by yu's avatar yu

Finish the multi thread

parent e226df76
Copyright (C) 2016 Nexedi SA and Contributors
Klaus Wölfel <klaus@nexedi.com>
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
MIT License
http://www.apache.org/licenses/LICENSE-2.0
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
# Filename file input plugin for Embulk
# Filename input plugin for Embulk
Embulk filename file input plugin similar to local file input which overloads FileInputStream read methods to provide the filename in the first bytes of the stream
TODO: Write short description here and build.gradle file.
## Overview
* **Plugin type**: file input
* **Plugin type**: input
* **Resume supported**: yes
* **Cleanup supported**: yes
* **Guess supported**: no
## Configuration
- **option1**: path_prefix (string, required)
- **option1**: description (integer, required)
- **option2**: description (string, default: `"myvalue"`)
- **option3**: description (string, default: `null`)
## Example
seed.yml:
```yaml
exec:
min_output_tasks: 1
in:
type: filename
path_prefix: /path/to/my/files
parser:
type: none-bin
out:
type: wendelin
tag: my_tag
streamtool_uri: https://my_instance.host.vifib.net:/erp5/portal_ingestion_policies/my_ingestion_policy
user: my_user
password: my_password
```
## Install
```
$ embulk gem install embulk-input-filename embulk-parser-none-bin embulk-output-wendelin
option1: example1
option2: example2
```
## Run
```
$ embulk run seed.yml -c diff.yml
```
## Build
```
$ ./gradlew package
```
## Build Package
```
$ ./gradlew gem # -t to watch change of files and rebuild continuously
```
......@@ -14,16 +14,20 @@ configurations {
}
version = "0.1.0"
sourceCompatibility = 1.8
targetCompatibility = 1.8
dependencies {
compile "org.embulk:embulk-core:0.8.23"
provided "org.embulk:embulk-core:0.8.23"
compile "org.embulk:embulk-standards:0.8.23"
provided "org.embulk:embulk-standards:0.8.23"
compile "org.embulk:embulk-core:0.8.27"
provided "org.embulk:embulk-core:0.8.27"
compile "org.embulk:embulk-standards:0.8.27"
provided "org.embulk:embulk-standards:0.8.27"
compile "commons-codec:commons-codec:1.9"
// compile "YOUR_JAR_DEPENDENCY_GROUP:YOUR_JAR_DEPENDENCY_MODULE:YOUR_JAR_DEPENDENCY_VERSION"
testCompile "commons-codec:commons-codec:1.9"
testCompile "junit:junit:4.+"
testCompile "org.embulk:embulk-core:0.8.23:tests"
testCompile 'org.embulk:embulk-test:0.8.23'
testCompile "org.embulk:embulk-core:0.8.27:tests"
testCompile 'org.embulk:embulk-test:0.8.27'
}
test {
......@@ -31,6 +35,7 @@ test {
testLogging.showStandardStreams = true
}
task classpath(type: Copy, dependsOn: ["jar"]) {
doFirst { file("classpath").deleteDir() }
from (configurations.runtime - configurations.provided + files(jar.archivePath))
......@@ -66,9 +71,11 @@ task gemPush(type: JRubyExec, dependsOn: ["gem"]) {
script "pkg/${project.name}-${project.version}.gem"
}
task "package"(dependsOn: ["gemspec", "classpath"]) << {
println "> Build succeeded."
println "> You can run embulk with '-L ${file(".").absolutePath}' argument."
task "package"(dependsOn: ["gemspec", "classpath"]) {
doLast {
println "> Build succeeded."
println "> You can run embulk with '-L ${file(".").absolutePath}' argument."
}
}
task gemspec {
......@@ -79,12 +86,12 @@ task gemspec {
Gem::Specification.new do |spec|
spec.name = "${project.name}"
spec.version = "${project.version}"
spec.authors = ["Klaus W\xC3\xB6lfel"]
spec.summary = %[Filename file input plugin for Embulk]
spec.description = %[Reads files stored on Filename.]
spec.email = ["klaus@nexedi.com"]
spec.authors = ["yu"]
spec.summary = %[Filename input plugin for Embulk]
spec.description = %[Loads records from Filename.]
spec.email = ["icaiyu0618@gmail.com"]
spec.licenses = ["MIT"]
# TODO set this: spec.homepage = "https://github.com/klaus/embulk-input-filename"
# TODO set this: spec.homepage = "https://github.com/icaiyu0618/embulk-input-filename"
spec.files = `git ls-files`.split("\n") + Dir["classpath/*.jar"]
spec.test_files = spec.files.grep(%r"^(test|spec)/")
......
# The first argument is the file name
# The second argument is the size the total size
data = "abcdefghij" * ARGV[1].to_i
File.open(ARGV[0], 'w') { |file| file.write(data)}
#Wed Jan 13 12:41:02 JST 2016
#Sun Jan 08 00:35:58 PST 2017
distributionBase=GRADLE_USER_HOME
distributionPath=wrapper/dists
zipStoreBase=GRADLE_USER_HOME
zipStorePath=wrapper/dists
distributionUrl=https\://services.gradle.org/distributions/gradle-2.10-bin.zip
distributionUrl=https\://services.gradle.org/distributions/gradle-3.2.1-bin.zip
......@@ -6,12 +6,30 @@
##
##############################################################################
# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
DEFAULT_JVM_OPTS=""
# Attempt to set APP_HOME
# Resolve links: $0 may be a link
PRG="$0"
# Need this for relative symlinks.
while [ -h "$PRG" ] ; do
ls=`ls -ld "$PRG"`
link=`expr "$ls" : '.*-> \(.*\)$'`
if expr "$link" : '/.*' > /dev/null; then
PRG="$link"
else
PRG=`dirname "$PRG"`"/$link"
fi
done
SAVED="`pwd`"
cd "`dirname \"$PRG\"`/" >/dev/null
APP_HOME="`pwd -P`"
cd "$SAVED" >/dev/null
APP_NAME="Gradle"
APP_BASE_NAME=`basename "$0"`
# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
DEFAULT_JVM_OPTS=""
# Use the maximum available, or set MAX_FD != -1 to use that value.
MAX_FD="maximum"
......@@ -30,6 +48,7 @@ die ( ) {
cygwin=false
msys=false
darwin=false
nonstop=false
case "`uname`" in
CYGWIN* )
cygwin=true
......@@ -40,26 +59,11 @@ case "`uname`" in
MINGW* )
msys=true
;;
NONSTOP* )
nonstop=true
;;
esac
# Attempt to set APP_HOME
# Resolve links: $0 may be a link
PRG="$0"
# Need this for relative symlinks.
while [ -h "$PRG" ] ; do
ls=`ls -ld "$PRG"`
link=`expr "$ls" : '.*-> \(.*\)$'`
if expr "$link" : '/.*' > /dev/null; then
PRG="$link"
else
PRG=`dirname "$PRG"`"/$link"
fi
done
SAVED="`pwd`"
cd "`dirname \"$PRG\"`/" >/dev/null
APP_HOME="`pwd -P`"
cd "$SAVED" >/dev/null
CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
# Determine the Java command to use to start the JVM.
......@@ -85,7 +89,7 @@ location of your Java installation."
fi
# Increase the maximum file descriptors if we can.
if [ "$cygwin" = "false" -a "$darwin" = "false" ] ; then
if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then
MAX_FD_LIMIT=`ulimit -H -n`
if [ $? -eq 0 ] ; then
if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
......@@ -157,4 +161,9 @@ function splitJvmOpts() {
eval splitJvmOpts $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS
JVM_OPTS[${#JVM_OPTS[*]}]="-Dorg.gradle.appname=$APP_BASE_NAME"
# by default we should be in the correct project dir, but when run from Finder on Mac, the cwd is wrong
if [[ "$(uname)" == "Darwin" ]] && [[ "$HOME" == "$PWD" ]]; then
cd "$(dirname "$0")"
fi
exec "$JAVACMD" "${JVM_OPTS[@]}" -classpath "$CLASSPATH" org.gradle.wrapper.GradleWrapperMain "$@"
......@@ -8,14 +8,14 @@
@rem Set local scope for the variables with windows NT shell
if "%OS%"=="Windows_NT" setlocal
@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
set DEFAULT_JVM_OPTS=
set DIRNAME=%~dp0
if "%DIRNAME%" == "" set DIRNAME=.
set APP_BASE_NAME=%~n0
set APP_HOME=%DIRNAME%
@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
set DEFAULT_JVM_OPTS=
@rem Find java.exe
if defined JAVA_HOME goto findJavaFromJavaHome
......@@ -46,10 +46,9 @@ echo location of your Java installation.
goto fail
:init
@rem Get command-line arguments, handling Windowz variants
@rem Get command-line arguments, handling Windows variants
if not "%OS%" == "Windows_NT" goto win9xME_args
if "%@eval[2+2]" == "4" goto 4NT_args
:win9xME_args
@rem Slurp the command line arguments.
......@@ -60,11 +59,6 @@ set _SKIP=2
if "x%~1" == "x" goto execute
set CMD_LINE_ARGS=%*
goto execute
:4NT_args
@rem Get arguments from the 4NT Shell from JP Software
set CMD_LINE_ARGS=%$
:execute
@rem Setup the command line
......
Embulk::JavaPlugin.register_input(
"filename", "org.embulk.input.filename.FilenameFileInputPlugin",
"filename", "org.embulk.input.filename.FilenameInputPlugin",
File.expand_path('../../../../classpath', __FILE__))
package org.embulk.input.filename;
import java.util.List;
import java.util.ArrayList;
import com.google.common.base.Optional;
......@@ -31,14 +33,15 @@ public class JoinfileOutputPlugin
public interface PluginTask
extends Task
{
// configuration option 1 (required integer)
@Config("path_prefix")
public String getPathPrefix();
// configuration option 2 (optional string, null is not allowed)
@Config("file_ext")
public String getFileExt();
@Config("sum_type")
@ConfigDefault("filename")
public String getSumType();
......@@ -49,6 +52,8 @@ public class JoinfileOutputPlugin
private static FileOutputStream output = null;
private static ArrayList<String> lastP = new ArrayList<String> ();
private static String sumType;
@Override
......@@ -57,19 +62,16 @@ public class JoinfileOutputPlugin
OutputPlugin.Control control)
{
PluginTask task = config.loadConfig(PluginTask.class);
sumType = task.getSumType();
// retryable (idempotent) output:
// return resume(task.dump(), schema, taskCount, control);
// non-retryable (non-idempotent) output:
log.info("In the transaction " + config);
String path = task.getPathPrefix() + task.getFileExt();
sumType = task.getSumType();
log.info("The SumType is: " + sumType);
try {
output = new FileOutputStream(new File(path));
} catch (FileNotFoundException ex) {
......@@ -77,12 +79,17 @@ public class JoinfileOutputPlugin
}
// for the ConfigDiff, we set the last Path of each task is "" as default.
for (int i = 0 ; i< taskCount; i++)
{
lastP.add("");
}
control.run(task.dump());
closeFile();
log.info("In the transaction ");
return Exec.newConfigDiff();
}
......@@ -106,34 +113,37 @@ public class JoinfileOutputPlugin
{
PluginTask task = taskSource.loadTask(PluginTask.class);
log.info("In the open " + taskSource.toString()+ " # " + taskIndex);
final int ind = taskIndex;
return new TransactionalPageOutput(){
//private final List<String> filenames = new ArrayList<>() ;
public void add(Page page){
log.info("The ADD: " + page.getStringReferences() + " ## " +page.getValueReferences());
//log.info("The ADD: " + page.getStringReferences() + " ## " +page.getValueReferences());
try {
//log.info("The content: " + page.getStringReference(0));
if (sumType.equals("filename")){
String line = page.getStringReference(1) + "\n";
output.write(line.getBytes());
} else{
String line = page.getStringReference(0) + "\n";
output.write(line.getBytes());
}
List<String> pageArray = page.getStringReferences();
String content = page.getStringReference(0);
String line = page.getStringReference(1) + "\n";
String tag = page.getStringReference(1);
if (sumType.equals("filename")){
output.write(line.getBytes());
}else{
output.write(content.getBytes());
}
lastP.set(ind ,tag);
} catch (IOException ex) {
throw new RuntimeException(ex);
throw new RuntimeException(ex);
}
}
public void finish(){
log.info("Finished");
//log.info("Finished");
}
public void close(){
log.info("closed");
//log.info("closed");
}
public void abort(){
......
package org.embulk.input.filename;
import org.embulk.config.Config;
import org.embulk.config.ConfigDefault;
import org.embulk.config.ConfigDiff;
import org.embulk.config.ConfigSource;
import org.embulk.config.Task;
import org.embulk.config.TaskSource;
import org.embulk.spi.ParserPlugin;
import org.embulk.spi.FileInput;
import org.embulk.spi.PageOutput;
import org.embulk.spi.Schema;
import org.embulk.spi.SchemaConfig;
import org.embulk.spi.Exec;
import org.embulk.spi.PageBuilder;
import org.embulk.spi.util.FileInputInputStream;
import org.embulk.spi.ColumnConfig;
import java.io.IOException;
import java.util.Arrays;
import java.util.ArrayList;
import org.apache.commons.codec.binary.Base64;
import static org.embulk.spi.type.Types.STRING;
import org.slf4j.Logger;
public class NoneBinParserPlugin
implements ParserPlugin
{
static int MAX_NAME_LENGTH = 255;
Schema schema;
public interface PluginTask
extends Task //, LineDecoder.DecoderTask //, TimestampParser.Task
{
@Config("column_name")
@ConfigDefault("\"payload\"")
public String getColumnName();
}
private final Logger log;
public NoneBinParserPlugin()
{
this.log = Exec.getLogger(NoneBinParserPlugin.class);
}
@Override
public void transaction(ConfigSource config, ParserPlugin.Control control)
{
PluginTask task = config.loadConfig(PluginTask.class);
log.info("The ConfigSource is: " + config.toString());
ArrayList<ColumnConfig> columns = new ArrayList<ColumnConfig>();
final String columnName = task.getColumnName();
columns.add(new ColumnConfig(columnName, STRING, config));
columns.add(new ColumnConfig("tag", STRING, config));
// In the Unit test we need to convert the output of the parser to java object
// Such conversion is based on the parser's schema so that we need keep this schema in parser instance's variable instead
// of using it just once in this method.
this.schema = new SchemaConfig(columns).toSchema();
control.run(task.dump(), this.schema);
}
@Override
public void run(TaskSource taskSource, Schema schema,
FileInput input, PageOutput output)
{
PluginTask task = taskSource.loadTask(PluginTask.class);
log.info("The taskSource of the Parser: "+ taskSource.toString());
FileInputInputStream dataIn = new FileInputInputStream(input);
PageBuilder pageBuilder = new PageBuilder(Exec.getBufferAllocator(), schema, output);
int chunksize = 1024 * 1024 * 1;
while( input.nextFile() ){
byte[] pathBytesArray = new byte[MAX_NAME_LENGTH];
int i = 0;
int c;
for (; i < MAX_NAME_LENGTH; i++) {
c = dataIn.read();
if ( c == -1) {
break;
} else if ( c == 0 ) {
// read empty bytes until MAX_NAME_LENGTH;
for (int j = i + 1; j < MAX_NAME_LENGTH; j++) {
dataIn.read();
}
break;
}
pathBytesArray[i] = (byte)c;
}
String path = new String(Arrays.copyOfRange(pathBytesArray, 0, i));
// To read the data, we read one byte from the dataIn, if it isn't the end of file we record it to the bytesArray,
// we jugde the length of the added bytes, if len == chunksize we record bytesArray to the page record the bytesArray again
int bytes_read = 0;
bytes_read = dataIn.read();
int len = 0;
byte[] bytesArray = new byte[chunksize];
while(bytes_read != -1) {
// Read one byte from the dataIn and record it to the bytesArray
bytesArray[len] = (byte) bytes_read;
bytes_read = dataIn.read();
len += 1 ;
if (len == chunksize) {
log.info(path);
pageBuilder.setString(0, Base64.encodeBase64String(bytesArray));
pageBuilder.setString(1, path);
pageBuilder.addRecord();
len = 0;
}
}
// In case the the remain part of the data is less than chunksize we need to record it to the page as well.
if (len != 0) {
pageBuilder.setString(0,Base64.encodeBase64String(Arrays.copyOfRange(bytesArray, 0, len)));
pageBuilder.setString(1,path);
pageBuilder.addRecord();
}
}
pageBuilder.finish();
}
}
......@@ -8,7 +8,6 @@ import java.nio.file.attribute.BasicFileAttributeView;
import java.nio.file.attribute.FileTime;
import java.util.Comparator;
import org.apache.commons.codec.binary.Base64;
import org.embulk.config.ConfigSource;
import org.embulk.config.ConfigDiff;
......@@ -37,7 +36,7 @@ import static org.embulk.test.EmbulkTests.readSortedFile;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertThat;
public class TestFilenameFileInputPlugin
public class TestFilenameInputPlugin
{
public static FileTime getCreationTime(String filename) throws IOException{
......@@ -59,11 +58,10 @@ public class TestFilenameFileInputPlugin
@Rule
public TestHelper embulk = TestHelper.builder()
.registerPlugin(InputPlugin.class,"filename",FilenameFileInputPlugin.class)
.registerPlugin(ParserPlugin.class,"none-bin",NoneBinParserPlugin.class)
.registerPlugin(InputPlugin.class,"filename",FilenameInputPlugin.class)
.registerPlugin(OutputPlugin.class,"joinfile",JoinfileOutputPlugin.class)
.build();
@Test
public void testOrderByModifiedTime() throws Exception{
......@@ -72,12 +70,14 @@ public class TestFilenameFileInputPlugin
.set("max_threads","1");
Path path_src = Paths.get("src/test/resources/testModifiedOrder");
ArrayList<String> multi_dir = new ArrayList<String> ();
multi_dir.add(path_src.toAbsolutePath().toString()+"/sample_");
ConfigSource inConfig = embulk.newConfig()
.set("type","filename")
.set("path_prefix",path_src.toAbsolutePath().toString()+"/sample_")
.set("order_by_modified_time","2")
.set("parser",embulk.newConfig().set("type","none-bin"));
.set("multi_dir",multi_dir)
.set("order_by_modified_time","2");
Path tmp = embulk.createTempDir();
ConfigSource outConfig = embulk.newConfig()
......@@ -110,8 +110,8 @@ public class TestFilenameFileInputPlugin
}
});
//System.out.println(lines);
//System.out.println(actual);
//System.out.println("The lines" + lines);
//System.out.println("The actual" + actual);
assertEquals(lines,actual);
inConfig.set("order_by_modified_time","1");
......@@ -143,10 +143,7 @@ public class TestFilenameFileInputPlugin
.set("type","filename")
.set("order_by_modified_time","2")
.set("multi_dir",multi_dir)
.set("multi_tag",multi_tag)
.set("path_prefix","/home/chronos/user/Downloads/embulk-input-filename/src/test/resources/testDirList/example/example_")
.set("parser",embulk.newConfig().set("type","none-bin"));
.set("multi_tag",multi_tag);
System.out.println(inConfig);
Path tmp = embulk.createTempDir();
......@@ -238,9 +235,7 @@ public class TestFilenameFileInputPlugin
.set("type","filename")
.set("order_by_modified_time","2")
.set("multi_dir",multi_dir)
.set("path_prefix","/home/chronos/user/Downloads/embulk-input-filename/src/test/resources/testDirList/example/example_")
.set("parser",embulk.newConfig().set("type","none-bin"));
.set("path_prefix","/home/chronos/user/Downloads/embulk-input-filename/src/test/resources/testDirList/example/example_");
Path tmp = embulk.createTempDir();
ConfigSource outConfig = embulk.newConfig()
......@@ -298,17 +293,20 @@ public class TestFilenameFileInputPlugin
assertEquals(lines,dir1);
}
@Test
public void testBase64() throws Exception{
public void testContent() throws Exception{
ConfigSource execConfig = embulk.newConfig()
.set("max_threads","1");
Path path_src = Paths.get("src/test/resources/data");
ArrayList<String> multi_dir = new ArrayList<String> ();
multi_dir.add(path_src.toAbsolutePath().toString()+"/test.csv");
ConfigSource inConfig = embulk.newConfig()
.set("type","filename")
.set("path_prefix",path_src.toAbsolutePath().toString()+"/test.csv")
.set("multi_dir",multi_dir)
.set("parser",embulk.newConfig().set("type","none-bin"));
Path tmp = embulk.createTempDir();
......@@ -324,10 +322,10 @@ public class TestFilenameFileInputPlugin
List<String> lines = Files.readAllLines(Paths.get(tmp.toString()+"/outputfile.txt"));
List<String> actual = Files.readAllLines(Paths.get(path_src+"/test.csv"));
//System.out.println(lines);
String ans = String.join("\n",actual) + "\n";
String actual_bytes = Base64.encodeBase64String(ans.getBytes());
assertEquals(lines.get(0),actual_bytes);
//System.out.println("The lines " + lines);
//System.out.println("The actual " + actual);
assertEquals(actual,lines);
}
}
afdasfgdfagdjg;ashdgklhdg;khdkjgndk;sagbnkadbnkghadskjgnvkdavbdfjbngkj;ldng;khg
hakd;hfehfkajdlgdabdba;hjag;sdgnkdngk;adsngjghkhjlkjljojaldfjlanf;aknhgk;adhg;ajg;lag
asdfgalkdhgkajdbngkahdgkahdkgndksjngkhkhjljiangladfgsdf
adfbkaldfhakdslhfkaldsh
abcdefghijlkjfafhodmjjmkdf
afkhjdofa;j;djfl;ajflkasjdfk;ankfjlndhkajlhgkalhgklahglkl
afhgakdhfgklasdhgkahknkdanfkhkhnkljahdfkanfhjjianlgla
afjljl;j;ajkajkldfakfhakjfdlajfldsjflajdslfjaldjfl
afjlkadsjflajlfjdlasfjlas
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment