Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
E
embulk-input-filename
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
Klaus Wölfel
embulk-input-filename
Commits
554744bd
Commit
554744bd
authored
Aug 08, 2017
by
yu
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'fixTheLastPath' into multiThread
fix the multi dir problem and single dir problem
parents
58d3b89e
996b5c7a
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
74 additions
and
84 deletions
+74
-84
src/main/java/org/embulk/input/filename/FilenameInputPlugin.java
...n/java/org/embulk/input/filename/FilenameInputPlugin.java
+66
-75
src/test/java/org/embulk/input/filename/TestFilenameInputPlugin.java
...va/org/embulk/input/filename/TestFilenameInputPlugin.java
+8
-9
No files found.
src/main/java/org/embulk/input/filename/FilenameInputPlugin.java
View file @
554744bd
...
...
@@ -21,7 +21,11 @@ import java.io.ByteArrayOutputStream;
import
com.google.common.base.Optional
;
import
org.apache.commons.codec.binary.Base64
;
//import org.apache.commons.io.IOUtils;
import
org.slf4j.Logger
;
import
org.embulk.config.Config
;
import
org.embulk.config.ConfigDefault
;
import
org.embulk.config.ConfigDiff
;
...
...
@@ -57,25 +61,18 @@ public class FilenameInputPlugin
@ConfigDefault
(
"[]"
)
ArrayList
<
String
>
getMultiTag
();
@Config
(
"last_path"
)
@ConfigDefault
(
"null"
)
Optional
<
String
>
getLastPath
();
@Config
(
"order_by_modified_time"
)
@ConfigDefault
(
"0"
)
int
getOrderByModifiedTime
();
@Config
(
"order_by_creation_time"
)
@ConfigDefault
(
"0"
)
int
getOrderByCreationTime
();
@Config
(
"lastPaths"
)
@ConfigDefault
(
"[]"
)
ArrayList
<
String
>
getLastPaths
();
@Config
(
"chunk_size"
)
@ConfigDefault
(
"10485760"
)
int
getChunkSize
();
@Config
(
"file_size"
)
@ConfigDefault
(
"null"
)
Optional
<
Integer
>
getFileSize
();
@Config
(
"load_order"
)
@ConfigDefault
(
"\"\""
)
String
getLoadOrder
();
@Config
(
"follow_symlinks"
)
@ConfigDefault
(
"false"
)
...
...
@@ -95,11 +92,10 @@ public class FilenameInputPlugin
private
final
static
Path
CURRENT_DIR
=
Paths
.
get
(
"."
).
normalize
();
private
static
ArrayList
<
String
>
tagList
;
private
static
ArrayList
<
String
>
lastPaths
;
private
static
int
chunkSize
;
private
static
ArrayList
<
String
>
last_p
=
new
ArrayList
<
String
>();
@Override
public
ConfigDiff
transaction
(
ConfigSource
config
,
InputPlugin
.
Control
control
)
...
...
@@ -108,6 +104,7 @@ public class FilenameInputPlugin
chunkSize
=
task
.
getChunkSize
();
ArrayList
<
String
>
dirList
=
task
.
getMultiDir
();
ArrayList
<
String
>
lastPaths
=
task
.
getLastPaths
();
ArrayList
<
ArrayList
<
String
>>
allFiles
=
new
ArrayList
<
ArrayList
<
String
>>();
tagList
=
task
.
getMultiTag
();
...
...
@@ -117,20 +114,27 @@ public class FilenameInputPlugin
// If the Number of tags is less than the directories, we say that the default tag is ""
tagList
.
add
(
""
);
}
while
(
lastPaths
.
size
()
<
dirList
.
size
()){
lastPaths
.
add
(
""
);
}
}
else
{
throw
new
RuntimeException
(
"The multi_dir should contain at least 1 directory."
);
}
for
(
String
dir
:
dirList
){
ArrayList
<
String
>
files
=
listFiles
(
task
,
Paths
.
get
(
dir
).
normalize
());
for
(
int
i
=
0
;
i
<
dirList
.
size
();
i
++
){
String
dir
=
dirList
.
get
(
i
);
String
lastPath
=
lastPaths
.
get
(
i
);
String
order
=
task
.
getLoadOrder
();
if
(
order
.
equals
(
""
)){
order
=
"ALPHABETICAL"
;}
ArrayList
<
String
>
files
=
listFiles
(
task
,
Paths
.
get
(
dir
).
normalize
(),
lastPath
,
order
);
// Sort the files if each directory
int
order_modified
=
task
.
getOrderByModifiedTime
();
int
order_creation
=
task
.
getOrderByCreationTime
();
if
(
order
_modified
==
0
&&
order_creation
==
0
){
if
(
order
.
equals
(
"ALPHABETICAL"
)
){
Collections
.
sort
(
files
);
}
else
if
(
order
_creation
==
0
){
}
else
if
(
order
.
equals
(
"ASCEND_MODIFIED"
)
||
order
.
equals
(
"DESCEND_MODIFIED"
)
){
Collections
.
sort
(
files
,
new
Comparator
<
String
>(){
@Override
public
int
compare
(
String
f1
,
String
f2
)
{
...
...
@@ -142,10 +146,8 @@ public class FilenameInputPlugin
return
0
;
}
});
if
(
order_modified
==
1
)
{
Collections
.
reverse
(
files
);
}
}
else
if
(
order_modified
==
0
){
if
(
order
.
equals
(
"DESCEND_MODIFIED"
)){
Collections
.
reverse
(
files
);
}
}
else
if
(
order
.
equals
(
"ASCEND_CREATION"
)
||
order
.
equals
(
"DESCEND_CREATION"
)
){
Collections
.
sort
(
files
,
new
Comparator
<
String
>(){
@Override
public
int
compare
(
String
f1
,
String
f2
)
{
...
...
@@ -158,39 +160,23 @@ public class FilenameInputPlugin
}
});
if
(
order
_creation
==
1
)
{
Collections
.
reverse
(
files
);}
if
(
order
.
equals
(
"DESCEND_CREATION"
)
)
{
Collections
.
reverse
(
files
);}
}
else
{
throw
new
RuntimeException
(
"
Could not order by creation time and lasModified time at the same time
"
);
throw
new
RuntimeException
(
"
Input a correct order
"
);
}
// End of sort
log
.
info
(
"The files is "
+
files
);
allFiles
.
add
(
files
);
last_p
.
add
(
files
.
get
(
0
));
}
int
taskCount
;
// If the we upload only one directory, we set each file as a task.
// In this case the max_threads must equal 1 to keep the file uploading order
if
(
dirList
.
size
()
==
1
){
ArrayList
<
ArrayList
<
String
>>
oneFile
=
new
ArrayList
<
ArrayList
<
String
>>
();
for
(
String
f
:
allFiles
.
get
(
0
)){
ArrayList
<
String
>
file
=
new
ArrayList
<
String
>
();
file
.
add
(
f
);
oneFile
.
add
(
file
);
}
while
(
tagList
.
size
()<
oneFile
.
size
()){
tagList
.
add
(
tagList
.
get
(
0
));
}
task
.
setFiles
(
oneFile
);
taskCount
=
oneFile
.
size
();
last_p
=
new
ArrayList
<
String
>();
last_p
.
add
(
allFiles
.
get
(
0
).
get
(
0
));
}
else
{
int
taskCount
=
allFiles
.
size
();
task
.
setFiles
(
allFiles
);
taskCount
=
allFiles
.
size
();
}
ArrayList
<
ColumnConfig
>
columns
=
new
ArrayList
<
ColumnConfig
>();
//final String columnName = task.getColumnName();
...
...
@@ -203,6 +189,9 @@ public class FilenameInputPlugin
//Schema schema = task.getColumns().toSchema();
// number of run() method calls
log
.
info
(
"TASKCOUNT "
+
taskCount
);
return
resume
(
task
.
dump
(),
schema
,
taskCount
,
control
);
}
...
...
@@ -212,10 +201,7 @@ public class FilenameInputPlugin
InputPlugin
.
Control
control
)
{
control
.
run
(
taskSource
,
schema
,
taskCount
);
ConfigDiff
diff
=
Exec
.
newConfigDiff
();
diff
.
set
(
"last_path"
,
last_p
);
return
diff
;
return
Exec
.
newConfigDiff
();
}
@Override
...
...
@@ -234,6 +220,8 @@ public class FilenameInputPlugin
ArrayList
<
String
>
files
=
task
.
getFiles
().
get
(
taskIndex
);
log
.
info
(
"The files in the run:"
+
files
);
for
(
String
file
:
files
)
{
...
...
@@ -241,7 +229,6 @@ public class FilenameInputPlugin
{
int
nRead
;
byte
[]
data
=
new
byte
[
chunkSize
];
String
filename
=
new
File
(
file
).
getCanonicalPath
();
FileInputStream
dataIn
=
new
FileInputStream
(
file
);
ByteArrayOutputStream
buffer
=
new
ByteArrayOutputStream
();
...
...
@@ -250,18 +237,12 @@ public class FilenameInputPlugin
try
(
PageBuilder
pageBuilder
=
new
PageBuilder
(
Exec
.
getBufferAllocator
(),
schema
,
output
))
{
pageBuilder
.
setString
(
0
,
buffer
.
toString
());
//Base64.encodeBase64String(buffer.toByteArray()));
pageBuilder
.
setString
(
1
,
tagList
.
get
(
taskIndex
)
+
filename
);
pageBuilder
.
setString
(
1
,
tagList
.
get
(
taskIndex
)
+
new
File
(
file
).
getCanonicalPath
()
);
pageBuilder
.
addRecord
();
buffer
.
flush
();
pageBuilder
.
finish
();
}
}
if
(
last_p
.
size
()
>
1
)
{
last_p
.
set
(
taskIndex
,
filename
);
}
else
{
last_p
.
set
(
0
,
filename
);
}
}
catch
(
IOException
ex
){
ex
.
printStackTrace
();
}
...
...
@@ -279,7 +260,9 @@ public class FilenameInputPlugin
}
public
ArrayList
<
String
>
listFiles
(
PluginTask
task
,
Path
pathPrefix
)
public
ArrayList
<
String
>
listFiles
(
PluginTask
task
,
Path
pathPrefix
,
String
lastPath
,
String
order
)
{
//Path pathPrefix = Paths.get(task.getPathPrefix()).normalize();
final
Path
directory
;
...
...
@@ -295,8 +278,6 @@ public class FilenameInputPlugin
//final ImmutableList.Builder<String> builder = ImmutableList.builder();
final
ArrayList
<
String
>
filesArray
=
new
ArrayList
<
String
>();
final
String
lastPath
=
task
.
getLastPath
().
orNull
();
final
Integer
fileSize
=
task
.
getFileSize
().
orNull
();
try
{
log
.
info
(
"Listing local files at directory '{}' filtering filename by prefix '{}'"
,
directory
.
equals
(
CURRENT_DIR
)
?
"."
:
directory
.
toString
(),
fileNamePrefix
);
Files
.
walkFileTree
(
directory
,
new
SimpleFileVisitor
<
Path
>()
{
...
...
@@ -322,19 +303,29 @@ public class FilenameInputPlugin
@Override
public
FileVisitResult
visitFile
(
Path
path
,
BasicFileAttributes
attrs
)
{
if
(
lastPath
!=
null
&&
path
.
toString
().
compareTo
(
lastPath
)
<=
0
)
{
try
{
if
(
!
lastPath
.
equals
(
""
)
&&
order
.
equals
(
"ALPHABETICAL"
)
&&
path
.
toString
().
compareTo
(
lastPath
)
<=
0
)
{
return
FileVisitResult
.
CONTINUE
;
}
else
if
(!
lastPath
.
equals
(
""
)
&&
order
.
equals
(
"ASCEND_MODIFIED"
)
&&
getLastModifiedTime
(
path
.
toString
()).
compareTo
(
getLastModifiedTime
(
lastPath
))
<=
0
)
{
return
FileVisitResult
.
CONTINUE
;
}
else
if
(!
lastPath
.
equals
(
""
)
&&
order
.
equals
(
"DESCEND_MODIFIED"
)
&&
getLastModifiedTime
(
path
.
toString
()).
compareTo
(
getLastModifiedTime
(
lastPath
))
>=
0
){
return
FileVisitResult
.
CONTINUE
;
}
else
if
(!
lastPath
.
equals
(
""
)
&&
order
.
equals
(
"ASCEND_CREATION"
)
&&
getCreationTime
(
path
.
toString
()).
compareTo
(
getCreationTime
(
lastPath
))
<=
0
){
return
FileVisitResult
.
CONTINUE
;
}
else
if
(!
lastPath
.
equals
(
""
)
&&
order
.
equals
(
"DESCEND_MODIFIED"
)
&&
getCreationTime
(
path
.
toString
()).
compareTo
(
getCreationTime
(
lastPath
))
<=
0
)
{
return
FileVisitResult
.
CONTINUE
;
}
else
if
(
path
.
getFileName
().
toString
().
startsWith
(
"."
))
{
return
FileVisitResult
.
CONTINUE
;
}
else
{
if
(
path
.
getFileName
().
toString
().
startsWith
(
fileNamePrefix
))
{
if
(
fileSize
==
null
||
path
.
toFile
().
length
()
==
fileSize
)
{
//builder.add(path.toString());
filesArray
.
add
(
path
.
toString
());
}
}
return
FileVisitResult
.
CONTINUE
;
}
}
catch
(
IOException
e
){
throw
new
RuntimeException
(
"IOException during the uploading files"
);
}
}
});
}
catch
(
IOException
ex
)
{
...
...
src/test/java/org/embulk/input/filename/TestFilenameInputPlugin.java
View file @
554744bd
...
...
@@ -77,7 +77,7 @@ public class TestFilenameInputPlugin
ConfigSource
inConfig
=
embulk
.
newConfig
()
.
set
(
"type"
,
"filename"
)
.
set
(
"multi_dir"
,
multi_dir
)
.
set
(
"
order_by_modified_time"
,
"2
"
);
.
set
(
"
load_order"
,
"ASCEND_MODIFIED
"
);
Path
tmp
=
embulk
.
createTempDir
();
...
...
@@ -115,7 +115,7 @@ public class TestFilenameInputPlugin
//System.out.println("The actual" + actual);
assertEquals
(
lines
,
actual
);
inConfig
.
set
(
"
order_by_modified_time"
,
"1
"
);
inConfig
.
set
(
"
load_order"
,
"DESCEND_MODIFIED
"
);
res
=
embulk
.
runAllBuilder
(
execConfig
,
inConfig
,
outConfig
);
lines
=
Files
.
readAllLines
(
Paths
.
get
(
tmp
.
toString
()+
"/outputfile.txt"
));
...
...
@@ -143,7 +143,7 @@ public class TestFilenameInputPlugin
ConfigSource
inConfig
=
embulk
.
newConfig
()
.
set
(
"type"
,
"filename"
)
.
set
(
"
order_by_modified_time"
,
"2
"
)
.
set
(
"
load_order"
,
"ASCEND_MODIFIED
"
)
.
set
(
"multi_dir"
,
multi_dir
)
.
set
(
"multi_tag"
,
multi_tag
);
System
.
out
.
println
(
inConfig
);
...
...
@@ -236,9 +236,8 @@ public class TestFilenameInputPlugin
ConfigSource
inConfig
=
embulk
.
newConfig
()
.
set
(
"type"
,
"filename"
)
.
set
(
"order_by_modified_time"
,
"2"
)
.
set
(
"multi_dir"
,
multi_dir
)
.
set
(
"path_prefix"
,
"/home/chronos/user/Downloads/embulk-input-filename/src/test/resources/testDirList/example/example_"
);
.
set
(
"load_order"
,
"ASCEND_MODIFIED"
)
.
set
(
"multi_dir"
,
multi_dir
);
Path
tmp
=
embulk
.
createTempDir
();
ConfigSource
outConfig
=
embulk
.
newConfig
()
...
...
@@ -310,8 +309,8 @@ public class TestFilenameInputPlugin
multi_dir
.
add
(
path_src
.
toAbsolutePath
().
toString
()+
"/test.csv"
);
ConfigSource
inConfig
=
embulk
.
newConfig
()
.
set
(
"type"
,
"filename"
)
.
set
(
"
multi_dir"
,
multi_dir
)
.
set
(
"
parser"
,
embulk
.
newConfig
().
set
(
"type"
,
"none-bin"
)
);
.
set
(
"
load_order"
,
"ALPHABETICAL"
)
.
set
(
"
multi_dir"
,
multi_dir
);
Path
tmp
=
embulk
.
createTempDir
();
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment