Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
C
cloud-consulting
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
nexedi
cloud-consulting
Commits
2c734ed1
Commit
2c734ed1
authored
Aug 03, 2013
by
Klaus Wölfel
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Adding automatic correction tool
parent
5a3853dc
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
193 additions
and
0 deletions
+193
-0
mltools/CorrectionRecommendation.py
mltools/CorrectionRecommendation.py
+193
-0
No files found.
mltools/CorrectionRecommendation.py
0 → 100644
View file @
2c734ed1
import
csv
,
datetime
,
os
,
subprocess
,
sys
,
time
,
xmlrpclib
def
normalize
(
data
):
if
isinstance
(
data
,
unicode
):
data
=
data
.
encode
(
'utf-8'
)
return
str
(
data
).
lower
()
class
RFODTScorer
(
object
):
server_url
=
""
#command = "CloudIA/trunk/RF-ODT/auto-correct -D `expr $(wc -l < train.csv) - 1` -W 500 -T `wc -l < test.csv` -I 100 -S 150 -M 2 -E 0 train.csv test.csv"
command
=
"CloudIA/trunk/RF-ODT/auto-correct-knn -D `expr $(wc -l < train.csv) - 1` -W 500 -T `expr $(wc -l < test.csv) - 1` -K 2 train.csv test.csv"
def
__init__
(
self
,
home_path
):
self
.
skipped
=
[]
self
.
recommedation_lines_skipped
=
[]
self
.
base_path
=
os
.
path
.
join
(
home_path
,
"rfodt"
)
self
.
server
=
xmlrpclib
.
Server
(
self
.
server_url
)
self
.
question_group_list
=
self
.
server
.
question_module
.
QuestionModule_getQuestionGroupList
()
+
[]
def
writeTrainFile
(
self
,
train_file
,
valid_validation_state_tuple
=
(
'archived'
,
'released'
),
valid_causality_state_tuple
=
(
'validated'
),
language_tuple
=
(
'en'
,),
question_id_list
=
[]):
table
=
self
.
server
.
answer_set_module
.
\
AnswerSetModule_constructAnswerCorrectionTableToExport
(
valid_validation_state_tuple
,
valid_causality_state_tuple
,
language_tuple
,
question_id_list
)
train_writer
=
csv
.
writer
(
train_file
,
delimiter
=
';'
,
quoting
=
csv
.
QUOTE_NONE
)
train_writer
.
writerow
(
table
[
'header_row'
][
1
:])
for
row
in
table
[
'row_list'
]:
row
=
map
(
normalize
,
[
row
[
'question'
],
row
[
'answer'
]])
+
row
[
'correction_occurence_list'
]
train_writer
.
writerow
(
row
)
def
writeTestFile
(
self
,
test_file
,
valid_validation_state_tuple
=
(
'submitted'
,),
valid_causality_state_tuple
=
(
'draft'
,
'validated'
,
'invalidated'
),
language_tuple
=
(
'en'
,),
question_id_list
=
[],
answer_set_id_list
=
[]):
table
=
self
.
server
.
answer_set_module
.
\
AnswerSetModule_constructAnswerCorrectionTableToExport
(
valid_validation_state_tuple
,
valid_causality_state_tuple
,
language_tuple
,
question_id_list
,
answer_set_id_list
)
test_writer
=
csv
.
writer
(
test_file
,
delimiter
=
';'
,
quoting
=
csv
.
QUOTE_NONE
)
test_writer
.
writerow
(
[
'question'
,
'answer'
]
)
answer_key_dict
=
{}
for
row
in
table
[
'row_list'
]:
row_tuple
=
tuple
(
map
(
normalize
,
[
row
[
'question'
],
row
[
'answer'
]]))
test_writer
.
writerow
(
row_tuple
)
answer_key_dict
[
row_tuple
]
=
row
[
'answer_id'
]
#print row_tuple
return
answer_key_dict
def
setRecommendationDict
(
self
,
recommendation_file
,
answer_key_dict
,
memcached_key_prefix
):
for
number
,
row
in
enumerate
(
csv
.
reader
(
recommendation_file
,
delimiter
=
';'
)):
if
len
(
row
)
<
2
:
self
.
recommedation_lines_skipped
.
append
((
number
,
row
))
continue
answer_key
=
answer_key_dict
[(
row
[
0
],
row
[
1
][:
-
4
])]
scored_correction_list
=
[]
for
recommendation
in
row
[
2
:]:
correction
,
score
=
recommendation
.
split
(
':True('
)
score
=
float
(
score
[:
-
1
])
scored_correction_list
.
append
((
correction
,
score
))
print
answer_key
,
scored_correction_list
self
.
server
.
ERP5Site_setRecommendationItem
(
answer_key
,
scored_correction_list
,
memcached_key_prefix
)
def
getPath
(
self
,
group
):
return
os
.
path
.
join
(
self
.
base_path
,
'_'
.
join
(
group
)
or
"all"
)
def
getMemcachedKeyPrefix
(
self
,
group
):
if
group
:
return
'correction-recommendation-rf-odt'
else
:
return
'correction-recommendation-rf-odt-all'
def
downloadTrainDataForQuestionGroup
(
self
,
group
):
path
=
self
.
getPath
(
group
)
try
:
os
.
makedirs
(
path
)
except
OSError
,
e
:
if
e
.
errno
!=
17
:
raise
os
.
chdir
(
path
)
with
open
(
"train.csv"
,
"wb"
)
as
train_file
:
self
.
writeTrainFile
(
train_file
,
question_id_list
=
group
)
print
os
.
getcwd
()
print
subprocess
.
check_output
(
"dos2unix train.csv"
,
stderr
=
subprocess
.
STDOUT
,
shell
=
True
)
def
scoreQuestionGroup
(
self
,
group
,
answer_set_id_list
=
[]):
path
=
self
.
getPath
(
group
)
memcached_key_prefix
=
self
.
getMemcachedKeyPrefix
(
group
)
os
.
chdir
(
path
)
with
open
(
"test.csv"
,
"wb"
)
as
test_file
:
answer_key_dict
=
self
.
writeTestFile
(
test_file
,
question_id_list
=
group
,
answer_set_id_list
=
answer_set_id_list
)
print
os
.
getcwd
()
print
subprocess
.
check_output
(
"dos2unix test.csv"
,
stderr
=
subprocess
.
STDOUT
,
shell
=
True
)
try
:
print
subprocess
.
check_output
(
self
.
command
,
stderr
=
subprocess
.
STDOUT
,
shell
=
True
)
except
subprocess
.
CalledProcessError
as
e
:
print
e
.
output
if
e
.
output
.
find
(
"ldc must be >= MAX(N,1)"
)
!=
-
1
:
self
.
skipped
.
append
(
group
)
else
:
raise
else
:
with
open
(
"suggest.csv"
,
"rb"
)
as
recommendation_file
:
self
.
setRecommendationDict
(
recommendation_file
,
answer_key_dict
,
self
.
getMemcachedKeyPrefix
(
group
))
def
score
(
self
,
answer_set_id_list
=
[]):
for
group
in
self
.
question_group_list
:
self
.
scoreQuestionGroup
(
group
,
answer_set_id_list
)
print
"The following questions were skipped:"
print
self
.
skipped
self
.
skipped
=
[]
print
"The following recommendation lines were skipped:"
print
self
.
recommedation_lines_skipped
self
.
recommedation_lines_skipped
=
[]
def
downloadAllTrainingData
(
self
):
for
group
in
self
.
question_group_list
:
self
.
downloadTrainDataForQuestionGroup
(
group
)
def
scoreNewAnswerSets
():
answer_set_id_list
=
self
.
server
.
answer_set_module
.
AnswerSetModule_getNewSubmittedAnswerSetIdList
()
self
.
score
(
answer_set_id_list
)
class
RapidScorer
(
object
):
def
writeTrainFile
(
self
,
train_file
,
valid_validation_state_tuple
=
(
'archived'
,
'released'
),
valid_causality_state_tuple
=
(
'validated'
),
language_tuple
=
(
'en'
,),
question_id_list
=
[],
answer_set_id_list
=
[]):
table
=
self
.
server
.
answer_set_module
.
\
AnswerSetModule_constructAnswerCorrectionTableToExport
(
valid_validation_state_tuple
,
valid_causality_state_tuple
,
language_tuple
,
question_id_list
,
answer_set_id_list
)
train_writer
=
csv
.
writer
(
train_file
,
delimiter
=
';'
,
quoting
=
csv
.
QUOTE_NONE
)
train_writer
.
writerow
(
table
[
'header_row'
][
1
:])
for
row
in
table
[
'row_list'
]:
row
=
map
(
normalize
,
[
row
[
'answer_id'
],
row
[
'answer'
]]
+
row
[
'correction_occurence_list'
])
train_writer
.
writerow
(
row
)
def
writeTestFile
(
self
,
test_file
,
valid_validation_state_tuple
=
(
'submitted'
,),
valid_causality_state_tuple
=
(
'draft'
,
'validated'
,
'invalidated'
),
language_tuple
=
(
'en'
,),
question_id_list
=
[]):
table
=
self
.
server
.
answer_set_module
.
\
AnswerSetModule_constructAnswerCorrectionTableToExport
(
valid_validation_state_tuple
,
valid_causality_state_tuple
,
language_tuple
,
question_id_list
)
test_writer
=
csv
.
writer
(
test_file
,
delimiter
=
';'
,
quoting
=
csv
.
QUOTE_NONE
)
for
row
in
table
[
'row_list'
]:
test_writer
.
writerow
(
map
(
normalize
,
[
row
[
'answer_id'
],
row
[
'question_id'
],
row
[
'answer'
]])
)
def
main
():
if
len
(
sys
.
argv
)
==
3
:
home_path
=
sys
.
argv
[
1
]
option
=
sys
.
argv
[
2
]
if
option
==
"scoreNewAnswerSets"
:
print
"Scoring new Answer Sets"
scorer
=
RFODTScorer
(
home_path
)
scorer
.
correctNewAnswerSets
()
return
elif
option
==
"downloadAllTrainingData"
:
print
"Downloading all training data"
scorer
=
RFODTScorer
(
home_path
)
scorer
.
downloadAllTrainingData
()
return
elif
option
==
"scoreAll"
:
print
"Scoring all Answer Sets"
scorer
=
RFODTScorer
(
home_path
)
scorer
.
score
()
return
elif
option
==
"downloadAndScoreAll"
:
print
"Downloading and scoring all Answer Sets"
scorer
=
RFODTScorer
(
home_path
)
scorer
.
downloadAllTrainingData
()
scorer
.
score
()
return
print
'Call with either "scoreNewAnswerSets", "downloadAllTrainingData", "scoreAll" or "downloadAndScoreAll" and "/data/path"'
if
__name__
==
"__main__"
:
main
()
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment