Commit 2c734ed1 authored by Klaus Wölfel's avatar Klaus Wölfel

Adding automatic correction tool

parent 5a3853dc
import csv, datetime, os, subprocess, sys, time, xmlrpclib
def normalize(data):
if isinstance(data, unicode):
data = data.encode('utf-8')
return str(data).lower()
class RFODTScorer(object):
server_url = ""
#command = "CloudIA/trunk/RF-ODT/auto-correct -D `expr $(wc -l < train.csv) - 1` -W 500 -T `wc -l < test.csv` -I 100 -S 150 -M 2 -E 0 train.csv test.csv"
command = "CloudIA/trunk/RF-ODT/auto-correct-knn -D `expr $(wc -l < train.csv) - 1` -W 500 -T `expr $(wc -l < test.csv) - 1` -K 2 train.csv test.csv"
def __init__(self, home_path):
self.skipped = []
self.recommedation_lines_skipped = []
self.base_path = os.path.join(home_path, "rfodt")
self.server = xmlrpclib.Server(self.server_url)
self.question_group_list = self.server.question_module.QuestionModule_getQuestionGroupList() + []
def writeTrainFile(self, train_file,
valid_validation_state_tuple = ('archived', 'released'),
valid_causality_state_tuple = ('validated'),
language_tuple = ('en',), question_id_list = []):
table = self.server.answer_set_module.\
AnswerSetModule_constructAnswerCorrectionTableToExport(
valid_validation_state_tuple, valid_causality_state_tuple,
language_tuple, question_id_list)
train_writer = csv.writer(train_file, delimiter=';', quoting=csv.QUOTE_NONE)
train_writer.writerow(table['header_row'][1:])
for row in table['row_list']:
row = map(normalize, [row['question'], row['answer']]) + row['correction_occurence_list']
train_writer.writerow(row)
def writeTestFile(self, test_file,
valid_validation_state_tuple = ('submitted',),
valid_causality_state_tuple = ('draft', 'validated','invalidated'),
language_tuple = ('en',), question_id_list = [],
answer_set_id_list = []):
table = self.server.answer_set_module.\
AnswerSetModule_constructAnswerCorrectionTableToExport(
valid_validation_state_tuple, valid_causality_state_tuple,
language_tuple, question_id_list, answer_set_id_list)
test_writer = csv.writer(test_file, delimiter=';', quoting=csv.QUOTE_NONE)
test_writer.writerow( ['question', 'answer'] )
answer_key_dict = {}
for row in table['row_list']:
row_tuple = tuple(map(normalize, [row['question'], row['answer']]))
test_writer.writerow( row_tuple )
answer_key_dict[row_tuple] = row['answer_id']
#print row_tuple
return answer_key_dict
def setRecommendationDict(self, recommendation_file, answer_key_dict, memcached_key_prefix):
for number, row in enumerate(csv.reader(recommendation_file, delimiter=';')):
if len(row) < 2:
self.recommedation_lines_skipped.append((number, row))
continue
answer_key = answer_key_dict[(row[0], row[1][:-4])]
scored_correction_list = []
for recommendation in row[2:]:
correction, score = recommendation.split(':True(')
score = float(score[:-1])
scored_correction_list.append((correction, score))
print answer_key, scored_correction_list
self.server.ERP5Site_setRecommendationItem(answer_key,
scored_correction_list, memcached_key_prefix)
def getPath(self, group):
return os.path.join(self.base_path, '_'.join(group) or "all")
def getMemcachedKeyPrefix(self, group):
if group:
return 'correction-recommendation-rf-odt'
else:
return 'correction-recommendation-rf-odt-all'
def downloadTrainDataForQuestionGroup(self, group):
path = self.getPath(group)
try:
os.makedirs(path)
except OSError, e:
if e.errno != 17:
raise
os.chdir(path)
with open("train.csv", "wb") as train_file:
self.writeTrainFile(train_file, question_id_list = group)
print os.getcwd()
print subprocess.check_output("dos2unix train.csv",
stderr=subprocess.STDOUT, shell=True)
def scoreQuestionGroup(self, group, answer_set_id_list = []):
path = self.getPath(group)
memcached_key_prefix = self.getMemcachedKeyPrefix(group)
os.chdir(path)
with open("test.csv", "wb") as test_file:
answer_key_dict = self.writeTestFile(test_file, question_id_list=group,
answer_set_id_list=answer_set_id_list)
print os.getcwd()
print subprocess.check_output("dos2unix test.csv", stderr=subprocess.STDOUT,
shell=True)
try:
print subprocess.check_output(self.command, stderr=subprocess.STDOUT,
shell=True)
except subprocess.CalledProcessError as e:
print e.output
if e.output.find("ldc must be >= MAX(N,1)") != -1:
self.skipped.append(group)
else:
raise
else:
with open("suggest.csv", "rb") as recommendation_file:
self.setRecommendationDict(recommendation_file, answer_key_dict,
self.getMemcachedKeyPrefix(group))
def score(self, answer_set_id_list = []):
for group in self.question_group_list:
self.scoreQuestionGroup(group, answer_set_id_list)
print "The following questions were skipped:"
print self.skipped
self.skipped = []
print "The following recommendation lines were skipped:"
print self.recommedation_lines_skipped
self.recommedation_lines_skipped = []
def downloadAllTrainingData(self):
for group in self.question_group_list:
self.downloadTrainDataForQuestionGroup(group)
def scoreNewAnswerSets():
answer_set_id_list = self.server.answer_set_module.AnswerSetModule_getNewSubmittedAnswerSetIdList()
self.score(answer_set_id_list)
class RapidScorer(object):
def writeTrainFile(self, train_file,
valid_validation_state_tuple = ('archived', 'released'),
valid_causality_state_tuple = ('validated'),
language_tuple = ('en',), question_id_list = [],
answer_set_id_list = []):
table = self.server.answer_set_module.\
AnswerSetModule_constructAnswerCorrectionTableToExport(
valid_validation_state_tuple, valid_causality_state_tuple,
language_tuple, question_id_list, answer_set_id_list)
train_writer = csv.writer(train_file, delimiter=';', quoting=csv.QUOTE_NONE)
train_writer.writerow(table['header_row'][1:])
for row in table['row_list']:
row = map(normalize, [row['answer_id'], row['answer']] + row['correction_occurence_list'])
train_writer.writerow(row)
def writeTestFile(self, test_file,
valid_validation_state_tuple = ('submitted',),
valid_causality_state_tuple = ('draft', 'validated','invalidated'),
language_tuple = ('en',), question_id_list = []):
table = self.server.answer_set_module.\
AnswerSetModule_constructAnswerCorrectionTableToExport(
valid_validation_state_tuple, valid_causality_state_tuple,
language_tuple, question_id_list)
test_writer = csv.writer(test_file, delimiter=';', quoting=csv.QUOTE_NONE)
for row in table['row_list']:
test_writer.writerow( map(normalize, [row['answer_id'], row['question_id'], row['answer']]) )
def main():
if len(sys.argv) == 3:
home_path = sys.argv[1]
option = sys.argv[2]
if option == "scoreNewAnswerSets":
print "Scoring new Answer Sets"
scorer = RFODTScorer(home_path)
scorer.correctNewAnswerSets()
return
elif option == "downloadAllTrainingData":
print "Downloading all training data"
scorer = RFODTScorer(home_path)
scorer.downloadAllTrainingData()
return
elif option == "scoreAll":
print "Scoring all Answer Sets"
scorer = RFODTScorer(home_path)
scorer.score()
return
elif option == "downloadAndScoreAll":
print "Downloading and scoring all Answer Sets"
scorer = RFODTScorer(home_path)
scorer.downloadAllTrainingData()
scorer.score()
return
print 'Call with either "scoreNewAnswerSets", "downloadAllTrainingData", "scoreAll" or "downloadAndScoreAll" and "/data/path"'
if __name__ == "__main__":
main()
\ No newline at end of file
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment