Commit 54955290 authored by Sebastien Robin's avatar Sebastien Robin

add transformations to get text from tiff files thanks

to tesseract

git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk@42937 20353a03-c40f-0410-a6d1-a30d3c3de9de
parent efc83957
......@@ -45,6 +45,7 @@ modules = [
#'textile_to_html',# textile, depends on PyTextile http://dom.eav.free.fr/python/textile-mirror-2.0.10.tar.gz
'web_intelligent_plain_text_to_html',
'html_to_web_intelligent_plain_text',
'tiff_to_text', # transforms tiff images to text
]
g = globals()
......
# -*- coding: utf-8 -*-
from Products.PortalTransforms.interfaces import ITransform
from Products.PortalTransforms.data import datastream
from Products.PortalTransforms.libtransforms.commandtransform \
import commandtransform
import os
import tempfile
from zope.interface import implements
class tiff_to_text(commandtransform):
implements(ITransform)
__name__ = "tiff_to_text"
inputs = ('image/tiff',)
output = 'text/plain'
output_encoding = 'utf-8'
__version__ = '2011-02-01.01'
binaryName = "tesseract"
binaryArgs = "%(infile)s "
def __init__(self):
commandtransform.__init__(self, binary=self.binaryName)
def convert(self, data, cache, **kwargs):
kwargs['filename'] = 'input.tiff'
tmp_dir, input_file = self.initialize_tmpdir(data,
filename='input.tiff')
text = None
try:
command = self.binary
output_file_path = os.path.join(tmp_dir, 'output')
cmd = '%s %s %s' % (
self.binary, input_file, output_file_path)
os.system(cmd)
output_file = open(output_file_path + '.txt', 'r')
out = output_file.read()
output_file.close()
finally:
self.cleanDir(tmp_dir)
data = datastream('output.txt')
data.setData(out)
return data
def register():
return tiff_to_text()
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment