diff --git a/product/PortalTransforms/transforms/__init__.py b/product/PortalTransforms/transforms/__init__.py index e23700042558edf0db99f506465f940e00a4a1bd..ee36637bd69e7921567a35633fd1521c03b0da77 100644 --- a/product/PortalTransforms/transforms/__init__.py +++ b/product/PortalTransforms/transforms/__init__.py @@ -45,6 +45,7 @@ modules = [ #'textile_to_html',# textile, depends on PyTextile http://dom.eav.free.fr/python/textile-mirror-2.0.10.tar.gz 'web_intelligent_plain_text_to_html', 'html_to_web_intelligent_plain_text', + 'tiff_to_text', # transforms tiff images to text ] g = globals() diff --git a/product/PortalTransforms/transforms/tiff_to_text.py b/product/PortalTransforms/transforms/tiff_to_text.py new file mode 100644 index 0000000000000000000000000000000000000000..b17b65daccd73dfb80e1add0c8eb99fe601d3e3c --- /dev/null +++ b/product/PortalTransforms/transforms/tiff_to_text.py @@ -0,0 +1,49 @@ +# -*- coding: utf-8 -*- +from Products.PortalTransforms.interfaces import ITransform +from Products.PortalTransforms.data import datastream +from Products.PortalTransforms.libtransforms.commandtransform \ + import commandtransform +import os +import tempfile +from zope.interface import implements + +class tiff_to_text(commandtransform): + implements(ITransform) + __name__ = "tiff_to_text" + + inputs = ('image/tiff',) + output = 'text/plain' + output_encoding = 'utf-8' + + __version__ = '2011-02-01.01' + + binaryName = "tesseract" + binaryArgs = "%(infile)s " + + def __init__(self): + commandtransform.__init__(self, binary=self.binaryName) + + def convert(self, data, cache, **kwargs): + kwargs['filename'] = 'input.tiff' + tmp_dir, input_file = self.initialize_tmpdir(data, + filename='input.tiff') + + text = None + try: + command = self.binary + output_file_path = os.path.join(tmp_dir, 'output') + cmd = '%s %s %s' % ( + self.binary, input_file, output_file_path) + os.system(cmd) + output_file = open(output_file_path + '.txt', 'r') + out = output_file.read() + output_file.close() + finally: + self.cleanDir(tmp_dir) + + data = datastream('output.txt') + data.setData(out) + return data + +def register(): + return tiff_to_text()