Commit 73e72c45 authored by francois's avatar francois

component/ocropy Add the ocropy library for OCR

This commit add component needed for Optical Character Recognition:
Ocropy, and the patch that need to be applied to make it work with erp5.

The Ocropy egg on Pypi is empty, so this component have to be built from
source. The default neural network model is downloaded from the author
website and installed in python2.7/share/ocropus.

The patch modify how the setup works to allow the C functions in
native.py to be compiled during setup (removing the need to make zope
use gcc) and provide a way to specify the model path for Ocropy.
The patched setup now need to call some modules from different eggs,
whose path needs to be put into sys.path before it can be run.

More models are available here:
https://github.com/tmbdev/ocropy/wiki/Models
parent 46689238
[buildout]
extends =
../git/buildout.cfg
../gzip/buildout.cfg
../scipy/buildout.cfg
../lxml-python/buildout.cfg
../matplotlib/buildout.cfg
../numpy/buildout.cfg
../patch/buildout.cfg
../pillow/buildout.cfg
../numpy/buildout.cfg
parts =
matplotlib
ocropy
[ocropy-eng-traineddata]
recipe = hexagonit.recipe.download
destination = ${python2.7:location}/share/ocropus
filename = en-default.pyrnn.gz
md5sum = cedd140c7d7650e910f0550ad0f04727
download-only = true
url = http://www.tmbdev.net/en-default.pyrnn.gz
[ocropy]
recipe = slapos.recipe.build
egg = ocropy
setup-eggs =
${numpy:egg}
${matplotlib:egg}
${pillow-python:egg}
url = https://github.com/tmbdev/ocropy/tarball/4efbddca22bb2f0c639af0694e7a1386f2f097b5
md5sum = 240b8866dd7248816e01af469a328c09
patch = ${:_profile_base_location_}/ocropy.patch
python-bin = ${python2.7:location}/bin/python
script =
import os
import sys
from itertools import chain
# Update of the path to include eggs location
list_modules = lambda dir: [dir + file for file in os.listdir(dir)]
sys.path[0:0] = chain.from_iterable(map(list_modules, ['${buildout:develop-eggs-directory}/', '${buildout:eggs-directory}/']))
os.environ['PYTHONPATH'] = ':'.join(sys.path)
os.environ['OCROPY_MODEL_PATH'] = "${ocropy-eng-traineddata:location}/${ocropy-eng-traineddata:filename}"
workingdir = guessworkdir(self.extract(self.download(%(url)r, %(md5sum)r)))
os.chdir(workingdir)
self.applyPatchList(self.options.get('patch'), \
patch_binary='${patch:location}/bin/patch', \
patch_options='-p0', cwd=workingdir)
call([self.options.get('python-bin'), os.path.join(workingdir, 'setup.py'),
'install'])
diff --git ocrolib/__init__.py ocrolib/__init__.py
index 1e0d627..81e85fb 100644
--- ocrolib/__init__.py
+++ ocrolib/__init__.py
@@ -1,7 +1,7 @@
__all__ = [
"binnednn","cairoextras","common","components","dbtables",
"fgen","gmmtree","gtkyield","hocr","lang","native",
- "mlp","multiclass","default","lineest"
+ "mlp","multiclass","default","lineest", "psegutils"
]
################################################################
@@ -9,5 +9,6 @@ __all__ = [
################################################################
import default
+from psegutils import *
from common import *
from default import traceback as trace
diff --git ocrolib/common.py ocrolib/common.py
index 27c0f26..b68c5c8 100644
--- ocrolib/common.py
+++ ocrolib/common.py
@@ -14,6 +14,7 @@ import unicodedata
import inspect
import glob
import cPickle
+import gzip
from ocrolib.exceptions import (BadClassLabel, BadInput, FileNotFound,
OcropusException)
@@ -428,6 +429,7 @@ def unpickle_find_global(mname,cname):
exec "import "+mname
return getattr(sys.modules[mname],cname)
+
def load_object(fname,zip=0,nofind=0,verbose=0):
"""Loads an object from disk. By default, this handles zipped files
and searches in the usual places for OCRopus. It also handles some
@@ -439,8 +441,7 @@ def load_object(fname,zip=0,nofind=0,verbose=0):
if zip==0 and fname.endswith(".gz"):
zip = 1
if zip>0:
- # with gzip.GzipFile(fname,"rb") as stream:
- with os.popen("gunzip < '%s'"%fname,"rb") as stream:
+ with gzip.GzipFile(fname,"rb") as stream:
unpickler = cPickle.Unpickler(stream)
unpickler.find_global = unpickle_find_global
return unpickler.load()
diff --git ocrolib/native.py ocrolib/native.py
index b7a207f..240450b 100644
--- ocrolib/native.py
+++ ocrolib/native.py
@@ -44,6 +44,7 @@ class CompileError(Exception):
def compile_and_find(c_string,prefix=".pynative",opt="-g -O4",libs="-lm",
options="-shared -fopenmp -std=c99 -fPIC",verbose=0):
+ prefix = os.path.join(os.path.dirname(__file__), prefix)
if not os.path.exists(prefix):
os.mkdir(prefix)
m = hashlib.md5()
diff --git setup.py setup.py
index 2ec5832..151bf55 100644
--- setup.py
+++ setup.py
@@ -10,7 +10,9 @@ assert sys.version_info[0]==2 and sys.version_info[1]>=7,\
from distutils.core import setup #, Extension, Command
#from distutils.command.install_data import install_data
-if not os.path.exists("models/en-default.pyrnn.gz"):
+models = os.environ.get('OCROPY_MODEL_PATH', '').split(':') or \
+ [c for c in glob.glob("models/*pyrnn.gz")]
+if not models:
print()
print("You should download the default model 'en-default.pyrnn.gz'")
print("and put it into ./models.")
@@ -18,16 +20,23 @@ if not os.path.exists("models/en-default.pyrnn.gz"):
print("Check https://github.com/tmbdev/ocropy for the location")
print("of model files.")
print()
+ sys.exit(1)
-models = [c for c in glob.glob("models/*pyrnn.gz")]
scripts = [c for c in glob.glob("ocropus-*") if "." not in c and "~" not in c]
+# compile pynative files now and include them in the build
+sys.path.insert(0, os.curdir)
+import ocrolib.nutils
+pynative_files = [os.path.join(*c.split(os.path.sep)[1:]) \
+ for c in glob.glob('ocrolib/.pynative/*')]
+
setup(
name = 'ocropy',
version = 'v1.0',
author = "Thomas Breuel",
description = "The OCRopy RNN-based Text Line Recognizer",
packages = ["ocrolib"],
+ package_data = {'ocrolib': pynative_files},
data_files= [('share/ocropus', models)],
scripts = scripts,
)
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment