Commit 2325d4c4 authored by francois's avatar francois

component/ocropy Add the ocropy library for OCR

This commit add component needed for Optical Character recognition:
ocropy and its dependancies

The Ocropy egg on Pypi is empty, so this component have to be built from
source. The default neural network model is downloaded from the author
website and installed in python2.7/share/ocropus.

The patch is here for to different thing: one is adding an
OCROPY_MODEL_PATH to be able to specify the position of models, the
other is importing for the first time a module that compile C code from
python, allowing us to have the static library built before runtime and
preventing us from using gcc through zope.

More models are available here:
https://github.com/tmbdev/ocropy/wiki/Models
parent c68e991b
[buildout]
extends =
../git/buildout.cfg
../gzip/buildout.cfg
../scipy/buildout.cfg
../lxml-python/buildout.cfg
../matplotlib/buildout.cfg
../numpy/buildout.cfg
../patch/buildout.cfg
../pillow/buildout.cfg
../numpy/buildout.cfg
parts = ocropy
[ocropy-eng-traineddata]
recipe = hexagonit.recipe.download
filename = en-default.pyrnn.gz
md5sum = cedd140c7d7650e910f0550ad0f04727
download-only = true
url = http://www.tmbdev.net/en-default.pyrnn.gz
[ocropy-env]
OCROPY_MODEL_PATH = ${ocropy-eng-traineddata:location}/${ocropy-eng-traineddata:filename}
HOME = ${ocropy:egg}
[ocropy]
recipe = zc.recipe.egg:custom
egg = ocropy
setup-eggs =
${numpy:egg}
${scipy:egg}
${matplotlib:egg}
${pillow-python:egg}
patches =
${:_profile_base_location_}/ocropy.patch
patch-options = -p0
patch-binary = ${patch:location}/bin/patch
environment = ocropy-env
find-links = https://github.com/tmbdev/ocropy/tarball/4efbddca22bb2f0c639af0694e7a1386f2f097b5/ocropy-1.0.tar.gz
md5sum = 240b8866dd7248816e01af469a328c09
diff --git ocrolib/__init__.py ocrolib/__init__.py
index 1e0d627..81e85fb 100644
--- ocrolib/__init__.py
+++ ocrolib/__init__.py
@@ -1,7 +1,7 @@
__all__ = [
"binnednn","cairoextras","common","components","dbtables",
"fgen","gmmtree","gtkyield","hocr","lang","native",
- "mlp","multiclass","default","lineest"
+ "mlp","multiclass","default","lineest", "psegutils"
]
################################################################
@@ -9,5 +9,6 @@ __all__ = [
################################################################
import default
+from psegutils import *
from common import *
from default import traceback as trace
diff --git ocrolib/common.py ocrolib/common.py
index 27c0f26..14f088f 100644
--- ocrolib/common.py
+++ ocrolib/common.py
@@ -14,6 +14,7 @@ import unicodedata
import inspect
import glob
import cPickle
+import gzip
from ocrolib.exceptions import (BadClassLabel, BadInput, FileNotFound,
OcropusException)
@@ -428,6 +429,7 @@ def unpickle_find_global(mname,cname):
exec "import "+mname
return getattr(sys.modules[mname],cname)
+
def load_object(fname,zip=0,nofind=0,verbose=0):
"""Loads an object from disk. By default, this handles zipped files
and searches in the usual places for OCRopus. It also handles some
@@ -439,8 +441,7 @@ def load_object(fname,zip=0,nofind=0,verbose=0):
if zip==0 and fname.endswith(".gz"):
zip = 1
if zip>0:
- # with gzip.GzipFile(fname,"rb") as stream:
- with os.popen("gunzip < '%s'"%fname,"rb") as stream:
+ with gzip.GzipFile(fname,"rb") as stream:
unpickler = cPickle.Unpickler(stream)
unpickler.find_global = unpickle_find_global
return unpickler.load()
@@ -618,7 +619,7 @@ def ocropus_find_file(fname, gz=True):
possible_prefixes.append(os.path.normpath(os.path.join(
os.path.dirname(inspect.getfile(inspect.currentframe())),
- os.pardir, os.pardir, os.pardir, os.pardir, "share", "ocropus")))
+ os.pardir, "share", "ocropus")))
possible_prefixes.append("/usr/local/share/ocropus")
diff --git ocrolib/native.py ocrolib/native.py
index b7a207f..240450b 100644
--- ocrolib/native.py
+++ ocrolib/native.py
@@ -44,6 +44,7 @@ class CompileError(Exception):
def compile_and_find(c_string,prefix=".pynative",opt="-g -O4",libs="-lm",
options="-shared -fopenmp -std=c99 -fPIC",verbose=0):
+ prefix = os.path.join(os.path.dirname(__file__), prefix)
if not os.path.exists(prefix):
os.mkdir(prefix)
m = hashlib.md5()
diff --git setup.py setup.py
index 2ec5832..6697b12 100644
--- setup.py
+++ setup.py
@@ -10,7 +10,9 @@ assert sys.version_info[0]==2 and sys.version_info[1]>=7,\
from distutils.core import setup #, Extension, Command
#from distutils.command.install_data import install_data
-if not os.path.exists("models/en-default.pyrnn.gz"):
+models = os.environ.get('OCROPY_MODEL_PATH', '').split(':') or \
+ [c for c in glob.glob("models/*pyrnn.gz")]
+if not models:
print()
print("You should download the default model 'en-default.pyrnn.gz'")
print("and put it into ./models.")
@@ -18,16 +20,23 @@ if not os.path.exists("models/en-default.pyrnn.gz"):
print("Check https://github.com/tmbdev/ocropy for the location")
print("of model files.")
print()
+ sys.exit(1)
-models = [c for c in glob.glob("models/*pyrnn.gz")]
scripts = [c for c in glob.glob("ocropus-*") if "." not in c and "~" not in c]
+# compile pynative files now and include them in the build
+sys.path.insert(0, os.curdir)
+import ocrolib.nutils
+pynative_files = [os.path.join(*c.split(os.path.sep)[1:]) \
+ for c in glob.glob('ocrolib/.pynative/*')]
+
setup(
name = 'ocropy',
version = 'v1.0',
author = "Thomas Breuel",
description = "The OCRopy RNN-based Text Line Recognizer",
packages = ["ocrolib"],
- data_files= [('share/ocropus', models)],
+ package_data = {'ocrolib': pynative_files},
+ data_files= [('share/ocropus', models), ("", ["LICENSE"])],
scripts = scripts,
)
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment