Commit 9e375b8e authored by Jérome Perrin's avatar Jérome Perrin

Lighter processing for OCR activities

When running OCR, we sometimes have issues because processing is "too heavy":
 - [x] use 2 or 3 Go of disk space for a one page PDF created by erp5_document_scanner, because we convert pdf -> png -> tiff before sending to tesseract. Modern Ghostscript supports running tesseract directly, so we use it if it's available.
 - [x] use 300% of CPU. Fixed by setting `OMP_THREAD_LIMIT` when running tesseract. This will only apply when OCR from Images. OCR embedded in Ghostscript does not seem to need this.
 - [x] ... and often crash, so is restarted. This is fixed by updated tesseract.

Updates of ghostscript and tesseract are part of nexedi/slapos!985

See merge request nexedi/erp5!1420
parents f084c646 f775724e
Pipeline #15826 failed with stage
in 0 seconds
...@@ -165,21 +165,66 @@ class PDFDocument(Image): ...@@ -165,21 +165,66 @@ class PDFDocument(Image):
security.declarePrivate('_convertToText') security.declarePrivate('_convertToText')
def _convertToText(self, format='txt'): # pylint: disable=redefined-builtin def _convertToText(self, format='txt'): # pylint: disable=redefined-builtin
""" """Convert the PDF to text
Convert the PDF text content to text with pdftotext
If the PDF have text, return the text, otherwise try to do OCR using
tesseract.
""" """
if not self.hasData(): if not self.hasData():
return '' return ''
data = str(self.getData())
try:
from PyPDF2 import PdfFileReader
from PyPDF2.utils import PdfReadError
except ImportError:
pass
else:
try:
if PdfFileReader(StringIO(data)).isEncrypted:
return ''
except PdfReadError:
return ''
mime_type = 'text/plain' mime_type = 'text/plain'
portal_transforms = self.getPortalObject().portal_transforms portal_transforms = self.getPortalObject().portal_transforms
filename = self.getFilename() filename = self.getFilename()
result = portal_transforms.convertToData(mime_type, str(self.getData()), result = portal_transforms.convertToData(mime_type, data,
context=self, filename=filename, context=self, filename=filename,
mimetype=self.getContentType()) mimetype=self.getContentType())
if result: if result:
return result return result
else: else:
# Try to use OCR # Try to use OCR from ghostscript, but tolerate that the command might
# not be available.
process = None
command = [
'gs', '-q', '-dQUIET', '-dSAFER', '-dBATCH', '-dNOPAUSE',
'-dNOPROMPT', '-sDEVICE=ocr', '-r300x300', '-o', '-', '-f', '-'
]
try:
process = Popen(
command,
stdin=PIPE,
stdout=PIPE,
stderr=PIPE,
close_fds=True,
)
output, error = process.communicate(data)
if process.returncode:
raise ConversionError(
"Error invoking ghostscript.\noutput:%s\nerror:%s" % (output, error))
return output.strip()
except OSError as e:
if e.errno != errno.ENOENT:
raise
finally:
del process
# We don't have ghostscript, fallback to the expensive pipeline using:
# pdf -- (Image._convert imagemagick) --> png
# -- (PortalTransforms.png_to_tiff imagemagick) --> tiff
# -- (PortalTransforms.tiff_to_text tesseract) --> text
#
# As high dpi images are required, it may take some times to convert the # As high dpi images are required, it may take some times to convert the
# pdf. # pdf.
# It may be required to use activities to fill the cache and at the end, # It may be required to use activities to fill the cache and at the end,
......
...@@ -71,6 +71,7 @@ from AccessControl import Unauthorized ...@@ -71,6 +71,7 @@ from AccessControl import Unauthorized
from Products.ERP5Type import Permissions from Products.ERP5Type import Permissions
from DateTime import DateTime from DateTime import DateTime
from ZTUtils import make_query from ZTUtils import make_query
import PyPDF2
QUIET = 0 QUIET = 0
...@@ -1981,13 +1982,34 @@ document.write('<sc'+'ript type="text/javascript" src="http://somosite.bg/utb.ph ...@@ -1981,13 +1982,34 @@ document.write('<sc'+'ript type="text/javascript" src="http://somosite.bg/utb.ph
def test_PDFDocument_asTextConversion(self): def test_PDFDocument_asTextConversion(self):
"""Test a PDF document with embedded images """Test a PDF document with embedded images
To force usage of Ocropus portal_transform chain To force usage of ghostscript with embedded tesseract OCR device
""" """
portal_type = 'PDF' document = self.portal.document_module.newContent(
module = self.portal.getDefaultModule(portal_type) portal_type='PDF',
upload_file = makeFileUpload('TEST.Embedded.Image.pdf') file=makeFileUpload('TEST.Embedded.Image.pdf'))
document = module.newContent(portal_type=portal_type, file=upload_file) self.assertEqual(document.asText(), 'ERP5 is a free software.')
self.assertEqual('ERP5 is a free software.', document.asText())
def test_broken_pdf_asText(self):
class StringIOWithFilename(StringIO.StringIO):
filename = 'broken.pdf'
document = self.portal.document_module.newContent(
portal_type='PDF',
file=StringIOWithFilename('broken'))
self.assertEqual(document.asText(), '')
self.tic() # no activity failure
def test_password_protected_pdf_asText(self):
pdf_reader = PyPDF2.PdfFileReader(makeFileUpload('TEST.Embedded.Image.pdf'))
pdf_writer = PyPDF2.PdfFileWriter()
pdf_writer.addPage(pdf_reader.getPage(0))
pdf_writer.encrypt('secret')
encrypted_pdf_stream = StringIO.StringIO()
pdf_writer.write(encrypted_pdf_stream)
document = self.portal.document_module.newContent(
portal_type='PDF',
file=encrypted_pdf_stream)
self.assertEqual(document.asText(), '')
self.tic() # no activity failure
def createRestrictedSecurityHelperScript(self): def createRestrictedSecurityHelperScript(self):
script_content_list = ['format=None, **kw', """ script_content_list = ['format=None, **kw', """
......
...@@ -34,9 +34,11 @@ class tiff_to_text(commandtransform): ...@@ -34,9 +34,11 @@ class tiff_to_text(commandtransform):
try: try:
output_file_path = os.path.join(tmp_dir, 'output') output_file_path = os.path.join(tmp_dir, 'output')
cmd = self.binary, input_file, output_file_path cmd = self.binary, input_file, output_file_path
process = subprocess.Popen(cmd, process = subprocess.Popen(
cmd,
stdout=subprocess.PIPE, stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,) stderr=subprocess.STDOUT,
env=dict(os.environ, OMP_THREAD_LIMIT='1'))
stdout = process.communicate()[0] stdout = process.communicate()[0]
err = process.returncode err = process.returncode
if err: if err:
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment