Lighter processing for OCR activities

When running OCR, we sometimes have issues because processing is "too heavy": - [x] use 2 or 3 Go of disk space for a one page PDF created by erp5_document_scanner, because we convert pdf -> png -> tiff before sending to tesseract. Modern Ghostscript supports running tesseract directly, so we use it if it's available. - [x] use 300% of CPU. Fixed by setting `OMP_THREAD_LIMIT` when running tesseract. This will only apply when OCR from Images. OCR embedded in Ghostscript does not seem to need this. - [x] ... and often crash, so is restarted. This is fixed by updated tesseract. Updates of ghostscript and tesseract are part of nexedi/slapos!985 See merge request nexedi/erp5!1420

Lighter processing for OCR activities
When running OCR, we sometimes have issues because processing is "too heavy": - [x] use 2 or 3 Go of disk space for a one page PDF created by erp5_document_scanner, because we convert pdf -> png -> tiff before sending to tesseract. Modern Ghostscript supports running tesseract directly, so we use it if it's available. - [x] use 300% of CPU. Fixed by setting `OMP_THREAD_LIMIT` when running tesseract. This will only apply when OCR from Images. OCR embedded in Ghostscript does not seem to need this. - [x] ... and often crash, so is restarted. This is fixed by updated tesseract. Updates of ghostscript and tesseract are part of nexedi/slapos!985 See merge request nexedi/erp5!1420
9e375b8e · Jérome Perrin · f084c646 · f775724e · 9e375b8e · 9e375b8e
Commit 9e375b8e authored Jun 04, 2021 by Jérome Perrin
3 changed files
--- a/bt5/erp5_dms/DocumentTemplateItem/portal_components/document.erp5.PDFDocument.py
+++ b/bt5/erp5_dms/DocumentTemplateItem/portal_components/document.erp5.PDFDocument.py
@@ -165,21 +165,66 @@ class PDFDocument(Image):

  security.declarePrivate('_convertToText')
  def _convertToText(self, format='txt'):  # pylint: disable=redefined-builtin
-    """
-      Convert the PDF text content to text with pdftotext
+    """Convert the PDF to text
+
+    If the PDF have text, return the text, otherwise try to do OCR using
+    tesseract.
    """
    if not self.hasData():
      return ''
+    data = str(self.getData())
+    try:
+      from PyPDF2 import PdfFileReader
+      from PyPDF2.utils import PdfReadError
+    except ImportError:
+      pass
+    else:
+      try:
+        if PdfFileReader(StringIO(data)).isEncrypted:
+          return ''
+      except PdfReadError:
+        return ''
+
    mime_type = 'text/plain'
    portal_transforms = self.getPortalObject().portal_transforms
    filename = self.getFilename()
-    result = portal_transforms.convertToData(mime_type, str(self.getData()),
+    result = portal_transforms.convertToData(mime_type, data,
                                             context=self, filename=filename,
                                             mimetype=self.getContentType())
    if result:
      return result
    else:
-      # Try to use OCR
+      # Try to use OCR from ghostscript, but tolerate that the command might
+      # not be available.
+      process = None
+      command = [
+          'gs', '-q', '-dQUIET', '-dSAFER', '-dBATCH', '-dNOPAUSE',
+          '-dNOPROMPT', '-sDEVICE=ocr', '-r300x300', '-o', '-', '-f', '-'
+      ]
+      try:
+        process = Popen(
+            command,
+            stdin=PIPE,
+            stdout=PIPE,
+            stderr=PIPE,
+            close_fds=True,
+        )
+        output, error = process.communicate(data)
+        if process.returncode:
+          raise ConversionError(
+              "Error invoking ghostscript.\noutput:%s\nerror:%s" % (output, error))
+        return output.strip()
+      except OSError as e:
+        if e.errno != errno.ENOENT:
+          raise
+      finally:
+        del process
+
+      # We don't have ghostscript, fallback to the expensive pipeline using:
+      #   pdf -- (Image._convert imagemagick) --> png
+      #       -- (PortalTransforms.png_to_tiff imagemagick) --> tiff
+      #       -- (PortalTransforms.tiff_to_text tesseract) --> text
+      #
      # As high dpi images are required, it may take some times to convert the
      # pdf.
      # It may be required to use activities to fill the cache and at the end,

--- a/bt5/erp5_dms/TestTemplateItem/portal_components/test.erp5.testDms.py
+++ b/bt5/erp5_dms/TestTemplateItem/portal_components/test.erp5.testDms.py
@@ -71,6 +71,7 @@ from AccessControl import Unauthorized
 from Products.ERP5Type import Permissions
 from DateTime import DateTime
 from ZTUtils import make_query
+import PyPDF2

 QUIET = 0

@@ -1981,13 +1982,34 @@ document.write('<sc'+'ript type="text/javascript" src="http://somosite.bg/utb.ph

  def test_PDFDocument_asTextConversion(self):
    """Test a PDF document with embedded images
-    To force usage of Ocropus portal_transform chain
+    To force usage of ghostscript with embedded tesseract OCR device
    """
-    portal_type = 'PDF'
-    module = self.portal.getDefaultModule(portal_type)
-    upload_file = makeFileUpload('TEST.Embedded.Image.pdf')
-    document = module.newContent(portal_type=portal_type, file=upload_file)
-    self.assertEqual('ERP5 is a free software.', document.asText())
+    document = self.portal.document_module.newContent(
+        portal_type='PDF',
+        file=makeFileUpload('TEST.Embedded.Image.pdf'))
+    self.assertEqual(document.asText(), 'ERP5 is a free software.')
+
+  def test_broken_pdf_asText(self):
+    class StringIOWithFilename(StringIO.StringIO):
+      filename = 'broken.pdf'
+    document = self.portal.document_module.newContent(
+        portal_type='PDF',
+        file=StringIOWithFilename('broken'))
+    self.assertEqual(document.asText(), '')
+    self.tic() # no activity failure
+
+  def test_password_protected_pdf_asText(self):
+    pdf_reader = PyPDF2.PdfFileReader(makeFileUpload('TEST.Embedded.Image.pdf'))
+    pdf_writer = PyPDF2.PdfFileWriter()
+    pdf_writer.addPage(pdf_reader.getPage(0))
+    pdf_writer.encrypt('secret')
+    encrypted_pdf_stream = StringIO.StringIO()
+    pdf_writer.write(encrypted_pdf_stream)
+    document = self.portal.document_module.newContent(
+        portal_type='PDF',
+        file=encrypted_pdf_stream)
+    self.assertEqual(document.asText(), '')
+    self.tic() # no activity failure

  def createRestrictedSecurityHelperScript(self):
    script_content_list = ['format=None, **kw', """

--- a/product/PortalTransforms/transforms/tiff_to_text.py
+++ b/product/PortalTransforms/transforms/tiff_to_text.py
@@ -34,9 +34,11 @@ class tiff_to_text(commandtransform):
      try:
        output_file_path = os.path.join(tmp_dir, 'output')
        cmd = self.binary, input_file, output_file_path
-        process = subprocess.Popen(cmd,
-                                   stdout=subprocess.PIPE,
-                                   stderr=subprocess.STDOUT,)
+        process = subprocess.Popen(
+            cmd,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,
+            env=dict(os.environ, OMP_THREAD_LIMIT='1'))
        stdout = process.communicate()[0]
        err = process.returncode
        if err: