Lighter processing for OCR activities

When running OCR, we sometimes have issues because processing is "too heavy": - [x] use 2 or 3 Go of disk space for a one page PDF created by erp5_document_scanner, because we convert pdf -> png -> tiff before sending to tesseract. Modern Ghostscript supports running tesseract directly, so we use it if it's available. - [x] use 300% of CPU. Fixed by setting `OMP_THREAD_LIMIT` when running tesseract. This will only apply when OCR from Images. OCR embedded in Ghostscript does not seem to need this. - [x] ... and often crash, so is restarted. This is fixed by updated tesseract. Updates of ghostscript and tesseract are part of slapos!985 See merge request !1420

Lighter processing for OCR activities
When running OCR, we sometimes have issues because processing is "too heavy": - [x] use 2 or 3 Go of disk space for a one page PDF created by erp5_document_scanner, because we convert pdf -> png -> tiff before sending to tesseract. Modern Ghostscript supports running tesseract directly, so we use it if it's available. - [x] use 300% of CPU. Fixed by setting `OMP_THREAD_LIMIT` when running tesseract. This will only apply when OCR from Images. OCR embedded in Ghostscript does not seem to need this. - [x] ... and often crash, so is restarted. This is fixed by updated tesseract. Updates of ghostscript and tesseract are part of slapos!985 See merge request !1420
9e375b8e · Jérome Perrin · f084c646 · f775724e · 9e375b8e · 9e375b8e
Commit 9e375b8e authored Jun 04, 2021 by Jérome Perrin
3 changed files
--- a/bt5/erp5_dms/DocumentTemplateItem/portal_components/document.erp5.PDFDocument.py
+++ b/bt5/erp5_dms/DocumentTemplateItem/portal_components/document.erp5.PDFDocument.py
@@ -165,21 +165,66 @@ class PDFDocument(Image):

  security.declarePrivate('_convertToText')
  def _convertToText(self, format='txt'):  # pylint: disable=redefined-builtin
-    """
-      Convert the PDF text content to text with pdftotext
+    """Convert the PDF to text
+
+    If the PDF have text, return the text, otherwise try to do OCR using
+    tesseract.
    """
    if not self.hasData():
      return ''
+    data = str(self.getData())
+    try:
+      from PyPDF2 import PdfFileReader
+      from PyPDF2.utils import PdfReadError
+    except ImportError:
+      pass
+    else:
+      try:
+        if PdfFileReader(StringIO(data)).isEncrypted:
+          return ''
+      except PdfReadError:
+        return ''
+
    mime_type = 'text/plain'
    portal_transforms = self.getPortalObject().portal_transforms
    filename = self.getFilename()
-    result = portal_transforms.convertToData(mime_type, str(self.getData()),
+    result = portal_transforms.convertToData(mime_type, data,
                                             context=self, filename=filename,
                                             mimetype=self.getContentType())
    if result:
      return result
    else:
-      # Try to use OCR
+      # Try to use OCR from ghostscript, but tolerate that the command might
+      # not be available.
+      process = None
+      command = [
+          'gs', '-q', '-dQUIET', '-dSAFER', '-dBATCH', '-dNOPAUSE',
+          '-dNOPROMPT', '-sDEVICE=ocr', '-r300x300', '-o', '-', '-f', '-'
+      ]
+      try:
+        process = Popen(
+            command,
+            stdin=PIPE,
+            stdout=PIPE,
+            stderr=PIPE,
+            close_fds=True,
+        )
+        output, error = process.communicate(data)
+        if process.returncode:
+          raise ConversionError(
+              "Error invoking ghostscript.\noutput:%s\nerror:%s" % (output, error))
+        return output.strip()
+      except OSError as e:
+        if e.errno != errno.ENOENT:
+          raise
+      finally:
+        del process
+
+      # We don't have ghostscript, fallback to the expensive pipeline using:
+      #   pdf -- (Image._convert imagemagick) --> png
+      #       -- (PortalTransforms.png_to_tiff imagemagick) --> tiff
+      #       -- (PortalTransforms.tiff_to_text tesseract) --> text
+      #
      # As high dpi images are required, it may take some times to convert the
      # pdf.
      # It may be required to use activities to fill the cache and at the end,

--- a/bt5/erp5_dms/TestTemplateItem/portal_components/test.erp5.testDms.py
+++ b/bt5/erp5_dms/TestTemplateItem/portal_components/test.erp5.testDms.py
@@ -71,6 +71,7 @@ from AccessControl import Unauthorized
 from Products.ERP5Type import Permissions
 from DateTime import DateTime
 from ZTUtils import make_query
+import PyPDF2

 QUIET = 0

@@ -1981,13 +1982,34 @@ document.write('<sc'+'ript type="text/javascript" src="http://somosite.bg/utb.ph

  def test_PDFDocument_asTextConversion(self):
    """Test a PDF document with embedded images
-    To force usage of Ocropus portal_transform chain
+    To force usage of ghostscript with embedded tesseract OCR device
    """
-    portal_type = 'PDF'
-    module = self.portal.getDefaultModule(portal_type)
-    upload_file = makeFileUpload('TEST.Embedded.Image.pdf')
-    document = module.newContent(portal_type=portal_type, file=upload_file)
-    self.assertEqual('ERP5 is a free software.', document.asText())
+    document = self.portal.document_module.newContent(
+        portal_type='PDF',
+        file=makeFileUpload('TEST.Embedded.Image.pdf'))
+    self.assertEqual(document.asText(), 'ERP5 is a free software.')
+
+  def test_broken_pdf_asText(self):
+    class StringIOWithFilename(StringIO.StringIO):
+      filename = 'broken.pdf'
+    document = self.portal.document_module.newContent(
+        portal_type='PDF',
+        file=StringIOWithFilename('broken'))
+    self.assertEqual(document.asText(), '')
+    self.tic() # no activity failure
+
+  def test_password_protected_pdf_asText(self):
+    pdf_reader = PyPDF2.PdfFileReader(makeFileUpload('TEST.Embedded.Image.pdf'))
+    pdf_writer = PyPDF2.PdfFileWriter()
+    pdf_writer.addPage(pdf_reader.getPage(0))
+    pdf_writer.encrypt('secret')
+    encrypted_pdf_stream = StringIO.StringIO()
+    pdf_writer.write(encrypted_pdf_stream)
+    document = self.portal.document_module.newContent(
+        portal_type='PDF',
+        file=encrypted_pdf_stream)
+    self.assertEqual(document.asText(), '')
+    self.tic() # no activity failure

  def createRestrictedSecurityHelperScript(self):
    script_content_list = ['format=None, **kw', """

--- a/product/PortalTransforms/transforms/tiff_to_text.py
+++ b/product/PortalTransforms/transforms/tiff_to_text.py
@@ -34,9 +34,11 @@ class tiff_to_text(commandtransform):
      try:
        output_file_path = os.path.join(tmp_dir, 'output')
        cmd = self.binary, input_file, output_file_path
-        process = subprocess.Popen(cmd,
-                                   stdout=subprocess.PIPE,
-                                   stderr=subprocess.STDOUT,)
+        process = subprocess.Popen(
+            cmd,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,
+            env=dict(os.environ, OMP_THREAD_LIMIT='1'))
        stdout = process.communicate()[0]
        err = process.returncode
        if err: