Public
Snippet $306 authored by Jérome Perrin

Deduplicate images in PDF

Edited
dedup.py
##############################################################################
#
# Copyright (c) 2016 Nexedi SA and Contributors. All Rights Reserved.
#
# WARNING: This program as such is intended to be used by professional
# programmers who take the whole responsibility of assessing all potential
# consequences resulting from its eventual inadequacies and bugs
# End users who are looking for a ready-to-use solution with commercial
# guarantees and support are strongly adviced to contract a Free Software
# Service Company
#
# This program is Free Software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
#
##############################################################################

from StringIO import StringIO
from PyPDF2 import PdfFileWriter, PdfFileReader
from copy import deepcopy
import PyPDF2
import hashlib

def ERP5Site_optimizePDF(self, pdf_data):
  """Optimize a PDF not to include an image stream twice.

  Browse all pages of the PDF and if two image objects with the
  same stream content and same attributes are used, use a reference
  to existing object so that the image stream appears only once in the
  resulting pdf.
  """
  if not pdf_data:
    return pdf_data

  input_pdf = PyPDF2.PdfFileReader(StringIO(pdf_data))
  output_pdf = PyPDF2.PdfFileWriter()

  # mapping image raw content -> external reference
  already_added_data = {}

  def getObjectSignature(xobject):
    """Compute the signature of an image object
    """
    image_hash = hashlib.md5()
    # Add the image content.
    image_hash.update(xobject._data)

    # add all properties recursively, and resolve colorspace etc.
    def recurseAddProperties(obj):
      if isinstance(obj, PyPDF2.generic.ArrayObject):
        for v in obj:
          recurseAddProperties(v)
      elif isinstance(obj, PyPDF2.generic.DictionaryObject):
        for k, v in sorted(obj.items()):
          image_hash.update(str(k))
          recurseAddProperties(v)
      elif isinstance(obj, PyPDF2.generic.IndirectObject):
        recurseAddProperties(obj.getObject())
      elif isinstance(obj, PyPDF2.generic.BooleanObject):
        # BooleanObject does not have a stable __str__ representation (includes object at 0x...)
        image_hash.update(str(obj.value))
      else:
        try:
          image_hash.update(str(obj))
        except UnicodeEncodeError:
          image_hash.update(obj.encode('utf8', 'repr'))
    recurseAddProperties(xobject)

    return image_hash.hexdigest()


  for page_num in range(input_pdf.getNumPages()):
    input_page = input_pdf.getPage(page_num)

    if '/XObject' in input_page['/Resources']:
      # rewrite images from input pages with external references, making sure to reuse the same
      # reference if the stream content and attributes are same
      for resource_id, resource_value in input_page['/Resources']['/XObject'].items():
        resource_value = resource_value.getObject()
        assert isinstance(resource_value, PyPDF2.generic.StreamObject)

        image_signature = getObjectSignature(resource_value)
        existing_object = already_added_data.get(image_signature)

        if existing_object is None:
          # get an external reference and add it to `already_added_data` to reuse it next time
          resource_value = output_pdf._addObject(resource_value)
          already_added_data[image_signature] = resource_value
        else:
          # already added, just reuse another reference to that image
          resource_value = existing_object

        input_page['/Resources']['/XObject'][resource_id] = resource_value

    output_pdf.addPage(input_page)


  outputStream = StringIO()
  output_pdf.write(outputStream)
  return outputStream.getvalue()