erp5_corporate_identity: improve slideshow rendering including displaying...

erp5_corporate_identity: improve slideshow rendering including displaying legacy presentation as slideshow

erp5_corporate_identity: improve slideshow rendering including displaying...
erp5_corporate_identity: improve slideshow rendering including displaying legacy presentation as slideshow
d32a1124 · Sven Franck · Xiaowu Zhang · a8b888ae · d32a1124 · d32a1124
Commit d32a1124 authored May 26, 2020 by Sven Franck Committed by Xiaowu Zhang May 27, 2020
3 changed files
--- a/bt5/erp5_corporate_identity/SkinTemplateItem/portal_skins/erp5_corporate_identity/Presentation_viewAsSlideshow.py
+++ b/bt5/erp5_corporate_identity/SkinTemplateItem/portal_skins/erp5_corporate_identity/Presentation_viewAsSlideshow.py
+"""
+================================================================================
+Try to convert old OpenOffice presentations into slideshows
+================================================================================
+"""
+# uses cloudooo to convert odp/sxi to html (quite buggy) and then salvages the
+# result into a slideshow html, which is passed on as remote_content to the
+# slideshow renderer
+
+# kw-parameters   (* default)
+# ------------------------------------------------------------------------------
+
+import re
+
+blank = ''
+flags = re.MULTILINE|re.DOTALL|re.IGNORECASE
+
+def getHeaderSlideTitle(my_doc):
+  return '<h1>' + my_doc.getTitle() + '</h1>'
+
+def getSlideList(content):
+  return re.findall(r'<html>(.*?)</html>', content, flags=flags)
+
+def getKey(item):
+  return int(item[0])
+
+# -------------------------------- Setup ---------------------------------------
+if context.getPortalType() in ["Presentation"]:
+  portal = context.getPortalObject()
+  mimetype = 'text/html'
+  content_type = context.getContentType()
+  raw_data = portal.portal_transforms.convertToData(mimetype, str(context.getData() or ""), context=context, mimetype=content_type)
+  if raw_data is None:
+    raise ValueError("Failed to convert to %r" % mimetype)
+  if context.REQUEST is not None:
+    context.REQUEST.RESPONSE.setHeader("Content-Type", mimetype)
+
+  # get a list of slides
+  content = getSlideList(raw_data)
+
+  # every slide is in the raw_data twice, once with the title and image as text,
+  # once with the slidecontent without title. All slides are mixed randomly, so
+  # we need to find out which slide contains what and then put them in their
+  # correct order. We do this by extracting the links in the slides navigation
+  # bar. This bar as a switch to change from image to text slides with the
+  # current slide number so <a href="text3">Text</a> to switch from Graphic
+  # slide 3 to Text slide 3. We use this to identify current slide
+  if len(content) > 0:
+    slideshow = []
+    output = blank
+    for slide in content:
+      slide_nav =  re.search(r'<center>(.*?)</center>', slide, flags=flags).group()
+      slide_nav_link_list = re.findall(r'<a(.*?)</a>', slide_nav, flags=flags)
+      for link in slide_nav_link_list:
+
+        # the header slide. Contains header and extracted text from image
+        if re.search(r'>Graphic', link, flags=flags):
+          pointer = re.search(r'(text|img)([0-9]*)\.', link, flags=flags)
+          if pointer is not None:
+            slide_header = re.search(r'<h1>(.*)?</h1>', slide, flags=flags).group()
+            slideshow.append([str(pointer.group(2)), slide_header])
+
+        # the content slide. Contains image and notes
+        if re.search(r'>Text', link, flags=flags):
+          pointer = re.search(r'(text|img)([0-9]*)\.', link, flags=flags)
+          if pointer is not None:
+            slideshow.append([str(pointer.group(2)), slide])
+
+  # time to sort and add first slide header in case missing
+  slideshow = sorted(slideshow, key=getKey)
+  if '<h1' not in slideshow[0][1]:
+    slideshow.insert(0, ["0", getHeaderSlideTitle(context)])
+
+  output = ""
+  section_start = '<section>'
+  section_end = '</section>'
+
+  # slideshow will contain <header>, <content>, <header>, <content>...
+  # so we need to go through it two-slides at a time to assemble
+  # slides
+  slide_iter = iter(slideshow)
+  for slide in slide_iter:
+    slide_1st = slide
+    slide_2nd = next(slide_iter)
+
+    # we don't know whether header is on first or second position
+    if '<h1' not in slide_1st[1]:
+      go_1st = slide_2nd[1]
+      go_2nd = slide_1st[1]
+    else:
+      go_1st = slide_1st[1]
+      go_2nd = slide_2nd[1]
+
+    go_2nd = go_2nd.replace(re.search(r'<head>.*?</center><br>', go_2nd, flags=flags).group(), blank)
+    go_2nd = go_2nd.replace("<h3>Notes:</h3><br>", '<details open="open">')
+    go_2nd = go_2nd.replace("</body>", "</details>")
+    output = output + section_start + go_1st + go_2nd + section_end
+
+  kw["remote_content"] = output
+  return context.WebPage_viewAsSlideshowWIP(*args, **kw)
--- a/bt5/erp5_corporate_identity/SkinTemplateItem/portal_skins/erp5_corporate_identity/Presentation_viewAsSlideshow.xml
+++ b/bt5/erp5_corporate_identity/SkinTemplateItem/portal_skins/erp5_corporate_identity/Presentation_viewAsSlideshow.xml
+<?xml version="1.0"?>
+<ZopeData>
+  <record id="1" aka="AAAAAAAAAAE=">
+    <pickle>
+      <global name="PythonScript" module="Products.PythonScripts.PythonScript"/>
+    </pickle>
+    <pickle>
+      <dictionary>
+        <item>
+            <key> <string>Script_magic</string> </key>
+            <value> <int>3</int> </value>
+        </item>
+        <item>
+            <key> <string>_bind_names</string> </key>
+            <value>
+              <object>
+                <klass>
+                  <global name="NameAssignments" module="Shared.DC.Scripts.Bindings"/>
+                </klass>
+                <tuple/>
+                <state>
+                  <dictionary>
+                    <item>
+                        <key> <string>_asgns</string> </key>
+                        <value>
+                          <dictionary>
+                            <item>
+                                <key> <string>name_container</string> </key>
+                                <value> <string>container</string> </value>
+                            </item>
+                            <item>
+                                <key> <string>name_context</string> </key>
+                                <value> <string>context</string> </value>
+                            </item>
+                            <item>
+                                <key> <string>name_m_self</string> </key>
+                                <value> <string>script</string> </value>
+                            </item>
+                            <item>
+                                <key> <string>name_subpath</string> </key>
+                                <value> <string>traverse_subpath</string> </value>
+                            </item>
+                          </dictionary>
+                        </value>
+                    </item>
+                  </dictionary>
+                </state>
+              </object>
+            </value>
+        </item>
+        <item>
+            <key> <string>_params</string> </key>
+            <value> <string></string> </value>
+        </item>
+        <item>
+            <key> <string>id</string> </key>
+            <value> <string>Presentation_viewAsSlideshow</string> </value>
+        </item>
+      </dictionary>
+    </pickle>
+  </record>
+</ZopeData>
--- a/bt5/erp5_corporate_identity/SkinTemplateItem/portal_skins/erp5_corporate_identity/WebPage_viewAsSlideshow.py
+++ b/bt5/erp5_corporate_identity/SkinTemplateItem/portal_skins/erp5_corporate_identity/WebPage_viewAsSlideshow.py
@@ -21,24 +21,27 @@ MAIN FILE: generate presentation in different output formats
 # display_note:             display slide notes (1) or not (0)*
 # display_svg:              display svg-images as svg or png*
 # ------
-# flag_ooo:                 convert legacy odp, sxi formats (not active)
+# remote_content:           convert legacy odp, sxi formats (not active)

 import re

 from base64 import b64encode

 blank = ''
+flags = re.MULTILINE|re.DOTALL|re.IGNORECASE
 details_separator = '</section><section class="ci-notes-continue"><section><h1>cont.</h1></section>'
 pref = context.getPortalObject().portal_preferences

 # ------------------ HTML cleanup/converter methods ----------------------------
+def getHeaderSlideTitle(my_doc):
+  return '<h1>' + my_doc.getTitle() + '</h1>'
+
 def getSlideList(my_content):
  return re.findall(r'<section[^>]*?>(.*?)</section>', my_content, re.S)

 #def getSectionSlideList(my_content):
 #  return re.findall(r'(<section[^>]*?>.*?</section>)', my_content, re.S)

-# https://regex101.com/r/8F8GTx/1/
 def getSlideDetailsList(my_content):
  return re.findall(r'<section.*?>\s?<section>.*?</details>\s?</section>', my_content, re.S)

@@ -51,6 +54,54 @@ def getDetailsList(my_slide):
 #def getNestedSection(my_content):
 #  return my_content.find("<section") > -1

+# please look the other direction until we can use beautifulsoup
+def getSlideFront(my_content):
+
+  # is there an image on the slide?
+  img = re.search(r'(<img.*?/>)', slide_content, flags=flags)
+  if img:
+    return img.group()
+
+  # is there another tag on the slide?
+  tag = re.search(r'<(.*?)( |>)', slide_content, flags=flags)
+  if tag:
+    key = tag.group(1)
+    element = re.search(r'(<%s.*?</%s>)'%(key, key), my_content, flags=flags)
+    if element:
+      return element.group()
+
+  # empty slide
+  return None
+
+# opinionated
+# TODO h1: chapter, h2:slide ?
+def setH1AndH2AsSlideHeaders(my_content):
+  for start_tag in re.findall(r'<h2', my_content, flags=flags):
+    my_content = my_content.replace(start_tag, '<h1')
+  for end_tag in re.findall(r'\/h2>', my_content, flags=flags):
+    my_content = my_content.replace(end_tag, '/h1>')
+  return my_content
+
+def removePlaceholders(my_content):
+  if my_content.find('${') > -1:
+    for substitution_string in re.findall(r'(\${.*})', my_content):
+      my_content = my_content.replace(substitution_string, blank)
+  return my_content
+
+def removeComments(my_content):
+  for comment_string in re.findall(r'(<!--.*?-->)', my_content, flags=flags):
+    my_content = my_content.replace(comment_string, blank)
+  return my_content
+
+def removeImageWrappers(my_content):
+  img_list = re.findall(r'(<p style=\"text-align: center;\">(.*?)</p>)', my_content, flags=flags)
+  for wrapped_image in img_list:
+    my_content = my_content.replace(wrapped_image[0], wrapped_image[1])
+  return my_content
+
+def removeLineBreaks(my_content):
+  return my_content.replace('\n', '').replace('\r', '')
+
 def splitMultipleDetails(my_content):
  for slide in getSlideDetailsList(my_content):
    detail_list = getDetailsList(slide)
@@ -86,118 +137,101 @@ def splitMultipleDetails(my_content):
 def removeEmptyDetails(my_content):
  content = my_content.replace('<details open="open"></details>', blank)
  content = content.replace('<details></details>', blank)
+  content = content.replace('<details open=""></details>', blank)
  content = content.replace('<details>&nbsp;</details>', blank)
  content = content.replace('<details> </details>', blank)
  return content

-def getPageList(my_content):
-  return re.findall(r'<html>(.*?)</html>', my_content, re.S)
-
-def getPageTitle(my_full_page):
-  result = re.search('<title>(.+?)</title>', my_full_page)
-  if result:
-    return result.group(1)
-
-def getPageContent(my_full_page):
-  result_list = my_full_page.split("</center><br>")
-  if len(result_list) == 2:
-    return result_list[1].replace("</body>", blank)
-
-def addSlideContent(my_content, my_notes):
-  return ''.join([
-    '<section>',
-    my_content,
-    '<details open="open">',
-    my_notes,
-    '</details></section>'
-  ])
-
-def sortContent(my_page_list):
-  try:
-    page_content_list = []
-    page_tuple_first = None
-    page_tuple_last = None
-    for page in my_page_list:
-      page_title = getPageTitle(page)
-
-      # Note cloudooo default html transformation mixes slide order. dirty fix
-      if page_title.find("Commercial") > -1:
-        page_content = getPageContent(page)
-        if page_content.find("<center>") > -1:
-          page_tuple_last = (page_title, page_content, "first")
-      elif page_title.find("ERP5") > -1:
-        page_content = getPageContent(page)
-        if page_content.find("<center>") > -1:
-          page_tuple_first = (page_title, page_content, "last")
-      else:
-        page_content = getPageContent(page)
-        if page_title.find("Slide") > -1:
-          slide_number = int(page_title.replace("Slide ", ""))
-          page_content_list.append((slide_number, page_content, None))
-        else:
-          if page_content.find("<center>") > -1:
-            page_tuple_first = (page_title, page_content, "first")
-      sort_content_list = sorted(page_content_list, key=lambda page_foo: page_foo[0])
-    if page_tuple_last is not None:
-      sort_content_list.append(page_tuple_last)
-    if page_tuple_first is not None:
-      sort_content_list = [page_tuple_first] + sort_content_list
-    return sort_content_list
-
-  except Exception as e:
-    raise e
+def addLastSlide(my_last_slide):
+  if my_last_slide.count("<div") != 2:
+    last_slide_relative_url = pref.getPreferredCorporateIdentityTemplateSlideLastSlideRelativeUrl() or None
+    if last_slide_relative_url:
+      # try:
+      last_slide = doc.restrictedTraverse(last_slide_relative_url) or None
+      if last_slide is not None:
+        return last_slide.getTextContent()
+      #except AttributeError:
+      #  last_slide_content = blank
+  return blank

 # -------------------------- Setup ---------------------------------------------
 doc = context
 doc_prefix = pref.getPreferredCorporateIdentityTemplateSlideDocumentPrefix() or "Slideshow."
-doc_converted_content = None
+doc_upgraded_content = None
+doc_slide_iter = None
 doc_format = kw.get('format') or 'html'
 doc_display_notes = int(kw.get('display_note') or 0)
 doc_display_svg = kw.get('display_svg') or 'png'
 doc_download = int(kw.get('document_download') or 0)
 doc_save = int(kw.get('document_save') or 0)
-doc_ooo = int(kw.get('flag_ooo') or 0)
+doc_ooo = kw.get('remote_content') or None
+doc_content = doc_ooo or doc.getTextContent()
+doc_is_slideshow = getSlideList(doc_content) or None

 override_logo_reference = kw.get('override_logo_reference', None)
 override_source_organisation_title = kw.get("override_source_organisation_title", None)
 override_batch_mode = kw.get('batch_mode')
 override_source_person_title = None

-# ---------- backward compatability with legacy odp/sxi presentations ----------
-# note: this has to come first to convert file into html and then continue
-if doc_ooo:
-  doc_portal = doc.getPortalObject()
-  if doc.getPortalType() in ["Presentation"]:
-    raw_data = doc_portal.portal_transforms.convertToData(
-      "text/html",
-      str(doc.getData() or blank),
-      context=context,
-      mimetype=doc.getContentType()
-    )
-    if raw_data is None:
-      raise ValueError("Failed to convert to %r" % "text/html")
-
-    # got something
-    page_list = getPageList(raw_data)
-    if len(page_list) > 0:
-      page_content = sortContent(page_list)
-      doc_converted_content = blank
-      for slide in page_content:
-        if slide[1].find("<center>") > -1:
-          slide_content_list = slide[1].split("<h3>Notes:</h3>")
-          if len(slide_content_list) != 2:
-            slide_content = slide[1]
-            slide_notes = blank
-          else:
-            slide_content = slide_content_list[0]
-            slide_content = slide_content.replace("<center>", "")
-            slide_content = slide_content.replace("</center>", "")
-            slide_notes = slide_content_list[1]
-          doc_converted_content += addSlideContent(slide_content, slide_notes)
+# --------------------- Convert any page into a slideshow ----------------------
+# Note: mileage varies depending on the cleanliness of the HTML page
+if doc_is_slideshow is None:
+
+  doc_upgraded_content = removePlaceholders(doc_content)
+  doc_upgraded_content = removeComments(doc_upgraded_content)
+  doc_upgraded_content = removeImageWrappers(doc_upgraded_content)
+  doc_upgraded_content = setH1AndH2AsSlideHeaders(doc_upgraded_content)
+  doc_upgraded_content = removeLineBreaks(doc_upgraded_content)
+
+  doc_content = blank
+  last_slide_content = blank
+  section_start = '<section>'
+  details_start = '<details open="open">'
+  details_end = '</details>'
+  section_end = '</section>'
+
+  # separate by <h1>, these will be our slide headers
+  fake_slide_list = re.split(r'(<h1.*?/h1>)', doc_upgraded_content, flags=flags)
+
+  # insert page title if first element isn't a <h1>
+  if '<h1' not in fake_slide_list[0]:
+    fake_slide_list.insert(0, getHeaderSlideTitle(doc))
+
+  # opinionated add of a "Thank you" slide if the last slide doesn't
+  # contain the default two <div> columns
+  last_slide_content = addLastSlide(fake_slide_list[-1])
+
+  # fake_slide_list will be <h1>,<content>,<h1>,<content> so we need to go
+  # over two items at a time
+  doc_slide_iter = iter(fake_slide_list)
+  for x in doc_slide_iter:
+    slide_header = x
+
+    # remove whitespace so we don't end up with empty <details>
+    slide_content = " ".join(next(doc_slide_iter).split())
+
+    # build slides assuming the first element after the header is on the slide
+    # (an img, a paragraph, a list, whatever). The rest goes into details. If
+    # there is an img on the slide, move it to the top
+    slide_front = getSlideFront(slide_content)
+    if slide_front:
+      slide_content = slide_content.replace(slide_front, blank)
+    else:
+      slide_front = blank
+
+    # build a new doc from slides
+    doc_content = doc_content + section_start + slide_header + slide_front \
+      + details_start + slide_content + details_end + section_end \
+
+# other case: we have a slideshow, doc_is_slideshow contains the slides
+else:
+  last_slide_content = addLastSlide(doc_is_slideshow[-1])
+
+# add last slide if required
+doc_content = doc_content + last_slide_content

 # -------------------------- Document Parameters  ------------------------------
-doc_dirty_content = doc_converted_content or doc.getTextContent()
-doc_content = removeEmptyDetails(doc_dirty_content)
+doc_content = removeEmptyDetails(doc_content)
 doc_title = doc.getShortTitle() or doc.getTitle()
 doc_language = doc.getLanguage()
 doc_description = doc.getDescription()
@@ -271,9 +305,9 @@ for image in re.findall('(<img.*?/>)', doc_content):
 #
 #  for link in re.findall('(<a.*?<\/a>)', document_content):
 #    doc_content = doc_content.replace(link, doc.WebPage_validateLink(link_string=link, link_toc=true))
-#

-# ------------- backwards compatability with old slideshow ---------------------
+
+# ------------- backcompat: old slideshow -------------------------------------
 # requires to wrap content of slides that contain <details> into nested
 # <section> tags. Done here, after book, because it adds more complexity
 if getDetails(doc_content) > -1: