Commit d32a1124 authored by Sven Franck's avatar Sven Franck Committed by Xiaowu Zhang

erp5_corporate_identity: improve slideshow rendering including displaying...

erp5_corporate_identity: improve slideshow rendering including displaying legacy presentation as slideshow
parent a8b888ae
Pipeline #9598 failed with stage
in 0 seconds
"""
================================================================================
Try to convert old OpenOffice presentations into slideshows
================================================================================
"""
# uses cloudooo to convert odp/sxi to html (quite buggy) and then salvages the
# result into a slideshow html, which is passed on as remote_content to the
# slideshow renderer
# kw-parameters (* default)
# ------------------------------------------------------------------------------
import re
blank = ''
flags = re.MULTILINE|re.DOTALL|re.IGNORECASE
def getHeaderSlideTitle(my_doc):
return '<h1>' + my_doc.getTitle() + '</h1>'
  • don't we need to escape title here ? ( or move this to a page template which escapes everything )

    /cc @georgios.dagkakis

  • or move this to a page template which escapes everything

    I think this is the best. Handling HTML in Python should be avoided as much as possible

  • This is probably a big change, what is easy to do now is something like this:

    from Products.PythonScripts.standard import html_quote
    
    def getHeaderSlideTitle(my_doc):
      return '<h1>' + html_quote(my_doc.getTitle()) + '</h1>'
Please register or sign in to reply
def getSlideList(content):
return re.findall(r'<html>(.*?)</html>', content, flags=flags)
def getKey(item):
return int(item[0])
# -------------------------------- Setup ---------------------------------------
if context.getPortalType() in ["Presentation"]:
portal = context.getPortalObject()
mimetype = 'text/html'
content_type = context.getContentType()
raw_data = portal.portal_transforms.convertToData(mimetype, str(context.getData() or ""), context=context, mimetype=content_type)
if raw_data is None:
raise ValueError("Failed to convert to %r" % mimetype)
if context.REQUEST is not None:
context.REQUEST.RESPONSE.setHeader("Content-Type", mimetype)
# get a list of slides
content = getSlideList(raw_data)
# every slide is in the raw_data twice, once with the title and image as text,
# once with the slidecontent without title. All slides are mixed randomly, so
# we need to find out which slide contains what and then put them in their
# correct order. We do this by extracting the links in the slides navigation
# bar. This bar as a switch to change from image to text slides with the
# current slide number so <a href="text3">Text</a> to switch from Graphic
# slide 3 to Text slide 3. We use this to identify current slide
if len(content) > 0:
slideshow = []
output = blank
for slide in content:
slide_nav = re.search(r'<center>(.*?)</center>', slide, flags=flags).group()
slide_nav_link_list = re.findall(r'<a(.*?)</a>', slide_nav, flags=flags)
for link in slide_nav_link_list:
# the header slide. Contains header and extracted text from image
if re.search(r'>Graphic', link, flags=flags):
pointer = re.search(r'(text|img)([0-9]*)\.', link, flags=flags)
if pointer is not None:
slide_header = re.search(r'<h1>(.*)?</h1>', slide, flags=flags).group()
slideshow.append([str(pointer.group(2)), slide_header])
# the content slide. Contains image and notes
if re.search(r'>Text', link, flags=flags):
pointer = re.search(r'(text|img)([0-9]*)\.', link, flags=flags)
if pointer is not None:
slideshow.append([str(pointer.group(2)), slide])
# time to sort and add first slide header in case missing
slideshow = sorted(slideshow, key=getKey)
if '<h1' not in slideshow[0][1]:
slideshow.insert(0, ["0", getHeaderSlideTitle(context)])
output = ""
section_start = '<section>'
section_end = '</section>'
# slideshow will contain <header>, <content>, <header>, <content>...
# so we need to go through it two-slides at a time to assemble
# slides
slide_iter = iter(slideshow)
for slide in slide_iter:
slide_1st = slide
slide_2nd = next(slide_iter)
# we don't know whether header is on first or second position
if '<h1' not in slide_1st[1]:
go_1st = slide_2nd[1]
go_2nd = slide_1st[1]
else:
go_1st = slide_1st[1]
go_2nd = slide_2nd[1]
go_2nd = go_2nd.replace(re.search(r'<head>.*?</center><br>', go_2nd, flags=flags).group(), blank)
go_2nd = go_2nd.replace("<h3>Notes:</h3><br>", '<details open="open">')
go_2nd = go_2nd.replace("</body>", "</details>")
output = output + section_start + go_1st + go_2nd + section_end
kw["remote_content"] = output
  • @frequent @xiaowu.zhang Can you please double check, kw is an undefined variable here and this cause a test failure

    https://nexedijs.erp5.net/#/test_result_module/20200527-1082CFAA/46?

  • strange, corporate identity codingstyle tests are not running on my test suite

    kw parameter is missing, i add it

    Edited by Xiaowu Zhang
  • Thanks ! That must be a bug in testnode or something.

    The list of tested business template is evaluated here

    maybe what happens is that this is called first and only after this the new git repository revision is used.

    coding style on this bt was enabled very recently xiaowu.zhang/erp5@73aef12e

    maybe first ERP5BusinessTemplateCodingStyleTestSuite.getTestList was called with the old version of the code, when this business template was still skipped.

    If that's really this, it should not be often a problem, I will not look more into this, unless we see this happening again. If you can fix the missing km parameters (there also seems to be same problem with the other script from this commit), that's good for now I would say. Thanks !

    Edited by Jérome Perrin
Please register or sign in to reply
return context.WebPage_viewAsSlideshowWIP(*args, **kw)
<?xml version="1.0"?>
<ZopeData>
<record id="1" aka="AAAAAAAAAAE=">
<pickle>
<global name="PythonScript" module="Products.PythonScripts.PythonScript"/>
</pickle>
<pickle>
<dictionary>
<item>
<key> <string>Script_magic</string> </key>
<value> <int>3</int> </value>
</item>
<item>
<key> <string>_bind_names</string> </key>
<value>
<object>
<klass>
<global name="NameAssignments" module="Shared.DC.Scripts.Bindings"/>
</klass>
<tuple/>
<state>
<dictionary>
<item>
<key> <string>_asgns</string> </key>
<value>
<dictionary>
<item>
<key> <string>name_container</string> </key>
<value> <string>container</string> </value>
</item>
<item>
<key> <string>name_context</string> </key>
<value> <string>context</string> </value>
</item>
<item>
<key> <string>name_m_self</string> </key>
<value> <string>script</string> </value>
</item>
<item>
<key> <string>name_subpath</string> </key>
<value> <string>traverse_subpath</string> </value>
</item>
</dictionary>
</value>
</item>
</dictionary>
</state>
</object>
</value>
</item>
<item>
<key> <string>_params</string> </key>
<value> <string></string> </value>
</item>
<item>
<key> <string>id</string> </key>
<value> <string>Presentation_viewAsSlideshow</string> </value>
</item>
</dictionary>
</pickle>
</record>
</ZopeData>
......@@ -21,24 +21,27 @@ MAIN FILE: generate presentation in different output formats
# display_note: display slide notes (1) or not (0)*
# display_svg: display svg-images as svg or png*
# ------
# flag_ooo: convert legacy odp, sxi formats (not active)
# remote_content: convert legacy odp, sxi formats (not active)
import re
from base64 import b64encode
blank = ''
flags = re.MULTILINE|re.DOTALL|re.IGNORECASE
details_separator = '</section><section class="ci-notes-continue"><section><h1>cont.</h1></section>'
pref = context.getPortalObject().portal_preferences
# ------------------ HTML cleanup/converter methods ----------------------------
def getHeaderSlideTitle(my_doc):
return '<h1>' + my_doc.getTitle() + '</h1>'
def getSlideList(my_content):
return re.findall(r'<section[^>]*?>(.*?)</section>', my_content, re.S)
#def getSectionSlideList(my_content):
# return re.findall(r'(<section[^>]*?>.*?</section>)', my_content, re.S)
# https://regex101.com/r/8F8GTx/1/
def getSlideDetailsList(my_content):
return re.findall(r'<section.*?>\s?<section>.*?</details>\s?</section>', my_content, re.S)
......@@ -51,6 +54,54 @@ def getDetailsList(my_slide):
#def getNestedSection(my_content):
# return my_content.find("<section") > -1
# please look the other direction until we can use beautifulsoup
def getSlideFront(my_content):
# is there an image on the slide?
img = re.search(r'(<img.*?/>)', slide_content, flags=flags)
if img:
return img.group()
# is there another tag on the slide?
tag = re.search(r'<(.*?)( |>)', slide_content, flags=flags)
if tag:
key = tag.group(1)
element = re.search(r'(<%s.*?</%s>)'%(key, key), my_content, flags=flags)
if element:
return element.group()
# empty slide
return None
# opinionated
# TODO h1: chapter, h2:slide ?
def setH1AndH2AsSlideHeaders(my_content):
for start_tag in re.findall(r'<h2', my_content, flags=flags):
my_content = my_content.replace(start_tag, '<h1')
for end_tag in re.findall(r'\/h2>', my_content, flags=flags):
my_content = my_content.replace(end_tag, '/h1>')
return my_content
def removePlaceholders(my_content):
if my_content.find('${') > -1:
for substitution_string in re.findall(r'(\${.*})', my_content):
my_content = my_content.replace(substitution_string, blank)
return my_content
def removeComments(my_content):
for comment_string in re.findall(r'(<!--.*?-->)', my_content, flags=flags):
my_content = my_content.replace(comment_string, blank)
return my_content
def removeImageWrappers(my_content):
img_list = re.findall(r'(<p style=\"text-align: center;\">(.*?)</p>)', my_content, flags=flags)
for wrapped_image in img_list:
my_content = my_content.replace(wrapped_image[0], wrapped_image[1])
return my_content
def removeLineBreaks(my_content):
return my_content.replace('\n', '').replace('\r', '')
def splitMultipleDetails(my_content):
for slide in getSlideDetailsList(my_content):
detail_list = getDetailsList(slide)
......@@ -86,118 +137,101 @@ def splitMultipleDetails(my_content):
def removeEmptyDetails(my_content):
content = my_content.replace('<details open="open"></details>', blank)
content = content.replace('<details></details>', blank)
content = content.replace('<details open=""></details>', blank)
content = content.replace('<details>&nbsp;</details>', blank)
content = content.replace('<details> </details>', blank)
return content
def getPageList(my_content):
return re.findall(r'<html>(.*?)</html>', my_content, re.S)
def getPageTitle(my_full_page):
result = re.search('<title>(.+?)</title>', my_full_page)
if result:
return result.group(1)
def getPageContent(my_full_page):
result_list = my_full_page.split("</center><br>")
if len(result_list) == 2:
return result_list[1].replace("</body>", blank)
def addSlideContent(my_content, my_notes):
return ''.join([
'<section>',
my_content,
'<details open="open">',
my_notes,
'</details></section>'
])
def sortContent(my_page_list):
try:
page_content_list = []
page_tuple_first = None
page_tuple_last = None
for page in my_page_list:
page_title = getPageTitle(page)
# Note cloudooo default html transformation mixes slide order. dirty fix
if page_title.find("Commercial") > -1:
page_content = getPageContent(page)
if page_content.find("<center>") > -1:
page_tuple_last = (page_title, page_content, "first")
elif page_title.find("ERP5") > -1:
page_content = getPageContent(page)
if page_content.find("<center>") > -1:
page_tuple_first = (page_title, page_content, "last")
else:
page_content = getPageContent(page)
if page_title.find("Slide") > -1:
slide_number = int(page_title.replace("Slide ", ""))
page_content_list.append((slide_number, page_content, None))
else:
if page_content.find("<center>") > -1:
page_tuple_first = (page_title, page_content, "first")
sort_content_list = sorted(page_content_list, key=lambda page_foo: page_foo[0])
if page_tuple_last is not None:
sort_content_list.append(page_tuple_last)
if page_tuple_first is not None:
sort_content_list = [page_tuple_first] + sort_content_list
return sort_content_list
except Exception as e:
raise e
def addLastSlide(my_last_slide):
if my_last_slide.count("<div") != 2:
last_slide_relative_url = pref.getPreferredCorporateIdentityTemplateSlideLastSlideRelativeUrl() or None
if last_slide_relative_url:
# try:
last_slide = doc.restrictedTraverse(last_slide_relative_url) or None
if last_slide is not None:
return last_slide.getTextContent()
#except AttributeError:
# last_slide_content = blank
return blank
# -------------------------- Setup ---------------------------------------------
doc = context
doc_prefix = pref.getPreferredCorporateIdentityTemplateSlideDocumentPrefix() or "Slideshow."
doc_converted_content = None
doc_upgraded_content = None
doc_slide_iter = None
doc_format = kw.get('format') or 'html'
doc_display_notes = int(kw.get('display_note') or 0)
doc_display_svg = kw.get('display_svg') or 'png'
doc_download = int(kw.get('document_download') or 0)
doc_save = int(kw.get('document_save') or 0)
doc_ooo = int(kw.get('flag_ooo') or 0)
doc_ooo = kw.get('remote_content') or None
doc_content = doc_ooo or doc.getTextContent()
doc_is_slideshow = getSlideList(doc_content) or None
override_logo_reference = kw.get('override_logo_reference', None)
override_source_organisation_title = kw.get("override_source_organisation_title", None)
override_batch_mode = kw.get('batch_mode')
override_source_person_title = None
# ---------- backward compatability with legacy odp/sxi presentations ----------
# note: this has to come first to convert file into html and then continue
if doc_ooo:
doc_portal = doc.getPortalObject()
if doc.getPortalType() in ["Presentation"]:
raw_data = doc_portal.portal_transforms.convertToData(
"text/html",
str(doc.getData() or blank),
context=context,
mimetype=doc.getContentType()
)
if raw_data is None:
raise ValueError("Failed to convert to %r" % "text/html")
# got something
page_list = getPageList(raw_data)
if len(page_list) > 0:
page_content = sortContent(page_list)
doc_converted_content = blank
for slide in page_content:
if slide[1].find("<center>") > -1:
slide_content_list = slide[1].split("<h3>Notes:</h3>")
if len(slide_content_list) != 2:
slide_content = slide[1]
slide_notes = blank
else:
slide_content = slide_content_list[0]
slide_content = slide_content.replace("<center>", "")
slide_content = slide_content.replace("</center>", "")
slide_notes = slide_content_list[1]
doc_converted_content += addSlideContent(slide_content, slide_notes)
# --------------------- Convert any page into a slideshow ----------------------
# Note: mileage varies depending on the cleanliness of the HTML page
if doc_is_slideshow is None:
doc_upgraded_content = removePlaceholders(doc_content)
doc_upgraded_content = removeComments(doc_upgraded_content)
doc_upgraded_content = removeImageWrappers(doc_upgraded_content)
doc_upgraded_content = setH1AndH2AsSlideHeaders(doc_upgraded_content)
doc_upgraded_content = removeLineBreaks(doc_upgraded_content)
doc_content = blank
last_slide_content = blank
section_start = '<section>'
details_start = '<details open="open">'
details_end = '</details>'
section_end = '</section>'
# separate by <h1>, these will be our slide headers
fake_slide_list = re.split(r'(<h1.*?/h1>)', doc_upgraded_content, flags=flags)
# insert page title if first element isn't a <h1>
if '<h1' not in fake_slide_list[0]:
fake_slide_list.insert(0, getHeaderSlideTitle(doc))
# opinionated add of a "Thank you" slide if the last slide doesn't
# contain the default two <div> columns
last_slide_content = addLastSlide(fake_slide_list[-1])
# fake_slide_list will be <h1>,<content>,<h1>,<content> so we need to go
# over two items at a time
doc_slide_iter = iter(fake_slide_list)
for x in doc_slide_iter:
slide_header = x
# remove whitespace so we don't end up with empty <details>
slide_content = " ".join(next(doc_slide_iter).split())
# build slides assuming the first element after the header is on the slide
# (an img, a paragraph, a list, whatever). The rest goes into details. If
# there is an img on the slide, move it to the top
slide_front = getSlideFront(slide_content)
if slide_front:
slide_content = slide_content.replace(slide_front, blank)
else:
slide_front = blank
# build a new doc from slides
doc_content = doc_content + section_start + slide_header + slide_front \
+ details_start + slide_content + details_end + section_end \
# other case: we have a slideshow, doc_is_slideshow contains the slides
else:
last_slide_content = addLastSlide(doc_is_slideshow[-1])
# add last slide if required
doc_content = doc_content + last_slide_content
# -------------------------- Document Parameters ------------------------------
doc_dirty_content = doc_converted_content or doc.getTextContent()
doc_content = removeEmptyDetails(doc_dirty_content)
doc_content = removeEmptyDetails(doc_content)
doc_title = doc.getShortTitle() or doc.getTitle()
doc_language = doc.getLanguage()
doc_description = doc.getDescription()
......@@ -271,9 +305,9 @@ for image in re.findall('(<img.*?/>)', doc_content):
#
# for link in re.findall('(<a.*?<\/a>)', document_content):
# doc_content = doc_content.replace(link, doc.WebPage_validateLink(link_string=link, link_toc=true))
#
# ------------- backwards compatability with old slideshow ---------------------
# ------------- backcompat: old slideshow -------------------------------------
# requires to wrap content of slides that contain <details> into nested
# <section> tags. Done here, after book, because it adds more complexity
if getDetails(doc_content) > -1:
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment