Commit a928abd2 authored by Jérome Perrin's avatar Jérome Perrin

web_renderjs_ui: use lxml to extract data-i18n messages

The previous regular expression based approach sometimes could not extract
message properly. Using xml parser simplify code and fix several messages
that were not extracted properly, like messages containing ", [] or {}

This also fix some problems when looking for messages sources:
  - archived web pages were sometimes used instead of published ones
  - messages from gadgets implemented as page templates/OFS files were not
    extracted.

A few more unit tests for the scripts involved in this process are added.
parent c94035cd
Pipeline #13272 failed with stage
in 0 seconds
import lxml
import io
import re
def TextDocument_substituteTextContent(self, text, **kw):
""" XXXX"""
return self._substituteTextContent(text, **kw)
\ No newline at end of file
return self._substituteTextContent(text, **kw)
def ERP5Site_extractTranslationMessageListFromHTML(self, text_content):
"""Extract messages from the text content of html text_content
"""
if not text_content:
return
if isinstance(text_content, str):
text_content = text_content.decode('utf-8')
parser = lxml.etree.HTMLParser()
tree = lxml.etree.parse(io.StringIO(text_content), parser)
if tree.getroot() is None:
return
# find data-i18n attributes in HTML
# message can be data-i18n="[value]Submit", in that case we return only Submit
tag_re = re.compile(r'^\[.*?\]')
for e in tree.xpath("//*[@data-i18n]"):
yield tag_re.sub("", e.attrib["data-i18n"], 1)
# find data-i18n= in comments
comment_data_i18n_re = re.compile(r'data-i18n=(.*)')
# if message is quoted, strip quotes to keep only message
remove_quote_re = re.compile(r"^[\"']+(.*)[\"']+$")
for comment in tree.xpath("//comment()"):
for message in comment_data_i18n_re.findall(comment.text):
remove_quote_match = remove_quote_re.match(message)
yield remove_quote_match.groups()[0] if remove_quote_match else message
# extract messages in scripts, they can be html templates
for script in tree.xpath("//script"):
for message in ERP5Site_extractTranslationMessageListFromHTML(self, script.text):
yield message
<?xml version="1.0"?>
<ZopeData>
<record id="1" aka="AAAAAAAAAAE=">
<pickle>
<global name="ExternalMethod" module="Products.ExternalMethod.ExternalMethod"/>
</pickle>
<pickle>
<dictionary>
<item>
<key> <string>_function</string> </key>
<value> <string>ERP5Site_extractTranslationMessageListFromHTML</string> </value>
</item>
<item>
<key> <string>_module</string> </key>
<value> <string>WebRenderJSUI</string> </value>
</item>
<item>
<key> <string>id</string> </key>
<value> <string>ERP5Site_extractTranslationMessageListFromHTML</string> </value>
</item>
<item>
<key> <string>title</string> </key>
<value> <string></string> </value>
</item>
</dictionary>
</pickle>
</record>
</ZopeData>
"""Returns the `text_content` that should be set on the translation data script for this RJS website.
"""
import re
import json
portal = context.getPortalObject()
......@@ -11,24 +10,33 @@ Base_translateString = context.Base_translateString
# <span data-18n="The message">The message</span>
# or in comments, like this:
# <!-- data-i18n="The message" -->
attribute_filter_re = re.compile(r"""(data-i18n)=["']?((?:.(?!["']?\s+(?:\S+)=|[>"']))+.)["']?""")
translatable_message_set = set([])
for web_page in portal.web_page_module.searchFolder(portal_type='Web Page',
reference=context.Base_getTranslationSourceFileList(only_html=1)):
data = attribute_filter_re.findall(web_page.getTextContent())
for attribute in data:
a = re.sub(r'[{|}]', "", attribute[1])
a = re.sub(r'\[.*?\]', "", a)
if a:
translatable_message_set.add(a)
# Web pages can be in web page module ...
web_page_reference_list = context.Base_getTranslationSourceFileList(only_html=1)
not_found_in_web_page_reference_set = set([])
for web_page_reference in web_page_reference_list:
web_page = context.getDocumentValue(web_page_reference)
if web_page is None:
not_found_in_web_page_reference_set.add(web_page_reference)
else:
for message in portal.ERP5Site_extractTranslationMessageListFromHTML(web_page.getTextContent()):
translatable_message_set.add(message)
# ... or in skin folders
for web_page_reference in not_found_in_web_page_reference_set:
if not '/' in web_page_reference:
web_page = context.restrictedTraverse(web_page_reference, None)
if web_page is not None and hasattr(web_page, 'manage_FTPget'):
for message in portal.ERP5Site_extractTranslationMessageListFromHTML(web_page.manage_FTPget()):
  • @jerome I'm not able to successfully run the upgrader from the alarm on my dev instance. I always get this error:

      File "/srv/slapgrid/slappart26/srv/runner/software/42b27ed03c0d12ef7ee7623
    31cc0d075/eggs/Products.PythonScripts-2.13.2-py2.7.egg/Products/PythonScript
    s/PythonScript.py", line 344, in _exec                                      
        result = f(*args, **kw)                                                 
      File "Script (Python)", line 25, in WebSite_getTranslationDataTextContent 
        web_page_text_content = web_page.manage_FTPget()                        
      File "/srv/slapgrid/slappart26/srv/runner/software/42b27ed03c0d12ef7ee7623
    31cc0d075/eggs/Zope2-2.13.30-py2.7.egg/OFS/Image.py", line 661, in manage_FT
    Pget                                                                        
        RESPONSE.setBase(None)                                                  
    AttributeError: TimerResponse instance has no attribute 'setBase'
  • Oh yes, thanks for letting me know.

    manage_FTPget does not seem the right API. I thought this was a good API to get the source of files and page templates, but after looking closer, it's not. Maybe generally TimerServer's responses should have setBase API, but this script should not use API using the response.

    manage_FTPget does this:

            data = self.data
            if isinstance(data, str):
                RESPONSE.setBase(None)
                return data
    
            while data is not None:
                RESPONSE.write(data.data)
                data = data.next
    
            return ''

    so this would also not work at all for large files using PData. I suggest doing !1358 (merged)

Please register or sign in to reply
translatable_message_set.add(message)
tmp = {}
for language in context.getAvailableLanguageSet():
tmp[language] = {}
for word in translatable_message_set:
tmp[language][word] = Base_translateString(word, lang = language)
tmp[language][word] = unicode(Base_translateString(word, lang = language), 'utf-8')
return """/**
return u"""/**
* This translation data is generated automatically and updated with upgrader in post-upgarde.
* Do not edit manually, but use "Update Translation Data" action on web site to update from
* Localizer and from data-i18n tags on web pages.
......@@ -41,7 +49,7 @@ return """/**
// @ts-ignore
window.translation_data = %s;
}(window));
""" % ("\n ".join(
""" % (u"\n ".join(
json.dumps(
tmp,
sort_keys=True,
......
# coding: utf-8
##############################################################################
#
# Copyright (c) 2018 Nexedi SA and Contributors. All Rights Reserved.
......@@ -27,12 +28,11 @@
import textwrap
import time
from Products.ERP5Type.tests.utils import createZODBPythonScript
from Products.ERP5Type.tests.ERP5TypeTestCase import ERP5TypeTestCase
class TestRenderJSUpgrade(ERP5TypeTestCase):
"""Test Upgrader scripts for renderjs UI.
"""
class RenderJSUpgradeTestCase(ERP5TypeTestCase):
def afterSetUp(self):
self.login()
self.web_site = self.portal.web_site_module.newContent(
......@@ -57,6 +57,10 @@ class TestRenderJSUpgrade(ERP5TypeTestCase):
reference='{}.js'.format(self.id()))
self.javascript.publish()
class TestRenderJSUpgrade(RenderJSUpgradeTestCase):
"""Test Upgrader scripts for renderjs UI.
"""
def test_upgrade_empty_site(self):
self.assertEqual([], self.web_site.checkConsistency())
self.assertEqual([], self.web_site.fixConsistency())
......@@ -179,3 +183,178 @@ class TestRenderJSUpgrade(ERP5TypeTestCase):
self.assertGreater(
test_upgrade_site_translation_data_js.getModificationDate(),
test_upgrade_site_translation_data_js_modification_date)
class TestRenderUpdateTranslationData(RenderJSUpgradeTestCase):
"""Tests for "Update Translation Data" utilities on RJS web sites.
"""
def afterSetUp(self):
super(TestRenderUpdateTranslationData, self).afterSetUp()
# create a translation script for this web site
self.web_script_translation_data_js = self.portal.web_page_module.newContent(
portal_type='Web Script',
# the convention is that this script has suffix translation_data.js
reference='{}_translation_data.js'.format(self.id()),
text_content='// will be filled',
)
self.web_script_translation_data_js.publish()
self.web_page_translation_gadget = self.portal.web_page_module.newContent(
portal_type='Web Page',
reference='{}.html'.format(self.id()),
content_type='text/html',
text_content=textwrap.dedent('''
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width" />
<title>Translation Gadget</title>
<link rel="http://www.renderjs.org/rel/interface" href="interface_translation.html">
<!-- renderjs -->
<script src="rsvp.js" type="text/javascript"></script>
<script src="renderjs.js" type="text/javascript"></script>
<!-- custom script -->
<script src="{translation_data_js_reference}" type="text/javascript"></script>
<script src="gadget_translation.js" type="text/javascript"></script>
</head>
<body>
</body>
</html>
''').format(translation_data_js_reference=self.web_script_translation_data_js.getReference()),
)
self.web_page_translation_gadget.publish()
self.web_site.setProperty(
'configuration_translation_gadget_url',
self.web_page_translation_gadget.getReference()
)
self.web_site.setAvailableLanguageList(['en', 'fa'])
# add a manifest to list web pages to extract messages from
createZODBPythonScript(
self.portal.portal_skins.custom,
'WebSection_getTestPrecacheManifestList',
'',
textwrap.dedent('''
return [
'test_gadget_with_translation.html',
'test_gadget_with_translation.js',
'test_portal_skins_gadget.html',
]
'''))
self.web_site.setProperty(
'configuration_precache_manifest_script_list',
'WebSection_getTestPrecacheManifestList')
self.tic()
def beforeTearDown(self):
super(TestRenderUpdateTranslationData, self).beforeTearDown()
if 'test_gadget_with_translation_html' in self.portal.web_page_module.objectIds():
self.portal.web_page_module.manage_delObjects(ids=['test_gadget_with_translation_html'])
self.tic()
def test_WebSite_getTranslationDataWebScriptValue(self):
self.assertEqual(
self.web_site.WebSite_getTranslationDataWebScriptValue(),
self.web_script_translation_data_js)
def test_Base_getTranslationSourceFileList(self):
self.assertIn(
'test_gadget_with_translation.html',
self.web_site.Base_getTranslationSourceFileList())
self.assertIn(
'test_gadget_with_translation.js',
self.web_site.Base_getTranslationSourceFileList())
self.assertIn(
'test_gadget_with_translation.html',
self.web_site.Base_getTranslationSourceFileList(only_html=True))
self.assertNotIn(
'test_gadget_with_translation.js',
self.web_site.Base_getTranslationSourceFileList(only_html=True))
def test_WebSite_getTranslationDataTextContent_extract_from_web_page(self):
self.portal.web_page_module.newContent(
portal_type='Web Page',
id='test_gadget_with_translation_html',
reference='test_gadget_with_translation.html',
text_content=textwrap.dedent('''
<html>
<!--
data-i18n=Message in comments
data-i18n="Quoted message in comments"
data-i18n=Message with "some parts" 'quoted' in comments
data-i18n=
-->
<h1 data-i18n="Message in attributes">Message in attributes</h1>
<input type="submit" data-i18n="[value]Message for attribute" value="Message for attribute"></input>
<h1 data-i18n="Message with {substitution}">Message with {substitution}</h1>
<h1 data-i18n="Message with [square brackets]">Message with [square brackets]</h1>
<div data-i18n="[html]Message in <a href='link'>HTML</a>">
</div>
<div data-i18n="">Empty data-i18n</div>
<script>
<span data-i18n="Message in script attributes">Message in script attributes</span>
</script>
<div data-i18n="メッサージュ"></div>
''')
).publish()
self.tic()
translation_data_text_content = self.web_site.WebSite_getTranslationDataTextContent()
self.assertIn('"Message in comments":', translation_data_text_content)
self.assertIn('"Quoted message in comments":', translation_data_text_content)
self.assertIn('"Message with \\"some parts\\" \'quoted\' in comments":', translation_data_text_content)
self.assertIn('"Message in attributes":', translation_data_text_content)
self.assertIn('"Message for attribute":', translation_data_text_content)
self.assertIn('"Message with {substitution}":', translation_data_text_content)
self.assertIn('"Message with [square brackets]":', translation_data_text_content)
self.assertIn('"Message in <a href=\'link\'>HTML</a>":', translation_data_text_content)
self.assertIn('"Message in script attributes":', translation_data_text_content)
self.assertIn(u'"メッサージュ":', translation_data_text_content)
def test_WebSite_getTranslationDataTextContent_extract_from_page_template(self):
self.portal.portal_skins.custom.manage_addProduct['PageTemplates'].manage_addPageTemplate(
'test_portal_skins_gadget.html',
text=textwrap.dedent('''
<html>
<!--
data-i18n=Message from page template
-->
</html>'''))
self.portal.changeSkin(None) # refresh skin cache
translation_data_text_content = self.web_site.WebSite_getTranslationDataTextContent()
self.assertIn('"Message from page template":', translation_data_text_content)
def test_WebSite_getTranslationDataTextContent_ignore_draft_web_page(self):
self.portal.web_page_module.newContent(
portal_type='Web Page',
id='test_gadget_with_translation_html',
reference='test_gadget_with_translation.html',
text_content=textwrap.dedent('''
<html>
<!--
data-i18n=Message in draft web page
-->
</html>'''))
translation_data_text_content = self.web_site.WebSite_getTranslationDataTextContent()
self.assertNotIn('"Message in draft web page":', translation_data_text_content)
def test_WebSite_getTranslationDataTextContent_ignore_archived_web_page(self):
web_page = self.portal.web_page_module.newContent(
portal_type='Web Page',
id='test_gadget_with_translation_html',
reference='test_gadget_with_translation.html',
text_content=textwrap.dedent('''
<html>
<!--
data-i18n=Message in archived web page
-->
</html>'''))
web_page.publish()
web_page.archive()
self.tic()
translation_data_text_content = self.web_site.WebSite_getTranslationDataTextContent()
self.assertNotIn('"Message in archived web page":', translation_data_text_content)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment