Commit d80b1d9d authored by Jérome Perrin's avatar Jérome Perrin

web_renderjs_ui: use lxml to extract data-i18n messages

The previous regular expression based approach sometimes could not extract
message properly. Using xml parser simplify code and fix several messages
that were not extracted properly, like messages containing ", [] or {}

This also fix some problems when looking for messages sources:
  - archived web pages were sometimes used instead of published ones
  - messages from gadgets implemented as page templates/OFS files were not
    extracted.

A few more unit tests for the scripts involved in this process are added.
parent 9b38dca8
Pipeline #13144 running with stage
in 0 seconds
import lxml
import io
import re
def TextDocument_substituteTextContent(self, text, **kw):
""" XXXX"""
return self._substituteTextContent(text, **kw)
\ No newline at end of file
return self._substituteTextContent(text, **kw)
def ERP5Site_extractTranslationMessageListFromHTML(self, text_content):
"""Extract messages from the text content of html text_content
"""
if not text_content:
return
if isinstance(text_content, str):
text_content = text_content.decode('utf-8')
parser = lxml.etree.HTMLParser()
tree = lxml.etree.parse(io.StringIO(text_content), parser)
if tree.getroot() is None:
return
# find data-i18n attributes in HTML
# message can be data-i18n="[value]Submit", in that case we return only Submit
tag_re = re.compile(r'^\[.*?\]')
for e in tree.xpath("//*[@data-i18n]"):
yield tag_re.sub("", e.attrib["data-i18n"], 1)
# find data-i18n= in comments
comment_data_i18n_re = re.compile(r'data-i18n=(.*)')
# if message is quoted, strip quotes to keep only message
remove_quote_re = re.compile(r"^[\"']+(.*)[\"']+$")
for comment in tree.xpath("//comment()"):
for message in comment_data_i18n_re.findall(comment.text):
remove_quote_match = remove_quote_re.match(message)
yield remove_quote_match.groups()[0] if remove_quote_match else message
# extract messages in scripts, they can be html templates
for script in tree.xpath("//script"):
for message in ERP5Site_extractTranslationMessageListFromHTML(self, script.text):
yield message
<?xml version="1.0"?>
<ZopeData>
<record id="1" aka="AAAAAAAAAAE=">
<pickle>
<global name="ExternalMethod" module="Products.ExternalMethod.ExternalMethod"/>
</pickle>
<pickle>
<dictionary>
<item>
<key> <string>_function</string> </key>
<value> <string>ERP5Site_extractTranslationMessageListFromHTML</string> </value>
</item>
<item>
<key> <string>_module</string> </key>
<value> <string>WebRenderJSUI</string> </value>
</item>
<item>
<key> <string>id</string> </key>
<value> <string>ERP5Site_extractTranslationMessageListFromHTML</string> </value>
</item>
<item>
<key> <string>title</string> </key>
<value> <string></string> </value>
</item>
</dictionary>
</pickle>
</record>
</ZopeData>
"""Returns the `text_content` that should be set on the translation data script for this RJS website.
"""
import re
import json
portal = context.getPortalObject()
......@@ -11,24 +10,37 @@ Base_translateString = context.Base_translateString
# <span data-18n="The message">The message</span>
# or in comments, like this:
# <!-- data-i18n="The message" -->
attribute_filter_re = re.compile(r"""(data-i18n)=["']?((?:.(?!["']?\s+(?:\S+)=|[>"']))+.)["']?""")
translatable_message_set = set([])
for web_page in portal.web_page_module.searchFolder(portal_type='Web Page',
reference=context.Base_getTranslationSourceFileList(only_html=1)):
data = attribute_filter_re.findall(web_page.getTextContent())
for attribute in data:
a = re.sub(r'[{|}]', "", attribute[1])
a = re.sub(r'\[.*?\]', "", a)
if a:
translatable_message_set.add(a)
# Web pages can be in web page module ...
web_page_reference_list = context.Base_getTranslationSourceFileList(only_html=1)
not_found_in_web_page_reference_set = set([])
for web_page_reference in web_page_reference_list:
web_page = context.getDocumentValue(web_page_reference)
if web_page is None:
not_found_in_web_page_reference_set.add(web_page_reference)
else:
for message in portal.ERP5Site_extractTranslationMessageListFromHTML(web_page.getTextContent()):
translatable_message_set.add(message)
# ... or in skin folders
for web_page_reference in not_found_in_web_page_reference_set:
if not '/' in web_page_reference:
web_page = context.restrictedTraverse(web_page_reference, None)
if web_page is not None and hasattr(web_page, 'manage_FTPget'):
for message in portal.ERP5Site_extractTranslationMessageListFromHTML(web_page.manage_FTPget()):
translatable_message_set.add(message)
tmp = {}
for language in context.getAvailableLanguageSet():
tmp[language] = {}
for word in translatable_message_set:
tmp[language][word] = Base_translateString(word, lang = language)
tmp[language][word] = unicode(Base_translateString(word, lang = language), 'utf-8')
return """/**
# We pass unicode to this json.dump(ensure_ascii=False), so that it produce
# UTF-8 string and not escaped characters. At the end we return an UTF-8
# encoded string and not an unicode instance, because text_content property
# is usually UTF-8 encoded str (not unicode).
return (u"""/**
* This translation data is generated automatically and updated with upgrader in post-upgarde.
* Do not edit manually, but use "Update Translation Data" action on web site to update from
* Localizer and from data-i18n tags on web pages.
......@@ -41,10 +53,10 @@ return """/**
// @ts-ignore
window.translation_data = %s;
}(window));
""" % ("\n ".join(
""" % (u"\n ".join(
json.dumps(
tmp,
sort_keys=True,
indent=2,
ensure_ascii=False,
separators=(',', ': ')).splitlines()))
separators=(',', ': ')).splitlines()))).encode('utf-8')
# coding: utf-8
##############################################################################
#
# Copyright (c) 2018 Nexedi SA and Contributors. All Rights Reserved.
......@@ -27,12 +28,11 @@
import textwrap
import time
from Products.ERP5Type.tests.utils import createZODBPythonScript
from Products.ERP5Type.tests.ERP5TypeTestCase import ERP5TypeTestCase
class TestRenderJSUpgrade(ERP5TypeTestCase):
"""Test Upgrader scripts for renderjs UI.
"""
class RenderJSUpgradeTestCase(ERP5TypeTestCase):
def afterSetUp(self):
self.login()
self.web_site = self.portal.web_site_module.newContent(
......@@ -57,6 +57,10 @@ class TestRenderJSUpgrade(ERP5TypeTestCase):
reference='{}.js'.format(self.id()))
self.javascript.publish()
class TestRenderJSUpgrade(RenderJSUpgradeTestCase):
"""Test Upgrader scripts for renderjs UI.
"""
def test_upgrade_empty_site(self):
self.assertEqual([], self.web_site.checkConsistency())
self.assertEqual([], self.web_site.fixConsistency())
......@@ -179,3 +183,178 @@ class TestRenderJSUpgrade(ERP5TypeTestCase):
self.assertGreater(
test_upgrade_site_translation_data_js.getModificationDate(),
test_upgrade_site_translation_data_js_modification_date)
class TestRenderUpdateTranslationData(RenderJSUpgradeTestCase):
"""Tests for "Update Translation Data" utilities on RJS web sites.
"""
def afterSetUp(self):
super(TestRenderUpdateTranslationData, self).afterSetUp()
# create a translation script for this web site
self.web_script_translation_data_js = self.portal.web_page_module.newContent(
portal_type='Web Script',
# the convention is that this script has suffix translation_data.js
reference='{}_translation_data.js'.format(self.id()),
text_content='// will be filled',
)
self.web_script_translation_data_js.publish()
self.web_page_translation_gadget = self.portal.web_page_module.newContent(
portal_type='Web Page',
reference='{}.html'.format(self.id()),
content_type='text/html',
text_content=textwrap.dedent('''
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width" />
<title>Translation Gadget</title>
<link rel="http://www.renderjs.org/rel/interface" href="interface_translation.html">
<!-- renderjs -->
<script src="rsvp.js" type="text/javascript"></script>
<script src="renderjs.js" type="text/javascript"></script>
<!-- custom script -->
<script src="{translation_data_js_reference}" type="text/javascript"></script>
<script src="gadget_translation.js" type="text/javascript"></script>
</head>
<body>
</body>
</html>
''').format(translation_data_js_reference=self.web_script_translation_data_js.getReference()),
)
self.web_page_translation_gadget.publish()
self.web_site.setProperty(
'configuration_translation_gadget_url',
self.web_page_translation_gadget.getReference()
)
self.web_site.setAvailableLanguageList(['en', 'fa'])
# add a manifest to list web pages to extract messages from
createZODBPythonScript(
self.portal.portal_skins.custom,
'WebSection_getTestPrecacheManifestList',
'',
textwrap.dedent('''
return [
'test_gadget_with_translation.html',
'test_gadget_with_translation.js',
'test_portal_skins_gadget.html',
]
'''))
self.web_site.setProperty(
'configuration_precache_manifest_script_list',
'WebSection_getTestPrecacheManifestList')
self.tic()
def beforeTearDown(self):
super(TestRenderUpdateTranslationData, self).beforeTearDown()
if 'test_gadget_with_translation_html' in self.portal.web_page_module.objectIds():
self.portal.web_page_module.manage_delObjects(ids=['test_gadget_with_translation_html'])
self.tic()
def test_WebSite_getTranslationDataWebScriptValue(self):
self.assertEqual(
self.web_site.WebSite_getTranslationDataWebScriptValue(),
self.web_script_translation_data_js)
def test_Base_getTranslationSourceFileList(self):
self.assertIn(
'test_gadget_with_translation.html',
self.web_site.Base_getTranslationSourceFileList())
self.assertIn(
'test_gadget_with_translation.js',
self.web_site.Base_getTranslationSourceFileList())
self.assertIn(
'test_gadget_with_translation.html',
self.web_site.Base_getTranslationSourceFileList(only_html=True))
self.assertNotIn(
'test_gadget_with_translation.js',
self.web_site.Base_getTranslationSourceFileList(only_html=True))
def test_WebSite_getTranslationDataTextContent_extract_from_web_page(self):
self.portal.web_page_module.newContent(
portal_type='Web Page',
id='test_gadget_with_translation_html',
reference='test_gadget_with_translation.html',
text_content=textwrap.dedent('''
<html>
<!--
data-i18n=Message in comments
data-i18n="Quoted message in comments"
data-i18n=Message with "some parts" 'quoted' in comments
data-i18n=
-->
<h1 data-i18n="Message in attributes">Message in attributes</h1>
<input type="submit" data-i18n="[value]Message for attribute" value="Message for attribute"></input>
<h1 data-i18n="Message with {substitution}">Message with {substitution}</h1>
<h1 data-i18n="Message with [square brackets]">Message with [square brackets]</h1>
<div data-i18n="[html]Message in <a href='link'>HTML</a>">
</div>
<div data-i18n="">Empty data-i18n</div>
<script>
<span data-i18n="Message in script attributes">Message in script attributes</span>
</script>
<div data-i18n="メッサージュ"></div>
''')
).publish()
self.tic()
translation_data_text_content = self.web_site.WebSite_getTranslationDataTextContent()
self.assertIn('"Message in comments":', translation_data_text_content)
self.assertIn('"Quoted message in comments":', translation_data_text_content)
self.assertIn('"Message with \\"some parts\\" \'quoted\' in comments":', translation_data_text_content)
self.assertIn('"Message in attributes":', translation_data_text_content)
self.assertIn('"Message for attribute":', translation_data_text_content)
self.assertIn('"Message with {substitution}":', translation_data_text_content)
self.assertIn('"Message with [square brackets]":', translation_data_text_content)
self.assertIn('"Message in <a href=\'link\'>HTML</a>":', translation_data_text_content)
self.assertIn('"Message in script attributes":', translation_data_text_content)
self.assertIn('"メッサージュ":', translation_data_text_content)
def test_WebSite_getTranslationDataTextContent_extract_from_page_template(self):
self.portal.portal_skins.custom.manage_addProduct['PageTemplates'].manage_addPageTemplate(
'test_portal_skins_gadget.html',
text=textwrap.dedent('''
<html>
<!--
data-i18n=Message from page template
-->
</html>'''))
self.portal.changeSkin(None) # refresh skin cache
translation_data_text_content = self.web_site.WebSite_getTranslationDataTextContent()
self.assertIn('"Message from page template":', translation_data_text_content)
def test_WebSite_getTranslationDataTextContent_ignore_draft_web_page(self):
self.portal.web_page_module.newContent(
portal_type='Web Page',
id='test_gadget_with_translation_html',
reference='test_gadget_with_translation.html',
text_content=textwrap.dedent('''
<html>
<!--
data-i18n=Message in draft web page
-->
</html>'''))
translation_data_text_content = self.web_site.WebSite_getTranslationDataTextContent()
self.assertNotIn('"Message in draft web page":', translation_data_text_content)
def test_WebSite_getTranslationDataTextContent_ignore_archived_web_page(self):
web_page = self.portal.web_page_module.newContent(
portal_type='Web Page',
id='test_gadget_with_translation_html',
reference='test_gadget_with_translation.html',
text_content=textwrap.dedent('''
<html>
<!--
data-i18n=Message in archived web page
-->
</html>'''))
web_page.publish()
web_page.archive()
self.tic()
translation_data_text_content = self.web_site.WebSite_getTranslationDataTextContent()
self.assertNotIn('"Message in archived web page":', translation_data_text_content)
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment