Commit e5dbbbca authored by Tristan Cavelier's avatar Tristan Cavelier

erp5_web: fix <a> links are not absolute everytime on html embedding

- <a href="c"> was not transformed to absolute url
- split the WebPage_exportAsSingleFile and WebPage_extractReferredObjectDict
into two scripts to allow to evaluate string html data.
+ tests
parent 43435f58
<?xml version="1.0"?>
<ZopeData>
<record id="1" aka="AAAAAAAAAAE=">
<pickle>
<global name="PythonScript" module="Products.PythonScripts.PythonScript"/>
</pickle>
<pickle>
<dictionary>
<item>
<key> <string>Script_magic</string> </key>
<value> <int>3</int> </value>
</item>
<item>
<key> <string>_bind_names</string> </key>
<value>
<object>
<klass>
<global name="NameAssignments" module="Shared.DC.Scripts.Bindings"/>
</klass>
<tuple/>
<state>
<dictionary>
<item>
<key> <string>_asgns</string> </key>
<value>
<dictionary>
<item>
<key> <string>name_container</string> </key>
<value> <string>container</string> </value>
</item>
<item>
<key> <string>name_context</string> </key>
<value> <string>context</string> </value>
</item>
<item>
<key> <string>name_m_self</string> </key>
<value> <string>script</string> </value>
</item>
<item>
<key> <string>name_subpath</string> </key>
<value> <string>traverse_subpath</string> </value>
</item>
</dictionary>
</value>
</item>
</dictionary>
</state>
</object>
</value>
</item>
<item>
<key> <string>_params</string> </key>
<value> <string>data, allow_script=False, format="embedded_html", base_url=None, site_object_dict=None, title=\'Untitled\'</string> </value>
</item>
<item>
<key> <string>id</string> </key>
<value> <string>Base_convertHtmlToSingleFile</string> </value>
</item>
</dictionary>
</pickle>
</record>
</ZopeData>
"""
Extract all object referenced by html components
`data` is the html to parse.
`allow_tag_list` is the white list of tag to parse.
Default is to allow every tag.
`deny_tag_list` is the black list of tag to parse.
Default is to deny no tag.
`base_url` is the url to use as base url when relative url are found,
by using it, the script will use `site_object_dict` for each href.
(Don't forget the ending '/' !)
`site_object_dict` is a dict of (domain, object) used to get the object
corresponding to the absolute url found. By default the dict returned
by `context.ERP5Site_getWebSiteDomainDict()` is used.
"""
from zExceptions import Unauthorized
portal = context.getPortalObject()
href_object_dict = {}
if not isinstance(allow_tag_list, (list, tuple)):
allow_tag_list = None
if not isinstance(deny_tag_list, (list, tuple)):
deny_tag_list = []
def main(data):
if isinstance(data, str):
data = data.decode("utf-8")
for part in context.Base_parseHtml(data):
handleHtmlPart(part)
return href_object_dict
def handleHtmlTag(tag, attrs):
if allow_tag_list is not None:
if tag not in allow_tag_list:
return
if tag in deny_tag_list:
return
#if tag == "base": and "href" in attrs: # should not exist in safe-html
# NotImplemented
if tag == "object":
for i in range(len(attrs)):
if attrs[i][0] == "data":
handleHref(attrs[i][1])
elif tag == "style":
# for style tags, next data will always be the entire text until </style>
on_next_data[0] = handleCss
else:
for i in range(len(attrs)):
if attrs[i][0] in ("src", "href"):
handleHref(attrs[i][1])
for i in range(len(attrs)):
if attrs[i][0] == "style":
handleCss(attrs[i][1])
on_next_data = [lambda x: x]
def handleHtmlPart(part):
part_type = part[0]
if part_type in ("starttag", "startendtag"):
return handleHtmlTag(part[1], part[2])
if part_type == "data":
if on_next_data[0] is None:
return part[1]
on_next_data[0](part[1])
on_next_data[0] = None
return None
def handleHref(href):
# handles "base_url/document_module/id"
# handles "base_url/R-Document.Reference"
# handles "base_url/R-Document.Reference/view"
if not isHrefAUrl(href):
return href
try:
obj = traverseHref(href, allow_method=False)
except (KeyError, Unauthorized):
obj = None
href_object_dict[href.encode("utf-8")] = obj
def handleCss(data):
for part in context.Base_parseCssForUrl(data):
if part[0] == "url":
handleHref(part[2])
def isHrefAUrl(href):
if href.startswith("https://") or href.startswith("http://"):
return True
split = href.split(":", 1)
if len(split) == 1:
return True
return not split[0].isalpha()
def traverseHref(url, allow_method=True, allow_hash=False):
base_obj, relative_path = prepareHrefTraverse(url, allow_hash=allow_hash)
obj = base_obj.restrictedTraverse(relative_path)
if allow_method or obj is None:
return obj
try:
obj.getUid()
except AttributeError:
obj = base_obj.restrictedTraverse("/".join(relative_path.split("/")[:-1]))
return obj
if site_object_dict is None:
site_object_dict = context.ERP5Site_getWebSiteDomainDict()
base_url_root_object = getattr(context, "getWebSiteValue", str)() or portal
base_url_object = context
# Resolve base_url by removing everything after the last slash
force_base_url = False
if base_url is not None:
if base_url.startswith("https://") or base_url.startswith("http://"):
force_base_url = True
else:
raise ValueError("invalid `base_url` argument")
if force_base_url:
root_url = "/".join(base_url.split("/", 3)[:3])
if root_url != base_url:
base_url = "/".join(base_url.split("/")[:-1])
else:
root_url = base_url_root_object.absolute_url()
base_url = base_url_object.absolute_url()
base_path = "."
if base_url_object.getRelativeUrl().startswith(base_url_root_object.getRelativeUrl()):
base_path = base_url_object.getRelativeUrl()[len(base_url_root_object.getRelativeUrl()):]
if base_path and not base_path.startswith("/"):
base_path = "/" + base_path
normalize_kw = {"keep_empty": False, "keep_trailing_slash": False}
def prepareHrefTraverse(href, allow_hash=False):
url = href.split("?")[0]
if not allow_hash:
url = url.split("#")[0]
if url.startswith("https://") or url.startswith("http://") or url.startswith("//"): # absolute url possibly on other sites
site_url = "/".join(url.split("/", 3)[:3])
domain = url.split("/", 3)[2]
site_object = site_object_dict[domain]
relative_path = url[len(site_url):]
relative_path = (relative_path[1:] if relative_path[:1] == "/" else relative_path)
relative_path = context.Base_normalizeUrlPathname("/" + relative_path, **normalize_kw)[1:]
return site_object, str(relative_path)
if url.startswith("/"): # absolute path, relative url
if force_base_url:
return prepareHrefTraverse(root_url + href, allow_hash=allow_hash) # use site_domain_dict
return base_url_root_object, str(context.Base_normalizeUrlPathname(url, **normalize_kw)[1:])
# relative path
if force_base_url:
return prepareHrefTraverse(base_url + "/" + href, allow_hash=allow_hash) # use site_domain_dict
return base_url_root_object, str(context.Base_normalizeUrlPathname(base_path + "/" + url, **normalize_kw)[1:])
return main(data)
<?xml version="1.0"?>
<ZopeData>
<record id="1" aka="AAAAAAAAAAE=">
<pickle>
<global name="PythonScript" module="Products.PythonScripts.PythonScript"/>
</pickle>
<pickle>
<dictionary>
<item>
<key> <string>Script_magic</string> </key>
<value> <int>3</int> </value>
</item>
<item>
<key> <string>_bind_names</string> </key>
<value>
<object>
<klass>
<global name="NameAssignments" module="Shared.DC.Scripts.Bindings"/>
</klass>
<tuple/>
<state>
<dictionary>
<item>
<key> <string>_asgns</string> </key>
<value>
<dictionary>
<item>
<key> <string>name_container</string> </key>
<value> <string>container</string> </value>
</item>
<item>
<key> <string>name_context</string> </key>
<value> <string>context</string> </value>
</item>
<item>
<key> <string>name_m_self</string> </key>
<value> <string>script</string> </value>
</item>
<item>
<key> <string>name_subpath</string> </key>
<value> <string>traverse_subpath</string> </value>
</item>
</dictionary>
</value>
</item>
</dictionary>
</state>
</object>
</value>
</item>
<item>
<key> <string>_params</string> </key>
<value> <string>data, allow_tag_list=None, deny_tag_list=None, base_url=None, site_object_dict=None</string> </value>
</item>
<item>
<key> <string>id</string> </key>
<value> <string>Base_extractReferredObjectDictFromHtml</string> </value>
</item>
</dictionary>
</pickle>
</record>
</ZopeData>
......@@ -50,7 +50,7 @@
</item>
<item>
<key> <string>_params</string> </key>
<value> <string>REQUEST=None, allow_script=False, format="embedded_html"</string> </value>
<value> <string>REQUEST=None, allow_script=False, format="embedded_html", base_url=None, site_object_dict=None</string> </value>
</item>
<item>
<key> <string>id</string> </key>
......
from zExceptions import Unauthorized
portal = context.getPortalObject()
"""
Extract all object referenced by html components
href_object_dict = {}
if not isinstance(allow_tag_list, (list, tuple)):
allow_tag_list = None
if not isinstance(deny_tag_list, (list, tuple)):
deny_tag_list = []
see Base_extractReferredObjectDictFromHtml for documentation
"""
def main():
for part in context.Base_parseHtml(context.getTextContent("").decode("utf-8")):
handleHtmlPart(part)
return href_object_dict
def handleHtmlTag(tag, attrs):
if allow_tag_list is not None:
if tag not in allow_tag_list:
return
if tag in deny_tag_list:
return
#if tag == "base": and "href" in attrs: # should not exist in safe-html
# NotImplemented
if tag == "object":
for i in range(len(attrs)):
if attrs[i][0] == "data":
handleHref(attrs[i][1])
elif tag == "style":
# for style tags, next data will always be the entire text until </style>
on_next_data[0] = handleCss
else:
for i in range(len(attrs)):
if attrs[i][0] in ("src", "href"):
handleHref(attrs[i][1])
for i in range(len(attrs)):
if attrs[i][0] == "style":
handleCss(attrs[i][1])
on_next_data = [lambda x: x]
def handleHtmlPart(part):
part_type = part[0]
if part_type in ("starttag", "startendtag"):
return handleHtmlTag(part[1], part[2])
if part_type == "data":
if on_next_data[0] is None:
return part[1]
on_next_data[0](part[1])
on_next_data[0] = None
return None
def handleHref(href):
# handles "base_url/document_module/id"
# handles "base_url/R-Document.Reference"
# handles "base_url/R-Document.Reference/view"
if not isHrefAUrl(href):
return href
try:
obj = traverseHref(href, allow_method=False)
except (KeyError, Unauthorized):
obj = None
href_object_dict[href] = obj
def handleCss(data):
for part in context.Base_parseCssForUrl(data):
if part[0] == "url":
handleHref(part[2])
def isHrefAUrl(href):
return href.startswith("https://") or href.startswith("http://") or not href.split(":", 1)[0].isalpha()
def traverseHref(url, allow_method=True, allow_hash=False):
base_obj, relative_path = prepareHrefTraverse(url, allow_hash=allow_hash)
obj = base_obj.restrictedTraverse(relative_path)
if allow_method or obj is None:
return obj
try:
obj.getUid()
except AttributeError:
obj = base_obj.restrictedTraverse("/".join(relative_path.split("/")[:-1]))
return obj
site_object_dict = context.ERP5Site_getWebSiteDomainDict()
base_url_root_object = getattr(context, "getWebSiteValue", str)() or portal
base_url_object = context
base_url = "."
if base_url_object.getRelativeUrl().startswith(base_url_root_object.getRelativeUrl()):
base_url = base_url_object.getRelativeUrl()[len(base_url_root_object.getRelativeUrl()):]
if base_url and not base_url.startswith("/"):
base_url = "/" + base_url
normalize_kw = {"keep_empty": False, "keep_trailing_slash": False}
def prepareHrefTraverse(url, allow_hash=False):
url = url.split("?")[0]
if not allow_hash:
url = url.split("#")[0]
if url.startswith("https://") or url.startswith("http://") or url.startswith("//"): # absolute url possibly on other sites
site_url = "/".join(url.split("/", 3)[:3])
domain = url.split("/", 3)[2]
site_object = site_object_dict[domain]
relative_path = url[len(site_url):]
relative_path = (relative_path[1:] if relative_path[:1] == "/" else relative_path)
relative_path = context.Base_normalizeUrlPathname("/" + relative_path, **normalize_kw)[1:]
return site_object, str(relative_path)
if url.startswith("/"): # absolute path, relative url
return base_url_root_object, str(context.Base_normalizeUrlPathname(url, **normalize_kw)[1:])
# relative path
return base_url_root_object, str(context.Base_normalizeUrlPathname(base_url + "/" + url, **normalize_kw)[1:])
return main()
return context.Base_extractReferredObjectDictFromHtml(context.getTextContent(""), **kw)
......@@ -50,7 +50,7 @@
</item>
<item>
<key> <string>_params</string> </key>
<value> <string>allow_tag_list=None, deny_tag_list=None</string> </value>
<value> <string>**kw</string> </value>
</item>
<item>
<key> <string>id</string> </key>
......
......@@ -1162,6 +1162,75 @@ return True
self.assertEqual(htmlmessage.get("Content-Location"), page.absolute_url())
self.assertEqual(quopri.decodestring(htmlmessage.get_payload()), html_data)
def test_WebPageAsEmbeddedHtml_pageWithLink(self):
"""Test convert one html page with links to embedded html file"""
# Test init part
web_page_module = self.portal.getDefaultModule(portal_type="Web Page")
page = web_page_module.newContent(portal_type="Web Page")
page.edit(text_content="".join([
"<p>Hello</p>",
'<a href="//a.a/">aa</a>',
'<a href="/b">bb</a>',
'<a href="c">cc</a>',
]))
# Test part
ehtml_data = page.WebPage_exportAsSingleFile(format="embedded_html")
self.assertEqual(ehtml_data, "".join([
"<p>Hello</p>",
'<a href="%s//a.a/">aa</a>' % self.portal.absolute_url().split("/", 1)[0],
'<a href="%s/b">bb</a>' % self.portal.absolute_url(),
'<a href="%s/c">cc</a>' % page.absolute_url(),
]))
ehtml_data = page.WebPage_exportAsSingleFile(format="embedded_html", base_url="https://hel.lo/world/dummy")
self.assertEqual(ehtml_data, "".join([
"<p>Hello</p>",
'<a href="https://a.a/">aa</a>',
'<a href="https://hel.lo/b">bb</a>',
'<a href="https://hel.lo/world/c">cc</a>',
]))
def test_WebPageAsMhtml_pageWithLink(self):
"""Test convert one html page with links to mhtml file"""
# Test init part
web_page_module = self.portal.getDefaultModule(portal_type="Web Page")
title = "Hello"
page = web_page_module.newContent(portal_type="Web Page")
page.edit(title=title, text_content="".join([
"<p>Hello</p>",
'<a href="//a.a/">aa</a>',
'<a href="/b">bb</a>',
'<a href="c">cc</a>',
]))
# Test part
mhtml_data = page.WebPage_exportAsSingleFile(format="mhtml")
message = EmailParser().parsestr(mhtml_data)
htmlmessage, = message.get_payload()
self.assertEqual( # should have only one content transfer encoding header
len([h for h in htmlmessage.keys() if h == "Content-Transfer-Encoding"]),
1,
)
self.assertEqual(
htmlmessage.get("Content-Transfer-Encoding"),
"quoted-printable",
)
self.assertEqual(htmlmessage.get("Content-Location"), page.absolute_url())
self.assertEqual(quopri.decodestring(htmlmessage.get_payload()), "".join([
"<p>Hello</p>",
'<a href="%s//a.a/">aa</a>' % self.portal.absolute_url().split("/", 1)[0],
'<a href="%s/b">bb</a>' % self.portal.absolute_url(),
'<a href="%s/c">cc</a>' % page.absolute_url(),
]))
mhtml_data = page.WebPage_exportAsSingleFile(format="mhtml", base_url="https://hel.lo/world/dummy")
message = EmailParser().parsestr(mhtml_data)
htmlmessage, = message.get_payload()
self.assertEqual(htmlmessage.get("Content-Location"), "https://hel.lo/world")
self.assertEqual(quopri.decodestring(htmlmessage.get_payload()), "".join([
"<p>Hello</p>",
'<a href="https://a.a/">aa</a>',
'<a href="https://hel.lo/b">bb</a>',
'<a href="https://hel.lo/world/c">cc</a>',
]))
def test_WebPageAsEmbeddedHtml_pageWithScript(self):
"""Test convert one html page with script to embedded html file"""
# Test init part
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment