Commit 86af2dbb authored by Kazuhiko Shiozaki's avatar Kazuhiko Shiozaki

reapply r42160 "try to parse latin-1 encoded url (even though that is invalid...

reapply r42160 "try to parse latin-1 encoded url (even though that is invalid according to RFC 3986)." with modifying testWebCrawler.py. tested on both Zope-2.8 and Zope-2.12.


git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk@42362 20353a03-c40f-0410-a6d1-a30d3c3de9de
parent 4e0ca62c
......@@ -205,7 +205,7 @@ class TestWebCrawler(ERP5TypeTestCase):
Funny link</a></p>
<p><a href="http://www.example.com/section">Internal link</a></p>
<p><a href="section2">Relative Internal link</a></p>
<p><a href="http://www.example.com/?title=%E9+crit">With Encoding issue
<p><a href="http://www.example.com/?title=%E9crit">With Encoding issue
This link will be discarded</a></p>
<img src="my_image_link"/>
<script src="should_not_be_followed.js"/>
......@@ -217,7 +217,8 @@ class TestWebCrawler(ERP5TypeTestCase):
self.assertEquals(web_page.getContentNormalisedURLList(),
["http://www.example.com/I%20don't%20care%20I%20put%20what/%20I%20want/",
'http://www.example.com/section',
'http://www.example.com/section2',])
'http://www.example.com/section2',
'http://www.example.com/?title=\xc3\xa9crit',])
# relative links without base tag
text_content = """<html>
<head>
......
......@@ -3187,6 +3187,7 @@ class ScalarMaxConflictResolver(persistent.Persistent):
###################
# URL Normaliser #
###################
from Products.PythonScripts.standard import url_unquote
try:
import urlnorm
except ImportError:
......@@ -3258,6 +3259,11 @@ def urlnormNormaliseUrl(url, base_url=None):
"""
try:
url = urlnorm.norm(url)
except UnicodeDecodeError:
try:
url = urlnorm.norm(url_unquote(url).decode('latin1'))
except UnicodeDecodeError:
raise urlnorm.InvalidUrl
except (AttributeError, urlnorm.InvalidUrl):
# This url is not valid, a better Exception will
# be raised
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment