Commit 86af2dbb authored by Kazuhiko Shiozaki's avatar Kazuhiko Shiozaki

reapply r42160 "try to parse latin-1 encoded url (even though that is invalid...

reapply r42160 "try to parse latin-1 encoded url (even though that is invalid according to RFC 3986)." with modifying testWebCrawler.py. tested on both Zope-2.8 and Zope-2.12.


git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk@42362 20353a03-c40f-0410-a6d1-a30d3c3de9de
parent 4e0ca62c
...@@ -205,7 +205,7 @@ class TestWebCrawler(ERP5TypeTestCase): ...@@ -205,7 +205,7 @@ class TestWebCrawler(ERP5TypeTestCase):
Funny link</a></p> Funny link</a></p>
<p><a href="http://www.example.com/section">Internal link</a></p> <p><a href="http://www.example.com/section">Internal link</a></p>
<p><a href="section2">Relative Internal link</a></p> <p><a href="section2">Relative Internal link</a></p>
<p><a href="http://www.example.com/?title=%E9+crit">With Encoding issue <p><a href="http://www.example.com/?title=%E9crit">With Encoding issue
This link will be discarded</a></p> This link will be discarded</a></p>
<img src="my_image_link"/> <img src="my_image_link"/>
<script src="should_not_be_followed.js"/> <script src="should_not_be_followed.js"/>
...@@ -217,7 +217,8 @@ class TestWebCrawler(ERP5TypeTestCase): ...@@ -217,7 +217,8 @@ class TestWebCrawler(ERP5TypeTestCase):
self.assertEquals(web_page.getContentNormalisedURLList(), self.assertEquals(web_page.getContentNormalisedURLList(),
["http://www.example.com/I%20don't%20care%20I%20put%20what/%20I%20want/", ["http://www.example.com/I%20don't%20care%20I%20put%20what/%20I%20want/",
'http://www.example.com/section', 'http://www.example.com/section',
'http://www.example.com/section2',]) 'http://www.example.com/section2',
'http://www.example.com/?title=\xc3\xa9crit',])
# relative links without base tag # relative links without base tag
text_content = """<html> text_content = """<html>
<head> <head>
......
...@@ -3187,6 +3187,7 @@ class ScalarMaxConflictResolver(persistent.Persistent): ...@@ -3187,6 +3187,7 @@ class ScalarMaxConflictResolver(persistent.Persistent):
################### ###################
# URL Normaliser # # URL Normaliser #
################### ###################
from Products.PythonScripts.standard import url_unquote
try: try:
import urlnorm import urlnorm
except ImportError: except ImportError:
...@@ -3258,6 +3259,11 @@ def urlnormNormaliseUrl(url, base_url=None): ...@@ -3258,6 +3259,11 @@ def urlnormNormaliseUrl(url, base_url=None):
""" """
try: try:
url = urlnorm.norm(url) url = urlnorm.norm(url)
except UnicodeDecodeError:
try:
url = urlnorm.norm(url_unquote(url).decode('latin1'))
except UnicodeDecodeError:
raise urlnorm.InvalidUrl
except (AttributeError, urlnorm.InvalidUrl): except (AttributeError, urlnorm.InvalidUrl):
# This url is not valid, a better Exception will # This url is not valid, a better Exception will
# be raised # be raised
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment