From 4111f81cb99c8b53b44d0c7fffacb4854cccd58e Mon Sep 17 00:00:00 2001 From: Julien Muchembled <jm@nexedi.com> Date: Fri, 24 Dec 2010 15:52:21 +0000 Subject: [PATCH] Make Contribution Tool accept non-conformant %-escaped URL (or unescaped URL) This fixes TestWebCrawler.test_02_crawlWebSite git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk@41759 20353a03-c40f-0410-a6d1-a30d3c3de9de --- product/ERP5/Tool/ContributionTool.py | 6 ++---- product/ERP5Type/Utils.py | 25 +++++++++++++++++++++++++ 2 files changed, 27 insertions(+), 4 deletions(-) diff --git a/product/ERP5/Tool/ContributionTool.py b/product/ERP5/Tool/ContributionTool.py index f8f161ec78..c444c0860c 100644 --- a/product/ERP5/Tool/ContributionTool.py +++ b/product/ERP5/Tool/ContributionTool.py @@ -40,6 +40,7 @@ from Products.ERP5Type.Globals import InitializeClass, DTMLFile from Products.CMFCore.utils import _checkPermission from Products.ERP5Type.Tool.BaseTool import BaseTool from Products.ERP5Type import Permissions +from Products.ERP5Type.Utils import reencodeUrlEscapes from Products.ERP5 import _dtmldir from Products.ERP5.Document.Url import no_crawl_protocol_list from AccessControl import Unauthorized @@ -661,10 +662,7 @@ class ContributionTool(BaseTool): return file_object, filename, content_type tuple """ # Quote path part of url - url_tuple = urlparse.urlsplit(url) - quoted_path = urllib.quote(url_tuple[2]) - url = urlparse.urlunsplit((url_tuple[0], url_tuple[1], quoted_path, - url_tuple[3], url_tuple[4])) + url = reencodeUrlEscapes(url) # build a new file from the url url_file = urllib2.urlopen(urllib2.Request(url, headers={'Accept':'*/*'})) diff --git a/product/ERP5Type/Utils.py b/product/ERP5Type/Utils.py index 116dbbd241..8330c09079 100644 --- a/product/ERP5Type/Utils.py +++ b/product/ERP5Type/Utils.py @@ -3304,3 +3304,28 @@ def guessEncodingFromText(data, content_type='text/html'): raise NotImplementedError, 'No encoding detector found.'\ ' You must install chardet and python-magic' +_reencodeUrlEscapes_map = dict((chr(x), chr(x) in (# safe + "!'()*-." "0123456789" "_~" + "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "abcdefghijklmnopqrstuvwxyz" + # reserved (maybe unsafe) + "#$&+,/:;=?@[]") + and chr(x) or "%%%02X" % x) + for x in xrange(256)) +def reencodeUrlEscapes(url): + """Fix a non-conformant %-escaped URL (or quote an unescaped one) + + This is a Python reimplementation of 'reencode_escapes' function of Wget 1.12 + """ + from string import hexdigits + next_part = iter(url.split('%')).next + url = [_reencodeUrlEscapes_map[c] for c in next_part()] + try: + while True: + part = next_part() + url.append('%') + if len(part) < 2 or not (part[0] in hexdigits and part[1] in hexdigits): + url.append('25') + url += [_reencodeUrlEscapes_map[c] for c in part] + except StopIteration: + return ''.join(url) -- 2.30.9