From 4111f81cb99c8b53b44d0c7fffacb4854cccd58e Mon Sep 17 00:00:00 2001
From: Julien Muchembled <jm@nexedi.com>
Date: Fri, 24 Dec 2010 15:52:21 +0000
Subject: [PATCH] Make Contribution Tool accept non-conformant %-escaped URL
 (or unescaped URL)

This fixes TestWebCrawler.test_02_crawlWebSite

git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk@41759 20353a03-c40f-0410-a6d1-a30d3c3de9de
---
 product/ERP5/Tool/ContributionTool.py |  6 ++----
 product/ERP5Type/Utils.py             | 25 +++++++++++++++++++++++++
 2 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/product/ERP5/Tool/ContributionTool.py b/product/ERP5/Tool/ContributionTool.py
index f8f161ec78..c444c0860c 100644
--- a/product/ERP5/Tool/ContributionTool.py
+++ b/product/ERP5/Tool/ContributionTool.py
@@ -40,6 +40,7 @@ from Products.ERP5Type.Globals import InitializeClass, DTMLFile
 from Products.CMFCore.utils import _checkPermission
 from Products.ERP5Type.Tool.BaseTool import BaseTool
 from Products.ERP5Type import Permissions
+from Products.ERP5Type.Utils import reencodeUrlEscapes
 from Products.ERP5 import _dtmldir
 from Products.ERP5.Document.Url import no_crawl_protocol_list
 from AccessControl import Unauthorized
@@ -661,10 +662,7 @@ class ContributionTool(BaseTool):
     return file_object, filename, content_type tuple
     """
     # Quote path part of url
-    url_tuple = urlparse.urlsplit(url)
-    quoted_path = urllib.quote(url_tuple[2])
-    url = urlparse.urlunsplit((url_tuple[0], url_tuple[1], quoted_path,
-                               url_tuple[3], url_tuple[4]))
+    url = reencodeUrlEscapes(url)
     # build a new file from the url
     url_file = urllib2.urlopen(urllib2.Request(url,
                                                headers={'Accept':'*/*'}))
diff --git a/product/ERP5Type/Utils.py b/product/ERP5Type/Utils.py
index 116dbbd241..8330c09079 100644
--- a/product/ERP5Type/Utils.py
+++ b/product/ERP5Type/Utils.py
@@ -3304,3 +3304,28 @@ def guessEncodingFromText(data, content_type='text/html'):
     raise NotImplementedError, 'No encoding detector found.'\
                                   ' You must install chardet and python-magic'
 
+_reencodeUrlEscapes_map = dict((chr(x), chr(x) in (# safe
+                                                   "!'()*-." "0123456789" "_~"
+                                                   "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+                                                   "abcdefghijklmnopqrstuvwxyz"
+                                                   # reserved (maybe unsafe)
+                                                   "#$&+,/:;=?@[]")
+                                        and chr(x) or "%%%02X" % x)
+                               for x in xrange(256))
+def reencodeUrlEscapes(url):
+  """Fix a non-conformant %-escaped URL (or quote an unescaped one)
+
+  This is a Python reimplementation of 'reencode_escapes' function of Wget 1.12
+  """
+  from string import hexdigits
+  next_part = iter(url.split('%')).next
+  url = [_reencodeUrlEscapes_map[c] for c in next_part()]
+  try:
+    while True:
+      part = next_part()
+      url.append('%')
+      if len(part) < 2 or not (part[0] in hexdigits and part[1] in hexdigits):
+        url.append('25')
+      url += [_reencodeUrlEscapes_map[c] for c in part]
+  except StopIteration:
+    return ''.join(url)
-- 
2.30.9