Commit 923baea9 authored by Serhiy Storchaka's avatar Serhiy Storchaka

Issue #1285086: Get rid of the refcounting hack and speed up urllib.unquote().

parent 2556c838
...@@ -28,6 +28,7 @@ import os ...@@ -28,6 +28,7 @@ import os
import time import time
import sys import sys
import base64 import base64
import re
from urlparse import urljoin as basejoin from urlparse import urljoin as basejoin
...@@ -1198,22 +1199,35 @@ def splitvalue(attr): ...@@ -1198,22 +1199,35 @@ def splitvalue(attr):
_hexdig = '0123456789ABCDEFabcdef' _hexdig = '0123456789ABCDEFabcdef'
_hextochr = dict((a + b, chr(int(a + b, 16))) _hextochr = dict((a + b, chr(int(a + b, 16)))
for a in _hexdig for b in _hexdig) for a in _hexdig for b in _hexdig)
_asciire = re.compile('([\x00-\x7f]+)')
def unquote(s): def unquote(s):
"""unquote('abc%20def') -> 'abc def'.""" """unquote('abc%20def') -> 'abc def'."""
res = s.split('%') if _is_unicode(s):
if '%' not in s:
return s
bits = _asciire.split(s)
res = [bits[0]]
append = res.append
for i in range(1, len(bits), 2):
append(unquote(str(bits[i])).decode('latin1'))
append(bits[i + 1])
return ''.join(res)
bits = s.split('%')
# fastpath # fastpath
if len(res) == 1: if len(bits) == 1:
return s return s
s = res[0] res = [bits[0]]
for item in res[1:]: append = res.append
for item in bits[1:]:
try: try:
s += _hextochr[item[:2]] + item[2:] append(_hextochr[item[:2]])
append(item[2:])
except KeyError: except KeyError:
s += '%' + item append('%')
except UnicodeDecodeError: append(item)
s += unichr(int(item[:2], 16)) + item[2:] return ''.join(res)
return s
def unquote_plus(s): def unquote_plus(s):
"""unquote('%7e/abc+def') -> '~/abc def'""" """unquote('%7e/abc+def') -> '~/abc def'"""
......
...@@ -28,6 +28,8 @@ test_urlparse.py provides a good indicator of parsing behavior. ...@@ -28,6 +28,8 @@ test_urlparse.py provides a good indicator of parsing behavior.
""" """
import re
__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag", __all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
"urlsplit", "urlunsplit", "parse_qs", "parse_qsl"] "urlsplit", "urlunsplit", "parse_qs", "parse_qsl"]
...@@ -311,6 +313,15 @@ def urldefrag(url): ...@@ -311,6 +313,15 @@ def urldefrag(url):
else: else:
return url, '' return url, ''
try:
unicode
except NameError:
def _is_unicode(x):
return 0
else:
def _is_unicode(x):
return isinstance(x, unicode)
# unquote method for parse_qs and parse_qsl # unquote method for parse_qs and parse_qsl
# Cannot use directly from urllib as it would create a circular reference # Cannot use directly from urllib as it would create a circular reference
# because urllib uses urlparse methods (urljoin). If you update this function, # because urllib uses urlparse methods (urljoin). If you update this function,
...@@ -319,22 +330,35 @@ def urldefrag(url): ...@@ -319,22 +330,35 @@ def urldefrag(url):
_hexdig = '0123456789ABCDEFabcdef' _hexdig = '0123456789ABCDEFabcdef'
_hextochr = dict((a+b, chr(int(a+b,16))) _hextochr = dict((a+b, chr(int(a+b,16)))
for a in _hexdig for b in _hexdig) for a in _hexdig for b in _hexdig)
_asciire = re.compile('([\x00-\x7f]+)')
def unquote(s): def unquote(s):
"""unquote('abc%20def') -> 'abc def'.""" """unquote('abc%20def') -> 'abc def'."""
res = s.split('%') if _is_unicode(s):
if '%' not in s:
return s
bits = _asciire.split(s)
res = [bits[0]]
append = res.append
for i in range(1, len(bits), 2):
append(unquote(str(bits[i])).decode('latin1'))
append(bits[i + 1])
return ''.join(res)
bits = s.split('%')
# fastpath # fastpath
if len(res) == 1: if len(bits) == 1:
return s return s
s = res[0] res = [bits[0]]
for item in res[1:]: append = res.append
for item in bits[1:]:
try: try:
s += _hextochr[item[:2]] + item[2:] append(_hextochr[item[:2]])
append(item[2:])
except KeyError: except KeyError:
s += '%' + item append('%')
except UnicodeDecodeError: append(item)
s += unichr(int(item[:2], 16)) + item[2:] return ''.join(res)
return s
def parse_qs(qs, keep_blank_values=0, strict_parsing=0): def parse_qs(qs, keep_blank_values=0, strict_parsing=0):
"""Parse a query given as a string argument. """Parse a query given as a string argument.
......
...@@ -214,6 +214,8 @@ Core and Builtins ...@@ -214,6 +214,8 @@ Core and Builtins
Library Library
------- -------
- Issue #1285086: Get rid of the refcounting hack and speed up urllib.unquote().
- Issue #17368: Fix an off-by-one error in the Python JSON decoder that caused - Issue #17368: Fix an off-by-one error in the Python JSON decoder that caused
a failure while decoding empty object literals when object_pairs_hook was a failure while decoding empty object literals when object_pairs_hook was
specified. specified.
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment