Commit 8ea4616f authored by Serhiy Storchaka's avatar Serhiy Storchaka

Issue #1285086: Get rid of the refcounting hack and speed up

urllib.parse.unquote() and urllib.parse.unquote_to_bytes().
parent 3b220e11
...@@ -27,6 +27,7 @@ parsing quirks from older RFCs are retained. The testcases in ...@@ -27,6 +27,7 @@ parsing quirks from older RFCs are retained. The testcases in
test_urlparse.py provides a good indicator of parsing behavior. test_urlparse.py provides a good indicator of parsing behavior.
""" """
import re
import sys import sys
import collections import collections
...@@ -470,6 +471,10 @@ def urldefrag(url): ...@@ -470,6 +471,10 @@ def urldefrag(url):
defrag = url defrag = url
return _coerce_result(DefragResult(defrag, frag)) return _coerce_result(DefragResult(defrag, frag))
_hexdig = '0123456789ABCDEFabcdef'
_hextobyte = {(a + b).encode(): bytes([int(a + b, 16)])
for a in _hexdig for b in _hexdig}
def unquote_to_bytes(string): def unquote_to_bytes(string):
"""unquote_to_bytes('abc%20def') -> b'abc def'.""" """unquote_to_bytes('abc%20def') -> b'abc def'."""
# Note: strings are encoded as UTF-8. This is only an issue if it contains # Note: strings are encoded as UTF-8. This is only an issue if it contains
...@@ -480,16 +485,21 @@ def unquote_to_bytes(string): ...@@ -480,16 +485,21 @@ def unquote_to_bytes(string):
return b'' return b''
if isinstance(string, str): if isinstance(string, str):
string = string.encode('utf-8') string = string.encode('utf-8')
res = string.split(b'%') bits = string.split(b'%')
if len(res) == 1: if len(bits) == 1:
return string return string
string = res[0] res = [bits[0]]
for item in res[1:]: append = res.append
for item in bits[1:]:
try: try:
string += bytes([int(item[:2], 16)]) + item[2:] append(_hextobyte[item[:2]])
except ValueError: append(item[2:])
string += b'%' + item except KeyError:
return string append(b'%')
append(item)
return b''.join(res)
_asciire = re.compile('([\x00-\x7f]+)')
def unquote(string, encoding='utf-8', errors='replace'): def unquote(string, encoding='utf-8', errors='replace'):
"""Replace %xx escapes by their single-character equivalent. The optional """Replace %xx escapes by their single-character equivalent. The optional
...@@ -501,39 +511,20 @@ def unquote(string, encoding='utf-8', errors='replace'): ...@@ -501,39 +511,20 @@ def unquote(string, encoding='utf-8', errors='replace'):
unquote('abc%20def') -> 'abc def'. unquote('abc%20def') -> 'abc def'.
""" """
if string == '': if '%' not in string:
return string string.split
res = string.split('%')
if len(res) == 1:
return string return string
if encoding is None: if encoding is None:
encoding = 'utf-8' encoding = 'utf-8'
if errors is None: if errors is None:
errors = 'replace' errors = 'replace'
# pct_sequence: contiguous sequence of percent-encoded bytes, decoded bits = _asciire.split(string)
pct_sequence = b'' res = [bits[0]]
string = res[0] append = res.append
for item in res[1:]: for i in range(1, len(bits), 2):
try: append(unquote_to_bytes(bits[i]).decode(encoding, errors))
if not item: append(bits[i + 1])
raise ValueError return ''.join(res)
pct_sequence += bytes.fromhex(item[:2])
rest = item[2:]
if not rest:
# This segment was just a single percent-encoded character.
# May be part of a sequence of code units, so delay decoding.
# (Stored in pct_sequence).
continue
except ValueError:
rest = '%' + item
# Encountered non-percent-encoded characters. Flush the current
# pct_sequence.
string += pct_sequence.decode(encoding, errors) + rest
pct_sequence = b''
if pct_sequence:
# Flush the final pct_sequence
string += pct_sequence.decode(encoding, errors)
return string
def parse_qs(qs, keep_blank_values=False, strict_parsing=False, def parse_qs(qs, keep_blank_values=False, strict_parsing=False,
encoding='utf-8', errors='replace'): encoding='utf-8', errors='replace'):
......
...@@ -233,6 +233,9 @@ Core and Builtins ...@@ -233,6 +233,9 @@ Core and Builtins
Library Library
------- -------
- Issue #1285086: Get rid of the refcounting hack and speed up
urllib.parse.unquote() and urllib.parse.unquote_to_bytes().
- Issue #17368: Fix an off-by-one error in the Python JSON decoder that caused - Issue #17368: Fix an off-by-one error in the Python JSON decoder that caused
a failure while decoding empty object literals when object_pairs_hook was a failure while decoding empty object literals when object_pairs_hook was
specified. specified.
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment