Commit ac751a56 authored by Kirill Smelkov's avatar Kirill Smelkov

strconv: Optimize quoting lightly

Add type annotations and use C-level objects instead of py-ones where it
is easy to do. We are not all-good yet, but this already brings some noticable speedup:

    name                 old time/op  new time/op  delta
    quote[a]              786µs ± 1%    10µs ± 0%  -98.76%  (p=0.016 n=4+5)
    quote[\u03b1]        1.12ms ± 0%  0.41ms ± 0%  -63.37%  (p=0.008 n=5+5)
    quote[\u65e5]         738µs ± 2%   258µs ± 0%  -65.07%  (p=0.016 n=4+5)
    quote[\U0001f64f]     920µs ± 1%    78µs ± 0%  -91.46%  (p=0.016 n=5+4)
    stdquote             1.19µs ± 0%  1.19µs ± 0%     ~     (p=0.794 n=5+5)
    unquote[a]           1.08ms ± 0%  1.08ms ± 1%     ~     (p=0.548 n=5+5)
    unquote[\u03b1]       797µs ± 0%   807µs ± 1%   +1.23%  (p=0.008 n=5+5)
    unquote[\u65e5]       522µs ± 0%   520µs ± 1%     ~     (p=0.056 n=5+5)
    unquote[\U0001f64f]  3.21ms ± 0%  3.14ms ± 0%   -2.13%  (p=0.008 n=5+5)
    stdunquote            815ns ± 0%   836ns ± 0%   +2.63%  (p=0.008 n=5+5)
parent 533bd30a
......@@ -1056,19 +1056,15 @@ _bstrustr_remove_unsupported_slots()
#
# NOTE the return type is str type of current python, so that quoted result
# could be directly used in __repr__ or __str__ implementation.
cdef _bpysmartquote_u3b2(s): # -> (unicode(py3)|bytes(py2), nonascii_escape)
# TODO change to `const byte[::1] s` after strconv._quote is moved to pyx
if isinstance(s, bytearray):
s = _bytearray_data(s)
assert isinstance(s, bytes), s
cdef _bpysmartquote_u3b2(const byte[::1] s): # -> (unicode(py3)|bytes(py2), nonascii_escape)
# smartquotes: choose ' or " as quoting character exactly the same way python does
# https://github.com/python/cpython/blob/v2.7.18-0-g8d21aa21f2c/Objects/stringobject.c#L905-L909
quote = b"'"
if (quote in s) and (b'"' not in s):
quote = b'"'
cdef byte quote = ord("'")
if (quote in s) and (ord('"') not in s):
quote = ord('"')
x, nonascii_escape = strconv._quote(s, quote) # raw bytes
cdef bint nonascii_escape
x = strconv._quote(s, quote, &nonascii_escape) # raw bytes
if PY_MAJOR_VERSION < 3:
return x, nonascii_escape
else:
......
......@@ -23,4 +23,4 @@
from golang cimport byte
cpdef pyquote(s)
cdef _quote(s, quote) # -> (quoted, nonascii_escape)
cdef bytes _quote(const byte[::1] s, char quote, bint* out_nonascii_escape) # -> (quoted, nonascii_escape)
......@@ -23,49 +23,82 @@
from __future__ import print_function, absolute_import
import unicodedata, codecs
from six.moves import range as xrange
from golang cimport pyb
from golang cimport pyb, byte, rune
from golang cimport _utf8_decode_rune, _xunichr
from golang.unicode cimport utf8
from cpython cimport PyObject
cdef extern from "Python.h":
PyObject* PyBytes_FromStringAndSize(char*, Py_ssize_t) except NULL
char* PyBytes_AS_STRING(PyObject*)
int _PyBytes_Resize(PyObject**, Py_ssize_t) except -1
void Py_DECREF(PyObject*)
# quote quotes unicode|bytes string into valid "..." bytestring always quoted with ".
cpdef pyquote(s): # -> bstr
q, _ = _quote(pyb(s), b'"')
cdef bint _
q = _quote(pyb(s), '"', &_)
return pyb(q)
cdef _quote(s, quote): # -> (quoted, nonascii_escape)
assert isinstance(s, bytes), type(s)
assert isinstance(quote, bytes), type(quote)
assert len(quote) == 1, repr(quote)
outv = []
emit = outv.append
nonascii_escape = False
i = 0
cdef char[16] hexdigit # = '0123456789abcdef'
for i, c in enumerate('0123456789abcdef'):
hexdigit[i] = ord(c)
# XXX not possible to use `except (NULL, False)`
# (https://stackoverflow.com/a/66335433/9456786)
cdef bytes _quote(const byte[::1] s, char quote, bint* out_nonascii_escape): # -> (quoted, nonascii_escape)
# 2*" + max(4)*each byte (+ 1 for tail \0 implicitly by PyBytesObject)
cdef Py_ssize_t qmaxsize = 1 + 4*len(s) + 1
cdef PyObject* qout = PyBytes_FromStringAndSize(NULL, qmaxsize)
cdef byte* q = <byte*>PyBytes_AS_STRING(qout)
cdef bint nonascii_escape = False
cdef Py_ssize_t i = 0, j
cdef Py_ssize_t isize
cdef int size
cdef rune r
cdef byte c
q[0] = quote; q += 1
while i < len(s):
c = s[i:i+1]
c = s[i]
# fast path - ASCII only
if ord(c) < 0x80:
if c in (b'\\', quote):
emit(b'\\'+c)
if c < 0x80:
if c in (ord('\\'), quote):
q[0] = ord('\\')
q[1] = c
q += 2
# printable ASCII
elif b' ' <= c <= b'\x7e':
emit(c)
elif 0x20 <= c <= 0x7e:
q[0] = c
q += 1
# non-printable ASCII
elif c == b'\t':
emit(br'\t')
elif c == b'\n':
emit(br'\n')
elif c == b'\r':
emit(br'\r')
elif c == ord('\t'):
q[0] = ord('\\')
q[1] = ord('t')
q += 2
elif c == ord('\n'):
q[0] = ord('\\')
q[1] = ord('n')
q += 2
elif c == ord('\r'):
q[0] = ord('\\')
q[1] = ord('r')
q += 2
# everything else is non-printable
else:
emit(br'\x%02x' % ord(c))
q[0] = ord('\\')
q[1] = ord('x')
q[2] = hexdigit[c >> 4]
q[3] = hexdigit[c & 0xf]
q += 4
i += 1
......@@ -77,21 +110,41 @@ cdef _quote(s, quote): # -> (quoted, nonascii_escape)
# decode error - just emit raw byte as escaped
if r == utf8.RuneError and size == 1:
nonascii_escape = True
emit(br'\x%02x' % ord(c))
q[0] = ord('\\')
q[1] = ord('x')
q[2] = hexdigit[c >> 4]
q[3] = hexdigit[c & 0xf]
q += 4
# printable utf-8 characters go as is
elif unicodedata.category(_xunichr(r))[0] in _printable_cat0:
emit(s[i:isize])
elif _unicodedata_category(_xunichr(r))[0] in 'LNPS': # letters, numbers, punctuation, symbols
for j in range(i, isize):
q[0] = s[j]
q += 1
# everything else goes in numeric byte escapes
else:
nonascii_escape = True
for j in xrange(i, isize):
emit(br'\x%02x' % ord(s[j:j+1]))
for j in range(i, isize):
c = s[j]
q[0] = ord('\\')
q[1] = ord('x')
q[2] = hexdigit[c >> 4]
q[3] = hexdigit[c & 0xf]
q += 4
i = isize
return (quote + b''.join(outv) + quote, nonascii_escape)
q[0] = quote; q += 1
q[0] = 0; # don't q++ at last because size does not include tail \0
cdef Py_ssize_t qsize = (q - <byte*>PyBytes_AS_STRING(qout))
assert qsize <= qmaxsize
_PyBytes_Resize(&qout, qsize)
bqout = <bytes>qout
Py_DECREF(qout)
out_nonascii_escape[0] = nonascii_escape
return bqout
# unquote decodes "-quoted unicode|byte string.
......@@ -181,4 +234,4 @@ cdef _unquote_next(s):
return b''.join(outv), s
_printable_cat0 = frozenset(['L', 'N', 'P', 'S']) # letters, numbers, punctuation, symbols
cdef _unicodedata_category = unicodedata.category
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment