strconv: Optimize quoting lightly

Add type annotations and use C-level objects instead of py-ones where it is easy to do. We are not all-good yet, but this already brings some noticable speedup: name old time/op new time/op delta quote[a] 786µs ± 1% 10µs ± 0% -98.76% (p=0.016 n=4+5) quote[\u03b1] 1.12ms ± 0% 0.41ms ± 0% -63.37% (p=0.008 n=5+5) quote[\u65e5] 738µs ± 2% 258µs ± 0% -65.07% (p=0.016 n=4+5) quote[\U0001f64f] 920µs ± 1% 78µs ± 0% -91.46% (p=0.016 n=5+4) stdquote 1.19µs ± 0% 1.19µs ± 0% ~ (p=0.794 n=5+5) unquote[a] 1.08ms ± 0% 1.08ms ± 1% ~ (p=0.548 n=5+5) unquote[\u03b1] 797µs ± 0% 807µs ± 1% +1.23% (p=0.008 n=5+5) unquote[\u65e5] 522µs ± 0% 520µs ± 1% ~ (p=0.056 n=5+5) unquote[\U0001f64f] 3.21ms ± 0% 3.14ms ± 0% -2.13% (p=0.008 n=5+5) stdunquote 815ns ± 0% 836ns ± 0% +2.63% (p=0.008 n=5+5)

strconv: Optimize quoting lightly
Add type annotations and use C-level objects instead of py-ones where it is easy to do. We are not all-good yet, but this already brings some noticable speedup: name old time/op new time/op delta quote[a] 786µs ± 1% 10µs ± 0% -98.76% (p=0.016 n=4+5) quote[\u03b1] 1.12ms ± 0% 0.41ms ± 0% -63.37% (p=0.008 n=5+5) quote[\u65e5] 738µs ± 2% 258µs ± 0% -65.07% (p=0.016 n=4+5) quote[\U0001f64f] 920µs ± 1% 78µs ± 0% -91.46% (p=0.016 n=5+4) stdquote 1.19µs ± 0% 1.19µs ± 0% ~ (p=0.794 n=5+5) unquote[a] 1.08ms ± 0% 1.08ms ± 1% ~ (p=0.548 n=5+5) unquote[\u03b1] 797µs ± 0% 807µs ± 1% +1.23% (p=0.008 n=5+5) unquote[\u65e5] 522µs ± 0% 520µs ± 1% ~ (p=0.056 n=5+5) unquote[\U0001f64f] 3.21ms ± 0% 3.14ms ± 0% -2.13% (p=0.008 n=5+5) stdunquote 815ns ± 0% 836ns ± 0% +2.63% (p=0.008 n=5+5)
a11cb5dc · Kirill Smelkov · e5c513bf · a11cb5dc · a11cb5dc · a11cb5dc
Commit a11cb5dc authored Jun 26, 2023 by Kirill Smelkov
Hide whitespace changes
Inline Side-by-side

Showing with 91 additions and 43 deletions

golang/_golang_str.pyx golang/_golang_str.pyx +6 -10

golang/_strconv.pxd golang/_strconv.pxd +1 -1

golang/_strconv.pyx golang/_strconv.pyx +84 -32

No files found.
--- a/golang/_golang_str.pyx
+++ b/golang/_golang_str.pyx
@@ -1056,19 +1056,15 @@ _bstrustr_remove_unsupported_slots()
 #
 # NOTE the return type is str type of current python, so that quoted result
 # could be directly used in __repr__ or __str__ implementation.
-cdef _bpysmartquote_u3b2(s): # -> (unicode(py3)|bytes(py2), nonascii_escape)
+cdef _bpysmartquote_u3b2(const byte[::1] s): # -> (unicode(py3)|bytes(py2), nonascii_escape)
-    # TODO change to `const byte[::1] s` after strconv._quote is moved to pyx
-    if isinstance(s, bytearray):
-        s = _bytearray_data(s)
-    assert isinstance(s, bytes), s
    # smartquotes: choose ' or " as quoting character exactly the same way python does
    # https://github.com/python/cpython/blob/v2.7.18-0-g8d21aa21f2c/Objects/stringobject.c#L905-L909
-    quote = b"'"
+    cdef byte quote = ord("'")
-    if (quote in s) and (b'"' not in s):
+    if (quote in s) and (ord('"') not in s):
-        quote = b'"'
+        quote = ord('"')
-    x, nonascii_escape = strconv._quote(s, quote)     # raw bytes
+    cdef bint nonascii_escape
+    x = strconv._quote(s, quote, &nonascii_escape)              # raw bytes
    if PY_MAJOR_VERSION < 3:
        return x, nonascii_escape
    else:

--- a/golang/_strconv.pxd
+++ b/golang/_strconv.pxd
@@ -23,4 +23,4 @@
 from golang cimport byte
 cpdef pyquote(s)
-cdef _quote(s, quote) # -> (quoted, nonascii_escape)
+cdef bytes _quote(const byte[::1] s, char quote, bint* out_nonascii_escape) # -> (quoted, nonascii_escape)
--- a/golang/_strconv.pyx
+++ b/golang/_strconv.pyx
 # -*- coding: utf-8 -*-
 # cython: language_level=2
-# Copyright (C) 2018-2023  Nexedi SA and Contributors.
+# Copyright (C) 2018-2024  Nexedi SA and Contributors.
 #                          Kirill Smelkov <kirr@nexedi.com>
 #
 # This program is free software: you can Use, Study, Modify and Redistribute
@@ -23,49 +23,81 @@
 from __future__ import print_function, absolute_import
 import unicodedata, codecs
-from six.moves import range as xrange
-from golang cimport pyb
+from golang cimport pyb, byte, rune
 from golang cimport _utf8_decode_rune, _xunichr
 from golang.unicode cimport utf8
+from cpython cimport PyObject, _PyBytes_Resize
+cdef extern from "Python.h":
+    PyObject* PyBytes_FromStringAndSize(char*, Py_ssize_t) except NULL
+    char* PyBytes_AS_STRING(PyObject*)
+    void Py_DECREF(PyObject*)
 # quote quotes unicode|bytes string into valid "..." bytestring always quoted with ".
 cpdef pyquote(s):  # -> bstr
-    q, _ = _quote(pyb(s), b'"')
+    cdef bint _
+    q = _quote(pyb(s), '"', &_)
    return pyb(q)
-cdef _quote(s, quote): # -> (quoted, nonascii_escape)
-    assert isinstance(s, bytes),     type(s)
-    assert isinstance(quote, bytes), type(quote)
-    assert len(quote) == 1,          repr(quote)
-    outv = []
+cdef char[16] hexdigit # = '0123456789abcdef'
-    emit = outv.append
+for i, c in enumerate('0123456789abcdef'):
-    nonascii_escape = False
+    hexdigit[i] = ord(c)
-    i = 0
+# XXX not possible to use `except (NULL, False)`
+#     (https://stackoverflow.com/a/66335433/9456786)
+cdef bytes _quote(const byte[::1] s, char quote, bint* out_nonascii_escape): # -> (quoted, nonascii_escape)
+    # 2*" + max(4)*each byte (+ 1 for tail \0 implicitly by PyBytesObject)
+    cdef Py_ssize_t qmaxsize = 1 + 4*len(s) + 1
+    cdef PyObject*  qout     = PyBytes_FromStringAndSize(NULL, qmaxsize)
+    cdef byte*      q        = <byte*>PyBytes_AS_STRING(qout)
+    cdef bint nonascii_escape = False
+    cdef Py_ssize_t i = 0, j
+    cdef Py_ssize_t isize
+    cdef int size
+    cdef rune r
+    cdef byte c
+    q[0] = quote;  q += 1
    while i < len(s):
-        c = s[i:i+1]
+        c = s[i]
        # fast path - ASCII only
-        if ord(c) < 0x80:
+        if c < 0x80:
-            if c in (b'\\', quote):
+            if c in (ord('\\'), quote):
-                emit(b'\\'+c)
+                q[0] = ord('\\')
+                q[1] = c
+                q += 2
            # printable ASCII
-            elif b' ' <= c <= b'\x7e':
+            elif 0x20 <= c <= 0x7e:
-                emit(c)
+                q[0] = c
+                q += 1
            # non-printable ASCII
-            elif c == b'\t':
+            elif c == ord('\t'):
-                emit(br'\t')
+                q[0] = ord('\\')
-            elif c == b'\n':
+                q[1] = ord('t')
-                emit(br'\n')
+                q += 2
-            elif c == b'\r':
+            elif c == ord('\n'):
-                emit(br'\r')
+                q[0] = ord('\\')
+                q[1] = ord('n')
+                q += 2
+            elif c == ord('\r'):
+                q[0] = ord('\\')
+                q[1] = ord('r')
+                q += 2
            # everything else is non-printable
            else:
-                emit(br'\x%02x' % ord(c))
+                q[0] = ord('\\')
+                q[1] = ord('x')
+                q[2] = hexdigit[c >> 4]
+                q[3] = hexdigit[c & 0xf]
+                q += 4
            i += 1
@@ -77,21 +109,41 @@ cdef _quote(s, quote): # -> (quoted, nonascii_escape)
            # decode error - just emit raw byte as escaped
            if r == utf8.RuneError  and  size == 1:
                nonascii_escape = True
-                emit(br'\x%02x' % ord(c))
+                q[0] = ord('\\')
+                q[1] = ord('x')
+                q[2] = hexdigit[c >> 4]
+                q[3] = hexdigit[c & 0xf]
+                q += 4
            # printable utf-8 characters go as is
-            elif unicodedata.category(_xunichr(r))[0] in _printable_cat0:
+            elif _unicodedata_category(_xunichr(r))[0] in 'LNPS': # letters, numbers, punctuation, symbols
-                emit(s[i:isize])
+                for j in range(i, isize):
+                    q[0] = s[j]
+                    q += 1
            # everything else goes in numeric byte escapes
            else:
                nonascii_escape = True
-                for j in xrange(i, isize):
+                for j in range(i, isize):
-                    emit(br'\x%02x' % ord(s[j:j+1]))
+                    c = s[j]
+                    q[0] = ord('\\')
+                    q[1] = ord('x')
+                    q[2] = hexdigit[c >> 4]
+                    q[3] = hexdigit[c & 0xf]
+                    q += 4
            i = isize
-    return (quote + b''.join(outv) + quote, nonascii_escape)
+    q[0] = quote;  q += 1
+    q[0] = 0;      # don't q++ at last because size does not include tail \0
+    cdef Py_ssize_t qsize = (q - <byte*>PyBytes_AS_STRING(qout))
+    assert qsize <= qmaxsize
+    _PyBytes_Resize(&qout, qsize)
+    bqout = <bytes>qout
+    Py_DECREF(qout)
+    out_nonascii_escape[0] = nonascii_escape
+    return bqout
 # unquote decodes "-quoted unicode|byte string.
@@ -181,4 +233,4 @@ cdef _unquote_next(s):
    return b''.join(outv), s
-_printable_cat0 = frozenset(['L', 'N', 'P', 'S'])   # letters, numbers, punctuation, symbols
+cdef _unicodedata_category = unicodedata.category