Commit a11cb5dc authored by Kirill Smelkov's avatar Kirill Smelkov

strconv: Optimize quoting lightly

Add type annotations and use C-level objects instead of py-ones where it
is easy to do. We are not all-good yet, but this already brings some noticable speedup:

    name                 old time/op  new time/op  delta
    quote[a]              786µs ± 1%    10µs ± 0%  -98.76%  (p=0.016 n=4+5)
    quote[\u03b1]        1.12ms ± 0%  0.41ms ± 0%  -63.37%  (p=0.008 n=5+5)
    quote[\u65e5]         738µs ± 2%   258µs ± 0%  -65.07%  (p=0.016 n=4+5)
    quote[\U0001f64f]     920µs ± 1%    78µs ± 0%  -91.46%  (p=0.016 n=5+4)
    stdquote             1.19µs ± 0%  1.19µs ± 0%     ~     (p=0.794 n=5+5)
    unquote[a]           1.08ms ± 0%  1.08ms ± 1%     ~     (p=0.548 n=5+5)
    unquote[\u03b1]       797µs ± 0%   807µs ± 1%   +1.23%  (p=0.008 n=5+5)
    unquote[\u65e5]       522µs ± 0%   520µs ± 1%     ~     (p=0.056 n=5+5)
    unquote[\U0001f64f]  3.21ms ± 0%  3.14ms ± 0%   -2.13%  (p=0.008 n=5+5)
    stdunquote            815ns ± 0%   836ns ± 0%   +2.63%  (p=0.008 n=5+5)
parent e5c513bf
...@@ -1056,19 +1056,15 @@ _bstrustr_remove_unsupported_slots() ...@@ -1056,19 +1056,15 @@ _bstrustr_remove_unsupported_slots()
# #
# NOTE the return type is str type of current python, so that quoted result # NOTE the return type is str type of current python, so that quoted result
# could be directly used in __repr__ or __str__ implementation. # could be directly used in __repr__ or __str__ implementation.
cdef _bpysmartquote_u3b2(s): # -> (unicode(py3)|bytes(py2), nonascii_escape) cdef _bpysmartquote_u3b2(const byte[::1] s): # -> (unicode(py3)|bytes(py2), nonascii_escape)
# TODO change to `const byte[::1] s` after strconv._quote is moved to pyx
if isinstance(s, bytearray):
s = _bytearray_data(s)
assert isinstance(s, bytes), s
# smartquotes: choose ' or " as quoting character exactly the same way python does # smartquotes: choose ' or " as quoting character exactly the same way python does
# https://github.com/python/cpython/blob/v2.7.18-0-g8d21aa21f2c/Objects/stringobject.c#L905-L909 # https://github.com/python/cpython/blob/v2.7.18-0-g8d21aa21f2c/Objects/stringobject.c#L905-L909
quote = b"'" cdef byte quote = ord("'")
if (quote in s) and (b'"' not in s): if (quote in s) and (ord('"') not in s):
quote = b'"' quote = ord('"')
x, nonascii_escape = strconv._quote(s, quote) # raw bytes cdef bint nonascii_escape
x = strconv._quote(s, quote, &nonascii_escape) # raw bytes
if PY_MAJOR_VERSION < 3: if PY_MAJOR_VERSION < 3:
return x, nonascii_escape return x, nonascii_escape
else: else:
......
...@@ -23,4 +23,4 @@ ...@@ -23,4 +23,4 @@
from golang cimport byte from golang cimport byte
cpdef pyquote(s) cpdef pyquote(s)
cdef _quote(s, quote) # -> (quoted, nonascii_escape) cdef bytes _quote(const byte[::1] s, char quote, bint* out_nonascii_escape) # -> (quoted, nonascii_escape)
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# cython: language_level=2 # cython: language_level=2
# Copyright (C) 2018-2023 Nexedi SA and Contributors. # Copyright (C) 2018-2024 Nexedi SA and Contributors.
# Kirill Smelkov <kirr@nexedi.com> # Kirill Smelkov <kirr@nexedi.com>
# #
# This program is free software: you can Use, Study, Modify and Redistribute # This program is free software: you can Use, Study, Modify and Redistribute
...@@ -23,49 +23,81 @@ ...@@ -23,49 +23,81 @@
from __future__ import print_function, absolute_import from __future__ import print_function, absolute_import
import unicodedata, codecs import unicodedata, codecs
from six.moves import range as xrange
from golang cimport pyb from golang cimport pyb, byte, rune
from golang cimport _utf8_decode_rune, _xunichr from golang cimport _utf8_decode_rune, _xunichr
from golang.unicode cimport utf8 from golang.unicode cimport utf8
from cpython cimport PyObject, _PyBytes_Resize
cdef extern from "Python.h":
PyObject* PyBytes_FromStringAndSize(char*, Py_ssize_t) except NULL
char* PyBytes_AS_STRING(PyObject*)
void Py_DECREF(PyObject*)
# quote quotes unicode|bytes string into valid "..." bytestring always quoted with ". # quote quotes unicode|bytes string into valid "..." bytestring always quoted with ".
cpdef pyquote(s): # -> bstr cpdef pyquote(s): # -> bstr
q, _ = _quote(pyb(s), b'"') cdef bint _
q = _quote(pyb(s), '"', &_)
return pyb(q) return pyb(q)
cdef _quote(s, quote): # -> (quoted, nonascii_escape)
assert isinstance(s, bytes), type(s)
assert isinstance(quote, bytes), type(quote)
assert len(quote) == 1, repr(quote)
outv = [] cdef char[16] hexdigit # = '0123456789abcdef'
emit = outv.append for i, c in enumerate('0123456789abcdef'):
nonascii_escape = False hexdigit[i] = ord(c)
i = 0
# XXX not possible to use `except (NULL, False)`
# (https://stackoverflow.com/a/66335433/9456786)
cdef bytes _quote(const byte[::1] s, char quote, bint* out_nonascii_escape): # -> (quoted, nonascii_escape)
# 2*" + max(4)*each byte (+ 1 for tail \0 implicitly by PyBytesObject)
cdef Py_ssize_t qmaxsize = 1 + 4*len(s) + 1
cdef PyObject* qout = PyBytes_FromStringAndSize(NULL, qmaxsize)
cdef byte* q = <byte*>PyBytes_AS_STRING(qout)
cdef bint nonascii_escape = False
cdef Py_ssize_t i = 0, j
cdef Py_ssize_t isize
cdef int size
cdef rune r
cdef byte c
q[0] = quote; q += 1
while i < len(s): while i < len(s):
c = s[i:i+1] c = s[i]
# fast path - ASCII only # fast path - ASCII only
if ord(c) < 0x80: if c < 0x80:
if c in (b'\\', quote): if c in (ord('\\'), quote):
emit(b'\\'+c) q[0] = ord('\\')
q[1] = c
q += 2
# printable ASCII # printable ASCII
elif b' ' <= c <= b'\x7e': elif 0x20 <= c <= 0x7e:
emit(c) q[0] = c
q += 1
# non-printable ASCII # non-printable ASCII
elif c == b'\t': elif c == ord('\t'):
emit(br'\t') q[0] = ord('\\')
elif c == b'\n': q[1] = ord('t')
emit(br'\n') q += 2
elif c == b'\r': elif c == ord('\n'):
emit(br'\r') q[0] = ord('\\')
q[1] = ord('n')
q += 2
elif c == ord('\r'):
q[0] = ord('\\')
q[1] = ord('r')
q += 2
# everything else is non-printable # everything else is non-printable
else: else:
emit(br'\x%02x' % ord(c)) q[0] = ord('\\')
q[1] = ord('x')
q[2] = hexdigit[c >> 4]
q[3] = hexdigit[c & 0xf]
q += 4
i += 1 i += 1
...@@ -77,21 +109,41 @@ cdef _quote(s, quote): # -> (quoted, nonascii_escape) ...@@ -77,21 +109,41 @@ cdef _quote(s, quote): # -> (quoted, nonascii_escape)
# decode error - just emit raw byte as escaped # decode error - just emit raw byte as escaped
if r == utf8.RuneError and size == 1: if r == utf8.RuneError and size == 1:
nonascii_escape = True nonascii_escape = True
emit(br'\x%02x' % ord(c)) q[0] = ord('\\')
q[1] = ord('x')
q[2] = hexdigit[c >> 4]
q[3] = hexdigit[c & 0xf]
q += 4
# printable utf-8 characters go as is # printable utf-8 characters go as is
elif unicodedata.category(_xunichr(r))[0] in _printable_cat0: elif _unicodedata_category(_xunichr(r))[0] in 'LNPS': # letters, numbers, punctuation, symbols
emit(s[i:isize]) for j in range(i, isize):
q[0] = s[j]
q += 1
# everything else goes in numeric byte escapes # everything else goes in numeric byte escapes
else: else:
nonascii_escape = True nonascii_escape = True
for j in xrange(i, isize): for j in range(i, isize):
emit(br'\x%02x' % ord(s[j:j+1])) c = s[j]
q[0] = ord('\\')
q[1] = ord('x')
q[2] = hexdigit[c >> 4]
q[3] = hexdigit[c & 0xf]
q += 4
i = isize i = isize
return (quote + b''.join(outv) + quote, nonascii_escape) q[0] = quote; q += 1
q[0] = 0; # don't q++ at last because size does not include tail \0
cdef Py_ssize_t qsize = (q - <byte*>PyBytes_AS_STRING(qout))
assert qsize <= qmaxsize
_PyBytes_Resize(&qout, qsize)
bqout = <bytes>qout
Py_DECREF(qout)
out_nonascii_escape[0] = nonascii_escape
return bqout
# unquote decodes "-quoted unicode|byte string. # unquote decodes "-quoted unicode|byte string.
...@@ -181,4 +233,4 @@ cdef _unquote_next(s): ...@@ -181,4 +233,4 @@ cdef _unquote_next(s):
return b''.join(outv), s return b''.join(outv), s
_printable_cat0 = frozenset(['L', 'N', 'P', 'S']) # letters, numbers, punctuation, symbols cdef _unicodedata_category = unicodedata.category
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment