golang_str: Speedup utf-8 decoding a bit on py2

We recently moved our custom UTF-8 encoding/decoding routines to Cython. Now we can start taking speedup advantage on C level to make our own UTF-8 decoder a bit less horribly slow on py2: name old time/op new time/op delta stddecode 752ns ± 0% 743ns ± 0% -1.19% (p=0.000 n=9+10) udecode 216µs ± 0% 75µs ± 0% -65.19% (p=0.000 n=9+10) stdencode 328ns ± 2% 327ns ± 1% ~ (p=0.252 n=10+9) bencode 34.1µs ± 1% 32.1µs ± 1% -5.92% (p=0.000 n=10+10) So it is ~ 3x speedup for u(), but still significantly slower compared to std unicode.decode('utf-8'). Only low-hanging fruit here to make _utf_decode_rune a bit more prompt, since it sits in the most inner loop. In the future _utf8_decode_surrogateescape might be reworked as well to avoid constructing resulting unicode via py-level list of py-unicode character objects. And similarly for _utf8_encode_surrogateescape. On py3 the performance of std and u/b decode/encode is approximately the same. /trusted-by @jerome /reviewed-on nexedi/pygolang!19

golang_str: Speedup utf-8 decoding a bit on py2
We recently moved our custom UTF-8 encoding/decoding routines to Cython. Now we can start taking speedup advantage on C level to make our own UTF-8 decoder a bit less horribly slow on py2: name old time/op new time/op delta stddecode 752ns ± 0% 743ns ± 0% -1.19% (p=0.000 n=9+10) udecode 216µs ± 0% 75µs ± 0% -65.19% (p=0.000 n=9+10) stdencode 328ns ± 2% 327ns ± 1% ~ (p=0.252 n=10+9) bencode 34.1µs ± 1% 32.1µs ± 1% -5.92% (p=0.000 n=10+10) So it is ~ 3x speedup for u(), but still significantly slower compared to std unicode.decode('utf-8'). Only low-hanging fruit here to make _utf_decode_rune a bit more prompt, since it sits in the most inner loop. In the future _utf8_decode_surrogateescape might be reworked as well to avoid constructing resulting unicode via py-level list of py-unicode character objects. And similarly for _utf8_encode_surrogateescape. On py3 the performance of std and u/b decode/encode is approximately the same. /trusted-by @jerome /reviewed-on nexedi/pygolang!19
9cb7b210 · Kirill Smelkov · 598eb479 · 9cb7b210 · 9cb7b210 · 9cb7b210
Commit 9cb7b210 authored Oct 04, 2022 by Kirill Smelkov
Hide whitespace changes
Inline Side-by-side

Showing with 52 additions and 17 deletions

golang/_golang_str.pyx golang/_golang_str.pyx +22 -16

golang/golang_str_test.py golang/golang_str_test.py +29 -0

golang/strconv.py golang/strconv.py +1 -1

No files found.
--- a/golang/_golang_str.pyx
+++ b/golang/_golang_str.pyx
@@ -22,6 +22,10 @@
 It is included from _golang.pyx .
 """

+from cpython cimport PyUnicode_DecodeUTF8
+
+from libc.stdint cimport uint8_t
+
 pystrconv = None  # = golang.strconv imported at runtime (see __init__.py)

 def pyb(s): # -> bytes
@@ -187,24 +191,25 @@ def pyqq(obj):
 from six import unichr                      # py2: unichr       py3: chr
 from six import int2byte as bchr            # py2: chr          py3: lambda x: bytes((x,))

-_rune_error = 0xFFFD # unicode replacement character
+cdef int _rune_error = 0xFFFD # unicode replacement character
+_py_rune_error = _rune_error

-_ucs2_build        = (sys.maxunicode ==     0xffff)     #    ucs2
-assert _ucs2_build or sys.maxunicode >= 0x0010ffff      # or ucs4
+cdef bint _ucs2_build = (sys.maxunicode ==     0xffff)      #    ucs2
+assert    _ucs2_build or sys.maxunicode >= 0x0010ffff       # or ucs4

 # _utf8_decode_rune decodes next UTF8-character from byte string s.
 #
 # _utf8_decode_rune(s) -> (r, size)
-def _utf8_decode_rune(s):
-    assert isinstance(s, bytes)
-
+def _py_utf8_decode_rune(const uint8_t[::1] s):
+    return _utf8_decode_rune(s)
+cdef (int, int) _utf8_decode_rune(const uint8_t[::1] s):
    if len(s) == 0:
        return _rune_error, 0

-    l = min(len(s), 4)  # max size of an UTF-8 encoded character
+    cdef int l = min(len(s), 4)  # max size of an UTF-8 encoded character
    while l > 0:
        try:
-            r = s[:l].decode('utf-8', 'strict')
+            r = PyUnicode_DecodeUTF8(<char*>&s[0], l, 'strict')
        except UnicodeDecodeError:
            l -= 1
            continue
@@ -229,10 +234,12 @@ def _utf8_decode_rune(s):


 # _utf8_decode_surrogateescape mimics s.decode('utf-8', 'surrogateescape') from py3.
-def _utf8_decode_surrogateescape(s): # -> unicode
-    assert isinstance(s, bytes)
+def _utf8_decode_surrogateescape(const uint8_t[::1] s): # -> unicode
    if PY_MAJOR_VERSION >= 3:
-        return s.decode('UTF-8', 'surrogateescape')
+        if len(s) == 0:
+            return u''  # avoid out-of-bounds slice access on &s[0]
+        else:
+            return PyUnicode_DecodeUTF8(<char*>&s[0], len(s), 'surrogateescape')

    # py2 does not have surrogateescape error handler, and even if we
    # provide one, builtin bytes.decode() does not treat surrogate
@@ -243,8 +250,8 @@ def _utf8_decode_surrogateescape(s): # -> unicode
    while len(s) > 0:
        r, width = _utf8_decode_rune(s)
        if r == _rune_error  and  width == 1:
-            b = ord(s[0])
-            assert 0x80 <= b <= 0xff
+            b = s[0]
+            assert 0x80 <= b <= 0xff, b
            emit(unichr(0xdc00 + b))

        # python2 "correctly" decodes surrogates - don't allow that as
@@ -253,11 +260,10 @@ def _utf8_decode_surrogateescape(s): # -> unicode
        # (python3 raises UnicodeDecodeError for surrogates)
        elif 0xd800 <= r < 0xdfff:
            for c in s[:width]:
-                b = ord(c)
                if c >= 0x80:
-                    emit(unichr(0xdc00 + b))
+                    emit(unichr(0xdc00 + c))
                else:
-                    emit(unichr(b))
+                    emit(unichr(c))

        else:
            emit(_xunichr(r))

--- a/golang/golang_str_test.py
+++ b/golang/golang_str_test.py
@@ -20,6 +20,7 @@

 from __future__ import print_function, absolute_import

+import golang
 from golang import b, u
 from golang.gcompat import qq
 from golang.strconv_test import byterange
@@ -27,6 +28,7 @@ from golang.golang_test import readfile, assertDoc, _pyrun, dir_testprog, PIPE
 from pytest import raises
 import sys
 from six import text_type as unicode
+from six.moves import range as xrange


 # verify b, u
@@ -137,3 +139,30 @@ def test_qq():
    if not ('PyPy' in sys.version): # https://foss.heptapod.net/pypy/pypy/issues/2763
        with raises(AttributeError):
            x.hello = 1
+
+
+# ---- benchmarks ----
+
+# utf-8 decoding
+def bench_stddecode(b):
+    s = (u'α'*100).encode('utf-8')
+    for i in xrange(b.N):
+        s.decode('utf-8')
+
+def bench_udecode(b):
+    s = (u'α'*100).encode('utf-8')
+    uu = golang.u
+    for i in xrange(b.N):
+        uu(s)
+
+# utf-8 encoding
+def bench_stdencode(b):
+    s = u'α'*100
+    for i in xrange(b.N):
+        s.encode('utf-8')
+
+def bench_bencode(b):
+    s = u'α'*100
+    bb = golang.b
+    for i in xrange(b.N):
+        bb(s)
--- a/golang/strconv.py
+++ b/golang/strconv.py
@@ -26,7 +26,7 @@ from six import text_type as unicode        # py2: unicode      py3: str
 from six.moves import range as xrange

 from golang import b, u
-from golang._golang import _utf8_decode_rune, _rune_error, _xunichr
+from golang._golang import _py_utf8_decode_rune as _utf8_decode_rune, _py_rune_error as _rune_error, _xunichr


 # _bstr is like b but also returns whether input was unicode.