Commit 9cb7b210 authored by Kirill Smelkov's avatar Kirill Smelkov

golang_str: Speedup utf-8 decoding a bit on py2

We recently moved our custom UTF-8 encoding/decoding routines to Cython.
Now we can start taking speedup advantage on C level to make our own
UTF-8 decoder a bit less horribly slow on py2:

    name       old time/op  new time/op  delta
    stddecode   752ns ± 0%   743ns ± 0%   -1.19%  (p=0.000 n=9+10)
    udecode     216µs ± 0%    75µs ± 0%  -65.19%  (p=0.000 n=9+10)
    stdencode   328ns ± 2%   327ns ± 1%     ~     (p=0.252 n=10+9)
    bencode    34.1µs ± 1%  32.1µs ± 1%   -5.92%  (p=0.000 n=10+10)

So it is ~ 3x speedup for u(), but still significantly slower compared
to std unicode.decode('utf-8').

Only low-hanging fruit here to make _utf_decode_rune a bit more prompt,
since it sits in the most inner loop. In the future
_utf8_decode_surrogateescape might be reworked as well to avoid
constructing resulting unicode via py-level list of py-unicode character
objects. And similarly for _utf8_encode_surrogateescape.

On py3 the performance of std and u/b decode/encode is approximately the same.

/trusted-by @jerome
/reviewed-on nexedi/pygolang!19
parent 598eb479
......@@ -22,6 +22,10 @@
It is included from _golang.pyx .
"""
from cpython cimport PyUnicode_DecodeUTF8
from libc.stdint cimport uint8_t
pystrconv = None # = golang.strconv imported at runtime (see __init__.py)
def pyb(s): # -> bytes
......@@ -187,24 +191,25 @@ def pyqq(obj):
from six import unichr # py2: unichr py3: chr
from six import int2byte as bchr # py2: chr py3: lambda x: bytes((x,))
_rune_error = 0xFFFD # unicode replacement character
cdef int _rune_error = 0xFFFD # unicode replacement character
_py_rune_error = _rune_error
_ucs2_build = (sys.maxunicode == 0xffff) # ucs2
assert _ucs2_build or sys.maxunicode >= 0x0010ffff # or ucs4
cdef bint _ucs2_build = (sys.maxunicode == 0xffff) # ucs2
assert _ucs2_build or sys.maxunicode >= 0x0010ffff # or ucs4
# _utf8_decode_rune decodes next UTF8-character from byte string s.
#
# _utf8_decode_rune(s) -> (r, size)
def _utf8_decode_rune(s):
assert isinstance(s, bytes)
def _py_utf8_decode_rune(const uint8_t[::1] s):
return _utf8_decode_rune(s)
cdef (int, int) _utf8_decode_rune(const uint8_t[::1] s):
if len(s) == 0:
return _rune_error, 0
l = min(len(s), 4) # max size of an UTF-8 encoded character
cdef int l = min(len(s), 4) # max size of an UTF-8 encoded character
while l > 0:
try:
r = s[:l].decode('utf-8', 'strict')
r = PyUnicode_DecodeUTF8(<char*>&s[0], l, 'strict')
except UnicodeDecodeError:
l -= 1
continue
......@@ -229,10 +234,12 @@ def _utf8_decode_rune(s):
# _utf8_decode_surrogateescape mimics s.decode('utf-8', 'surrogateescape') from py3.
def _utf8_decode_surrogateescape(s): # -> unicode
assert isinstance(s, bytes)
def _utf8_decode_surrogateescape(const uint8_t[::1] s): # -> unicode
if PY_MAJOR_VERSION >= 3:
return s.decode('UTF-8', 'surrogateescape')
if len(s) == 0:
return u'' # avoid out-of-bounds slice access on &s[0]
else:
return PyUnicode_DecodeUTF8(<char*>&s[0], len(s), 'surrogateescape')
# py2 does not have surrogateescape error handler, and even if we
# provide one, builtin bytes.decode() does not treat surrogate
......@@ -243,8 +250,8 @@ def _utf8_decode_surrogateescape(s): # -> unicode
while len(s) > 0:
r, width = _utf8_decode_rune(s)
if r == _rune_error and width == 1:
b = ord(s[0])
assert 0x80 <= b <= 0xff
b = s[0]
assert 0x80 <= b <= 0xff, b
emit(unichr(0xdc00 + b))
# python2 "correctly" decodes surrogates - don't allow that as
......@@ -253,11 +260,10 @@ def _utf8_decode_surrogateescape(s): # -> unicode
# (python3 raises UnicodeDecodeError for surrogates)
elif 0xd800 <= r < 0xdfff:
for c in s[:width]:
b = ord(c)
if c >= 0x80:
emit(unichr(0xdc00 + b))
emit(unichr(0xdc00 + c))
else:
emit(unichr(b))
emit(unichr(c))
else:
emit(_xunichr(r))
......
......@@ -20,6 +20,7 @@
from __future__ import print_function, absolute_import
import golang
from golang import b, u
from golang.gcompat import qq
from golang.strconv_test import byterange
......@@ -27,6 +28,7 @@ from golang.golang_test import readfile, assertDoc, _pyrun, dir_testprog, PIPE
from pytest import raises
import sys
from six import text_type as unicode
from six.moves import range as xrange
# verify b, u
......@@ -137,3 +139,30 @@ def test_qq():
if not ('PyPy' in sys.version): # https://foss.heptapod.net/pypy/pypy/issues/2763
with raises(AttributeError):
x.hello = 1
# ---- benchmarks ----
# utf-8 decoding
def bench_stddecode(b):
s = (u'α'*100).encode('utf-8')
for i in xrange(b.N):
s.decode('utf-8')
def bench_udecode(b):
s = (u'α'*100).encode('utf-8')
uu = golang.u
for i in xrange(b.N):
uu(s)
# utf-8 encoding
def bench_stdencode(b):
s = u'α'*100
for i in xrange(b.N):
s.encode('utf-8')
def bench_bencode(b):
s = u'α'*100
bb = golang.b
for i in xrange(b.N):
bb(s)
......@@ -26,7 +26,7 @@ from six import text_type as unicode # py2: unicode py3: str
from six.moves import range as xrange
from golang import b, u
from golang._golang import _utf8_decode_rune, _rune_error, _xunichr
from golang._golang import _py_utf8_decode_rune as _utf8_decode_rune, _py_rune_error as _rune_error, _xunichr
# _bstr is like b but also returns whether input was unicode.
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment