Commit 04be919b authored by Kirill Smelkov's avatar Kirill Smelkov

golang_str: bstr/ustr index access

Implement access to bstr/ustr by [index] and by slice. Result of such
[index] access - similarly to standard str - returns the same bstr/ustr
type with one character:

  - ustr[i] returns ustr with one unicode character taken from i'th character of original string, while
  - bstr[i] returns bstr with one byte taken from i'th byte of original bytestring.

This follows str/unicode semantics on both py2/py3, bytes semantic on
py2, but diverges from bytes semantics on py3. I originally tried to
follow bytes/py3 semantic - for bstr to return an integer instead of
1-byte character, but later found several compatibility breakages due to
it. I contemplated about this divergence for a long time and finally
took decision to follow strings semantics for both ustr and bstr. This
preserves backward compatibility with Python2 and also allows for bstr
to be practically drop-in replacement for str type.

To get an ordinal corresponding to retrieved character, one can use
standard `ord`, e.g. as in `ord(bstr[i])`. This will always return an
integer for all bstr/ustr/str/unicode. Similarly to standard `chr` and
`unichr`, we also provide two utility functions - `uchr` and `bbyte` to
create 1-character and 1-byte ustr/bstr correspondingly.
parent 105d03d4
......@@ -240,6 +240,10 @@ The conversion, in both encoding and decoding, never fails and never looses
information: `bstr→ustr→bstr` and `ustr→bstr→ustr` are always identity
even if bytes data is not valid UTF-8.
Semantically `bstr` is array of bytes, while `ustr` is array of
unicode-characters. Accessing their elements by `[index]` yields byte and
unicode character correspondingly [*]_.
Operations in between `bstr` and `ustr`/`unicode` / `bytes`/`bytearray` coerce to `bstr`, while
operations in between `ustr` and `bstr`/`bytes`/`bytearray` / `unicode` coerce
to `ustr`. When the coercion happens, `bytes` and `bytearray`, similarly to
......@@ -262,6 +266,8 @@ Usage example::
... # (*) the decoding never fails nor looses information.
.. [*] `unicode` on Python2, `str` on Python3.
.. [*] | ordinal of such byte and unicode character can be obtained via regular `ord`.
| For completeness `bbyte` and `uchr` are also provided for constructing 1-byte `bstr` and 1-character `ustr` from ordinal.
.. [*] | data in buffer, similarly to `bytes` and `bytearray`, is treated as UTF8-encoded string.
| Notice that only explicit conversion through `b` and `u` accept objects with buffer interface. Automatic coercion does not.
......
......@@ -36,7 +36,7 @@ from __future__ import print_function, absolute_import
__version__ = "0.1"
__all__ = ['go', 'chan', 'select', 'default', 'nilchan', 'defer', 'panic',
'recover', 'func', 'error', 'b', 'u', 'bstr', 'ustr', 'gimport']
'recover', 'func', 'error', 'b', 'u', 'bstr', 'ustr', 'bbyte', 'uchr', 'gimport']
from golang._gopath import gimport # make gimport available from golang
import inspect, sys
......@@ -317,8 +317,10 @@ from ._golang import \
pyerror as error, \
pyb as b, \
pybstr as bstr, \
pybbyte as bbyte, \
pyu as u, \
pyustr as ustr
pyustr as ustr, \
pyuchr as uchr
# import golang.strconv into _golang from here to workaround cyclic golang ↔ strconv dependency
def _():
......
......@@ -174,6 +174,15 @@ cdef __pystr(object obj): # -> ~str
return pyb(obj)
def pybbyte(int i): # -> 1-byte bstr
"""bbyte(i) returns 1-byte bstr with ordinal i."""
return pyb(bytearray([i]))
def pyuchr(int i): # -> 1-character ustr
"""uchr(i) returns 1-character ustr with unicode ordinal i."""
return pyu(unichr(i))
# XXX cannot `cdef class`: github.com/cython/cython/issues/711
class pybstr(bytes):
"""bstr is byte-string.
......@@ -185,6 +194,9 @@ class pybstr(bytes):
is always identity even if bytes data is not valid UTF-8.
Semantically bstr is array of bytes. Accessing its elements by [index]
yields byte character.
Operations in between bstr and ustr/unicode / bytes/bytearray coerce to bstr.
When the coercion happens, bytes and bytearray, similarly to bstr, are also
treated as UTF8-encoded strings.
......@@ -253,6 +265,21 @@ class pybstr(bytes):
def __le__(a, b): return bytes.__le__(a, _pyb_coerce(b))
def __ge__(a, b): return bytes.__ge__(a, _pyb_coerce(b))
# len - no need to override
# [], [:]
def __getitem__(self, idx):
x = bytes.__getitem__(self, idx)
if type(idx) is slice:
return pyb(x)
else:
# bytes[i] returns 1-character bytestring(py2) or int(py3)
# we always return 1-character bytestring
if PY_MAJOR_VERSION >= 3:
return pybbyte(x)
else:
return pyb(x)
# XXX cannot `cdef class` with __new__: https://github.com/cython/cython/issues/799
class pyustr(unicode):
......@@ -265,6 +292,9 @@ class pyustr(unicode):
is always identity even if bytes data is not valid UTF-8.
ustr is similar to standard unicode type - accessing its
elements by [index] yields unicode characters.
Operations in between ustr and bstr/bytes/bytearray / unicode coerce to ustr.
When the coercion happens, bytes and bytearray, similarly to bstr, are also
treated as UTF8-encoded strings.
......@@ -324,6 +354,12 @@ class pyustr(unicode):
def __le__(a, b): return unicode.__le__(a, _pyu_coerce(b))
def __ge__(a, b): return unicode.__ge__(a, _pyu_coerce(b))
# len - no need to override
# [], [:]
def __getitem__(self, idx):
return pyu(unicode.__getitem__(self, idx))
# _bdata/_udata retrieve raw data from bytes/unicode.
def _bdata(obj): # -> bytes
......
......@@ -21,7 +21,7 @@
from __future__ import print_function, absolute_import
import golang
from golang import b, u, bstr, ustr
from golang import b, u, bstr, ustr, bbyte, uchr
from golang._golang import _udata, _bdata
from golang.gcompat import qq
from golang.strconv_test import byterange
......@@ -29,7 +29,7 @@ from golang.golang_test import readfile, assertDoc, _pyrun, dir_testprog, PIPE
from pytest import raises, mark, skip
import sys
import six
from six import text_type as unicode
from six import text_type as unicode, unichr
from six.moves import range as xrange
import array
......@@ -271,6 +271,119 @@ def test_strings_memoryview():
assert _(5) == 0x80
# verify that ord on bstr/ustr works as expected.
def test_strings_ord():
with raises(TypeError): ord(b(''))
with raises(TypeError): ord(u(''))
with raises(TypeError): ord(b('ab'))
with raises(TypeError): ord(u('ab'))
assert ord(b('a')) == 97
assert ord(u('a')) == 97
with raises(TypeError): ord(b('м')) # 2 bytes, not 1
assert ord(u('м')) == 1084
for i in range(0x100):
bc = b(bytearray([i]))
assert len(bc) == 1
assert ord(bc) == i
for i in range(0x10000):
uc = u(unichr(i))
assert len(uc) == 1
assert ord(uc) == i
# verify bbyte.
def test_strings_bbyte():
with raises(ValueError): bbyte(-1)
with raises(ValueError): bbyte(0x100)
for i in range(0x100):
bi = bbyte(i)
assert type(bi) is bstr
assert len(bi) == 1
assert ord(bi) == i
assert bi == bytearray([i])
# verify uchr.
def test_strings_uchr():
with raises(ValueError): unichr(-1)
# upper limit depends on whether python was built with ucs as 2-bytes or 4-bytes long
# but at least it all should work for small 2-bytes range
for i in range(0x10000):
ui = uchr(i)
assert type(ui) is ustr
assert len(ui) == 1
assert ord(ui) == i
assert ui == unichr(i)
# verify strings access by index.
def test_strings_index():
us = u("миру мир"); u_ = u"миру мир"
bs = b("миру мир"); b_ = xbytes("миру мир")
assert len(us) == 8; assert len(u_) == 8
assert len(bs) == 15; assert len(b_) == 15
# u/unicode [idx] -> unicode character
def uidx(i):
x = us[i]; assert type(x) is ustr
y = u_[i]; assert type(y) is unicode
assert x == y
return x
for i, x in enumerate(['м','и','р','у',' ','м','и','р']):
assert uidx(i) == x
# b/bytes [idx] -> bytechar of byte value @ position idx
def bidx(i):
x = bs[i]; assert type(x) is bstr; assert len(x) == 1
y = b_[i]
if six.PY3:
y = bytes([y]) # on py3 bytes[i] returns int instead of 1-byte string
assert type(y) is bytes; assert len(y) == 1
assert x == y
return x
for i, x in enumerate([0xd0,0xbc,0xd0,0xb8,0xd1,0x80,0xd1,0x83,0x20,0xd0,0xbc,0xd0,0xb8,0xd1,0x80]):
assert bidx(i) == bytearray([x])
# u/unicode [:] -> unicode string
class USlice:
def __getitem__(self, key):
x = us[key]; assert type(x) is ustr
y = u_[key]; assert type(y) is unicode
assert x == y
return x
def __len__(self): # py2
x = len(us)
y = len(u_)
assert x == y
return x
_ = USlice()
assert _[:] == u"миру мир"
assert _[1:] == u"иру мир"
assert _[:-1] == u"миру ми"
assert _[2:5] == u"ру "
assert _[1:-1:2]== u"иум"
# b/bytes [:] -> bytestring
class BSlice:
def __getitem__(self, key):
x = bs[key]; assert type(x) is bstr
y = b_[key]; assert type(y) is bytes
assert x == y
return x
def __len__(self): # py2
x = len(bs)
y = len(b_)
assert x == y
return x
_ = BSlice()
assert _[:] == "миру мир"
assert _[1:] == b'\xbc\xd0\xb8\xd1\x80\xd1\x83 \xd0\xbc\xd0\xb8\xd1\x80'
assert _[:-1] == b'\xd0\xbc\xd0\xb8\xd1\x80\xd1\x83 \xd0\xbc\xd0\xb8\xd1'
assert _[3:12] == b'\xb8\xd1\x80\xd1\x83 \xd0\xbc\xd0'
assert _[1:-1:2]== b'\xbc\xb8\x80\x83\xd0\xd0\xd1'
# verify string operations like `x + y` for all combinations of pairs from
# bytes, unicode, bstr, ustr and bytearray. Except if both x and y are std
# python types, e.g. (bytes, unicode), because those combinations are handled
......
......@@ -73,6 +73,8 @@ def test_golang_builtins():
assert u is golang.u
assert bstr is golang.bstr
assert ustr is golang.ustr
assert bbyte is golang.bbyte
assert uchr is golang.uchr
# indirectly verify golang.__all__
for k in golang.__all__:
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment