Commit 390fd810 authored by Kirill Smelkov's avatar Kirill Smelkov

golang_str: bstr/ustr %-formatting

Teach bstr/ustr to do % formatting similarly to how unicode does, but
with treating bytes as UTF8-encoded strings - all in line with
general idea for bstr/ustr to treat bytes as strings.

The following approach is used to implement this:

1. both bstr and ustr format via bytes-based _bprintf.
2. we parse the format string and handle every formatting specifier separately:
3. for formats besides %s/%r we use bytes.__mod__ directly.

4. for %s we stringify corresponding argument specially with all, potentially
   internal, bytes instances treated as UTF8-encoded strings:

      '%s' % b'\xce\xb2'      ->  "β"
      '%s' % [b'\xce\xb2']    ->  "['β']"

5. for %r, similarly to %s, we prepare repr of corresponding argument
   specially with all, potentially internal, bytes instances also treated as
   UTF8-encoded strings:

      '%r' % b'\xce\xb2'      ->  "b'β'"
      '%r' % [b'\xce\xb2']    ->  "[b'β']"

For "2" we implement %-format parsing ourselves. test_strings_mod
has good coverage for this phase to make sure we get it right and behaving
exactly the same way as standard Python does.

For "4" we monkey-patch bytes.__repr__ to repr bytes as strings when called
from under bstr.__mod__(). See _bstringify for details.

For "5", similarly to "4", we rely on adjustments to bytes.__repr__ .
See _bstringify_repr for details.

I initially tried to avoid parsing format specification myself and
wanted to reuse original bytes.__mod__ and just adjust its behaviour
a bit somehow. This did not worked quite right as the following comment
explains:

    # Rejected alternative: try to format; if we get "TypeError: %b requires a
    # bytes-like object ..." retry with that argument converted to bstr.
    #
    # Rejected because e.g. for  `%(x)s %(x)r` % {'x': obj}`  we need to use
    # access number instead of key 'x' to determine which accesses to
    # bstringify. We could do that, but unfortunately on Python2 the access
    # number is not easily predictable because string could be upgraded to
    # unicode in the midst of being formatted and so some access keys will be
    # accesses not once.
    #
    # Another reason for rejection: b'%r' and u'%r' handle arguments
    # differently - on b %r is aliased to %a.

That's why full %-format parsing and handling is implemented in this
patch. Once again to make sure its behaviour is really the same compared
to Python's builtin %-formatting, we have good test coverage for both
%-format parsing itself, and for actual formatting of many various cases.

See test_strings_mod for details.
parent ddf6958b
...@@ -269,6 +269,11 @@ Usage example:: ...@@ -269,6 +269,11 @@ Usage example::
for c in s: # c will iterate through for c in s: # c will iterate through
... # [u(_) for _ in ('п','р','и','в','е','т',' ','м','и','р')] ... # [u(_) for _ in ('п','р','и','в','е','т',' ','м','и','р')]
# the following gives b('привет мир труд май')
b('привет %s %s %s') % (u'мир', # raw unicode
u'труд'.encode('utf-8'), # raw bytes
u('май')) # ustr
def f(s): def f(s):
s = u(s) # make sure s is ustr, decoding as UTF-8(*) if it was bstr, bytes, bytearray or buffer. s = u(s) # make sure s is ustr, decoding as UTF-8(*) if it was bstr, bytes, bytearray or buffer.
... # (*) the decoding never fails nor looses information. ... # (*) the decoding never fails nor looses information.
......
...@@ -58,6 +58,7 @@ from libc.stdint cimport uint8_t ...@@ -58,6 +58,7 @@ from libc.stdint cimport uint8_t
pystrconv = None # = golang.strconv imported at runtime (see __init__.py) pystrconv = None # = golang.strconv imported at runtime (see __init__.py)
import types as pytypes import types as pytypes
import functools as pyfunctools import functools as pyfunctools
import re as pyre
if PY_MAJOR_VERSION >= 3: if PY_MAJOR_VERSION >= 3:
import copyreg as pycopyreg import copyreg as pycopyreg
else: else:
...@@ -278,12 +279,12 @@ class pybstr(bytes): ...@@ -278,12 +279,12 @@ class pybstr(bytes):
def __repr__(self): def __repr__(self):
qself, nonascii_escape = _bpysmartquote_u3b2(self) qself, nonascii_escape = _bpysmartquote_u3b2(self)
bs = _inbstringify_get() bs = _inbstringify_get()
if bs.inbstringify == 0: if bs.inbstringify == 0 or bs.inrepr:
if nonascii_escape: # so that e.g. b(u'\x80') is represented as if nonascii_escape: # so that e.g. b(u'\x80') is represented as
qself = 'b' + qself # b(b'\xc2\x80'), not as b('\xc2\x80') qself = 'b' + qself # b(b'\xc2\x80'), not as b('\xc2\x80')
return "b(" + qself + ")" return "b(" + qself + ")"
else: else:
# [b('β')] goes as ['β'] when under _bstringify # [b('β')] goes as ['β'] when under _bstringify for %s
return qself return qself
...@@ -378,6 +379,19 @@ class pybstr(bytes): ...@@ -378,6 +379,19 @@ class pybstr(bytes):
return b.__mul__(a) return b.__mul__(a)
# %-formatting
def __mod__(a, b):
return _bprintf(a, b)
def __rmod__(b, a):
# ("..." % x) calls "x.__rmod__()" for string subtypes
# determine output type as in __radd__
if isinstance(a, bytearray):
# on py2 bytearray does not implement %
return NotImplemented # no need to check for py3 - there our __rmod__ is not invoked
a = _pybu_rcoerce(a)
return a.__mod__(b)
# all other string methods # all other string methods
def capitalize(self): return pyb(pyu(self).capitalize()) def capitalize(self): return pyb(pyu(self).capitalize())
...@@ -530,12 +544,12 @@ class pyustr(unicode): ...@@ -530,12 +544,12 @@ class pyustr(unicode):
def __repr__(self): def __repr__(self):
qself, nonascii_escape = _upysmartquote_u3b2(self) qself, nonascii_escape = _upysmartquote_u3b2(self)
bs = _inbstringify_get() bs = _inbstringify_get()
if bs.inbstringify == 0: if bs.inbstringify == 0 or bs.inrepr:
if nonascii_escape: if nonascii_escape:
qself = 'b'+qself # see bstr.__repr__ qself = 'b'+qself # see bstr.__repr__
return "u(" + qself + ")" return "u(" + qself + ")"
else: else:
# [u('β')] goes as ['β'] when under _bstringify # [u('β')] goes as ['β'] when under _bstringify for %s
return qself return qself
...@@ -621,6 +635,18 @@ class pyustr(unicode): ...@@ -621,6 +635,18 @@ class pyustr(unicode):
return b.__mul__(a) return b.__mul__(a)
# %-formatting
def __mod__(a, b):
return pyu(pyb(a).__mod__(b))
def __rmod__(b, a):
# ("..." % x) calls "x.__rmod__()" for string subtypes
# determine output type as in __radd__
if isinstance(a, bytearray):
return NotImplemented # see bstr.__rmod__
a = _pybu_rcoerce(a)
return a.__mod__(b)
# all other string methods # all other string methods
def capitalize(self): return pyu(unicode.capitalize(self)) def capitalize(self): return pyu(unicode.capitalize(self))
...@@ -916,6 +942,15 @@ cdef _bstringify(object obj): # -> unicode|bytes ...@@ -916,6 +942,15 @@ cdef _bstringify(object obj): # -> unicode|bytes
finally: finally:
_bstringify_leave() _bstringify_leave()
# _bstringify_repr returns repr of obj.
# it is similar to repr(obj), but handles bytes as UTF-8 encoded strings.
cdef _bstringify_repr(object obj): # -> unicode|bytes
_bstringify_enter_repr()
try:
return repr(obj)
finally:
_bstringify_leave_repr()
# patch bytes.{__repr__,__str__} and (py2) unicode.{__repr__,__str__}, so that both # patch bytes.{__repr__,__str__} and (py2) unicode.{__repr__,__str__}, so that both
# bytes and unicode are treated as normal strings when under _bstringify. # bytes and unicode are treated as normal strings when under _bstringify.
# #
...@@ -927,7 +962,7 @@ cdef _bstringify(object obj): # -> unicode|bytes ...@@ -927,7 +962,7 @@ cdef _bstringify(object obj): # -> unicode|bytes
# py3: str(['β'.encode()]) -> [b'\\xce\\xb2'] (4) x # py3: str(['β'.encode()]) -> [b'\\xce\\xb2'] (4) x
# #
# for us 3 is ok, while 1,2 and 4 are not. For all 1,2,3,4 we want e.g. # for us 3 is ok, while 1,2 and 4 are not. For all 1,2,3,4 we want e.g.
# `bstr(·)` to give ['β']. This is fixed by patching __repr__. # `bstr(·)` or `b('%s') % ·` to give ['β']. This is fixed by patching __repr__.
# #
# regarding patching __str__ - 6 and 8 in the following examples illustrate the # regarding patching __str__ - 6 and 8 in the following examples illustrate the
# need to do it: # need to do it:
...@@ -936,6 +971,8 @@ cdef _bstringify(object obj): # -> unicode|bytes ...@@ -936,6 +971,8 @@ cdef _bstringify(object obj): # -> unicode|bytes
# py2: str(u'β') -> UnicodeEncodeError (6) x # py2: str(u'β') -> UnicodeEncodeError (6) x
# py3: str( 'β') -> 'β' (7) # py3: str( 'β') -> 'β' (7)
# py3: str('β'.encode()) -> b'\\xce\\xb2' (8) x # py3: str('β'.encode()) -> b'\\xce\\xb2' (8) x
#
# See also overview of %-formatting.
cdef reprfunc _bytes_tp_repr = Py_TYPE(b'').tp_repr cdef reprfunc _bytes_tp_repr = Py_TYPE(b'').tp_repr
cdef reprfunc _bytes_tp_str = Py_TYPE(b'').tp_str cdef reprfunc _bytes_tp_str = Py_TYPE(b'').tp_str
...@@ -947,6 +984,8 @@ cdef object _bytes_tp_xrepr(object s): ...@@ -947,6 +984,8 @@ cdef object _bytes_tp_xrepr(object s):
if bs.inbstringify == 0: if bs.inbstringify == 0:
return _bytes_tp_repr(s) return _bytes_tp_repr(s)
s, _ = _bpysmartquote_u3b2(s) s, _ = _bpysmartquote_u3b2(s)
if PY_MAJOR_VERSION >= 3 and bs.inrepr != 0:
s = 'b'+s
return s return s
cdef object _bytes_tp_xstr(object s): cdef object _bytes_tp_xstr(object s):
...@@ -964,6 +1003,8 @@ cdef object _unicode2_tp_xrepr(object s): ...@@ -964,6 +1003,8 @@ cdef object _unicode2_tp_xrepr(object s):
if bs.inbstringify == 0: if bs.inbstringify == 0:
return _unicode_tp_repr(s) return _unicode_tp_repr(s)
s, _ = _upysmartquote_u3b2(s) s, _ = _upysmartquote_u3b2(s)
if PY_MAJOR_VERSION < 3 and bs.inrepr != 0:
s = 'u'+s
return s return s
cdef object _unicode2_tp_xstr(object s): cdef object _unicode2_tp_xstr(object s):
...@@ -1047,8 +1088,8 @@ if PY_MAJOR_VERSION < 3: ...@@ -1047,8 +1088,8 @@ if PY_MAJOR_VERSION < 3:
# patch bytearray.{__repr__,__str__} similarly to bytes, so that e.g. # patch bytearray.{__repr__,__str__} similarly to bytes, so that e.g.
# bstr( bytearray('β') ) turns into β instead of bytearray(b'\xce\xb2'), and # '%s' % bytearray('β') turns into β instead of bytearray(b'\xce\xb2'), and
# bstr( [bytearray('β'] ) turns into ['β'] instead of [bytearray(b'\xce\xb2')]. # '%s' % [bytearray('β'] turns into ['β'] instead of [bytearray(b'\xce\xb2')].
# #
# also patch: # also patch:
# #
...@@ -1069,6 +1110,8 @@ cdef object _bytearray_tp_xrepr(object a): ...@@ -1069,6 +1110,8 @@ cdef object _bytearray_tp_xrepr(object a):
if bs.inbstringify == 0: if bs.inbstringify == 0:
return _bytearray_tp_repr(a) return _bytearray_tp_repr(a)
s, _ = _bpysmartquote_u3b2(a) s, _ = _bpysmartquote_u3b2(a)
if bs.inrepr != 0:
s = 'bytearray(b' + s + ')'
return s return s
cdef object _bytearray_tp_xstr(object a): cdef object _bytearray_tp_xstr(object a):
...@@ -1148,15 +1191,17 @@ cdef bytes _bytearray_data(object s): ...@@ -1148,15 +1191,17 @@ cdef bytes _bytearray_data(object s):
return _bytearray_tp_str(s) return _bytearray_tp_str(s)
# _bstringify_enter/_bstringify_leave/_inbstringify_get allow _bstringify to # _bstringify_enter*/_bstringify_leave*/_inbstringify_get allow _bstringify* to
# indicate to further invoked code whether it has been invoked from under # indicate to further invoked code whether it has been invoked from under
# _bstringify or not. # _bstringify* or not.
cdef object _inbstringify_key = "golang._inbstringify" cdef object _inbstringify_key = "golang._inbstringify"
@final @final
cdef class _InBStringify: cdef class _InBStringify:
cdef int inbstringify # >0 if we are running under _bstringify cdef int inbstringify # >0 if we are running under _bstringify/_bstringify_repr
cdef int inrepr # >0 if we are running under _bstringify_repr
def __cinit__(self): def __cinit__(self):
self.inbstringify = 0 self.inbstringify = 0
self.inrepr = 0
cdef void _bstringify_enter() except*: cdef void _bstringify_enter() except*:
bs = _inbstringify_get() bs = _inbstringify_get()
...@@ -1166,6 +1211,16 @@ cdef void _bstringify_leave() except*: ...@@ -1166,6 +1211,16 @@ cdef void _bstringify_leave() except*:
bs = _inbstringify_get() bs = _inbstringify_get()
bs.inbstringify -= 1 bs.inbstringify -= 1
cdef void _bstringify_enter_repr() except*:
bs = _inbstringify_get()
bs.inbstringify += 1
bs.inrepr += 1
cdef void _bstringify_leave_repr() except*:
bs = _inbstringify_get()
bs.inbstringify -= 1
bs.inrepr -= 1
cdef _InBStringify _inbstringify_get(): cdef _InBStringify _inbstringify_get():
cdef PyObject* _ts_dict = PyThreadState_GetDict() # borrowed cdef PyObject* _ts_dict = PyThreadState_GetDict() # borrowed
if _ts_dict == NULL: if _ts_dict == NULL:
...@@ -1212,6 +1267,272 @@ cdef class _UnboundMethod(object): # they removed unbound methods on py3 ...@@ -1212,6 +1267,272 @@ cdef class _UnboundMethod(object): # they removed unbound methods on py3
return pyfunctools.partial(self.func, obj) return pyfunctools.partial(self.func, obj)
# ---- % formatting ----
# When formatting string is bstr/ustr we treat bytes in all arguments as
# UTF8-encoded bytestrings. The following approach is used to implement this:
#
# 1. both bstr and ustr format via bytes-based _bprintf.
# 2. we parse the format string and handle every formatting specifier separately:
# 3. for formats besides %s/%r we use bytes.__mod__ directly.
#
# 4. for %s we stringify corresponding argument specially with all, potentially
# internal, bytes instances treated as UTF8-encoded strings:
#
# '%s' % b'\xce\xb2' -> "β"
# '%s' % [b'\xce\xb2'] -> "['β']"
#
# 5. for %r, similarly to %s, we prepare repr of corresponding argument
# specially with all, potentially internal, bytes instances also treated as
# UTF8-encoded strings:
#
# '%r' % b'\xce\xb2' -> "b'β'"
# '%r' % [b'\xce\xb2'] -> "[b'β']"
#
#
# For "2" we implement %-format parsing ourselves. test_strings_mod
# has good coverage for this phase to make sure we get it right and behaving
# exactly the same way as standard Python does.
#
# For "4" we monkey-patch bytes.__repr__ to repr bytes as strings when called
# from under bstr.__mod__(). See _bstringify for details.
#
# For "5", similarly to "4", we rely on adjustments to bytes.__repr__ .
# See _bstringify_repr for details.
#
# See also overview of patching bytes.{__repr__,__str__} near _bstringify.
cdef object _missing = object()
cdef object _atidx_re = pyre.compile('.* at index ([0-9]+)$')
cdef _bprintf(const uint8_t[::1] fmt, xarg): # -> pybstr
cdef bytearray out = bytearray()
cdef tuple argv = None # if xarg is tuple
cdef object argm = None # if xarg is mapping
# https://github.com/python/cpython/blob/2.7-0-g8d21aa21f2c/Objects/stringobject.c#L4298-L4300
# https://github.com/python/cpython/blob/v3.11.0b1-171-g70aa1b9b912/Objects/unicodeobject.c#L14319-L14320
if _XPyMapping_Check(xarg) and \
(not isinstance(xarg, tuple)) and \
(not isinstance(xarg, (bytes,unicode))):
argm = xarg
if isinstance(xarg, tuple):
argv = xarg
xarg = _missing
#print()
#print('argv:', argv)
#print('argm:', argm)
#print('xarg:', xarg)
cdef int argv_idx = 0
def nextarg():
nonlocal argv_idx, xarg
# NOTE for `'%s %(x)s' % {'x':1}` python gives "{'x': 1} 1"
# -> so we avoid argm check completely here
#if argm is not None:
if 0:
raise ValueError('mixing dict/tuple')
elif argv is not None:
# tuple xarg
if argv_idx < len(argv):
arg = argv[argv_idx]
argv_idx += 1
return arg
elif xarg is not _missing:
# sole xarg
arg = xarg
xarg = _missing
return arg
raise TypeError('not enough arguments for format string')
def badf():
raise ValueError('incomplete format')
# parse format string locating formatting specifiers
# if we see %s/%r - use _bstringify
# else use builtin %-formatting
#
# %[(name)][flags][width|*][.[prec|*]][len](type)
#
# https://docs.python.org/3/library/stdtypes.html#printf-style-string-formatting
# https://github.com/python/cpython/blob/2.7-0-g8d21aa21f2c/Objects/stringobject.c#L4266-L4765
#
# Rejected alternative: try to format; if we get "TypeError: %b requires a
# bytes-like object ..." retry with that argument converted to bstr.
#
# Rejected because e.g. for `%(x)s %(x)r` % {'x': obj}` we need to use
# access number instead of key 'x' to determine which accesses to
# bstringify. We could do that, but unfortunately on Python2 the access
# number is not easily predictable because string could be upgraded to
# unicode in the midst of being formatted and so some access keys will be
# accesses not once.
#
# Another reason for rejection: b'%r' and u'%r' handle arguments
# differently - on b %r is aliased to %a.
cdef int i = 0
cdef int l = len(fmt)
cdef uint8_t c
while i < l:
c = fmt[i]
i += 1
if c != ord('%'):
out.append(c)
continue
fmt_istart = i-1
nameb = _missing
width = _missing
prec = _missing
value = _missing
# `c = fmt_nextchar()` avoiding https://github.com/cython/cython/issues/4798
if i >= l: badf()
c = fmt[i]; i += 1
# (name)
if c == ord('('):
#print('(name)')
if argm is None:
raise TypeError('format requires a mapping')
nparen = 1
nameb = b''
while 1:
if i >= l:
raise ValueError('incomplete format key')
c = fmt[i]; i += 1
if c == ord('('):
nparen += 1
elif c == ord(')'):
nparen -= 1
if i >= l: badf()
c = fmt[i]; i += 1
break
else:
nameb += bchr(c)
# flags
while chr(c) in '#0- +':
#print('flags')
if i >= l: badf()
c = fmt[i]; i += 1
# [width|*]
if c == ord('*'):
#print('*width')
width = nextarg()
if i >= l: badf()
c = fmt[i]; i += 1
else:
while chr(c).isdigit():
#print('width')
if i >= l: badf()
c = fmt[i]; i += 1
# [.prec|*]
if c == ord('.'):
#print('dot')
if i >= l: badf()
c = fmt[i]; i += 1
if c == ord('*'):
#print('.*')
prec = nextarg()
if i >= l: badf()
c = fmt[i]; i += 1
else:
while chr(c).isdigit():
#print('.prec')
if i >= l: badf()
c = fmt[i]; i += 1
# [len]
while chr(c) in 'hlL':
#print('len')
if i >= l: badf()
c = fmt[i]; i += 1
fmt_type = c
#print('fmt_type:', repr(chr(fmt_type)))
if fmt_type == ord('%'):
if i-2 == fmt_istart: # %%
out.append(b'%')
continue
if nameb is not _missing:
xarg = _missing # `'%(x)s %s' % {'x':1}` raises "not enough arguments"
nameu = _utf8_decode_surrogateescape(nameb)
try:
value = argm[nameb]
except KeyError:
# retry with changing key via bytes <-> unicode
# e.g. for `b('%(x)s') % {'x': ...}` builtin bytes.__mod__ will
# extract b'x' as key and raise KeyError: b'x'. We avoid that via
# retrying with second string type for key.
value = argm[nameu]
else:
# NOTE for `'%4%' % ()` python raises "not enough arguments ..."
#if fmt_type != ord('%'):
if 1:
value = nextarg()
if fmt_type == ord('%'):
raise ValueError("unsupported format character '%s' (0x%x) at index %i" % (chr(c), c, i-1))
fmt1 = memoryview(fmt[fmt_istart:i]).tobytes()
#print('fmt_istart:', fmt_istart)
#print('i: ', i)
#print(' ~> __mod__ ', repr(fmt1))
# bytes %r is aliased of %a (ASCII), but we want unicode-like %r
# -> handle it ourselves
if fmt_type == ord('r'):
value = pyb(_bstringify_repr(value))
fmt_type = ord('s')
fmt1 = fmt1[:-1] + b's'
elif fmt_type == ord('s'):
# %s -> feed value through _bstringify
# this also converts e.g. int to bstr, else e.g. on `b'%s' % 123` python
# complains '%b requires a bytes-like object ...'
value = pyb(_bstringify(value))
if nameb is not _missing:
arg = {nameb: value, nameu: value}
else:
t = []
if width is not _missing: t.append(width)
if prec is not _missing: t.append(prec)
if value is not _missing: t.append(value)
t = tuple(t)
arg = t
#print('--> __mod__ ', repr(fmt1), ' % ', repr(arg))
try:
s = bytes.__mod__(fmt1, arg)
except ValueError as e:
# adjust position in '... at index <idx>' from fmt1 to fmt
if len(e.args) == 1:
a = e.args[0]
m = _atidx_re.match(a)
if m is not None:
a = a[:m.start(1)] + str(i-1)
e.args = (a,)
raise
out.extend(s)
if argm is None:
#print('END')
#print('argv:', argv, 'argv_idx:', argv_idx, 'xarg:', xarg)
if (argv is not None and argv_idx != len(argv)) or (xarg is not _missing):
raise TypeError("not all arguments converted during string formatting")
return pybstr(out)
# ---- misc ---- # ---- misc ----
# _strhas returns whether unicode string type has specified method. # _strhas returns whether unicode string type has specified method.
...@@ -1254,6 +1575,21 @@ cdef extern from "Python.h": ...@@ -1254,6 +1575,21 @@ cdef extern from "Python.h":
""" """
bint _XPyObject_CheckOldBuffer(object o) bint _XPyObject_CheckOldBuffer(object o)
cdef extern from "Python.h":
"""
static int _XPyMapping_Check(PyObject *o) {
#if PY_MAJOR_VERSION >= 3
return PyMapping_Check(o);
#else
// on py2 PyMapping_Check besides checking tp_as_mapping->mp_subscript
// also verifies !tp_as_sequence->sq_slice. We want to avoid that
// because PyString_Format checks only tp_as_mapping->mp_subscript.
return Py_TYPE(o)->tp_as_mapping && Py_TYPE(o)->tp_as_mapping->mp_subscript;
#endif
}
"""
bint _XPyMapping_Check(object o)
# ---- UTF-8 encode/decode ---- # ---- UTF-8 encode/decode ----
......
...@@ -32,7 +32,7 @@ import six ...@@ -32,7 +32,7 @@ import six
from six import text_type as unicode, unichr from six import text_type as unicode, unichr
from six.moves import range as xrange from six.moves import range as xrange
import pickle, copy, types import pickle, copy, types
import array import array, collections
# buftypes lists types with buffer interface that we will test against. # buftypes lists types with buffer interface that we will test against.
...@@ -718,6 +718,83 @@ def test_strings_ops2(tx, ty): ...@@ -718,6 +718,83 @@ def test_strings_ops2(tx, ty):
_ is not x _ is not x
# x % y (not tuple at right)
# ideally same typing rules as for +, but for `x=u'' y=b()` and `x=b'' y=u()`
# we can't make python call y.__rmod__ .
# see https://bugs.python.org/issue28598 for references where python implements this.
#
# NOTE python 3.11 reworked % handling to be generic - there we could
# probably make y.__rmod__ to be called via tweaking __subclasscheck__
# https://github.com/python/cpython/commit/ec382fac0db6
if tx in (bstr, ustr):
tmod = tx
elif tx in (unicode, bytes):
if ty in (unicode, bytes, bytearray):
tmod = tx
else:
assert ty in (bstr, ustr)
# on py2 str % (unicode|ustr) gives unicode
if six.PY2 and ty is ustr:
if tx is bytes:
tmod = unicode
else:
assert tx is unicode
tmod = ustr # ustr is subclass of unicode -> __rmod__ is called
else:
tmod = tx if tbu(tx) is not ty else \
tbu(tx)
else:
assert tx is bytearray
tmod = tx
x = xstr(u'hello %s', tx)
if six.PY2 and tx is bytearray: # bytearray/py2 does not support %
_ = xbytearray(bytes(x) % y)
else:
_ = x % y
assert type(_) is tmod
assert _ == xstr(u'hello мир', tmod)
assert _ is not x
# x %= y (not tuple at right; same as in corresponding %)
_ = x
if six.PY2 and tx is bytearray: # bytearray/py2 does not support %=
_ = xbytearray(bytes(x) % y)
else:
_ %= y
assert type(_) is tmod
assert _ == xstr(u'hello мир', tmod)
assert _ is not x # even bytearray('%s') %= y creates new object
# x % (y,)
# py3: result type is type(x) because y.__rmod__ is never called
# py2: similar, but b'' % u'' gives u
if six.PY2 and tx is bytearray: # bytearray/py2 does not support %
_ = xbytearray(bytes(x) % (y,))
else:
_ = x % (y,)
ttmod = tx
if six.PY2:
if tx in (bytes, unicode):
if tx is unicode or ty in (unicode, ustr):
ttmod = unicode
else:
ttmod = bytes
assert type(_) is ttmod
assert _ == xstr(u'hello мир', ttmod)
assert _ is not x
# x %= (y,)
_ = x
if six.PY2 and tx is bytearray: # bytearray/py2 does not support %=
_ = xbytearray(bytes(x) % (y,))
else:
_ %= (y,)
assert type(_) is ttmod
assert _ == xstr(u'hello мир', ttmod)
assert _ is not x # even bytearray('%s') %= y creates new object
# verify string operations like `x + y` for x being bstr/ustr and y being a # verify string operations like `x + y` for x being bstr/ustr and y being a
# type unsupported for coercion. # type unsupported for coercion.
# #
...@@ -802,6 +879,433 @@ def test_strings_ops2_eq_any(tx): ...@@ -802,6 +879,433 @@ def test_strings_ops2_eq_any(tx):
with raises(TypeError): hash(l) with raises(TypeError): hash(l)
_(l) _(l)
# verify logic in `bstr % ...` .
def test_strings_mod():
# verify_fmt_all_types verifies f(fmt, args) for all combinations of
#
# · fmt being unicode, bstr, ustr
# · args being/containing unicode, bytes, bytearray, bstr, ustr
#
# it checks that all results are the same for the case when both fmt and
# args contain only standard unicode.
def verify_fmt_all_types(f, fmt, args, *okv, **kw):
excok = kw.pop('excok', False)
assert not kw
rok = None
#print()
def xfmt(fmt, args):
exc = False
try:
r = f(fmt, args) # e.g. fmt % args
except Exception as e:
if not excok:
raise
exc = True
r = repr(e) # because e.g. ValueError('x') == ValueError('x') is false
#print(repr(fmt), "%", repr(args), "->", repr(r))
if not exc:
assert type(r) is type(fmt)
if len(okv) != 0:
for ok in okv:
if isinstance(ok, Exception):
ok = repr(ok)
else:
ok = xunicode(ok)
if r == ok:
break
else:
raise AssertionError("result (%r) not in any of %r" % (r, okv))
elif rok is not None:
assert r == rok
return r
fmt_ustd = deepReplaceStr(fmt, xunicode)
fmt_u = deepReplaceStr(fmt, u)
fmt_b = deepReplaceStr(fmt, b)
args_ustd = deepReplaceStr(args, xunicode)
args_bstd = deepReplaceStr(args, xbytes)
args_barr = deepReplaceStr2Bytearray(args)
args_u = deepReplaceStr(args, u)
args_b = deepReplaceStr(args, b)
# see if args_ustd could be used for stringification.
# e.g. on py2 both str() and unicode() on UserString(u'β') raises
# "UnicodeEncodeError: 'ascii' codec can't encode characters ..."
args_ustd_ok = True
if six.PY2:
try:
unicode(args_ustd) # e.g. UserString
try:
it = iter(args_ustd) # e.g. (UserString,)
# on py2 UserDict is not really iterable - iter succeeds but
# going through it raises KeyError because of
# https://github.com/python/cpython/blob/2.7-0-g8d21aa21f2c/Lib/UserDict.py#L112-L114
# -> work it around
if six.PY2 and not hasattr(args_ustd, '__iter__'):
raise TypeError
except TypeError:
pass
else:
for _ in it:
unicode(_)
except UnicodeEncodeError:
args_ustd_ok = False
# initialize rok from u'' % u''.
# Skip errors on py2 because e.g. `u'α %s' % [u'β']` gives u"α [u'\\u03b2']",
# not u"α ['β']". This way we cannot use u'' % u'' as a reference.
# We cannot use b'' % b'' as a reference neither because e.g.
# `'α %s' % ['β']` gives "α ['\\xce\\xb2']", not "α ['β']"
if args_ustd_ok:
good4rok = True
try:
_ = xfmt(fmt_ustd, args_ustd) # u'' % (u'', ...)
except AssertionError as e:
if six.PY2 and len(e.args) == 1 and "not in any of" in e.args[0]:
good4rok = False
else:
raise
if good4rok:
rok = _
# if rok computation was skipped we insist on being explicitly called with ok=...
assert (rok is not None) or (len(okv) != 0)
if args_ustd_ok:
xfmt(fmt_b, args_ustd) # b() % (u'', ...)
xfmt(fmt_u, args_ustd) # u() % (u'', ...)
xfmt(fmt_b, args_bstd) # b() % (b'', ...)
xfmt(fmt_u, args_bstd) # u() % (b'', ...)
xfmt(fmt_b, args_barr) # b() % (bytearray, ...)
xfmt(fmt_u, args_barr) # u() % (bytearray, ...)
xfmt(fmt_b, args_b) # b() % (b(), ...)
xfmt(fmt_u, args_b) # b() % (b(), ...)
xfmt(fmt_b, args_u) # b() % (u(), ...)
xfmt(fmt_u, args_u) # b() % (u(), ...)
# NOTE we don't check e.g. `u'' % u()` and `u'' % b()` because for e.g.
# `u'α %s' % [u('β')]` the output is u"α [u("β")]" - not u"α ['β']".
# _bprintf parses %-format ourselves. Verify that parsing first
# NOTE here all strings are plain ASCII.
def _(fmt, args):
fmt = '*str '+fmt
for l in range(len(fmt), -1, -1):
# [:len(fmt)] verifies original case
# [:l<len] should verify "incomplete format" parsing
verify_fmt_all_types(lambda fmt, args: fmt % args,
fmt[:l], args, excok=True)
_('%(name)s', {'name': 123})
_('%x', 123) # flags
_('%#x', 123)
_('%05d', 123)
_('%-5d', 123)
_('% d', 123)
_('% d', -123)
_('%+d', -123)
_('%5d', 123) # width
_('%*d', (5,123))
_('%f', 1.234) # .prec
_('%.f', 1.234)
_('%.1f', 1.234)
_('%.2f', 1.234)
_('%*f', (2,1.234))
_('%hi', 123) # len
_('%li', 123)
_('%Li', 123)
_('%%', ()) # %%
_('%10.4f', 1.234) # multiple features
_('%(x)10.4f', {'y':0, 'x':1.234})
_('%*.*f', (10,4,1.234))
_('', {}) # not all arguments converted
_('', [])
_('', 123)
_('', '123')
_('%s', ()) # not enough arguments to format
_('%s %s', 123)
_('%s %s', (123,))
_('%(x)s', 123) # format requires a mapping
_('%(x)s', (123,))
_('%s %(x)s', (123,4))
_('%(x)s %s', (123,4))
_('%(x)s %s', {'x':1}) # mixing tuple/dict
_('%s %(x)s', {'x':1})
_('abc %z', 1) # unsupported format character
_('abc %44z', 1)
# for `'%4%' % ()` py2 gives ' %', but we stick to more reasonable py3 semantic
def _(fmt, args, ok):
return verify_fmt_all_types(lambda fmt, args: fmt % args,
fmt, args, ok, excok=True)
_('*str %4%', (), TypeError("not enough arguments for format string"))
_('*str %4%', 1, ValueError("unsupported format character '%' (0x25) at index 7"))
_('*str %4%', (1,), ValueError("unsupported format character '%' (0x25) at index 7"))
_('*str %(x)%', {'x':1}, ValueError("unsupported format character '%' (0x25) at index 9"))
# parse checking complete. now verify actual %-formatting
# _ verifies `fmt % args`
# if fmt has no '%' only .format(args) is verified.
def _(fmt, args, *okv):
verify_fmt_all_types(lambda fmt, args: fmt % args,
fmt, args, *okv)
_("*str a %s z", 123) # NOTE *str to force str -> bstr/ustr even for ASCII string
_("*str a %s z", '*str \'"\x7f')
_("*str a %s z", 'β')
_("*str a %s z", ('β',))
_("*str a %s z", ['β'] , "*str a ['β'] z")
_("a %s π", 123)
_("a %s π", '*str \'"\x7f')
_("a %s π", 'β')
_("a %s π", ('β',))
_("a %s π", ['β'] , "a ['β'] π")
_("α %s z", 123)
_("α %s z", '*str \'"\x7f')
_("α %s z", 'β')
_("α %s z", ('β',))
_("α %s z", ['β'] , "α ['β'] z")
_("α %s π", 123)
_("α %s π", '*str \'"\x7f')
_("α %s π", 'β')
_("α %s π", ('β',))
_("α %s π", ('β',))
_("α %s %s π", ('β', 'γ'))
_("α %s %s %s π", ('β', 'γ', 'δ'))
_("α %s %s %s %s %s %s %s π", (1, 'β', 2, 'γ', 3, 'δ', 4))
_("α %s π", [])
_("α %s π", ([],))
_("α %s π", ((),))
_("α %s π", set())
_("α %s π", (set(),))
_("α %s π", frozenset())
_("α %s π", (frozenset(),))
_("α %s π", ({},))
_("α %s π", ['β'] , "α ['β'] π")
_("α %s π", (['β'],) , "α ['β'] π")
_("α %s π", (('β',),) , "α ('β',) π")
_("α %s π", {'β'} , x32("α {'β'} π", "α set(['β']) π"))
_("α %s π", ({'β'},) , x32("α {'β'} π", "α set(['β']) π"))
_("α %s π", frozenset({'β'}) , x32("α frozenset({'β'}) π", "α frozenset(['β']) π"))
_("α %s π", (frozenset({'β'}),) , x32("α frozenset({'β'}) π", "α frozenset(['β']) π"))
_("α %s π", ({'β':'γ'},) , "α {'β': 'γ'} π")
_("α %s %s π", ([1, 'β', 2], 345) , "α [1, 'β', 2] 345 π")
_("α %s %s π", ((1, 'β', 2), 345) , "α (1, 'β', 2) 345 π")
# NOTE set/frozenset/dict: print order is "random"
_("α %s %s π", ({1, 'β'}, 345) , *x32(("α {1, 'β'} 345 π", "α {'β', 1} 345 π"),
("α set([1, 'β']) 345 π", "α set(['β', 1]) 345 π")))
_("α %s %s π", (frozenset({1, 'β'}), 345) , *x32(("α frozenset({1, 'β'}) 345 π", "α frozenset({'β', 1}) 345 π"),
("α frozenset([1, 'β']) 345 π", "α frozenset(['β', 1]) 345 π"))),
_("α %s %s π", ({1:'мир', 'β':'труд'}, 345) , *x32(("α {1: 'мир', 'β': 'труд'} 345 π",), # py3: dict is insert-order
("α {1: 'мир', 'β': 'труд'} 345 π", "α {'β': 'труд', 1: 'мир'} 345 π")))
# recursive list
l = [1,]; l += [l, 'мир']
_('α %s π', (l,) , "α [1, [...], 'мир'] π")
# recursive tuple
t = (1, []); t[1].append((t, 'мир'))
_('α %s π', (t,) , "α (1, [((...), 'мир')]) π")
# recursive set
s = {1}; s.add(hlist([s]))
_('α %s π', (s,) , x32("α {[set(...)], 1} π", "α set([[set(...)], 1]) π"))
# recursive frozenset
l = hlist()
f = frozenset({1, l}); l.append(f)
_('α %s π', (f,))
# recursive dict (via value)
d = {1:'мир'}; d.update({2:d})
_('α %s π', (d,) , *x32(("α {1: 'мир', 2: {...}} π",),
("α {1: 'мир', 2: {...}} π", "α {2: {...}, 1: 'мир'} π")))
# recursive dict (via key)
l = hlist([1])
d = {l:'мир'}; l.append(d)
_('α %s π', (d,) , "α {[1, {...}]: 'мир'} π")
# old-style class with __str__
class Cold:
def __repr__(self): return "Cold()"
def __str__(self): return u"Класс (old)"
_('α %s π', Cold())
_('α %s π', (Cold(),))
# new-style class with __str__
class Cnew(object):
def __repr__(self): return "Cnew()"
def __str__(self): return u"Класс (new)"
_('α %s π', Cnew())
_('α %s π', (Cnew(),))
# custom classes inheriting from set/list/tuple/dict/frozenset
class L(list): pass
class T(tuple): pass
class S(set): pass
class F(frozenset): pass
class D(dict): pass
_('α %s π', L(['β',3]) , "α ['β', 3] π")
_('α %s π', (L(['β',3]),) , "α ['β', 3] π")
_('α %s π', (T(['β',3]),) , "α ('β', 3) π")
# NOTE set/frozenset/dict: print order is "random"
_('α %s π', S(['β',3]) , *x32(("α S({'β', 3}) π", "α S({3, 'β'}) π"),
("α S(['β', 3]) π", "α S([3, 'β']) π")))
_('α %s π', (S(['β',3]),) , *x32(("α S({'β', 3}) π", "α S({3, 'β'}) π"),
("α S(['β', 3]) π", "α S([3, 'β']) π")))
_('α %s π', F(['β',3]) , *x32(("α F({'β', 3}) π", "α F({3, 'β'}) π"),
("α F(['β', 3]) π", "α F([3, 'β']) π")))
_('α %s π', (F(['β',3]),) , *x32(("α F({'β', 3}) π", "α F({3, 'β'}) π"),
("α F(['β', 3]) π", "α F([3, 'β']) π")))
_('α %s π', (D([('β','γ'), (3,4)]),)
, *x32(("α {'β': 'γ', 3: 4} π",),
("α {'β': 'γ', 3: 4} π", "α {3: 4, 'β': 'γ'} π")))
# well-known classes
# namedtuple
cc = collections; xcc = six.moves
Point = cc.namedtuple('Point', ['x', 'y'])
_('α %s π', (Point('β','γ'),) , "α Point(x='β', y='γ') π")
# deque
_('α %s π', cc.deque(['β','γ']) , "α deque(['β', 'γ']) π")
_('α %s π', (cc.deque(['β','γ']),) , "α deque(['β', 'γ']) π")
# Counter (inherits from dict)
_('α %s π', (cc.Counter({'β':1}),) , "α Counter({'β': 1}) π")
# OrderedDict
_('α %s π', (cc.OrderedDict([(1,'мир'), ('β','труд')]),)
, "α OrderedDict([(1, 'мир'), ('β', 'труд')]) π")
# defaultdict
_('α %s π', (cc.defaultdict(int, {'β':1}),)
, x32("α defaultdict(<class 'int'>, {'β': 1}) π",
"α defaultdict(<type 'int'>, {'β': 1}) π"))
# UserDict
_('α %s π', (xcc.UserDict({'β':1}),) , "α {'β': 1} π")
# UserList
_('α %s π', xcc.UserList(['β','γ']) , "α ['β', 'γ'] π")
_('α %s π', (xcc.UserList(['β','γ']),) , "α ['β', 'γ'] π")
# UserString
_('α %s π', xcc.UserString('βγ') , "α βγ π")
_('α %s π', (xcc.UserString('βγ'),) , "α βγ π")
# custom classes inheriting from bytes/unicode/bytearray
class B(bytes): pass
class BB(bytes):
def __repr__(self): return "BB(байты)"
def __str__(self): return "байты"
class U(unicode): pass
class UU(unicode):
def __repr__(self): return "UU(юникод)"
def __str__(self): return "юникод"
__unicode__ = __str__
class A(bytearray): pass
class AA(bytearray):
def __repr__(self): return "AA(байтмассив)"
def __str__(self): return "байтмассив"
def M(fmt, args, ok):
# verify only `b() % args` and `u() % args` since for e.g. `u'' % b''` the result is different
bfmt = b(fmt)
ufmt = u(fmt)
br = bfmt % args #;print(repr(bfmt), " % ", repr(args), " -> ", repr(br))
ur = ufmt % args #;print(repr(ufmt), " % ", repr(args), " -> ", repr(ur))
assert type(br) is bstr
assert type(ur) is ustr
assert br == ok
assert ur == ok
M("α %s π", U ( u'май') , "α май π")
M("α %s π", (U ( u'май'),) , "α май π")
M("α %s π", [U ( u'май')] , "α ['май'] π")
M("α %s π", UU( u'май2') , "α юникод π") # not май2
M("α %s π", (UU( u'май2'),) , "α юникод π") # not май2
M("α %s π", [UU( u'май2')] , "α [UU(юникод)] π") # not [май2]
M("α %s π", B (xbytes('мир')) , "α мир π")
M("α %s π", (B (xbytes('мир')),) , "α мир π")
M("α %s π", [B (xbytes('мир'))] , "α ['мир'] π")
M("α %s π", BB(xbytes('мир2')) , "α байты π") # not мир2
# vvv does not work on py3 as b'' % b'' does not consult __str__ nor __bytes__ of the argument
# even though it is not 100% we are ok here, because customizing bytes or unicode is very exotic
if six.PY2:
M("α %s π", (BB(xbytes('мир2')),) , "α байты π") # not мир2
M("α %s π", [BB(xbytes('мир2'))] , "α [BB(байты)] π") # not [мир2]
M("α %s π", A (xbytes('труд')) , "α труд π")
M("α %s π", (A (xbytes('труд')),) , "α труд π")
M("α %s π", [A (xbytes('труд'))] , "α ['труд'] π")
M("α %s π", AA(xbytes('труд2')) , "α байтмассив π") # not труд2
M("α %s π", (AA(xbytes('труд2')),) , "α байтмассив π") # not труд2
M("α %s π", [AA(xbytes('труд2'))] , "α [AA(байтмассив)] π") # not [труд2]
# dict at right
# less tests because stringification of arguments is already thoroughly
# verified with "tuple at right" tests above.
_("*str a %(x)s z", {'x': 123})
_("*str a %(x)s z", {'x': '*str \'"\x7f'})
_("*str a %(x)s z", {'x': 'β'})
_("*str a %(x)s z", {'x': ['β']} , "*str a ['β'] z")
_("*str a %(x)s %(y)s z", {'x':'β', 'y':'γ'})
_("*str a %(x)s %(y)s %(z)s z", {'x':'β', 'y':'γ', 'z':'δ'})
_("a %(x)s π", {'x': 123})
_("a %(x)s π", {'x': '*str \'"\x7f'})
_("a %(x)s π", {'x': 'β'})
_("a %(x)s π", {'x': ['β']} , "a ['β'] π")
_("a %(x)s %(y)s π", {'x': 'β', 'y':'γ'})
_("a %(x)s %(y)s %(z)s π", {'x': 'β', 'y':'γ', 'z':'δ'})
_("α %(x)s z", {'x': 123})
_("α %(x)s z", {'x': '*str \'"\x7f'})
_("α %(x)s z", {'x': 'β'})
_("α %(x)s z", {'x': ['β']} , "α ['β'] z")
_("α %(x)s %(y)s z", {'x': 'β', 'y':'γ'})
_("α %(x)s %(y)s %(z)s z", {'x': 'β', 'y':'γ', 'z':'δ'})
_("α %(x)s π", {'x': 123})
_("α %(x)s π", {'x': '*str \'"\x7f'})
_("α %(x)s π", {'x': 'β'})
_("α %(x)s π", {'x': ['β']} , "α ['β'] π")
_("α %(x)s %(y)s π", {'x':'β', 'y':'γ'})
_("α %(x)s %(y)s %(z)s π", {'x':'β', 'y':'γ', 'z':'δ'})
_("*str a %(x)s z", xcc.UserDict({'x': 'β'}))
_("α %(x)s π", xcc.UserDict({'x': 'β'}))
# %r (and !r)
M("α %r", u'z' , x32("α 'z'", "α u'z'"))
M("α %r", u'β' , x32("α 'β'", "α u'β'"))
M("α %r", b'z' , x32("α b'z'", "α 'z'"))
M("α %r", xbytes('β') , x32("α b'β'", "α 'β'"))
M("α %r", xbytearray('β') , "α bytearray(b'β')")
M("α %r", b('β') , "α b('β')")
M("α %r", u('β') , "α u('β')")
M("α %r", [u'z'] , x32("α ['z']", "α [u'z']"))
M("α %r", [u'β'] , x32("α ['β']", "α [u'β']"))
M("α %r", [b'z'] , x32("α [b'z']", "α ['z']"))
M("α %r", [xbytes('β')] , x32("α [b'β']", "α ['β']"))
M("α %r", [xbytearray('β')] , "α [bytearray(b'β')]")
M("α %r", [b('β')] , "α [b('β')]")
M("α %r", [u('β')] , "α [u('β')]")
# verify print for bstr/ustr. # verify print for bstr/ustr.
def test_strings_print(): def test_strings_print():
outok = readfile(dir_testprog + "/golang_test_str.txt") outok = readfile(dir_testprog + "/golang_test_str.txt")
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment