Commit c9648c44 authored by Kirill Smelkov's avatar Kirill Smelkov

X on bstr/ustr ; Almost ready

parent 7b72d418
......@@ -10,7 +10,7 @@ Package `golang` provides Go-like features for Python:
- `func` allows to define methods separate from class.
- `defer` allows to schedule a cleanup from the main control flow.
- `error` and package `errors` provide error chaining.
- `b` and `u` provide way to make sure an object is either bytes or unicode.
- `b`, `u` and `bstr`/`ustr` provide uniform UTF8-based approach to strings.
- `gimport` allows to import python modules by full path in a Go workspace.
Package `golang.pyx` provides__ similar features for Cython/nogil.
......@@ -229,19 +229,60 @@ __ https://www.python.org/dev/peps/pep-3134/
Strings
-------
`b` and `u` provide way to make sure an object is either bytes or unicode.
`b(obj)` converts str/unicode/bytes obj to UTF-8 encoded bytestring, while
`u(obj)` converts str/unicode/bytes obj to unicode string. For example::
Pygolang, similarly to Go, provides uniform UTF8-based approach to strings with
the idea to make working with byte- and unicode- strings easy and transparently
interoperable:
b("привет мир") # -> gives bytes corresponding to UTF-8 encoding of "привет мир".
- `bstr` is byte-string: it is based on `bytes` and can automatically convert to/from `unicode` [*]_.
- `ustr` is unicode-string: it is based on `unicode` and can automatically convert to/from `bytes`.
def f(s):
s = u(s) # make sure s is unicode, decoding as UTF-8(*) if it was bytes.
... # (*) but see below about lack of decode errors.
The conversion, in both encoding and decoding, never fails and never looses
information: `bstr→ustr→bstr` and `ustr→bstr→ustr` are always identity
even if bytes data is not valid UTF-8.
Semantically `bstr` is array of bytes, while `ustr` is array of
unicode-characters. Accessing their elements by `[index]` yields byte and
unicode character correspondingly [*]_. Iterating them, however, yields unicode
characters for both `bstr` and `ustr`. In practice `bstr` is enough 99% of the
time, and `ustr` only needs to be used for random access to string characters.
See `Strings, bytes, runes and characters in Go`__ for overview of this approach.
__ https://blog.golang.org/strings
Operations in between `bstr` and `ustr`/`unicode` / `bytes`/`bytearray` coerce to `bstr`, while
operations in between `ustr` and `bstr`/`bytes`/`bytearray` / `unicode` coerce
to `ustr`. When the coercion happens, `bytes` and `bytearray`, similarly to
`bstr`, are also treated as UTF8-encoded strings.
`bstr` and `ustr` are meant to be drop-in replacements for standard
`str`/`unicode` classes. They support all methods of `str`/`unicode` and in
particular their constructors accept arbitrary objects and either convert or stringify them. For
cases when no stringification is desired, and one only wants to convert
`bstr`/`ustr` / `unicode`/`bytes`/`bytearray`, or an object with `buffer`
interface [*]_, to Pygolang string, `b` and `u` provide way to make sure an
object is either `bstr` or `ustr` correspondingly.
The conversion in both encoding and decoding never fails and never looses
information: `b(u(·))` and `u(b(·))` are always identity for bytes and unicode
correspondingly, even if bytes input is not valid UTF-8.
Usage example::
s = b('привет') # s is bstr corresponding to UTF-8 encoding of 'привет'.
s += ' мир' # s is b('привет мир')
for c in s: # c will iterate through
... # [u(_) for _ in ('п','р','и','в','е','т',' ','м','и','р')]
# the following gives b('привет мир труд май')
b('привет %s %s %s') % (u'мир', # raw unicode
u'труд'.encode('utf-8'), # raw bytes
u('май')) # ustr
def f(s):
s = u(s) # make sure s is ustr, decoding as UTF-8(*) if it was bstr, bytes, bytearray or buffer.
... # (*) the decoding never fails nor looses information.
.. [*] `unicode` on Python2, `str` on Python3.
.. [*] | ordinal of such byte and unicode character can be obtained via regular `ord`.
| For completeness `bbyte` and `uchr` are also provided for constructing 1-byte `bstr` and 1-character `ustr` from ordinal.
.. [*] | data in buffer, similarly to `bytes` and `bytearray`, is treated as UTF8-encoded string.
| Notice that only explicit conversion through `b`/`u` and `bstr`/`ustr` accept objects with buffer interface. Automatic coercion does not.
Import
......
......@@ -24,7 +24,7 @@
- `func` allows to define methods separate from class.
- `defer` allows to schedule a cleanup from the main control flow.
- `error` and package `errors` provide error chaining.
- `b` and `u` provide way to make sure an object is either bytes or unicode.
- `b`, `u` and `bstr`/`ustr` provide uniform UTF8-based approach to strings.
- `gimport` allows to import python modules by full path in a Go workspace.
See README for thorough overview.
......@@ -36,7 +36,7 @@ from __future__ import print_function, absolute_import
__version__ = "0.1"
__all__ = ['go', 'chan', 'select', 'default', 'nilchan', 'defer', 'panic',
'recover', 'func', 'error', 'b', 'u', 'gimport']
'recover', 'func', 'error', 'b', 'u', 'bstr', 'ustr', 'bbyte', 'uchr', 'gimport']
from golang._gopath import gimport # make gimport available from golang
import inspect, sys
......@@ -316,4 +316,16 @@ from ._golang import \
pypanic as panic, \
pyerror as error, \
pyb as b, \
pyu as u
pybstr as bstr, \
pybbyte as bbyte, \
pyu as u, \
pyustr as ustr, \
pyuchr as uchr
# import golang.strconv into _golang from here to workaround cyclic golang ↔ strconv dependency
def _():
from . import _golang
from . import strconv
_golang.pystrconv = strconv
_()
del _
......@@ -2,8 +2,9 @@
# cython: language_level=2
# cython: binding=False
# cython: c_string_type=str, c_string_encoding=utf8
# cython: auto_pickle=False
# distutils: language = c++
# distutils: depends = libgolang.h os/signal.h
# distutils: depends = libgolang.h os/signal.h _golang_str.pyx
#
# Copyright (C) 2018-2022 Nexedi SA and Contributors.
# Kirill Smelkov <kirr@nexedi.com>
......@@ -808,151 +809,7 @@ cdef DType parse_dtype(dtype) except <DType>-1:
# ---- strings ----
from golang import strconv as pystrconv
def pyb(s): # -> bytes
"""b converts str/unicode/bytes s to UTF-8 encoded bytestring.
Bytes input is preserved as-is:
b(bytes_input) == bytes_input
Unicode input is UTF-8 encoded. The encoding always succeeds.
b is reverse operation to u - the following invariant is always true:
b(u(bytes_input)) == bytes_input
TypeError is raised if type(s) is not one of the above.
See also: u.
"""
bs, _ = pystrconv._bstr(s)
return bs
def pyu(s): # -> unicode
"""u converts str/unicode/bytes s to unicode string.
Unicode input is preserved as-is:
u(unicode_input) == unicode_input
Bytes input is UTF-8 decoded. The decoding always succeeds and input
information is not lost: non-valid UTF-8 bytes are decoded into
surrogate codes ranging from U+DC80 to U+DCFF.
u is reverse operation to b - the following invariant is always true:
u(b(unicode_input)) == unicode_input
TypeError is raised if type(s) is not one of the above.
See also: b.
"""
us, _ = pystrconv._ustr(s)
return us
# qq is substitute for %q, which is missing in python.
#
# (python's automatic escape uses smartquotes quoting with either ' or ").
#
# like %s, %q automatically converts its argument to string.
def pyqq(obj):
# make sure obj is text | bytes
# py2: unicode | str
# py3: str | bytes
if not isinstance(obj, (unicode, bytes)):
obj = str(obj)
qobj = pystrconv.quote(obj)
# `printf('%s', qq(obj))` should work. For this make sure qobj is always
# a-la str type (unicode on py3, bytes on py2), that can be transparently
# converted to unicode or bytes as needed.
if PY_MAJOR_VERSION >= 3:
qobj = _pyunicode(pyu(qobj))
else:
qobj = _pystr(pyb(qobj))
return qobj
# XXX cannot `cdef class`: github.com/cython/cython/issues/711
class _pystr(bytes):
"""_str is like bytes but can be automatically converted to Python unicode
string via UTF-8 decoding.
The decoding never fails nor looses information - see u for details.
"""
# don't allow to set arbitrary attributes.
# won't be needed after switch to -> `cdef class`
__slots__ = ()
# __bytes__ - no need
def __unicode__(self): return pyu(self)
def __str__(self):
if PY_MAJOR_VERSION >= 3:
return pyu(self)
else:
return self
cdef class _pyunicode(unicode):
"""_unicode is like unicode(py2)|str(py3) but can be automatically converted
to bytes via UTF-8 encoding.
The encoding always succeeds - see b for details.
"""
def __bytes__(self): return pyb(self)
# __unicode__ - no need
def __str__(self):
if PY_MAJOR_VERSION >= 3:
return self
else:
return pyb(self)
# initialize .tp_print for _pystr so that this type could be printed.
# If we don't - printing it will result in `RuntimeError: print recursion`
# because str of this type never reaches real bytes or unicode.
# Do it only on python2, because python3 does not use tp_print at all.
# NOTE _pyunicode does not need this because on py2 str(_pyunicode) returns _pystr.
IF PY2:
# NOTE Cython does not define tp_print for PyTypeObject - do it ourselves
from libc.stdio cimport FILE
cdef extern from "Python.h":
ctypedef int (*printfunc)(PyObject *, FILE *, int) except -1
ctypedef struct PyTypeObject:
printfunc tp_print
cdef PyTypeObject *Py_TYPE(object)
cdef int _pystr_tp_print(PyObject *obj, FILE *f, int nesting) except -1:
o = <bytes>obj
o = bytes(buffer(o)) # change tp_type to bytes instead of _pystr
return Py_TYPE(o).tp_print(<PyObject*>o, f, nesting)
Py_TYPE(_pystr()).tp_print = _pystr_tp_print
# __pystr converts obj to str of current python:
#
# - to bytes, via b, if running on py2, or
# - to unicode, via u, if running on py3.
#
# It is handy to use __pystr when implementing __str__ methods.
#
# NOTE __pystr is currently considered to be internal function and should not
# be used by code outside of pygolang.
#
# XXX we should be able to use _pystr, but py3's str verify that it must have
# Py_TPFLAGS_UNICODE_SUBCLASS in its type flags.
cdef __pystr(object obj):
if PY_MAJOR_VERSION >= 3:
return pyu(obj)
else:
return pyb(obj)
include "_golang_str.pyx"
# ---- error ----
......
# -*- coding: utf-8 -*-
# Copyright (C) 2018-2022 Nexedi SA and Contributors.
# Kirill Smelkov <kirr@nexedi.com>
#
# This program is free software: you can Use, Study, Modify and Redistribute
# it under the terms of the GNU General Public License version 3, or (at your
# option) any later version, as published by the Free Software Foundation.
#
# You can also Link and Combine this program with other software covered by
# the terms of any of the Free Software licenses or any of the Open Source
# Initiative approved licenses and Convey the resulting work. Corresponding
# source of such a combination shall include the source code for all other
# software used.
#
# This program is distributed WITHOUT ANY WARRANTY; without even the implied
# warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
# See COPYING file for full licensing terms.
# See https://www.nexedi.com/licensing for rationale and options.
"""_golang_str.pyx complements _golang.pyx and keeps everything related to strings.
It is included from _golang.pyx .
"""
# XXX overview of approach, what we patch and why
from cpython cimport PyUnicode_AsUnicode, PyUnicode_GetSize, PyUnicode_FromUnicode
from cpython cimport PyTypeObject, Py_TYPE, reprfunc, richcmpfunc, binaryfunc
from cpython cimport Py_EQ, Py_NE, Py_LT, Py_GT, Py_LE, Py_GE
from cpython.iterobject cimport PySeqIter_New
from cpython cimport PyThreadState_GetDict, PyDict_SetItem
from cpython cimport PyObject_CheckBuffer
cdef extern from "Python.h":
"""
#if PY_MAJOR_VERSION < 3
// on py2, PyDict_GetItemWithError is called _PyDict_GetItemWithError
// NOTE Cython3 provides PyDict_GetItemWithError out of the box
# define PyDict_GetItemWithError _PyDict_GetItemWithError
#endif
"""
PyObject* PyDict_GetItemWithError(object, object) except? NULL # borrowed ref
Py_ssize_t PY_SSIZE_T_MAX
void PyType_Modified(PyTypeObject *)
cdef extern from "Python.h":
"""
static int _XPyMapping_Check(PyObject *o) {
#if PY_MAJOR_VERSION >= 3
return PyMapping_Check(o);
#else
// on py2 PyMapping_Check besides checking tp_as_mapping->mp_subscript
// also verifies !tp_as_sequence->sq_slice. We want to avoid that
// because PyString_Format checks only tp_as_mapping->mp_subscript.
return Py_TYPE(o)->tp_as_mapping && Py_TYPE(o)->tp_as_mapping->mp_subscript;
#endif
}
"""
bint _XPyMapping_Check(object o)
cdef extern from "Python.h":
"""
static int _XPyObject_CheckOldBuffer(PyObject *o) {
#if PY_MAJOR_VERSION >= 3
// no old-style buffers on py3
return 0;
#else
return PyObject_CheckReadBuffer(o);
#endif
}
"""
bint _XPyObject_CheckOldBuffer(object o)
cdef extern from "Python.h":
ctypedef int (*initproc)(object, PyObject *, PyObject *) except -1
ctypedef struct _XPyTypeObject "PyTypeObject":
initproc tp_init
PySequenceMethods *tp_as_sequence
ctypedef struct PySequenceMethods:
binaryfunc sq_concat
binaryfunc sq_inplace_concat
from libc.stdint cimport uint8_t
from libc.stdio cimport FILE
pystrconv = None # = golang.strconv imported at runtime (see __init__.py)
import string as pystring
import types as pytypes
import functools as pyfunctools
import re as pyre
if PY_MAJOR_VERSION >= 3:
import copyreg as pycopyreg
else:
import copy_reg as pycopyreg
def pyb(s): # -> bstr
"""b converts object to bstr.
- For bstr the same object is returned.
- For bytes, bytearray, or object with buffer interface, the data is
preserved as-is and only result type is changed to bstr.
- For ustr/unicode the data is UTF-8 encoded. The encoding always succeeds.
TypeError is raised if type(s) is not one of the above.
b is reverse operation to u - the following invariant is always true:
b(u(bytes_input)) is bstr with the same data as bytes_input.
See also: u, bstr/ustr.
"""
bs = _pyb(pybstr, s)
if bs is None:
raise TypeError("b: invalid type %s" % type(s))
return bs
def pyu(s): # -> ustr
"""u converts object to ustr.
- For ustr the same object is returned.
- For unicode the data is preserved as-is and only result type is changed to ustr.
- For bstr, bytes, bytearray, or object with buffer interface, the data is UTF-8 decoded.
The decoding always succeeds and input
information is not lost: non-valid UTF-8 bytes are decoded into
surrogate codes ranging from U+DC80 to U+DCFF.
TypeError is raised if type(s) is not one of the above.
u is reverse operation to b - the following invariant is always true:
u(b(unicode_input)) is ustr with the same data as unicode_input.
See also: b, bstr/ustr.
"""
us = _pyu(pyustr, s)
if us is None:
raise TypeError("u: invalid type %s" % type(s))
return us
cdef _pyb(bcls, s): # -> ~bstr | None
if type(s) is bcls:
return s
if isinstance(s, bytes):
if type(s) is not bytes:
s = _bdata(s)
elif isinstance(s, unicode):
s = _utf8_encode_surrogateescape(s)
else:
s = _ifbuffer_data(s) # bytearray and buffer
if s is None:
return None
assert type(s) is bytes
return bytes.__new__(bcls, s)
cdef _pyu(ucls, s): # -> ~ustr | None
if type(s) is ucls:
return s
if isinstance(s, unicode):
if type(s) is not unicode:
s = _udata(s)
else:
_ = _ifbuffer_data(s) # bytearray and buffer
if _ is not None:
s = _
if isinstance(s, bytes):
s = _utf8_decode_surrogateescape(s)
else:
return None
assert type(s) is unicode
return unicode.__new__(ucls, s)
# _ifbuffer_data returns contained data if obj provides buffer interface.
cdef _ifbuffer_data(obj): # -> bytes|None
if PyObject_CheckBuffer(obj):
if PY_MAJOR_VERSION >= 3:
return bytes(obj)
else:
# py2: bytes(memoryview) returns '<memory at ...>'
return bytes(bytearray(obj))
elif _XPyObject_CheckOldBuffer(obj): # old-style buffer, py2-only
return bytes(_buffer_py2(obj))
else:
return None
# _pyb_coerce coerces x from `b op x` to be used in operation with pyb.
cdef _pyb_coerce(x): # -> bstr|bytes
if isinstance(x, bytes):
return x
elif isinstance(x, (unicode, bytearray)):
return pyb(x)
else:
raise TypeError("b: coerce: invalid type %s" % type(x))
# _pyu_coerce coerces x from `u op x` to be used in operation with pyu.
cdef _pyu_coerce(x): # -> ustr|unicode
if isinstance(x, unicode):
return x
elif isinstance(x, (bytes, bytearray)):
return pyu(x)
else:
raise TypeError("u: coerce: invalid type %s" % type(x))
# _pybu_rcoerce coerces x from `x op b|u` to either bstr or ustr.
# NOTE bytearray is handled outside of this function.
cdef _pybu_rcoerce(x): # -> bstr|ustr
if isinstance(x, bytes):
return pyb(x)
elif isinstance(x, unicode):
return pyu(x)
else:
raise TypeError('b/u: coerce: invalid type %s' % type(x))
# __pystr converts obj to ~str of current python:
#
# - to ~bytes, via b, if running on py2, or
# - to ~unicode, via u, if running on py3.
#
# It is handy to use __pystr when implementing __str__ methods.
#
# NOTE __pystr is currently considered to be internal function and should not
# be used by code outside of pygolang.
#
# XXX we should be able to use pybstr, but py3's str verify that it must have
# Py_TPFLAGS_UNICODE_SUBCLASS in its type flags.
cdef __pystr(object obj): # -> ~str
if PY_MAJOR_VERSION >= 3:
return pyu(obj)
else:
return pyb(obj)
# XXX doc, place
def pybbyte(int i): # -> 1-byte bstr
return pyb(bytearray([i]))
# XXX doc, place
def pyuchr(int i):
return pyu(unichr(i))
# XXX cannot `cdef class`: github.com/cython/cython/issues/711
class pybstr(bytes):
"""bstr is byte-string.
It is based on bytes and can automatically convert to/from unicode.
The conversion never fails and never looses information:
bstr → ustr → bstr
is always identity even if bytes data is not valid UTF-8.
Semantically bstr is array of bytes. Accessing its elements by [index]
yields integers. Iterating through bstr, however, yields unicode characters.
In practice bstr is enough 99% of the time, and ustr only needs to be used
for random access to string characters. See https://blog.golang.org/strings
for overview of this approach.
Operations in between bstr and ustr/unicode / bytes/bytearray coerce to bstr.
When the coercion happens, bytes and bytearray, similarly to bstr, are also
treated as UTF8-encoded strings.
bstr constructor accepts arbitrary objects and stringify them:
- if encoding and/or errors is specified, the object must provide buffer
interface. The data in the buffer is decoded according to provided
encoding/errors and further encoded via UTF-8 into bstr.
- if the object is bstr/ustr / unicode/bytes/bytearray - it is converted
to bstr. See b for details.
- otherwise bstr will have string representation of the object.
See also: b, ustr/u.
"""
# don't allow to set arbitrary attributes.
# won't be needed after switch to -> `cdef class`
__slots__ = ()
def __new__(cls, object='', encoding=None, errors=None):
# encoding or errors -> object must expose buffer interface
if not (encoding is None and errors is None):
object = _buffer_decode(object, encoding, errors)
# _bstringify. Note: it handles bstr/ustr / unicode/bytes/bytearray as documented
object = _bstringify(object)
assert isinstance(object, (unicode, bytes)), object
bobj = _pyb(cls, object)
assert bobj is not None
return bobj
def __bytes__(self): return self
def __unicode__(self): return pyu(self)
def __str__(self):
if PY_MAJOR_VERSION >= 3:
return pyu(self)
else:
return self
def __repr__(self):
qself, nonascii_escape = _bpysmartquote_u3b2(self)
bs = _inbstringify_get()
if bs.inbstringify == 0 or bs.inrepr:
if nonascii_escape: # so that e.g. b(u'\x80') is represented as
qself = 'b' + qself # b(b'\xc2\x80'), not as b('\xc2\x80')
return "b(" + qself + ")"
else:
# [b('β')] goes as ['β'] when under _bstringify for %s
return qself
# override reduce for protocols < 2. Builtin handler for that goes through
# copyreg._reduce_ex which eventually calls bytes(bstr-instance) to
# retrieve state, which gives bstr, not bytes. Fix state to be bytes ourselves.
def __reduce_ex__(self, protocol):
if protocol >= 2:
return bytes.__reduce_ex__(self, protocol)
return (
pycopyreg._reconstructor,
(self.__class__, self.__class__, _bdata(self))
)
def __hash__(self):
# hash of the same unicode and UTF-8 encoded bytes is generally different
# -> we can't make hash(bstr) == both hash(bytes) and hash(unicode) at the same time.
# -> make hash(bstr) == hash(str type of current python) so that bstr
# could be used as keys in dictionary interchangeably with native str type.
if PY_MAJOR_VERSION >= 3:
return hash(pyu(self))
else:
return bytes.__hash__(self)
# == != < > <= >=
# NOTE == and != are special: they must succeed against any type so that
# bstr could be used as dict key.
def __eq__(a, b):
try:
b = _pyb_coerce(b)
except TypeError:
return False
return bytes.__eq__(a, b)
def __ne__(a, b): return not a.__eq__(b)
def __lt__(a, b): return bytes.__lt__(a, _pyb_coerce(b))
def __gt__(a, b): return bytes.__gt__(a, _pyb_coerce(b))
def __le__(a, b): return bytes.__le__(a, _pyb_coerce(b))
def __ge__(a, b): return bytes.__ge__(a, _pyb_coerce(b))
# len - no need to override
# [], [:]
def __getitem__(self, idx):
x = bytes.__getitem__(self, idx)
if type(idx) is slice:
return pyb(x)
else:
# bytes[i] returns 1-character bytestring(py2) or int(py3)
# we always return 1-character bytestring
if PY_MAJOR_VERSION >= 3:
return pybbyte(x)
else:
return pyb(x)
# __iter__ - yields unicode characters
def __iter__(self):
# TODO iterate without converting self to u
return pyu(self).__iter__()
# __contains__
def __contains__(self, key):
# NOTE on py3 bytes.__contains__ accepts numbers and buffers. We don't want to
# automatically coerce any of them to bytestrings
return bytes.__contains__(self, _pyb_coerce(key))
# __add__, __radd__ (no need to override __iadd__)
def __add__(a, b):
# NOTE Cython < 3 does not automatically support __radd__ for cdef class
# https://cython.readthedocs.io/en/latest/src/userguide/migrating_to_cy30.html#arithmetic-special-methods
# but pybstr is currently _not_ cdef'ed class
# see also https://github.com/cython/cython/issues/4750
return pyb(bytes.__add__(a, _pyb_coerce(b)))
def __radd__(b, a):
# a.__add__(b) returned NotImplementedError, e.g. for unicode.__add__(bstr)
# u'' + b() -> u() ; same as u() + b() -> u()
# b'' + b() -> b() ; same as b() + b() -> b()
# barr + b() -> barr
if isinstance(a, bytearray):
# force `bytearray +=` to go via bytearray.sq_inplace_concat - see PyNumber_InPlaceAdd
return NotImplemented
a = _pybu_rcoerce(a)
return a.__add__(b)
# __mul__, __rmul__ (no need to override __imul__)
def __mul__(a, b):
return pyb(bytes.__mul__(a, b))
def __rmul__(b, a):
return b.__mul__(a)
# %-formatting
def __mod__(a, b):
return _bprintf(a, b)
def __rmod__(b, a):
# ("..." % x) calls "x.__rmod__()" for string subtypes
# determine output type as in __radd__
if isinstance(a, bytearray):
# on py2 bytearray does not implement %
return NotImplemented # no need to check for py3 - there our __rmod__ is not invoked
a = _pybu_rcoerce(a)
return a.__mod__(b)
# format
def format(self, *args, **kwargs): return pyb(pyu(self).format(*args, **kwargs))
def format_map(self, mapping): return pyb(pyu(self).format_map(mapping))
def __format__(self, format_spec):
# NOTE don't convert to b due to "TypeError: __format__ must return a str, not pybstr"
# we are ok to return ustr even for format(bstr, ...) because in
# practice format builtin is never used and it is only s.format()
# that is used in programs. This way __format__ will be invoked
# only internally.
#
# NOTE we are ok to use ustr.__format__ because the only format code
# supported by bstr/ustr/unicode __format__ is 's', not e.g. 'r'.
return pyu(self).__format__(format_spec)
# encode/decode XXX place
def decode(self, encoding=None, errors=None):
if encoding is None and errors is None:
encoding = 'utf-8' # NOTE always UTF-8, not sys.getdefaultencoding
errors = 'surrogateescape'
else:
if encoding is None: encoding = 'utf-8'
if errors is None: errors = 'strict'
if encoding == 'utf-8' and errors == 'surrogateescape':
x = _utf8_decode_surrogateescape(self)
else:
x = bytes.decode(self, encoding, errors)
return pyu(x)
if PY_MAJOR_VERSION < 3:
# whiteout encode inherited from bytes
# TODO ideally whiteout it in such a way that bstr.encode also raises AttributeError
encode = property(doc='bstr has no encode')
# all other string methods
def capitalize(self): return pyb(pyu(self).capitalize())
if _strhas('casefold'): # py3.3 TODO provide py2 implementation
def casefold(self): return pyb(pyu(self).casefold())
def center(self, width, fillchar=' '): return pyb(pyu(self).center(width, fillchar))
def count(self, sub, start=None, end=None): return bytes.count(self, _pyb_coerce(sub), start, end)
def endswith(self, suffix, start=None, end=None):
if isinstance(suffix, tuple):
for _ in suffix:
if self.endswith(_pyb_coerce(_), start, end):
return True
return False
if start is None: start = 0
if end is None: end = PY_SSIZE_T_MAX
return bytes.endswith(self, _pyb_coerce(suffix), start, end)
def expandtabs(self, tabsize=8): return pyb(pyu(self).expandtabs(tabsize))
# NOTE find/index & friends should return byte-position, not unicode-position
def find(self, sub, start=None, end=None): return bytes.find(self, _pyb_coerce(sub), start, end)
def index(self, sub, start=None, end=None): return bytes.index(self, _pyb_coerce(sub), start, end)
def isalnum(self): return pyu(self).isalnum()
def isalpha(self): return pyu(self).isalpha()
# isascii(self) no need to override
def isdecimal(self): return pyu(self).isdecimal()
def isdigit(self): return pyu(self).isdigit()
if _strhas('isidentifier'): # py3 TODO provide fallback implementation
def isidentifier(self): return pyu(self).isidentifier()
def islower(self): return pyu(self).islower()
def isnumeric(self): return pyu(self).isnumeric()
if _strhas('isprintable'): # py3 TODO provide fallback implementation
def isprintable(self): return pyu(self).isprintable()
def isspace(self): return pyu(self).isspace()
def istitle(self): return pyu(self).istitle()
def join(self, iterable): return pyb(bytes.join(self, (_pyb_coerce(_) for _ in iterable)))
def ljust(self, width, fillchar=' '): return pyb(pyu(self).ljust(width, fillchar))
def lower(self): return pyb(pyu(self).lower())
def lstrip(self, chars=None): return pyb(pyu(self).lstrip(chars))
def partition(self, sep): return tuple(pyb(_) for _ in bytes.partition(self, _pyb_coerce(sep)))
if _strhas('removeprefix'): # py3.9 TODO provide fallback implementation
def removeprefix(self, prefix): return pyb(pyu(self).removeprefix(prefix))
if _strhas('removesuffix'): # py3.9 TODO provide fallback implementation
def removesuffix(self, suffix): return pyb(pyu(self).removesuffix(suffix))
def replace(self, old, new, count=-1): return pyb(bytes.replace(self, _pyb_coerce(old), _pyb_coerce(new), count))
# NOTE rfind/rindex & friends should return byte-position, not unicode-position
def rfind(self, sub, start=None, end=None): return bytes.rfind(self, _pyb_coerce(sub), start, end)
def rindex(self, sub, start=None, end=None): return bytes.rindex(self, _pyb_coerce(sub), start, end)
def rjust(self, width, fillchar=' '): return pyb(pyu(self).rjust(width, fillchar))
def rpartition(self, sep): return tuple(pyb(_) for _ in bytes.rpartition(self, _pyb_coerce(sep)))
def rsplit(self, sep=None, maxsplit=-1):
v = pyu(self).rsplit(sep, maxsplit)
return list([pyb(_) for _ in v])
def rstrip(self, chars=None): return pyb(pyu(self).rstrip(chars))
def split(self, sep=None, maxsplit=-1):
v = pyu(self).split(sep, maxsplit)
return list([pyb(_) for _ in v])
def splitlines(self, keepends=False): return list(pyb(_) for _ in pyu(self).splitlines(keepends))
def startswith(self, prefix, start=None, end=None):
if isinstance(prefix, tuple):
for _ in prefix:
if self.startswith(_pyb_coerce(_), start, end):
return True
return False
if start is None: start = 0
if end is None: end = PY_SSIZE_T_MAX
return bytes.startswith(self, _pyb_coerce(prefix), start, end)
def strip(self, chars=None): return pyb(pyu(self).strip(chars))
def swapcase(self): return pyb(pyu(self).swapcase())
def title(self): return pyb(pyu(self).title())
def translate(self, table, delete=None):
# bytes mode (compatibility with str/py2)
if table is None or isinstance(table, bytes) or delete is not None:
if delete is None: delete = b''
return pyb(bytes.translate(self, table, delete))
# unicode mode
else:
return pyb(pyu(self).translate(table))
def upper(self): return pyb(pyu(self).upper())
def zfill(self, width): return pyb(pyu(self).zfill(width))
@staticmethod
def maketrans(x=None, y=None, z=None):
return pyustr.maketrans(x, y, z)
# XXX cannot `cdef class` with __new__: https://github.com/cython/cython/issues/799
class pyustr(unicode):
"""ustr is unicode-string.
It is based on unicode and can automatically convert to/from bytes.
The conversion never fails and never looses information:
ustr → bstr → ustr
is always identity even if bytes data is not valid UTF-8.
ustr is similar to standard unicode type - iterating and accessing its
elements by [index] yields unicode characters.
ustr complements bstr and is meant to be used only in situations when
random access to string characters is needed. Otherwise bstr is more
preferable and should be enough 99% of the time.
Operations in between ustr and bstr/bytes/bytearray / unicode coerce to ustr.
When the coercion happens, bytes and bytearray, similarly to bstr, are also
treated as UTF8-encoded strings.
ustr constructor, similarly to the one in bstr, accepts arbitrary objects
and stringify them. Please refer to bstr and u documentation for details.
See also: u, bstr/b.
"""
# don't allow to set arbitrary attributes.
# won't be needed after switch to -> `cdef class`
__slots__ = ()
def __new__(cls, object='', encoding=None, errors=None):
# encoding or errors -> object must expose buffer interface
if not (encoding is None and errors is None):
object = _buffer_decode(object, encoding, errors)
# _bstringify. Note: it handles bstr/ustr / unicode/bytes/bytearray as documented
object = _bstringify(object)
assert isinstance(object, (unicode, bytes)), object
uobj = _pyu(cls, object)
assert uobj is not None
return uobj
def __bytes__(self): return pyb(self)
def __unicode__(self): return self
def __str__(self):
if PY_MAJOR_VERSION >= 3:
return self
else:
return pyb(self)
def __repr__(self):
qself, nonascii_escape = _upysmartquote_u3b2(self)
bs = _inbstringify_get()
if bs.inbstringify == 0 or bs.inrepr:
if nonascii_escape:
qself = 'b'+qself # see bstr.__repr__
return "u(" + qself + ")"
else:
# [u('β')] goes as ['β'] when under _bstringify for %s
return qself
# override reduce for protocols < 2. Builtin handler for that goes through
# copyreg._reduce_ex which eventually calls unicode(ustr-instance) to
# retrieve state, which gives ustr, not unicode. Fix state to be unicode ourselves.
def __reduce_ex__(self, protocol):
if protocol >= 2:
return unicode.__reduce_ex__(self, protocol)
return (
pycopyreg._reconstructor,
(self.__class__, self.__class__, _udata(self))
)
def __hash__(self):
# see pybstr.__hash__ for why we stick to hash of current str
if PY_MAJOR_VERSION >= 3:
return unicode.__hash__(self)
else:
return hash(pyb(self))
# == != < > <= >=
# NOTE == and != are special: they must succeed against any type so that
# ustr could be used as dict key.
def __eq__(a, b):
try:
b = _pyu_coerce(b)
except TypeError:
return False
return unicode.__eq__(a, b)
def __ne__(a, b): return not a.__eq__(b)
def __lt__(a, b): return unicode.__lt__(a, _pyu_coerce(b))
def __gt__(a, b): return unicode.__gt__(a, _pyu_coerce(b))
def __le__(a, b): return unicode.__le__(a, _pyu_coerce(b))
def __ge__(a, b): return unicode.__ge__(a, _pyu_coerce(b))
# len - no need to override
# [], [:]
def __getitem__(self, idx):
return pyu(unicode.__getitem__(self, idx))
# __iter__
def __iter__(self):
if PY_MAJOR_VERSION >= 3:
return _pyustrIter(unicode.__iter__(self))
else:
# on python 2 unicode does not have .__iter__
return PySeqIter_New(self)
# __contains__
def __contains__(self, key):
return unicode.__contains__(self, _pyu_coerce(key))
# __add__, __radd__ (no need to override __iadd__)
def __add__(a, b):
# NOTE Cython < 3 does not automatically support __radd__ for cdef class
# https://cython.readthedocs.io/en/latest/src/userguide/migrating_to_cy30.html#arithmetic-special-methods
# but pyustr is currently _not_ cdef'ed class
# see also https://github.com/cython/cython/issues/4750
return pyu(unicode.__add__(a, _pyu_coerce(b)))
def __radd__(b, a):
# a.__add__(b) returned NotImplementedError, e.g. for unicode.__add__(bstr)
# u'' + u() -> u() ; same as u() + u() -> u()
# b'' + u() -> b() ; same as b() + u() -> b()
# barr + u() -> barr
if isinstance(a, bytearray):
# force `bytearray +=` to go via bytearray.sq_inplace_concat - see PyNumber_InPlaceAdd
# for pyustr this relies on patch to bytearray.sq_inplace_concat to accept ustr as bstr
return NotImplemented
a = _pybu_rcoerce(a)
return a.__add__(b)
# __mul__, __rmul__ (no need to override __imul__)
def __mul__(a, b):
return pyu(unicode.__mul__(a, b))
def __rmul__(b, a):
return b.__mul__(a)
# %-formatting
def __mod__(a, b):
return pyu(pyb(a).__mod__(b))
def __rmod__(b, a):
# ("..." % x) calls "x.__rmod__()" for string subtypes
# determine output type as in __radd__
if isinstance(a, bytearray):
return NotImplemented # see bstr.__rmod__
a = _pybu_rcoerce(a)
return a.__mod__(b)
# format
def format(self, *args, **kwargs):
return pyu(_bvformat(self, args, kwargs))
def format_map(self, mapping):
return pyu(_bvformat(self, (), mapping))
def __format__(self, format_spec):
# NOTE not e.g. `_bvformat(_pyu_coerce(format_spec), (self,))` because
# the only format code that string.__format__ should support is
# 's', not e.g. 'r'.
return pyu(unicode.__format__(self, format_spec))
# encode/decode XXX place
def encode(self, encoding=None, errors=None):
if encoding is None and errors is None:
encoding = 'utf-8' # NOTE always UTF-8, not sys.getdefaultencoding
errors = 'surrogateescape'
else:
if encoding is None: encoding = 'utf-8'
if errors is None: errors = 'strict'
if encoding == 'utf-8' and errors == 'surrogateescape':
x = _utf8_encode_surrogateescape(self)
else:
x = unicode.encode(self, encoding, errors)
return pyb(x)
if PY_MAJOR_VERSION < 3:
# whiteout decode inherited from unicode
# TODO ideally whiteout it in such a way that ustr.decode also raises AttributeError
decode = property(doc='ustr has no decode')
# all other string methods
def capitalize(self): return pyu(unicode.capitalize(self))
if _strhas('casefold'): # py3.3 TODO provide fallback implementation
def casefold(self): return pyu(unicode.casefold(self))
def center(self, width, fillchar=' '): return pyu(unicode.center(self, width, _pyu_coerce(fillchar)))
def count(self, sub, start=None, end=None):
# cython optimizes unicode.count to directly call PyUnicode_Count -
# - cannot use None for start/stop https://github.com/cython/cython/issues/4737
if start is None: start = 0
if end is None: end = PY_SSIZE_T_MAX
return unicode.count(self, _pyu_coerce(sub), start, end)
def endswith(self, suffix, start=None, end=None):
if isinstance(suffix, tuple):
for _ in suffix:
if self.endswith(_pyu_coerce(_), start, end):
return True
return False
if start is None: start = 0
if end is None: end = PY_SSIZE_T_MAX
return unicode.endswith(self, _pyu_coerce(suffix), start, end)
def expandtabs(self, tabsize=8): return pyu(unicode.expandtabs(self, tabsize))
def find(self, sub, start=None, end=None):
if start is None: start = 0
if end is None: end = PY_SSIZE_T_MAX
return unicode.find(self, _pyu_coerce(sub), start, end)
def index(self, sub, start=None, end=None):
if start is None: start = 0
if end is None: end = PY_SSIZE_T_MAX
return unicode.index(self, _pyu_coerce(sub), start, end)
# isalnum(self) no need to override
# isalpha(self) no need to override
# isascii(self) no need to override
# isdecimal(self) no need to override
# isdigit(self) no need to override
# isidentifier(self) no need to override
# islower(self) no need to override
# isnumeric(self) no need to override
# isprintable(self) no need to override
# isspace(self) no need to override
# istitle(self) no need to override
def join(self, iterable): return pyu(unicode.join(self, (_pyu_coerce(_) for _ in iterable)))
def ljust(self, width, fillchar=' '): return pyu(unicode.ljust(self, width, _pyu_coerce(fillchar)))
def lower(self): return pyu(unicode.lower(self))
def lstrip(self, chars=None): return pyu(unicode.lstrip(self, _xpyu_coerce(chars)))
def partition(self, sep): return tuple(pyu(_) for _ in unicode.partition(self, _pyu_coerce(sep)))
if _strhas('removeprefix'): # py3.9 TODO provide fallback implementation
def removeprefix(self, prefix): return pyu(unicode.removeprefix(self, _pyu_coerce(prefix)))
if _strhas('removesuffix'): # py3.9 TODO provide fallback implementation
def removesuffix(self, suffix): return pyu(unicode.removesuffix(self, _pyu_coerce(suffix)))
def replace(self, old, new, count=-1): return pyu(unicode.replace(self, _pyu_coerce(old), _pyu_coerce(new), count))
def rfind(self, sub, start=None, end=None):
if start is None: start = 0
if end is None: end = PY_SSIZE_T_MAX
return unicode.rfind(self, _pyu_coerce(sub), start, end)
def rindex(self, sub, start=None, end=None):
if start is None: start = 0
if end is None: end = PY_SSIZE_T_MAX
return unicode.rindex(self, _pyu_coerce(sub), start, end)
def rjust(self, width, fillchar=' '): return pyu(unicode.rjust(self, width, _pyu_coerce(fillchar)))
def rpartition(self, sep): return tuple(pyu(_) for _ in unicode.rpartition(self, _pyu_coerce(sep)))
def rsplit(self, sep=None, maxsplit=-1):
v = unicode.rsplit(self, _xpyu_coerce(sep), maxsplit)
return list([pyu(_) for _ in v])
def rstrip(self, chars=None): return pyu(unicode.rstrip(self, _xpyu_coerce(chars)))
def split(self, sep=None, maxsplit=-1):
# cython optimizes unicode.split to directly call PyUnicode_Split - cannot use None for sep
# and cannot also use object=NULL https://github.com/cython/cython/issues/4737
if sep is None:
if PY_MAJOR_VERSION >= 3:
v = unicode.split(self, maxsplit=maxsplit)
else:
# on py2 unicode.split does not accept keyword arguments
v = _udata(self).split(None, maxsplit)
else:
v = unicode.split(self, _pyu_coerce(sep), maxsplit)
return list([pyu(_) for _ in v])
def splitlines(self, keepends=False): return list(pyu(_) for _ in unicode.splitlines(self, keepends))
def startswith(self, prefix, start=None, end=None):
if isinstance(prefix, tuple):
for _ in prefix:
if self.startswith(_pyu_coerce(_), start, end):
return True
return False
if start is None: start = 0
if end is None: end = PY_SSIZE_T_MAX
return unicode.startswith(self, _pyu_coerce(prefix), start, end)
def strip(self, chars=None): return pyu(unicode.strip(self, _xpyu_coerce(chars)))
def swapcase(self): return pyu(unicode.swapcase(self))
def title(self): return pyu(unicode.title(self))
def translate(self, table):
# unicode.translate does not accept bstr values
t = {}
for k,v in table.items():
if not isinstance(v, int): # either unicode ordinal,
v = _xpyu_coerce(v) # character or None
t[k] = v
return pyu(unicode.translate(self, t))
def upper(self): return pyu(unicode.upper(self))
def zfill(self, width): return pyu(unicode.zfill(self, width))
@staticmethod
def maketrans(x=None, y=None, z=None):
if PY_MAJOR_VERSION >= 3:
if y is None:
# std maketrans(x) accepts only int|unicode keys
_ = {}
for k,v in x.items():
if not isinstance(k, int):
k = pyu(k)
_[k] = v
return unicode.maketrans(_)
elif z is None:
return unicode.maketrans(pyu(x), pyu(y)) # std maketrans does not accept b
else:
return unicode.maketrans(pyu(x), pyu(y), pyu(z)) # ----//----
# hand-made on py2
t = {}
if y is not None:
x = pyu(x)
y = pyu(y)
if len(x) != len(y):
raise ValueError("len(x) must be == len(y))")
for (xi,yi) in zip(x,y):
t[ord(xi)] = ord(yi)
if z is not None:
z = pyu(z)
for _ in z:
t[ord(_)] = None
else:
if type(x) is not dict:
raise TypeError("sole x must be dict")
for k,v in x.iteritems():
if not isinstance(k, (int,long)):
k = ord(pyu(k))
t[k] = pyu(v)
return t
# _pyustrIter wraps unicode iterator to return pyustr for each yielded character.
cdef class _pyustrIter:
cdef object uiter
def __init__(self, uiter):
self.uiter = uiter
def __iter__(self):
return self
def __next__(self):
x = next(self.uiter)
return pyu(x)
# _bdata/_udata retrieve raw data from bytes/unicode.
def _bdata(obj): # -> bytes
assert isinstance(obj, bytes)
_ = obj.__getnewargs__()[0] # (`bytes-data`,)
assert type(_) is bytes
return _
"""
bcopy = bytes(memoryview(obj))
assert type(bcopy) is bytes
return bcopy
"""
def _udata(obj): # -> unicode
assert isinstance(obj, unicode)
_ = obj.__getnewargs__()[0] # (`unicode-data`,)
assert type(_) is unicode
return _
"""
cdef Py_UNICODE* u = PyUnicode_AsUnicode(obj)
cdef Py_ssize_t size = PyUnicode_GetSize(obj)
cdef unicode ucopy = PyUnicode_FromUnicode(u, size)
assert type(ucopy) is unicode
return ucopy
"""
# XXX place
# _buffer_decode decodes buf to unicode according to encoding and errors.
#
# buf must expose buffer interface.
# encoding/errors can be None meaning to use default utf-8/strict.
cdef unicode _buffer_decode(buf, encoding, errors):
if encoding is None: encoding = 'utf-8' # NOTE always UTF-8, not sys.getdefaultencoding
if errors is None: errors = 'strict'
if _XPyObject_CheckOldBuffer(buf):
buf = _buffer_py2(buf)
else:
buf = memoryview(buf)
return bytearray(buf).decode(encoding, errors)
# XXX place
# initialize .tp_print for pybstr so that this type could be printed.
# If we don't - printing it will result in `RuntimeError: print recursion`
# because str of this type never reaches real bytes or unicode.
# Do it only on python2, because python3 does not use tp_print at all.
# NOTE pyustr does not need this because on py2 str(pyustr) returns pybstr.
IF PY2:
# Cython does not define tp_print for PyTypeObject - do it ourselves
cdef extern from "Python.h":
ctypedef int (*printfunc)(PyObject *, FILE *, int) except -1
ctypedef struct _PyTypeObject_Print "PyTypeObject":
printfunc tp_print
int Py_PRINT_RAW
cdef int _pybstr_tp_print(PyObject *obj, FILE *f, int flags) except -1:
o = <object>obj
if flags & Py_PRINT_RAW:
# emit str of the object instead of repr
# https://docs.python.org/2.7/c-api/object.html#c.PyObject_Print
pass
else:
# emit repr
o = repr(o)
assert isinstance(o, bytes)
o = <bytes>o
o = bytes(buffer(o)) # change tp_type to bytes instead of pybstr
return (<_PyTypeObject_Print*>Py_TYPE(o)) .tp_print(<PyObject*>o, f, Py_PRINT_RAW)
(<_PyTypeObject_Print*>Py_TYPE(pybstr())) .tp_print = _pybstr_tp_print
# _bpysmartquote_u3b2 quotes bytes/bytearray s the same way python would do for string.
#
# nonascii_escape indicates whether \xNN with NN >= 0x80 is present in the output.
#
# NOTE the return type is str type of current python, so that quoted result
# could be directly used in __repr__ or __str__ implementation.
cdef _bpysmartquote_u3b2(s): # -> (unicode(py3)|bytes(py2), nonascii_escape)
# TODO change to `const uint8_t[::1] s` after strconv._quote is moved to pyx
if isinstance(s, bytearray):
s = _bytearray_data(s)
assert isinstance(s, bytes), s
# smartquotes: choose ' or " as quoting character exactly the same way python does
# https://github.com/python/cpython/blob/v2.7.18-0-g8d21aa21f2c/Objects/stringobject.c#L905-L909
quote = b"'"
if (quote in s) and (b'"' not in s):
quote = b'"'
x, nonascii_escape = pystrconv._quote(s, quote) # raw bytes
if PY_MAJOR_VERSION < 3:
return x, nonascii_escape
else:
return _utf8_decode_surrogateescape(x), nonascii_escape # raw unicode
# _upysmartquote_u3b2 is similar to _bpysmartquote_u3b2 but accepts unicode argument.
#
# NOTE the return type is str type of current python - see _bpysmartquote_u3b2 for details.
cdef _upysmartquote_u3b2(s): # -> (unicode(py3)|bytes(py2), nonascii_escape)
assert isinstance(s, unicode), s
return _bpysmartquote_u3b2(_utf8_encode_surrogateescape(s))
# qq is substitute for %q, which is missing in python.
#
# (python's automatic escape uses smartquotes quoting with either ' or ").
#
# like %s, %q automatically converts its argument to string.
def pyqq(obj):
# make sure obj is text | bytes
# py2: unicode | str
# py3: str | bytes
if not isinstance(obj, (unicode, bytes)):
obj = _bstringify(obj)
return pystrconv.quote(obj)
# ---- _bstringify ----
# _bstringify returns string representation of obj.
# it is similar to unicode(obj), but handles bytes as UTF-8 encoded strings.
cdef _bstringify(object obj): # -> unicode|bytes
if type(obj) in (pybstr, pyustr):
return obj
# indicate to e.g. patched bytes.__repr__ that it is being called from under _bstringify
_bstringify_enter()
try:
if PY_MAJOR_VERSION >= 3:
# NOTE this depends on patches to bytes.{__repr__,__str__} below
return unicode(obj)
else:
# on py2 mimic manually what unicode(·) does on py3
# the reason we do it manually is because if we try just
# unicode(obj), and obj's __str__ returns UTF-8 bytestring, it will
# fail with UnicodeDecodeError. Similarly if we unconditionally do
# str(obj), it will fail if obj's __str__ returns unicode.
#
# NOTE this depends on patches to bytes.{__repr__,__str__} and
# unicode.{__repr__,__str__} below.
if hasattr(obj, '__unicode__'):
return obj.__unicode__()
elif hasattr(obj, '__str__'):
return obj.__str__()
else:
return repr(obj)
finally:
_bstringify_leave()
# _bstringify_repr returns repr of obj.
# it is similar to repr(obj), but handles bytes as UTF-8 encoded strings.
cdef _bstringify_repr(object obj): # -> unicode|bytes
_bstringify_enter_repr()
try:
return repr(obj)
finally:
_bstringify_leave_repr()
# patch bytes.{__repr__,__str__} and (py2) unicode.{__repr__,__str__}, so that both
# bytes and unicode are treated as normal strings when under _bstringify.
#
# Why:
#
# py2: str([ 'β']) -> ['\\xce\\xb2'] (1) x
# py2: str([u'β']) -> [u'\\u03b2'] (2) x
# py3: str([ 'β']) -> ['β'] (3)
# py3: str(['β'.encode()]) -> [b'\\xce\\xb2'] (4) x
#
# for us 3 is ok, while 1,2 and 4 are not. For all 1,2,3,4 we want e.g.
# `bstr(·)` or `b('%s') % ·` to give ['β']. This is fixed by patching __repr__.
#
# regarding patching __str__ - 6 and 8 in the following examples illustrate the
# need to do it:
#
# py2: str( 'β') -> 'β' (5)
# py2: str(u'β') -> UnicodeEncodeError (6) x
# py3: str( 'β') -> 'β' (7)
# py3: str('β'.encode()) -> b'\\xce\\xb2' (8) x
#
# See also overview of %-formatting.
cdef reprfunc _bytes_tp_repr = Py_TYPE(b'').tp_repr
cdef reprfunc _bytes_tp_str = Py_TYPE(b'').tp_str
cdef reprfunc _unicode_tp_repr = Py_TYPE(u'').tp_repr
cdef reprfunc _unicode_tp_str = Py_TYPE(u'').tp_str
cdef object _bytes_tp_xrepr(object s):
bs = _inbstringify_get()
if bs.inbstringify == 0:
return _bytes_tp_repr(s)
s, _ = _bpysmartquote_u3b2(s)
if PY_MAJOR_VERSION >= 3 and bs.inrepr != 0:
s = 'b'+s
return s
cdef object _bytes_tp_xstr(object s):
bs = _inbstringify_get()
if bs.inbstringify == 0:
return _bytes_tp_str(s)
else:
if PY_MAJOR_VERSION >= 3:
return _utf8_decode_surrogateescape(s)
else:
return s
cdef object _unicode2_tp_xrepr(object s):
bs = _inbstringify_get()
if bs.inbstringify == 0:
return _unicode_tp_repr(s)
s, _ = _upysmartquote_u3b2(s)
if PY_MAJOR_VERSION < 3 and bs.inrepr != 0:
s = 'u'+s
return s
cdef object _unicode2_tp_xstr(object s):
bs = _inbstringify_get()
if bs.inbstringify == 0:
return _unicode_tp_str(s)
else:
return s
def _bytes_x__repr__(s): return _bytes_tp_xrepr(s)
def _bytes_x__str__(s): return _bytes_tp_xstr(s)
def _unicode2_x__repr__(s): return _unicode2_tp_xrepr(s)
def _unicode2_x__str__(s): return _unicode2_tp_xstr(s)
def _():
cdef PyTypeObject* t
# NOTE patching bytes and its already-created subclasses that did not override .tp_repr/.tp_str
# NOTE if we don't also patch __dict__ - e.g. x.__repr__() won't go through patched .tp_repr
for pyt in [bytes] + bytes.__subclasses__():
assert isinstance(pyt, type)
t = <PyTypeObject*>pyt
if t.tp_repr == _bytes_tp_repr:
t.tp_repr = _bytes_tp_xrepr
_patch_slot(t, '__repr__', _bytes_x__repr__)
if t.tp_str == _bytes_tp_str:
t.tp_str = _bytes_tp_xstr
_patch_slot(t, '__str__', _bytes_x__str__)
_()
if PY_MAJOR_VERSION < 3:
def _():
cdef PyTypeObject* t
for pyt in [unicode] + unicode.__subclasses__():
assert isinstance(pyt, type)
t = <PyTypeObject*>pyt
if t.tp_repr == _unicode_tp_repr:
t.tp_repr = _unicode2_tp_xrepr
_patch_slot(t, '__repr__', _unicode2_x__repr__)
if t.tp_str == _unicode_tp_str:
t.tp_str = _unicode2_tp_xstr
_patch_slot(t, '__str__', _unicode2_x__str__)
_()
# py2: adjust unicode.tp_richcompare(a,b) to return NotImplemented if b is bstr.
# This way we avoid `UnicodeWarning: Unicode equal comparison failed to convert
# both arguments to Unicode - interpreting them as being unequal`, and that
# further `a == b` returns False even if `b == a` gives True.
#
# NOTE there is no need to do the same for ustr, because ustr inherits from
# unicode and can be always natively converted to unicode by python itself.
cdef richcmpfunc _unicode_tp_richcompare = Py_TYPE(u'').tp_richcompare
cdef object _unicode_tp_xrichcompare(object a, object b, int op):
if isinstance(b, pybstr):
return NotImplemented
return _unicode_tp_richcompare(a, b, op)
cdef object _unicode_x__eq__(object a, object b): return _unicode_tp_richcompare(a, b, Py_EQ)
cdef object _unicode_x__ne__(object a, object b): return _unicode_tp_richcompare(a, b, Py_NE)
cdef object _unicode_x__lt__(object a, object b): return _unicode_tp_richcompare(a, b, Py_LT)
cdef object _unicode_x__gt__(object a, object b): return _unicode_tp_richcompare(a, b, Py_GT)
cdef object _unicode_x__le__(object a, object b): return _unicode_tp_richcompare(a, b, Py_LE)
cdef object _unicode_x__ge__(object a, object b): return _unicode_tp_richcompare(a, b, Py_GE)
if PY_MAJOR_VERSION < 3:
def _():
cdef PyTypeObject* t
for pyt in [unicode] + unicode.__subclasses__():
assert isinstance(pyt, type)
t = <PyTypeObject*>pyt
if t.tp_richcompare == _unicode_tp_richcompare:
t.tp_richcompare = _unicode_tp_xrichcompare
_patch_slot(t, "__eq__", _unicode_x__eq__)
_patch_slot(t, "__ne__", _unicode_x__ne__)
_patch_slot(t, "__lt__", _unicode_x__lt__)
_patch_slot(t, "__gt__", _unicode_x__gt__)
_patch_slot(t, "__le__", _unicode_x__le__)
_patch_slot(t, "__ge__", _unicode_x__ge__)
_()
# patch bytearray.{__repr__,__str__} similarly to bytes, so that e.g.
# '%s' % bytearray('β') turns into β instead of bytearray(b'\xce\xb2'), and
# '%s' % [bytearray('β'] turns into ['β'] instead of [bytearray(b'\xce\xb2')].
#
# also patch:
#
# - bytearray.__init__ to accept ustr instead of raising 'TypeError:
# string argument without an encoding' (pybug: bytearray() should respect
# __bytes__ similarly to bytes)
#
# - bytearray.{sq_concat,sq_inplace_concat} to accept ustr instead of raising
# TypeError. (pybug: bytearray + and += should respect __bytes__)
cdef reprfunc _bytearray_tp_repr = (<PyTypeObject*>bytearray) .tp_repr
cdef reprfunc _bytearray_tp_str = (<PyTypeObject*>bytearray) .tp_str
cdef initproc _bytearray_tp_init = (<_XPyTypeObject*>bytearray) .tp_init
cdef binaryfunc _bytearray_sq_concat = (<_XPyTypeObject*>bytearray) .tp_as_sequence.sq_concat
cdef binaryfunc _bytearray_sq_iconcat = (<_XPyTypeObject*>bytearray) .tp_as_sequence.sq_inplace_concat
cdef object _bytearray_tp_xrepr(object a):
bs = _inbstringify_get()
if bs.inbstringify == 0:
return _bytearray_tp_repr(a)
s, _ = _bpysmartquote_u3b2(a)
if bs.inrepr != 0:
s = 'bytearray(b' + s + ')'
return s
cdef object _bytearray_tp_xstr(object a):
bs = _inbstringify_get()
if bs.inbstringify == 0:
return _bytearray_tp_str(a)
else:
if PY_MAJOR_VERSION >= 3:
return _utf8_decode_surrogateescape(a)
else:
return _bytearray_data(a)
cdef int _bytearray_tp_xinit(object self, PyObject* args, PyObject* kw) except -1:
if args != NULL and (kw == NULL or (not <object>kw)):
argv = <object>args
if isinstance(argv, tuple) and len(argv) == 1:
arg = argv[0]
if isinstance(arg, pyustr):
argv = (pyb(arg),) # NOTE argv is kept alive till end of function
args = <PyObject*>argv # no need to incref it
return _bytearray_tp_init(self, args, kw)
cdef object _bytearray_sq_xconcat(object a, object b):
if isinstance(b, pyustr):
b = pyb(b)
return _bytearray_sq_concat(a, b)
cdef object _bytearray_sq_xiconcat(object a, object b):
if isinstance(b, pyustr):
b = pyb(b)
return _bytearray_sq_iconcat(a, b)
def _bytearray_x__repr__(a): return _bytearray_tp_xrepr(a)
def _bytearray_x__str__ (a): return _bytearray_tp_xstr(a)
def _bytearray_x__init__(self, *argv, **kw):
# NOTE don't return - just call: __init__ should return None
_bytearray_tp_xinit(self, <PyObject*>argv, <PyObject*>kw)
def _bytearray_x__add__ (a, b): return _bytearray_sq_xconcat(a, b)
def _bytearray_x__iadd__(a, b): return _bytearray_sq_xiconcat(a, b)
def _():
cdef PyTypeObject* t
for pyt in [bytearray] + bytearray.__subclasses__():
assert isinstance(pyt, type)
t = <PyTypeObject*>pyt
if t.tp_repr == _bytearray_tp_repr:
t.tp_repr = _bytearray_tp_xrepr
_patch_slot(t, '__repr__', _bytearray_x__repr__)
if t.tp_str == _bytearray_tp_str:
t.tp_str = _bytearray_tp_xstr
_patch_slot(t, '__str__', _bytearray_x__str__)
t_ = <_XPyTypeObject*>t
if t_.tp_init == _bytearray_tp_init:
t_.tp_init = _bytearray_tp_xinit
_patch_slot(t, '__init__', _bytearray_x__init__)
t_sq = t_.tp_as_sequence
if t_sq.sq_concat == _bytearray_sq_concat:
t_sq.sq_concat = _bytearray_sq_xconcat
_patch_slot(t, '__add__', _bytearray_x__add__)
if t_sq.sq_inplace_concat == _bytearray_sq_iconcat:
t_sq.sq_inplace_concat = _bytearray_sq_xiconcat
_patch_slot(t, '__iadd__', _bytearray_x__iadd__)
_()
# _bytearray_data return raw data in bytearray as bytes.
# XXX `bytearray s` leads to `TypeError: Expected bytearray, got hbytearray`
cdef bytes _bytearray_data(object s):
if PY_MAJOR_VERSION >= 3:
return bytes(s)
else:
# on py2 bytes(s) is str(s) which invokes patched bytearray.__str__
# we want to get raw bytearray data, which is provided by unpatched bytearray.__str__
return _bytearray_tp_str(s)
# _bstringify_enter*/_bstringify_leave*/_inbstringify_get allow _bstringify* to
# indicate to further invoked code whether it has been invoked from under
# _bstringify* or not.
cdef object _inbstringify_key = "golang._inbstringify"
@final
cdef class _InBStringify:
cdef int inbstringify # >0 if we are running under _bstringify/_bstringify_repr
cdef int inrepr # >0 if we are running under _bstringify_repr
def __cinit__(self):
self.inbstringify = 0
cdef void _bstringify_enter() except*:
bs = _inbstringify_get()
bs.inbstringify += 1
cdef void _bstringify_leave() except*:
bs = _inbstringify_get()
bs.inbstringify -= 1
cdef void _bstringify_enter_repr() except*:
bs = _inbstringify_get()
bs.inbstringify += 1
bs.inrepr += 1
cdef void _bstringify_leave_repr() except*:
bs = _inbstringify_get()
bs.inbstringify -= 1
bs.inrepr -= 1
cdef _InBStringify _inbstringify_get():
cdef PyObject* _ts_dict = PyThreadState_GetDict() # borrowed
if _ts_dict == NULL:
raise RuntimeError("no thread state")
cdef _InBStringify ts_inbstringify
cdef PyObject* _ts_inbstrinfigy = PyDict_GetItemWithError(<object>_ts_dict, _inbstringify_key) # raises on error
if _ts_inbstrinfigy == NULL:
# key not present
ts_inbstringify = _InBStringify()
PyDict_SetItem(<object>_ts_dict, _inbstringify_key, ts_inbstringify)
else:
ts_inbstringify = <_InBStringify>_ts_inbstrinfigy
return ts_inbstringify
# _patch_slot installs func_or_descr into typ's __dict__ as name.
#
# if func_or_descr is descriptor (has __get__), it is installed as is.
# otherwise it is wrapped with "unbound method" descriptor.
cdef _patch_slot(PyTypeObject* typ, str name, object func_or_descr):
typdict = <dict>(typ.tp_dict)
#print("\npatching %s.%s with %r" % (typ.tp_name, name, func_or_descr))
#print("old: %r" % typdict.get(name))
if hasattr(func_or_descr, '__get__'):
descr = func_or_descr
else:
func = func_or_descr
if PY_MAJOR_VERSION < 3:
descr = pytypes.MethodType(func, None, <object>typ)
else:
descr = _UnboundMethod(func)
typdict[name] = descr
#print("new: %r" % typdict.get(name))
PyType_Modified(typ)
cdef class _UnboundMethod(object): # they removed unbound methods on py3
cdef object func
def __init__(self, func):
self.func = func
def __get__(self, obj, objtype):
return pyfunctools.partial(self.func, obj)
# ---- _bstringify fallback for pypy where patching bytes.tp_repr has no effect ----
cdef __bstringify(object obj, bint _repr, set inside):
# if obj has its own __str__ - we must use it
# check __str__ only on class because e.g. for `f = Foo(); f.__str__ = lambda: 'sneaky'`
# str(f) does _not_ call f.__str__ at all.
klass = obj.__class__ # NOTE type(obj) gives wrong result for old-style classes
if not _repr:
kstr = getattr(klass, '__str__', None)
if isinstance(obj, bytes): # bytes has its own __str__
kstrdefault = kstr is bytes.__str__
else:
kstrdefault = ( \
(kstr is None) or # old-style classes do not have default __str__
(kstr is object.__str__)) # new-style classes have default __str__ from object
if not kstrdefault:
return unicode(obj)
# obj does not define non-default __str__
# let's __bstringify it the same way as std __str__ would do XXX modulo bytes -> b
if isinstance(obj, unicode):
return obj if not _repr else repr(obj)
if isinstance(obj, bytes):
obj = pyu(obj)
return obj if not _repr else unicode.__repr__(obj)
if isinstance(obj, list):
# XXX check klass.__repr__ == list.__repr__
if id(obj) in inside:
return "[...]"
inside.add(id(obj))
r = u"[" + u", ".join(__bstringify(_, True, inside) for _ in obj) + "]"
inside.remove(id(obj))
return r
elif isinstance(obj, tuple):
# XXX check klass.__repr__ == tuple.__repr__
if id(obj) in inside:
return "(...)"
inside.add(id(obj))
r = u"(" + u", ".join(__bstringify(_, True, inside) for _ in obj)
if len(obj) == 1:
r += u"," # single-element tuple formats as (x,)
r += ")"
inside.remove(id(obj))
return r
if isinstance(obj, (set, frozenset)):
# XXX check klass.__repr__ == (set|frozenset).__repr__
if len(obj) == 0:
return klass.__name__ + "()"
if id(obj) in inside:
return klass.__name__ + "(...)"
inside.add(id(obj))
r = u"{" + u", ".join(__bstringify(_, True, inside) for _ in obj) + "}"
if klass is not set:
r = klass.__name__ + "(" + r + ")"
inside.remove(id(obj))
return r
elif isinstance(obj, dict):
# XXX check klass.__repr__ == dict.__repr__
if id(obj) in inside:
return "{...}"
inside.add(id(obj))
r = u"{" + u", ".join(__bstringify(k, True, inside) + u": " +
__bstringify(v, True, inside)
for (k,v) in obj.items()) + \
u"}"
inside.remove(id(obj))
return r
# nothing we can do except to stringify obj standard way
# (e.g. int has to be stringified this way)
return unicode(obj) if not _repr else repr(obj)
# ---- % formatting ----
# When formatting string is bstr/ustr we treat bytes in all arguments as
# UTF8-encoded bytestrings. The following approach is used to implement this:
#
# 1. both bstr and ustr format via bytes-based _bprintf.
# 2. we parse the format string and handle every formatting specifier separately:
# 3. for formats besides %s/%r we use bytes.__mod__ directly.
#
# 4. for %s we stringify corresponding argument specially with all, potentially
# internal, bytes instances treated as UTF8-encoded strings:
#
# '%s' % b'\xce\xb2' -> "β"
# '%s' % [b'\xce\xb2'] -> "['β']"
#
# 5. for %r, similarly to %s, we prepare repr of corresponding argument
# specially with all, potentially internal, bytes instances also treated as
# UTF8-encoded strings:
#
# '%r' % b'\xce\xb2' -> "b'β'"
# '%r' % [b'\xce\xb2'] -> "[b'β']"
#
#
# For "2" we implement %-format parsing ourselves. test_strings_mod_and_format
# has good coverage for this phase to make sure we get it right and behaving
# exactly the same way as standard Python does.
#
# For "4" we monkey-patch bytes.__repr__ to repr bytes as strings when called
# from under bstr.__mod__(). See _bstringify for details. And as a fallback on
# PyPy, where patching bytes.__repr__ is not possible, for the purpose of
# stringification, we clone original object with all its bytes XXX pypy: not clone
# instances replaced with unicode.
#
# For "5", similarly to "4", we rely on adjustments to bytes.__repr__ .
# See _bstringify_repr for details.
#
# See also overview of patching bytes.{__repr__,__str__} near _bstringify.
cdef object _missing = object()
cdef object _atidx_re = pyre.compile('.* at index ([0-9]+)$')
cdef _bprintf(const uint8_t[::1] fmt, xarg): # -> pybstr
cdef bytearray out = bytearray()
cdef tuple argv = None # if xarg is tuple
cdef object argm = None # if xarg is mapping
# https://github.com/python/cpython/blob/2.7-0-g8d21aa21f2c/Objects/stringobject.c#L4298-L4300
# https://github.com/python/cpython/blob/v3.11.0b1-171-g70aa1b9b912/Objects/unicodeobject.c#L14319-L14320
if _XPyMapping_Check(xarg) and \
(not isinstance(xarg, tuple)) and \
(not isinstance(xarg, (bytes,unicode))):
argm = xarg
if isinstance(xarg, tuple):
argv = xarg
xarg = _missing
#print()
#print('argv:', argv)
#print('argm:', argm)
#print('xarg:', xarg)
cdef int argv_idx = 0
def nextarg():
nonlocal argv_idx, xarg
# NOTE for `'%s %(x)s' % {'x':1}` python gives "{'x': 1} 1"
# -> so we avoid argm check completely here
#if argm is not None:
if 0:
raise ValueError('mixing dict/tuple')
elif argv is not None:
# tuple xarg
if argv_idx < len(argv):
arg = argv[argv_idx]
argv_idx += 1
return arg
elif xarg is not _missing:
# sole xarg
arg = xarg
xarg = _missing
return arg
raise TypeError('not enough arguments for format string')
def badf():
raise ValueError('incomplete format')
# parse format string locating formatting specifiers
# if we see %s/%r - use _bstringify
# else use builtin %-formatting
#
# %[(name)][flags][width|*][.[prec|*]][len](type)
#
# https://docs.python.org/3/library/stdtypes.html#printf-style-string-formatting
# https://github.com/python/cpython/blob/2.7-0-g8d21aa21f2c/Objects/stringobject.c#L4266-L4765
#
# Rejected alternative: try to format; if we get "TypeError: %b requires a
# bytes-like object ..." retry with that argument converted to bstr.
#
# Rejected because e.g. for `%(x)s %(x)r` % {'x': obj}` we need to use
# access number instead of key 'x' to determine which accesses to
# bstringify. We could do that, but unfortunately on Python2 the access
# number is not easily predictable because string could be upgraded to
# unicode in the midst of being formatted and so some access keys will be
# accesses not once.
#
# Another reason for rejection: b'%r' and u'%r' handle arguments
# differently - on b %r is aliased to %a.
cdef int i = 0
cdef int l = len(fmt)
cdef uint8_t c
while i < l:
c = fmt[i]
i += 1
if c != ord('%'):
out.append(c)
continue
fmt_istart = i-1
nameb = _missing
width = _missing
prec = _missing
value = _missing
# `c = fmt_nextchar()` avoiding https://github.com/cython/cython/issues/4798
if i >= l: badf()
c = fmt[i]; i += 1
# (name)
if c == ord('('):
#print('(name)')
if argm is None:
raise TypeError('format requires a mapping')
nparen = 1
nameb = b''
while 1:
if i >= l:
raise ValueError('incomplete format key')
c = fmt[i]; i += 1
if c == ord('('):
nparen += 1
elif c == ord(')'):
nparen -= 1
if i >= l: badf()
c = fmt[i]; i += 1
break
else:
nameb += bchr(c)
# flags
while chr(c) in '#0- +':
#print('flags')
if i >= l: badf()
c = fmt[i]; i += 1
# [width|*]
if c == ord('*'):
#print('*width')
width = nextarg()
if i >= l: badf()
c = fmt[i]; i += 1
else:
while chr(c).isdigit():
#print('width')
if i >= l: badf()
c = fmt[i]; i += 1
# [.prec|*]
if c == ord('.'):
#print('dot')
if i >= l: badf()
c = fmt[i]; i += 1
if c == ord('*'):
#print('.*')
prec = nextarg()
if i >= l: badf()
c = fmt[i]; i += 1
else:
while chr(c).isdigit():
#print('.prec')
if i >= l: badf()
c = fmt[i]; i += 1
# [len]
while chr(c) in 'hlL':
#print('len')
if i >= l: badf()
c = fmt[i]; i += 1
fmt_type = c
#print('fmt_type:', repr(chr(fmt_type)))
if fmt_type == ord('%'):
if i-2 == fmt_istart: # %%
out.append(b'%')
continue
if nameb is not _missing:
xarg = _missing # `'%(x)s %s' % {'x':1}` raises "not enough arguments"
nameu = _utf8_decode_surrogateescape(nameb)
try:
value = argm[nameb]
except KeyError:
# retry with changing key via bytes <-> unicode
# e.g. for `b('%(x)s') % {'x': ...}` builtin bytes.__mod__ will
# extract b'x' as key and raise KeyError: b'x'. We avoid that via
# retrying with second string type for key.
value = argm[nameu]
else:
# NOTE for `'%4%' % ()` python raises "not enough arguments ..."
#if fmt_type != ord('%'):
if 1:
value = nextarg()
if fmt_type == ord('%'):
raise ValueError("unsupported format character '%s' (0x%x) at index %i" % (chr(c), c, i-1))
fmt1 = memoryview(fmt[fmt_istart:i]).tobytes()
#print('fmt_istart:', fmt_istart)
#print('i: ', i)
#print(' ~> __mod__ ', repr(fmt1))
# bytes %r is aliased of %a (ASCII), but we want unicode-like %r
# -> handle it ourselves
if fmt_type == ord('r'):
value = pyb(_bstringify_repr(value))
fmt_type = ord('s')
fmt1 = fmt1[:-1] + b's'
elif fmt_type == ord('s'):
# %s -> feed value through _bstringify
# this also converts e.g. int to bstr, else e.g. on `b'%s' % 123` python
# complains '%b requires a bytes-like object ...'
value = pyb(_bstringify(value))
if nameb is not _missing:
arg = {nameb: value, nameu: value}
else:
t = []
if width is not _missing: t.append(width)
if prec is not _missing: t.append(prec)
if value is not _missing: t.append(value)
t = tuple(t)
arg = t
#print('--> __mod__ ', repr(fmt1), ' % ', repr(arg))
try:
s = bytes.__mod__(fmt1, arg)
except ValueError as e:
# adjust position in '... at index <idx>' from fmt1 to fmt
if len(e.args) == 1:
a = e.args[0]
m = _atidx_re.match(a)
if m is not None:
a = a[:m.start(1)] + str(i-1)
e.args = (a,)
raise
out.extend(s)
if argm is None:
#print('END')
#print('argv:', argv, 'argv_idx:', argv_idx, 'xarg:', xarg)
if (argv is not None and argv_idx != len(argv)) or (xarg is not _missing):
raise TypeError("not all arguments converted during string formatting")
return pybstr(out)
# ---- .format formatting ----
# Handling .format is easier and similar to %-Formatting: we detect fields to
# format as strings via using custom string.Formatter (see _BFormatter), and
# further treat objects to stringify similarly to how %-formatting does for %s and %r.
#
# We do not need to implement format parsing ourselves, because
# string.Formatter provides it.
# _bvformat implements .format for pybstr/pyustr.
cdef _bvformat(fmt, args, kw):
return _BFormatter().vformat(fmt, args, kw)
class _BFormatter(pystring.Formatter):
def format_field(self, v, fmtspec):
#print('format_field', repr(v), repr(fmtspec))
# {} on bytes/bytearray -> treat it as bytestring
if type(v) in (bytes, bytearray):
v = pyb(v)
#print(' ~ ', repr(v))
# if the object contains bytes inside, e.g. as in [b'β'] - treat those
# internal bytes also as bytestrings
_bstringify_enter()
try:
#return super(_BFormatter, self).format_field(v, fmtspec)
x = super(_BFormatter, self).format_field(v, fmtspec)
finally:
_bstringify_leave()
#print(' ->', repr(x))
if PY_MAJOR_VERSION < 3: # py2 Formatter._vformat does does ''.join(result)
x = pyu(x) # -> we want everything in result to be unicode to avoid
# UnicodeDecodeError
return x
def convert_field(self, v, conv):
#print('convert_field', repr(v), repr(conv))
if conv == 's':
# string.Formatter does str(v) for 's'. we don't want that:
# py3: stringify, and especially treat bytes as bytestring
# py2: stringify, avoiding e.g. UnicodeEncodeError for str(unicode)
x = pyb(_bstringify(v))
elif conv == 'r':
# for bytes {!r} produces ASCII-only, but we want unicode-like !r for e.g. b'β'
# -> handle it ourselves
x = pyb(_bstringify_repr(v))
else:
x = super(_BFormatter, self).convert_field(v, conv)
#print(' ->', repr(x))
return x
# on py2 string.Formatter does not handle field autonumbering
# -> do it ourselves
if PY_MAJOR_VERSION < 3:
_autoidx = 0
_had_digit = False
def get_field(self, field_name, args, kwargs):
if field_name == '':
if self._had_digit:
raise ValueError("mixing explicit and auto numbered fields is forbidden")
field_name = str(self._autoidx)
self._autoidx += 1
elif field_name.isdigit():
self._had_digit = True
if self._autoidx != 0:
raise ValueError("mixing explicit and auto numbered fields is forbidden")
return super(_BFormatter, self).get_field(field_name, args, kwargs)
# ---- misc ----
# _strhas returns whether unicode string type has specified method.
cdef bint _strhas(str meth) except *:
return hasattr(unicode, meth)
cdef object _xpyu_coerce(obj):
return _pyu_coerce(obj) if obj is not None else None
# _buffer_py2 returns buffer(obj) on py2 / fails on py3
cdef object _buffer_py2(object obj):
IF PY2: # cannot `if PY_MAJOR_VERSION < 3` because then cython errors
return buffer(obj) # "undeclared name not builtin: buffer"
ELSE:
raise AssertionError("must be called only on py2")
# ---- UTF-8 encode/decode ----
from six import unichr # py2: unichr py3: chr
from six import int2byte as bchr # py2: chr py3: lambda x: bytes((x,))
_rune_error = 0xFFFD # unicode replacement character
_ucs2_build = (sys.maxunicode == 0xffff) # ucs2
assert _ucs2_build or sys.maxunicode >= 0x0010ffff # or ucs4
# _utf8_decode_rune decodes next UTF8-character from byte string s.
#
# _utf8_decode_rune(s) -> (r, size)
def _utf8_decode_rune(const uint8_t[::1] s):
if len(s) == 0:
return _rune_error, 0
l = min(len(s), 4) # max size of an UTF-8 encoded character
while l > 0:
_ = memoryview(s[:l]).tobytes()
try:
r = _.decode('utf-8', 'strict')
except UnicodeDecodeError:
l -= 1
continue
if len(r) == 1:
return ord(r), l
# see comment in _utf8_encode_surrogateescape
if _ucs2_build and len(r) == 2:
try:
return _xuniord(r), l
# e.g. TypeError: ord() expected a character, but string of length 2 found
# ValueError: only single character unicode strings can be converted to Py_UCS4, got length 2
except (TypeError, ValueError):
l -= 1
continue
l -= 1
continue
# invalid UTF-8
return _rune_error, 1
# _utf8_decode_surrogateescape mimics s.decode('utf-8', 'surrogateescape') from py3.
def _utf8_decode_surrogateescape(const uint8_t[::1] s): # -> unicode
if PY_MAJOR_VERSION >= 3:
return bytearray(s).decode('UTF-8', 'surrogateescape')
# py2 does not have surrogateescape error handler, and even if we
# provide one, builtin bytes.decode() does not treat surrogate
# sequences as error. -> Do the decoding ourselves.
outv = []
emit = outv.append
while len(s) > 0:
r, width = _utf8_decode_rune(s)
if r == _rune_error:
b = s[0]
assert 0x80 <= b <= 0xff, b
emit(unichr(0xdc00 + b))
# python2 "correctly" decodes surrogates - don't allow that as
# surrogates are not valid UTF-8:
# https://github.com/python/cpython/blob/v3.8.1-118-gdbb37aac142/Objects/stringlib/codecs.h#L153-L157
# (python3 raises UnicodeDecodeError for surrogates)
elif 0xd800 <= r < 0xdfff:
for c in s[:width]:
if c >= 0x80:
emit(unichr(0xdc00 + c))
else:
emit(unichr(c))
else:
emit(_xunichr(r))
s = s[width:]
return u''.join(outv)
# _utf8_encode_surrogateescape mimics s.encode('utf-8', 'surrogateescape') from py3.
def _utf8_encode_surrogateescape(s): # -> bytes
assert isinstance(s, unicode)
if PY_MAJOR_VERSION >= 3:
return unicode.encode(s, 'UTF-8', 'surrogateescape')
# py2 does not have surrogateescape error handler, and even if we
# provide one, builtin unicode.encode() does not treat
# \udc80-\udcff as error. -> Do the encoding ourselves.
outv = []
emit = outv.append
while len(s) > 0:
uc = s[0]; s = s[1:]
c = ord(uc)
if 0xdc80 <= c <= 0xdcff:
# surrogate - emit unescaped byte
emit(bchr(c & 0xff))
continue
# in builds with --enable-unicode=ucs2 (default for py2 on macos and windows)
# python represents unicode points > 0xffff as _two_ unicode characters:
#
# uh = u - 0x10000
# c1 = 0xd800 + (uh >> 10) ; [d800, dbff]
# c2 = 0xdc00 + (uh & 0x3ff) ; [dc00, dfff]
#
# if detected - merge those two unicode characters for .encode('utf-8') below
#
# this should be only relevant for python2, as python3 switched to "flexible"
# internal unicode representation: https://www.python.org/dev/peps/pep-0393
if _ucs2_build and (0xd800 <= c <= 0xdbff):
if len(s) > 0:
uc2 = s[0]
c2 = ord(uc2)
if 0xdc00 <= c2 <= 0xdfff:
uc = uc + uc2
s = s[1:]
emit(uc.encode('utf-8', 'strict'))
return b''.join(outv)
# _xuniord returns ordinal for a unicode character u.
#
# it works correctly even if u is represented as 2 unicode surrogate points on
# ucs2 python build.
if not _ucs2_build:
_xuniord = ord
else:
def _xuniord(u):
assert isinstance(u, unicode)
if len(u) == 1:
return ord(u)
# see _utf8_encode_surrogateescape for details
if len(u) == 2:
c1 = ord(u[0])
c2 = ord(u[1])
if (0xd800 <= c1 <= 0xdbff) and (0xdc00 <= c2 <= 0xdfff):
return 0x10000 | ((c1 - 0xd800) << 10) | (c2 - 0xdc00)
# let it crash
return ord(u)
# _xunichr returns unicode character for an ordinal i.
#
# it works correctly even on ucs2 python builds, where ordinals >= 0x10000 are
# represented as 2 unicode points.
if not _ucs2_build:
_xunichr = unichr
else:
def _xunichr(i):
if i < 0x10000:
return unichr(i)
# see _utf8_encode_surrogateescape for details
uh = i - 0x10000
return unichr(0xd800 + (uh >> 10)) + \
unichr(0xdc00 + (uh & 0x3ff))
# -*- coding: utf-8 -*-
# Copyright (C) 2018-2022 Nexedi SA and Contributors.
# Kirill Smelkov <kirr@nexedi.com>
#
# This program is free software: you can Use, Study, Modify and Redistribute
# it under the terms of the GNU General Public License version 3, or (at your
# option) any later version, as published by the Free Software Foundation.
#
# You can also Link and Combine this program with other software covered by
# the terms of any of the Free Software licenses or any of the Open Source
# Initiative approved licenses and Convey the resulting work. Corresponding
# source of such a combination shall include the source code for all other
# software used.
#
# This program is distributed WITHOUT ANY WARRANTY; without even the implied
# warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
# See COPYING file for full licensing terms.
# See https://www.nexedi.com/licensing for rationale and options.
from __future__ import print_function, absolute_import
from golang import b, u, bstr, ustr, bbyte, uchr, func, defer, panic
from golang._golang import _udata, _bdata
from golang.gcompat import qq
from golang.strconv_test import byterange
from golang.golang_test import readfile, assertDoc, _pyrun, dir_testprog, PIPE
from pytest import raises, mark, skip
import sys
import six
from six import text_type as unicode, unichr
import re, pickle, copy, types
import array, collections
# buftypes lists types with buffer interface that we will test against.
#
# NOTE bytearray is not included here - being bytes-like object it is handled
# and tested explicitly in tests that exercise interaction of bstr/ustr with
# bytes/unicode and bytearray.
buftypes = [
memoryview,
lambda x: array.array('B', x),
]
if six.PY2:
buftypes.append(buffer) # no buffer on py3
# verify b/u and bstr/ustr basics.
def test_strings_basic():
# UTF-8 encode/decode
testv = (
# bytes <-> unicode
(b'', u''),
(b'hello', u'hello'),
(b'hello\nworld', u'hello\nworld'),
(b'\xd0\xbc\xd0\xb8\xd1\x80', u'мир'),
# invalid utf-8
(b'\xd0', u'\udcd0'),
(b'a\xd0b', u'a\udcd0b'),
# invalid utf-8 with byte < 0x80
(b'\xe2\x28\xa1', u'\udce2(\udca1'),
# more invalid utf-8
# https://stackoverflow.com/questions/1301402/example-invalid-utf8-string
(b"\xc3\x28", u'\udcc3('), # Invalid 2 Octet Sequence
(b"\xa0\xa1", u'\udca0\udca1'), # Invalid Sequence Identifier
(b"\xe2\x82\xa1", u'\u20a1'), # Valid 3 Octet Sequence '₡'
(b"\xe2\x28\xa1", u'\udce2(\udca1'), # Invalid 3 Octet Sequence (in 2nd Octet)
(b"\xe2\x82\x28", u'\udce2\udc82('), # Invalid 3 Octet Sequence (in 3rd Octet)
(b"\xf0\x90\x8c\xbc", u'\U0001033c'), # Valid 4 Octet Sequence '𐌼'
(b"\xf0\x28\x8c\xbc", u'\udcf0(\udc8c\udcbc'), # Invalid 4 Octet Sequence (in 2nd Octet)
(b"\xf0\x90\x28\xbc", u'\udcf0\udc90(\udcbc'), # Invalid 4 Octet Sequence (in 3rd Octet)
(b"\xf0\x28\x8c\x28", u'\udcf0(\udc8c('), # Invalid 4 Octet Sequence (in 4th Octet)
(b"\xf8\xa1\xa1\xa1\xa1", # Valid 5 Octet Sequence (but not Unicode!)
u'\udcf8\udca1\udca1\udca1\udca1'),
(b"\xfc\xa1\xa1\xa1\xa1\xa1", # Valid 6 Octet Sequence (but not Unicode!)
u'\udcfc\udca1\udca1\udca1\udca1\udca1'),
# surrogate
(b'\xed\xa0\x80', u'\udced\udca0\udc80'),
# x00 - x1f
(byterange(0,32),
u"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" +
u"\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"),
# non-printable utf-8
(b'\x7f\xc2\x80\xc2\x81\xc2\x82\xc2\x83\xc2\x84\xc2\x85\xc2\x86\xc2\x87',
u"\u007f\u0080\u0081\u0082\u0083\u0084\u0085\u0086\u0087"),
# some characters with U >= 0x10000
(b'\xf0\x9f\x99\x8f', u'\U0001f64f'), # 🙏
(b'\xf0\x9f\x9a\x80', u'\U0001f680'), # 🚀
)
for tbytes, tunicode in testv:
assert type(tbytes) is bytes
assert type(tunicode) is unicode
# b(bytes), u(unicode)
b_tbytes = b(tbytes); assert type(b_tbytes) is bstr
b_tbytes_ = _bdata(b_tbytes); assert type(b_tbytes_) is bytes
u_tunicode = u(tunicode); assert type(u_tunicode) is ustr
u_tunicode_ = _udata(u_tunicode); assert type(u_tunicode_) is unicode
assert b_tbytes_ == tbytes
assert u_tunicode_ == tunicode
# b(unicode), u(bytes)
b_tunicode = b(tunicode); assert type(b_tunicode) is bstr
b_tunicode_ = _bdata(b_tunicode); assert type(b_tunicode_) is bytes
u_tbytes = u(tbytes); assert type(u_tbytes) is ustr
u_tbytes_ = _udata(u_tbytes); assert type(u_tbytes_) is unicode
assert b_tunicode_ == tbytes
assert u_tbytes_ == tunicode
# b(u(bytes)), u(b(unicode))
bu_tbytes = b(u(tbytes)); assert type(bu_tbytes) is bstr
bu_tbytes_ = _bdata(bu_tbytes); assert type(bu_tbytes_) is bytes
ub_tunicode = u(b(tunicode)); assert type(ub_tunicode) is ustr
ub_tunicode_= _udata(ub_tunicode); assert type(ub_tunicode_)is unicode
assert bu_tbytes_ == tbytes
assert ub_tunicode_ == tunicode
# b/u accept only ~bytes/~unicode/bytearray/buffer
with raises(TypeError): b()
with raises(TypeError): u()
with raises(TypeError): b(123)
with raises(TypeError): u(123)
with raises(TypeError): b([1,'β'])
with raises(TypeError): u([1,'β'])
with raises(TypeError): b(object())
with raises(TypeError): u(object())
# bstr/ustr - similarly to str - accept arbitrary objects
_ = bstr(); assert type(_) is bstr; assert _ == ''
_ = ustr(); assert type(_) is ustr; assert _ == ''
_ = bstr(123); assert type(_) is bstr; assert _ == '123'
_ = ustr(123); assert type(_) is ustr; assert _ == '123'
_ = bstr([1,'β']); assert type(_) is bstr; assert _ == "[1, 'β']"
_ = ustr([1,'β']); assert type(_) is ustr; assert _ == "[1, 'β']"
obj = object()
_ = bstr(obj); assert type(_) is bstr; assert _ == str(obj) # <object ...>
_ = ustr(obj); assert type(_) is ustr; assert _ == str(obj) # <object ...>
# when stringifying they also handle bytes/bytearray inside containers as UTF-8 strings
_ = bstr([xunicode( 'β')]); assert type(_) is bstr; assert _ == "['β']"
_ = ustr([xunicode( 'β')]); assert type(_) is ustr; assert _ == "['β']"
_ = bstr([xbytes( 'β')]); assert type(_) is bstr; assert _ == "['β']"
_ = ustr([xbytes( 'β')]); assert type(_) is ustr; assert _ == "['β']"
_ = bstr([xbytearray('β')]); assert type(_) is bstr; assert _ == "['β']"
_ = ustr([xbytearray('β')]); assert type(_) is ustr; assert _ == "['β']"
b_ = xbytes ("мир"); assert type(b_) is bytes
u_ = xunicode ("мир"); assert type(u_) is unicode
ba_ = xbytearray("мир"); assert type(ba_) is bytearray
# b/u from unicode
bs = b(u_); assert isinstance(bs, bytes); assert type(bs) is bstr
us = u(u_); assert isinstance(us, unicode); assert type(us) is ustr
_ = bstr(u_); assert type(_) is bstr; assert _ == "мир"
_ = ustr(u_); assert type(_) is ustr; assert _ == "мир"
# b/u from bytes
_ = b(b_); assert type(_) is bstr; assert _ == "мир"
_ = u(b_); assert type(_) is ustr; assert _ == "мир"
_ = bstr(b_); assert type(_) is bstr; assert _ == "мир"
_ = ustr(b_); assert type(_) is ustr; assert _ == "мир"
# b/u from bytearray
_ = b(ba_); assert type(_) is bstr; assert _ == "мир"
_ = u(ba_); assert type(_) is ustr; assert _ == "мир"
_ = bstr(ba_); assert type(_) is bstr; assert _ == "мир"
_ = ustr(ba_); assert type(_) is ustr; assert _ == "мир"
# b/u from buffer
for tbuf in buftypes:
bbuf_ = tbuf(b_)
bbuf_std_str = str(bbuf_) # e.g. '<memory at ...>' for memoryview
_ = b(bbuf_); assert type(_) is bstr; assert _ == "мир"
_ = u(bbuf_); assert type(_) is ustr; assert _ == "мир"
_ = bstr(bbuf_); assert type(_) is bstr; assert _ == bbuf_std_str # NOTE not 'мир'
_ = ustr(bbuf_); assert type(_) is ustr; assert _ == bbuf_std_str
# bstr/ustr from bytes/bytearray/buffer with encoding
k8mir_bytes = u"мир".encode('koi8-r')
for tbuf in [bytes, bytearray] + buftypes:
k8mir = tbuf(k8mir_bytes)
_ = bstr(k8mir, 'koi8-r'); assert type(_) is bstr; assert _ == "мир"
_ = ustr(k8mir, 'koi8-r'); assert type(_) is ustr; assert _ == "мир"
with raises(UnicodeDecodeError): bstr(k8mir, 'ascii')
with raises(UnicodeDecodeError): ustr(k8mir, 'ascii')
_ = bstr(k8mir, 'ascii', 'replace'); assert type(_) is bstr; assert _ == u'\ufffd\ufffd\ufffd'
_ = ustr(k8mir, 'ascii', 'replace'); assert type(_) is ustr; assert _ == u'\ufffd\ufffd\ufffd'
# no encoding -> utf8 with surrogateescape for bytes/bytearray, stringify for the rest
k8mir_usurrogateescape = u'\udccd\udcc9\udcd2'
k8mir_strok = k8mir_usurrogateescape
if not tbuf in (bytes, bytearray):
k8mir_strok = str(k8mir) # e.g. '<memory at ...>' for memoryview
_ = bstr(k8mir); assert type(_) is bstr; assert _ == k8mir_strok
_ = ustr(k8mir); assert type(_) is ustr; assert _ == k8mir_strok
_ = b (k8mir); assert type(_) is bstr; assert _ == k8mir_usurrogateescape # always surrogateescape
_ = u (k8mir); assert type(_) is ustr; assert _ == k8mir_usurrogateescape
# encoding specified -> treat it precisely
with raises(UnicodeDecodeError): bstr(k8mir, 'utf-8')
with raises(UnicodeDecodeError): ustr(k8mir, 'utf-8')
with raises(UnicodeDecodeError): bstr(k8mir, encoding='utf-8')
with raises(UnicodeDecodeError): ustr(k8mir, encoding='utf-8')
with raises(UnicodeDecodeError): bstr(k8mir, errors='strict')
with raises(UnicodeDecodeError): ustr(k8mir, errors='strict')
# b(b(·)) = identity, u(u(·)) = identity
assert b(bs) is bs; assert bstr(bs) is bs
assert u(us) is us; assert ustr(us) is us
# bytes(b(·)) = identity, unicode(u(·)) = identity
assert bytes (bs) is bs
assert unicode(us) is us
# unicode(b) -> u, bytes(u) -> b
_ = unicode(bs); assert type(_) is ustr; assert _ == "мир"
_ = bytes (us); assert type(_) is bstr; assert _ == "мир"
# bytearray(b|u) -> bytearray
_ = bytearray(bs); assert type(_) is bytearray; assert _ == b'\xd0\xbc\xd0\xb8\xd1\x80'
_ = bytearray(us); assert type(_) is bytearray; assert _ == b'\xd0\xbc\xd0\xb8\xd1\x80'
# b(u(·)), u(b(·))
_ = b(us); assert type(_) is bstr; assert _ == "мир"
_ = u(bs); assert type(_) is ustr; assert _ == "мир"
_ = bstr(us); assert type(_) is bstr; assert _ == "мир"
_ = ustr(bs); assert type(_) is ustr; assert _ == "мир"
# hash of b/u is made to be equal to hash of current str
# (it cannot be equal to hash(b'мир') and hash(u'мир') at the same time as those hashes differ)
assert hash(us) == hash("мир"); assert us == "мир"
assert hash(bs) == hash("мир"); assert bs == "мир"
# str/repr
_ = str(us); assert isinstance(_, str); assert _ == "мир"
_ = str(bs); assert isinstance(_, str); assert _ == "мир"
_ = repr(us); assert isinstance(_, str); assert _ == "u('мир')"
_ = repr(bs); assert isinstance(_, str); assert _ == "b('мир')"
# str/repr of non-valid utf8
b_hik8 = xbytes ('привет ')+b(k8mir_bytes); assert type(b_hik8) is bstr
u_hik8 = xunicode('привет ')+u(k8mir_bytes); assert type(u_hik8) is ustr
assert _bdata(b_hik8) == b'\xd0\xbf\xd1\x80\xd0\xb8\xd0\xb2\xd0\xb5\xd1\x82 \xcd\xc9\xd2'
assert _udata(u_hik8) == u'привет \udccd\udcc9\udcd2'
_ = str(u_hik8); assert isinstance(_, str); assert _ == xbytes('привет ')+b'\xcd\xc9\xd2'
_ = str(b_hik8); assert isinstance(_, str); assert _ == xbytes('привет ')+b'\xcd\xc9\xd2'
_ = repr(u_hik8); assert isinstance(_, str); assert _ == r"u(b'привет \xcd\xc9\xd2')"
_ = repr(b_hik8); assert isinstance(_, str); assert _ == r"b(b'привет \xcd\xc9\xd2')"
# str/repr of quotes
def _(text, breprok, ureprok):
bt = b(text); assert type(bt) is bstr
ut = u(text); assert type(ut) is ustr
_ = str(bt); assert isinstance(_, str); assert _ == text
_ = str(ut); assert isinstance(_, str); assert _ == text
_ = repr(bt); assert isinstance(_, str); assert _ == breprok
_ = repr(ut); assert isinstance(_, str); assert _ == ureprok
_('', "b('')", "u('')")
_('"', "b('\"')", "u('\"')")
_("'", 'b("\'")', 'u("\'")')
_('"\'', "b('\"\\'')", "u('\"\\'')")
_('"α" \'β\'', "b('\"α\" \\\\'')", "u('\"α\" \\\\'')")
# custom attributes cannot be injected to bstr/ustr
if not ('PyPy' in sys.version): # https://foss.heptapod.net/pypy/pypy/issues/2763
with raises(AttributeError):
us.hello = 1
with raises(AttributeError):
bs.hello = 1
# verify memoryview(bstr|ustr).
def test_strings_memoryview():
bs = b('мир')
us = u('май')
with raises(TypeError):
memoryview(us)
m = memoryview(bs)
assert len(m) == 6
def _(i): # returns m[i] as int
x = m[i]
if six.PY2: # on py2 memoryview[i] returns bytechar
x = ord(x)
return x
assert _(0) == 0xd0
assert _(1) == 0xbc
assert _(2) == 0xd0
assert _(3) == 0xb8
assert _(4) == 0xd1
assert _(5) == 0x80
# verify that bstr/ustr can be pickled/unpickled correctly.
def test_strings_pickle():
bs = b("мир")
us = u("май")
#from pickletools import dis
for proto in range(0, pickle.HIGHEST_PROTOCOL):
p_bs = pickle.dumps(bs, proto)
#dis(p_bs)
bs_ = pickle.loads(p_bs)
assert type(bs_) is bstr
assert bs_ == bs
p_us = pickle.dumps(us, proto)
#dis(p_us)
us_ = pickle.loads(p_us)
assert type(us_) is ustr
assert us_ == us
# verify that ord on bstr/ustr works as expected.
# XXX place
def test_strings_ord():
with raises(TypeError): ord(b(''))
with raises(TypeError): ord(u(''))
with raises(TypeError): ord(b('ab'))
with raises(TypeError): ord(u('ab'))
assert ord(b('a')) == 97
assert ord(u('a')) == 97
with raises(TypeError): ord(b('м')) # 2 bytes, not 1
assert ord(u('м')) == 1084
for i in range(0x100):
bc = b(bytearray([i]))
assert len(bc) == 1
assert ord(bc) == i
for i in range(0x10000):
uc = u(unichr(i))
assert len(uc) == 1
assert ord(uc) == i
# verify bbyte.
# XXX place
def test_strings_bbyte():
with raises(ValueError): bbyte(-1)
with raises(ValueError): bbyte(0x100)
for i in range(0x100):
bi = bbyte(i)
assert type(bi) is bstr
assert len(bi) == 1
assert ord(bi) == i
assert bi == bytearray([i])
# verify uchr.
# XXX place
def test_strings_uchr():
with raises(ValueError): unichr(-1)
# upper limit depends on whether python was built with ucs as 2-bytes or 4-bytes long
# but at least it all should work for small 2-bytes range
for i in range(0x10000):
ui = uchr(i)
assert type(ui) is ustr
assert len(ui) == 1
assert ord(ui) == i
assert ui == unichr(i)
# verify strings access by index.
def test_strings_index():
us = u("миру мир"); u_ = u"миру мир"
bs = b("миру мир"); b_ = xbytes("миру мир")
assert len(us) == 8; assert len(u_) == 8
assert len(bs) == 15; assert len(b_) == 15
# u/unicode [idx] -> unicode character
def uidx(i):
x = us[i]; assert type(x) is ustr
y = u_[i]; assert type(y) is unicode
assert x == y
return x
for i, x in enumerate(['м','и','р','у',' ','м','и','р']):
assert uidx(i) == x
# b/bytes [idx] -> bytechar of byte value @ position idx
def bidx(i):
x = bs[i]; assert type(x) is bstr; assert len(x) == 1
y = b_[i]
if six.PY3:
y = bytes([y]) # on py3 bytes[i] returns int instead of 1-byte string
assert type(y) is bytes; assert len(y) == 1
assert x == y
return x
for i, x in enumerate([0xd0,0xbc,0xd0,0xb8,0xd1,0x80,0xd1,0x83,0x20,0xd0,0xbc,0xd0,0xb8,0xd1,0x80]):
assert bidx(i) == bytearray([x])
# u/unicode [:] -> unicode string
class USlice:
def __getitem__(self, key):
x = us[key]; assert type(x) is ustr
y = u_[key]; assert type(y) is unicode
assert x == y
return x
def __len__(self): # py2
x = len(us)
y = len(u_)
assert x == y
return x
_ = USlice()
assert _[:] == u"миру мир"
assert _[1:] == u"иру мир"
assert _[:-1] == u"миру ми"
assert _[2:5] == u"ру "
assert _[1:-1:2]== u"иум"
# b/bytes [:] -> bytestring
class BSlice:
def __getitem__(self, key):
x = bs[key]; assert type(x) is bstr
y = b_[key]; assert type(y) is bytes
assert x == y
return x
def __len__(self): # py2
x = len(bs)
y = len(b_)
assert x == y
return x
_ = BSlice()
assert _[:] == "миру мир"
assert _[1:] == b'\xbc\xd0\xb8\xd1\x80\xd1\x83 \xd0\xbc\xd0\xb8\xd1\x80'
assert _[:-1] == b'\xd0\xbc\xd0\xb8\xd1\x80\xd1\x83 \xd0\xbc\xd0\xb8\xd1'
assert _[3:12] == b'\xb8\xd1\x80\xd1\x83 \xd0\xbc\xd0'
assert _[1:-1:2]== b'\xbc\xb8\x80\x83\xd0\xd0\xd1'
# u/unicode: index/rindex/find/rfind return character-position
# methods that accept start/stop also treat them as character position
#
# b/bytes: index/rindex/find/rfind return byte-position
# methods that accept start/stop also treat them as byte-position
#
# b/u: methods does not automatically coerce buffers to strings
class CheckOp:
def __init__(self, xs, x_, str2std):
self.xs = xs
self.x_ = x_
self.str2std = str2std
def __getattr__(self, meth):
def _(*argv):
argv_ = deepReplaceStr(argv, self.str2std)
x = xcall(self.xs, meth, *argv)
y = xcall(self.x_, meth, *argv_)
assert type(x) is type(y)
if isinstance(x, Exception):
assert str(x) == str(y) # ValueError('x') == ValueError('x') is false
else:
assert x == y
# also test xs.meth(unicode|bytes|bytearray | bstr|ustr)
for zt in [xunicode, xbytes, xbytearray, b, u]:
argv_z = deepReplaceStr(argv, zt)
z = xcall(self.xs, meth, *argv_z)
assert type(z) is type(x)
if isinstance(x, Exception):
assert str(z) == str(x)
else:
assert z == x
# buffers should not be accepted
for tbuf in buftypes:
have_m = [False]
def _(s):
have_m[0] = True
return tbuf(xbytes(s))
argv_m = deepReplaceStr(argv, _)
if have_m[0]:
with raises(TypeError):
getattr(self.xs, meth)(*argv_m)
return x
return _
U = CheckOp(us, u_, xunicode)
B = CheckOp(bs, b_, xbytes)
assert U.count("α") == 0
assert B.count("α") == 0
assert U.count("и") == 2
assert B.count("и") == 2
assert U.count("ир") == 2
assert B.count("ир") == 2
assert U.count("ир", 2) == 1
assert B.count("ир", 2) == 2
assert U.count("ир", 2, 7) == 0
assert B.count("ир", 2, 7) == 1
assert U.find("α") == -1
assert B.find("α") == -1
assert U.find("ир") == 1
assert B.find("ир") == 2
assert U.find("ир", 2) == 6
assert B.find("ир", 2) == 2
assert U.find("ир", 2, 7) == -1
assert B.find("ир", 2, 7) == 2
assert U.rfind("α") == -1
assert B.rfind("α") == -1
assert U.rfind("ир") == 6
assert B.rfind("ир") == 11
assert U.rfind("ир", 2) == 6
assert B.rfind("ир", 2) == 11
assert U.rfind("ир", 2, 7) == -1
assert B.rfind("ир", 2, 7) == 2
_ = U.index("α"); assert isinstance(_, ValueError)
_ = B.index("α"); assert isinstance(_, ValueError)
assert U.index("ир") == 1
assert B.index("ир") == 2
assert U.index("ир", 2) == 6
assert B.index("ир", 2) == 2
_ = U.index("ир", 2, 7); assert isinstance(_, ValueError)
assert B.index("ир", 2, 7) == 2
_ = U.rindex("α"); assert isinstance(_, ValueError)
_ = B.rindex("α"); assert isinstance(_, ValueError)
assert U.rindex("ир") == 6
assert B.rindex("ир") == 11
assert U.rindex("ир", 2) == 6
assert B.rindex("ир", 2) == 11
_ = U.rindex("ир", 2, 7); assert isinstance(_, ValueError)
assert B.rindex("ир", 2, 7) == 2
assert U.startswith("α") == False
assert B.startswith("α") == False
assert U.startswith("мир") == True
assert B.startswith("мир") == True
assert U.startswith("мир", 5) == True
assert B.startswith("мир", 5) == False
assert U.startswith("мир", 5, 7)== False
assert B.startswith("мир", 5, 7)== False
assert U.startswith(()) == False
assert B.startswith(()) == False
assert U.startswith(("α",)) == False
assert B.startswith(("α",)) == False
assert U.startswith(("α","β")) == False
assert B.startswith(("α","β")) == False
assert U.startswith(("α","β","ир")) == False
assert B.startswith(("α","β","ир")) == False
assert U.startswith(("α","β","мир")) == True
assert B.startswith(("α","β","мир")) == True
assert U.endswith("α") == False
assert B.endswith("α") == False
assert U.endswith("мир") == True
assert B.endswith("мир") == True
assert U.endswith("мир", 2) == True
assert B.endswith("мир", 2) == True
assert U.endswith("мир", 2, 7) == False
assert B.endswith("мир", 2, 7) == False
assert U.endswith("мир", None, 3) == True
assert B.endswith("мир", None, 3) == False
assert U.endswith("мир", None, 6) == False
assert B.endswith("мир", None, 6) == True
assert U.endswith(()) == False
assert B.endswith(()) == False
assert U.endswith(("α",)) == False
assert B.endswith(("α",)) == False
assert U.endswith(("α","β")) == False
assert B.endswith(("α","β")) == False
assert U.endswith(("α","β","ир")) == True
assert B.endswith(("α","β","ир")) == True
assert U.endswith(("α","β","мир")) == True
assert B.endswith(("α","β","мир")) == True
# verify strings iteration.
def test_strings_iter():
us = u("миру мир"); u_ = u"миру мир"
bs = b("миру мир")
# iter( b/u/unicode ) -> iterate unicode characters
# NOTE that iter(b) too yields unicode characters - not integers
bi = iter(bs)
ui = iter(us)
ui_ = iter(u_)
class XIter:
def __iter__(self):
return self
def __next__(self, missing=object):
x = next(bi, missing)
y = next(ui, missing)
z = next(ui_, missing)
assert type(x) is type(y)
if x is not missing:
assert type(x) is ustr
if z is not missing:
assert type(z) is unicode
assert x == y
assert y == z
if x is missing:
raise StopIteration
return x
next = __next__ # py2
assert list(XIter()) == ['м','и','р','у',' ','м','и','р']
# verify .encode/.decode .
def test_strings_encodedecode():
us = u('мир')
bs = b('май')
# TODO also raise AttributeError on .encode/.decode lookup on classes
assert hasattr(us, 'encode') ; assert hasattr(ustr, 'encode')
assert not hasattr(bs, 'encode') #; assert not hasattr(bstr, 'encode')
assert not hasattr(us, 'decode') #; assert not hasattr(ustr, 'decode')
assert hasattr(bs, 'decode') ; assert hasattr(bstr, 'decode')
_ = us.encode(); assert type(_) is bstr; assert _bdata(_) == xbytes('мир')
_ = us.encode('utf-8'); assert type(_) is bstr; assert _bdata(_) == xbytes('мир')
_ = bs.decode(); assert type(_) is ustr; assert _udata(_) == u'май'
_ = bs.decode('utf-8'); assert type(_) is ustr; assert _udata(_) == u'май'
# !utf-8
k8mir = u'мир'.encode('koi8-r')
b_k8mir = b(k8mir)
assert type(b_k8mir) is bstr
assert _bdata(b_k8mir) == k8mir
assert _bdata(b_k8mir) == b'\xcd\xc9\xd2'
_ = b_k8mir.decode('koi8-r')
assert type(_) is ustr
assert _udata(_) == u'мир'
b_cpmir = us.encode('cp1251')
assert type(b_cpmir) is bstr
assert _bdata(b_cpmir) == u'мир'.encode('cp1251')
assert _bdata(b_cpmir) == b'\xec\xe8\xf0'
# decode/encode errors
u_k8mir = b_k8mir.decode() # no decode error with
assert type(u_k8mir) is ustr # default parameters
assert _udata(u_k8mir) == u'\udccd\udcc9\udcd2'
_ = b_k8mir.decode('utf-8', 'surrogateescape') # no decode error with
assert type(_) is ustr # explicit utf-8/surrogateescape
assert _udata(_) == _udata(u_k8mir)
with raises(UnicodeDecodeError): # decode error if encoding is explicitly specified
b_k8mir.decode('utf-8')
with raises(UnicodeDecodeError):
b_k8mir.decode('utf-8', 'strict')
with raises(UnicodeDecodeError):
b_k8mir.decode('ascii')
with raises(UnicodeEncodeError):
us.encode('ascii')
_ = u_k8mir.encode() # no encode error with
assert type(_) is bstr # default parameters
assert _bdata(_) == k8mir
_ = u_k8mir.encode('utf-8', 'surrogateescape') # no encode error with
assert type(_) is bstr # explicit utf-8/surrogateescape
assert _bdata(_) == k8mir
# on py2 unicode.encode accepts surrogate pairs and does not complain
# TODO(?) manually implement encode/py2 and reject surrogate pairs by default
if six.PY3:
with raises(UnicodeEncodeError): # encode error if encoding is explicit specified
u_k8mir.encode('utf-8')
with raises(UnicodeEncodeError):
u_k8mir.encode('utf-8', 'strict')
with raises(UnicodeEncodeError):
u_k8mir.encode('ascii')
# verify string operations like `x * 3` for all cases from bytes, bytearray, unicode, bstr and ustr.
@mark.parametrize('tx', (bytes, unicode, bytearray, bstr, ustr))
def test_strings_ops1(tx):
x = xstr(u'мир', tx)
assert type(x) is tx
# *
_ = x * 3
assert type(_) is tx
assert xudata(_) == u'мирмирмир'
_ = 3 * x
assert type(_) is tx
assert xudata(_) == u'мирмирмир'
# *=
_ = x
_ *= 3
assert type(_) is tx
assert xudata(_) == u'мирмирмир'
assert _ is x if tx is bytearray else \
_ is not x
# verify string operations like `x + y` for all combinations of pairs from
# bytes, unicode, bstr, ustr and bytearray. Except if both x and y are std
# python types, e.g. (bytes, unicode), because those combinations are handled
# only by builtin python code and might be rejected.
@mark.parametrize('tx', (bytes, unicode, bstr, ustr, bytearray))
@mark.parametrize('ty', (bytes, unicode, bstr, ustr, bytearray))
def test_strings_ops2(tx, ty):
# skip e.g. regular bytes vs regular unicode
tstd = {bytes, unicode, bytearray}
if tx in tstd and ty in tstd and tx is not ty:
skip()
# == != <= >= < > for ~equal
x = xstr(u'мир', tx); assert type(x) is tx
y = xstr(u'мир', ty); assert type(y) is ty
assert x == y
assert y == x
assert not (x != y)
assert not (y != x)
assert x >= y
assert y >= x
assert x <= y
assert y <= x
assert not (x > y)
assert not (y > x)
assert not (x < y)
assert not (y < x)
# now not equal
x = xstr(u'hello ', tx)
y = xstr(u'мир', ty)
# == != <= >= < >
assert not (x == y)
assert not (y == x)
assert x != y
assert y != x
assert not (x >= y)
assert y >= x
assert x <= y
assert not (y <= x)
assert x < y
assert not (y < x)
assert not (x > y)
assert y > x
# +
#
# type(x + y) is determined by type(x):
# u() + * -> u()
# b() + * -> b()
# u'' + u()/b() -> u()
# u'' + u'' -> u''
# b'' + u()/b() -> b()
# b'' + b'' -> b''
# barr + u()/b() -> barr
if tx in (bstr, ustr):
tadd = tx
elif tx in (unicode, bytes):
if ty in (unicode, bytes, bytearray):
tadd = tx # we are skipping e.g. bytes + unicode
else:
assert ty in (bstr, ustr)
tadd = tbu(tx)
else:
assert tx is bytearray
tadd = tx
_ = x + y
assert type(_) is tadd
assert _ is not x; assert _ is not y
assert _ == xstr(u'hello мир', tadd)
# += (same typing rules as for +)
_ = x
_ += y
assert type(_) is tadd
assert _ == xstr(u'hello мир', tadd)
assert _ is x if tx is bytearray else \
_ is not x
# x % y (not tuple at right)
# ideally same typing rules as for +, but for `x=u'' y=b()` and `x=b'' y=u()`
# we can't make python call y.__rmod__ .
# see https://bugs.python.org/issue28598 for references where python implements this.
#
# NOTE python 3.11 reworked % handling to be generic - there we could
# probably make y.__rmod__ to be called via tweaking __subclasscheck__
# https://github.com/python/cpython/commit/ec382fac0db6
if tx in (bstr, ustr):
tmod = tx
elif tx in (unicode, bytes):
if ty in (unicode, bytes, bytearray):
tmod = tx
else:
assert ty in (bstr, ustr)
# on py2 str % (unicode|ustr) gives unicode
if six.PY2 and ty is ustr:
if tx is bytes:
tmod = unicode
else:
assert tx is unicode
tmod = ustr # ustr is subclass of unicode -> __rmod__ is called
else:
tmod = tx if tbu(tx) is not ty else \
tbu(tx)
else:
assert tx is bytearray
tmod = tx
x = xstr(u'hello %s', tx)
if six.PY2 and tx is bytearray: # bytearray/py2 does not support %
_ = xbytearray(bytes(x) % y)
else:
_ = x % y
assert type(_) is tmod
assert _ == xstr(u'hello мир', tmod)
assert _ is not x
# x %= y (not tuple at right; same as in corresponding %)
_ = x
if six.PY2 and tx is bytearray: # bytearray/py2 does not support %=
_ = xbytearray(bytes(x) % y)
else:
_ %= y
assert type(_) is tmod
assert _ == xstr(u'hello мир', tmod)
assert _ is not x # even bytearray('%s') %= y creates new object
# x % (y,)
# py3: result type is type(x) because y.__rmod__ is never called
# py2: similar, but b'' % u'' gives u
if six.PY2 and tx is bytearray: # bytearray/py2 does not support %
_ = xbytearray(bytes(x) % (y,))
else:
_ = x % (y,)
ttmod = tx
if six.PY2:
if tx in (bytes, unicode):
if tx is unicode or ty in (unicode, ustr):
ttmod = unicode
else:
ttmod = bytes
assert type(_) is ttmod
assert _ == xstr(u'hello мир', ttmod)
assert _ is not x
# x %= (y,)
_ = x
if six.PY2 and tx is bytearray: # bytearray/py2 does not support %=
_ = xbytearray(bytes(x) % (y,))
else:
_ %= (y,)
assert type(_) is ttmod
assert _ == xstr(u'hello мир', ttmod)
assert _ is not x # even bytearray('%s') %= y creates new object
# verify string operations like `x + y` for x being bstr/ustr and y being a
# type unsupported for coercion.
#
# NOTE string methods, like .join and .startswith, are verified to reject
# buffers in test_strings_methods and test_strings_index.
@mark.parametrize('tx', (bstr, ustr))
@mark.parametrize('ty', buftypes)
def test_strings_ops2_bufreject(tx, ty):
x = xstr(u'мир', tx)
y = ty(b'123')
with raises(TypeError): x + y
with raises(TypeError): x * y
with raises(TypeError): y in x
assert (x == y) is False # see test_strings_ops2_eq_any
assert (x != y) is True
with raises(TypeError): x >= y
with raises(TypeError): x <= y
with raises(TypeError): x > y
with raises(TypeError): x < y
# reverse operations, e.g. memoryview + bstr
with raises(TypeError): y + x
with raises(TypeError): y * x
# `x in y` does not raise: y is considered to be generic sequence without
# __contains__, and so python transforms `x in y` into `x in list(y)`.
#with raises(TypeError): x in y
# `y > x` does not raise when x is bstr (= provides buffer):
y == x # not raises TypeError - see test_strings_ops2_eq_any
y != x #
if tx is not bstr:
with raises(TypeError): y >= x
with raises(TypeError): y <= x
with raises(TypeError): y > x
with raises(TypeError): y < x
# verify string operations like `x == *` for x being bstr/ustr.
# Those operations must succeed for any hashable type or else bstr/ustr could
# not be used as dict keys.
@mark.parametrize('tx', (bstr, ustr))
def test_strings_ops2_eq_any(tx):
x = xstr(u'мир', tx)
while 1:
hx = hash(x)
if hash(hx) == hx: # positive int32 will have this property
break
x += '!'
# assertNE asserts that (x==y) is False and (x!=y) is True.
# it also asserts that e.g. x < y raises TypeError
def assertNE(y):
assert (x == y) is False
assert (x != y) is True
with raises(TypeError): x >= y
with raises(TypeError): x <= y
with raises(TypeError): x > y
with raises(TypeError): x < y
_ = assertNE
_(None)
_(0)
_(1)
_(2)
assert hash(x) == hx
assert hash(hx) == hx
_(hx)
d = {x: 1, hx: 2} # creating dict will fail if `x == hx` raises TypeError
assert d[x] == 1
assert d[hx] == 2
_(())
_((1,))
_((x,))
# == wrt non-hashable type also succeeds following std python where e.g. 's' == [1] gives False
l = [1]
with raises(TypeError): hash(l)
_(l)
# verify logic in `bstr % ...` and `bstr.format(...)` .
def test_strings_mod_and_format():
# verify_fmt_all_types verifies f(fmt, args) for all combinations of
#
# · fmt being unicode, bstr, ustr
# · args being/containing unicode, bytes, bytearray, bstr, ustr
#
# it checks that all results are the same for the case when both fmt and
# args contain only standard unicode.
def verify_fmt_all_types(f, fmt, args, *okv, **kw):
excok = kw.pop('excok', False)
assert not kw
rok = None
#print()
def xfmt(fmt, args):
exc = False
try:
r = f(fmt, args) # e.g. fmt % args
except Exception as e:
if not excok:
raise
exc = True
r = repr(e) # because e.g. ValueError('x') == ValueError('x') is false
#print(repr(fmt), "%", repr(args), "->", repr(r))
if not exc:
assert type(r) is type(fmt)
if len(okv) != 0:
for ok in okv:
if isinstance(ok, Exception):
ok = repr(ok)
else:
ok = xunicode(ok)
if r == ok:
break
else:
raise AssertionError("result (%r) not in any of %r" % (r, okv))
elif rok is not None:
assert r == rok
return r
fmt_ustd = deepReplaceStr(fmt, xunicode)
fmt_u = deepReplaceStr(fmt, u)
fmt_b = deepReplaceStr(fmt, b)
args_ustd = deepReplaceStr(args, xunicode)
args_bstd = deepReplaceStr(args, xbytes)
args_barr = deepReplaceStr2Bytearray(args)
args_u = deepReplaceStr(args, u)
args_b = deepReplaceStr(args, b)
# see if args_ustd could be used for stringification.
# e.g. on py2 both str() and unicode() on UserString(u'β') raises
# "UnicodeEncodeError: 'ascii' codec can't encode characters ..."
args_ustd_ok = True
if six.PY2:
try:
unicode(args_ustd) # e.g. UserString
try:
it = iter(args_ustd) # e.g. (UserString,)
# on py2 UserDict is not really iterable - iter succeeds but
# going through it raises KeyError because of
# https://github.com/python/cpython/blob/2.7-0-g8d21aa21f2c/Lib/UserDict.py#L112-L114
# -> work it around
if six.PY2 and not hasattr(args_ustd, '__iter__'):
raise TypeError
except TypeError:
pass
else:
for _ in it:
unicode(_)
except UnicodeEncodeError:
args_ustd_ok = False
# initialize rok from u'' % u''.
# Skip errors on py2 because e.g. `u'α %s' % [u'β']` gives u"α [u'\\u03b2']",
# not u"α ['β']". This way we cannot use u'' % u'' as a reference.
# We cannot use b'' % b'' as a reference neither because e.g.
# `'α %s' % ['β']` gives "α ['\\xce\\xb2']", not "α ['β']"
if args_ustd_ok:
good4rok = True
try:
_ = xfmt(fmt_ustd, args_ustd) # u'' % (u'', ...)
except AssertionError as e:
if six.PY2 and len(e.args) == 1 and "not in any of" in e.args[0]:
good4rok = False
else:
raise
if good4rok:
rok = _
# if rok computation was skipped we insist on being explicitly called with ok=...
assert (rok is not None) or (len(okv) != 0)
if args_ustd_ok:
xfmt(fmt_b, args_ustd) # b() % (u'', ...)
xfmt(fmt_u, args_ustd) # u() % (u'', ...)
xfmt(fmt_b, args_bstd) # b() % (b'', ...)
xfmt(fmt_u, args_bstd) # u() % (b'', ...)
xfmt(fmt_b, args_barr) # b() % (bytearray, ...)
xfmt(fmt_u, args_barr) # u() % (bytearray, ...)
xfmt(fmt_b, args_b) # b() % (b(), ...)
xfmt(fmt_u, args_b) # b() % (b(), ...)
xfmt(fmt_b, args_u) # b() % (u(), ...)
xfmt(fmt_u, args_u) # b() % (u(), ...)
# NOTE we don't check e.g. `u'' % u()` and `u'' % b()` because for e.g.
# `u'α %s' % [u('β')]` the output is u"α [u("β")]" - not u"α ['β']".
# _bprintf parses %-format ourselves. Verify that parsing first
# NOTE here all strings are plain ASCII.
def _(fmt, args):
fmt = '*str '+fmt
for l in range(len(fmt), -1, -1):
# [:len(fmt)] verifies original case
# [:l<len] should verify "incomplete format" parsing
verify_fmt_all_types(lambda fmt, args: fmt % args,
fmt[:l], args, excok=True)
_('%(name)s', {'name': 123})
_('%x', 123) # flags
_('%#x', 123)
_('%05d', 123)
_('%-5d', 123)
_('% d', 123)
_('% d', -123)
_('%+d', -123)
_('%5d', 123) # width
_('%*d', (5,123))
_('%f', 1.234) # .prec
_('%.f', 1.234)
_('%.1f', 1.234)
_('%.2f', 1.234)
_('%*f', (2,1.234))
_('%hi', 123) # len
_('%li', 123)
_('%Li', 123)
_('%%', ()) # %%
_('%10.4f', 1.234) # multiple features
_('%(x)10.4f', {'y':0, 'x':1.234})
_('%*.*f', (10,4,1.234))
_('', {}) # not all arguments converted
_('', [])
_('', 123)
_('', '123')
_('%s', ()) # not enough arguments to format
_('%s %s', 123)
_('%s %s', (123,))
_('%(x)s', 123) # format requires a mapping
_('%(x)s', (123,))
_('%s %(x)s', (123,4))
_('%(x)s %s', (123,4))
_('%(x)s %s', {'x':1}) # mixing tuple/dict
_('%s %(x)s', {'x':1})
_('abc %z', 1) # unsupported format character
_('abc %44z', 1)
# for `'%4%' % ()` py2 gives ' %', but we stick to more reasonable py3 semantic
def _(fmt, args, ok):
return verify_fmt_all_types(lambda fmt, args: fmt % args,
fmt, args, ok, excok=True)
_('*str %4%', (), TypeError("not enough arguments for format string"))
_('*str %4%', 1, ValueError("unsupported format character '%' (0x25) at index 7"))
_('*str %4%', (1,), ValueError("unsupported format character '%' (0x25) at index 7"))
_('*str %(x)%', {'x':1}, ValueError("unsupported format character '%' (0x25) at index 9"))
# parse checking complete. now verify actual %- and format- formatting
# fmt_percent_to_bracket converts %-style format to .format-style format string.
def fmt_percent_to_bracket(fmt):
# replace %<x> with corresponding {} style
# be dumb and explicit in replacement to make sure there is no chance
# we get this logic wrong
def _(m):
r = {
'%s': '{!s}',
'%r': '{!r}',
'%(x)s': '{x!s}',
'%(y)s': '{y!s}',
'%(z)s': '{z!s}',
}
return r[m.group()]
fmt_ = re.sub('%[^ ]*[a-z]', _, fmt)
assert '%' not in fmt_
return fmt_
# xformat calls fmt.format with *args or **args appropriately.
def xformat(fmt, args):
if isinstance(args, (dict, six.moves.UserDict)):
a = fmt.format(**args)
if not (six.PY2 and type(fmt) is unicode):
b = fmt.format_map(args) # py2: no unicode.format_map()
assert a == b
return a
elif isinstance(args, tuple):
return fmt.format(*args)
else:
return fmt.format(args) # it was e.g. `'%s' % 123`
# _ verifies `fmt % args` and `fmt'.format(args)`
# if fmt has no '%' only .format(args) is verified.
def _(fmt, args, *okv):
if '%' in fmt:
verify_fmt_all_types(lambda fmt, args: fmt % args,
fmt, args, *okv)
# compute fmt' for .format verification
fmt_ = fmt_percent_to_bracket(fmt)
# and assert that .format result is the same as for %
# compare to b() formatting because else on py2 we hit unicode % issues
# we, anyway, just verified b() % above.
if len(okv) == 0:
okv = [b(fmt) % args]
else:
fmt_ = fmt
verify_fmt_all_types(xformat, fmt_, args, *okv)
_("*str a %s z", 123) # NOTE *str to force str -> bstr/ustr even for ASCII string
_("*str a %s z", '*str \'"\x7f')
_("*str a %s z", 'β')
_("*str a %s z", ('β',))
_("*str a %s z", ['β'] , "*str a ['β'] z")
_("a %s π", 123)
_("a %s π", '*str \'"\x7f')
_("a %s π", 'β')
_("a %s π", ('β',))
_("a %s π", ['β'] , "a ['β'] π")
_("α %s z", 123)
_("α %s z", '*str \'"\x7f')
_("α %s z", 'β')
_("α %s z", ('β',))
_("α %s z", ['β'] , "α ['β'] z")
_("α %s π", 123)
_("α %s π", '*str \'"\x7f')
_("α %s π", 'β')
_("α %s π", ('β',))
_("α %s π", ('β',))
_("α %s %s π", ('β', 'γ'))
_("α %s %s %s π", ('β', 'γ', 'δ'))
_("α %s %s %s %s %s %s %s π", (1, 'β', 2, 'γ', 3, 'δ', 4))
_("α %s π", [])
_("α %s π", ([],))
_("α %s π", ((),))
_("α %s π", set())
_("α %s π", (set(),))
_("α %s π", frozenset())
_("α %s π", (frozenset(),))
_("α %s π", ({},))
_("α %s π", ['β'] , "α ['β'] π")
_("α %s π", (['β'],) , "α ['β'] π")
_("α %s π", (('β',),) , "α ('β',) π")
_("α %s π", {'β'} , x32("α {'β'} π", "α set(['β']) π"))
_("α %s π", ({'β'},) , x32("α {'β'} π", "α set(['β']) π"))
_("α %s π", frozenset({'β'}) , x32("α frozenset({'β'}) π", "α frozenset(['β']) π"))
_("α %s π", (frozenset({'β'}),) , x32("α frozenset({'β'}) π", "α frozenset(['β']) π"))
_("α %s π", ({'β':'γ'},) , "α {'β': 'γ'} π")
_("α %s %s π", ([1, 'β', 2], 345) , "α [1, 'β', 2] 345 π")
_("α %s %s π", ((1, 'β', 2), 345) , "α (1, 'β', 2) 345 π")
# NOTE set/frozenset/dict: print order is "random"
_("α %s %s π", ({1, 'β'}, 345) , *x32(("α {1, 'β'} 345 π", "α {'β', 1} 345 π"),
("α set([1, 'β']) 345 π", "α set(['β', 1]) 345 π")))
_("α %s %s π", (frozenset({1, 'β'}), 345) , *x32(("α frozenset({1, 'β'}) 345 π", "α frozenset({'β', 1}) 345 π"),
("α frozenset([1, 'β']) 345 π", "α frozenset(['β', 1]) 345 π"))),
_("α %s %s π", ({1:'мир', 'β':'труд'}, 345) , *x32(("α {1: 'мир', 'β': 'труд'} 345 π",), # py3: dict is insert-order
("α {1: 'мир', 'β': 'труд'} 345 π", "α {'β': 'труд', 1: 'мир'} 345 π")))
# recursive list
l = [1,]; l += [l, 'мир']
_('α %s π', (l,) , "α [1, [...], 'мир'] π")
# recursive tuple
t = (1, []); t[1].append((t, 'мир'))
_('α %s π', (t,) , "α (1, [((...), 'мир')]) π")
# recursive set
s = {1}; s.add(hlist([s]))
_('α %s π', (s,) , x32("α {[set(...)], 1} π", "α set([[set(...)], 1]) π"))
# recursive frozenset
l = hlist()
f = frozenset({1, l}); l.append(f)
_('α %s π', (f,))
# recursive dict (via value)
d = {1:'мир'}; d.update({2:d})
_('α %s π', (d,) , *x32(("α {1: 'мир', 2: {...}} π",),
("α {1: 'мир', 2: {...}} π", "α {2: {...}, 1: 'мир'} π")))
# recursive dict (via key)
l = hlist([1])
d = {l:'мир'}; l.append(d)
_('α %s π', (d,) , "α {[1, {...}]: 'мир'} π")
# old-style class with __str__
class Cold:
def __repr__(self): return "Cold()"
def __str__(self): return u"Класс (old)"
_('α %s π', Cold())
_('α %s π', (Cold(),))
# new-style class with __str__
class Cnew(object):
def __repr__(self): return "Cnew()"
def __str__(self): return u"Класс (new)"
_('α %s π', Cnew())
_('α %s π', (Cnew(),))
# custom classes inheriting from set/list/tuple/dict/frozenset
class L(list): pass
class T(tuple): pass
class S(set): pass
class F(frozenset): pass
class D(dict): pass
_('α %s π', L(['β',3]) , "α ['β', 3] π")
_('α %s π', (L(['β',3]),) , "α ['β', 3] π")
_('α %s π', (T(['β',3]),) , "α ('β', 3) π")
# NOTE set/frozenset/dict: print order is "random"
_('α %s π', S(['β',3]) , *x32(("α S({'β', 3}) π", "α S({3, 'β'}) π"),
("α S(['β', 3]) π", "α S([3, 'β']) π")))
_('α %s π', (S(['β',3]),) , *x32(("α S({'β', 3}) π", "α S({3, 'β'}) π"),
("α S(['β', 3]) π", "α S([3, 'β']) π")))
_('α %s π', F(['β',3]) , *x32(("α F({'β', 3}) π", "α F({3, 'β'}) π"),
("α F(['β', 3]) π", "α F([3, 'β']) π")))
_('α %s π', (F(['β',3]),) , *x32(("α F({'β', 3}) π", "α F({3, 'β'}) π"),
("α F(['β', 3]) π", "α F([3, 'β']) π")))
_('α %s π', (D([('β','γ'), (3,4)]),)
, *x32(("α {'β': 'γ', 3: 4} π",),
("α {'β': 'γ', 3: 4} π", "α {3: 4, 'β': 'γ'} π")))
# well-known classes
# namedtuple
cc = collections; xcc = six.moves
Point = cc.namedtuple('Point', ['x', 'y'])
_('α %s π', (Point('β','γ'),) , "α Point(x='β', y='γ') π")
# deque
_('α %s π', cc.deque(['β','γ']) , "α deque(['β', 'γ']) π")
_('α %s π', (cc.deque(['β','γ']),) , "α deque(['β', 'γ']) π")
# Counter (inherits from dict)
_('α %s π', (cc.Counter({'β':1}),) , "α Counter({'β': 1}) π")
# OrderedDict
_('α %s π', (cc.OrderedDict([(1,'мир'), ('β','труд')]),)
, "α OrderedDict([(1, 'мир'), ('β', 'труд')]) π")
# defaultdict
_('α %s π', (cc.defaultdict(int, {'β':1}),)
, x32("α defaultdict(<class 'int'>, {'β': 1}) π",
"α defaultdict(<type 'int'>, {'β': 1}) π"))
# UserDict
_('α %s π', (xcc.UserDict({'β':1}),) , "α {'β': 1} π")
# UserList
_('α %s π', xcc.UserList(['β','γ']) , "α ['β', 'γ'] π")
_('α %s π', (xcc.UserList(['β','γ']),) , "α ['β', 'γ'] π")
# UserString
_('α %s π', xcc.UserString('βγ') , "α βγ π")
_('α %s π', (xcc.UserString('βγ'),) , "α βγ π")
# custom classes inheriting from bytes/unicode/bytearray
class B(bytes): pass
class BB(bytes):
def __repr__(self): return "BB(байты)"
def __str__(self): return "байты"
class U(unicode): pass
class UU(unicode):
def __repr__(self): return "UU(юникод)"
def __str__(self): return "юникод"
__unicode__ = __str__
class A(bytearray): pass
class AA(bytearray):
def __repr__(self): return "AA(байтмассив)"
def __str__(self): return "байтмассив"
def M(fmt, args, ok):
# verify only `b() % args` and `u() % args` since for e.g. `u'' % b''` the result is different
bfmt = b(fmt)
ufmt = u(fmt)
br = bfmt % args #;print(repr(bfmt), " % ", repr(args), " -> ", repr(br))
ur = ufmt % args #;print(repr(ufmt), " % ", repr(args), " -> ", repr(ur))
assert type(br) is bstr
assert type(ur) is ustr
assert br == ok
assert ur == ok
# verify b().format(args) and u().format(args)
fmt_ = fmt_percent_to_bracket(fmt)
bfmt_ = b(fmt_)
ufmt_ = u(fmt_)
br_ = xformat(bfmt_, args) #;print(repr(bfmt), " .format ", repr(args), " -> ", repr(br))
ur_ = xformat(ufmt_, args) #;print(repr(ufmt), " .format ", repr(args), " -> ", repr(ur))
assert type(br_) is bstr
assert type(ur_) is ustr
assert br_ == ok
assert ur_ == ok
M("α %s π", U ( u'май') , "α май π")
M("α %s π", (U ( u'май'),) , "α май π")
M("α %s π", [U ( u'май')] , "α ['май'] π")
M("α %s π", UU( u'май2') , "α юникод π") # not май2
M("α %s π", (UU( u'май2'),) , "α юникод π") # not май2
M("α %s π", [UU( u'май2')] , "α [UU(юникод)] π") # not [май2]
M("α %s π", B (xbytes('мир')) , "α мир π")
M("α %s π", (B (xbytes('мир')),) , "α мир π")
M("α %s π", [B (xbytes('мир'))] , "α ['мир'] π")
M("α %s π", BB(xbytes('мир2')) , "α байты π") # not мир2
# vvv does not work on py3 as b'' % b'' does not consult __str__ nor __bytes__ of the argument
# even though it is not 100% we are ok here, because customizing bytes or unicode is very exotic
if six.PY2:
M("α %s π", (BB(xbytes('мир2')),) , "α байты π") # not мир2
M("α %s π", [BB(xbytes('мир2'))] , "α [BB(байты)] π") # not [мир2]
M("α %s π", A (xbytes('труд')) , "α труд π")
M("α %s π", (A (xbytes('труд')),) , "α труд π")
M("α %s π", [A (xbytes('труд'))] , "α ['труд'] π")
M("α %s π", AA(xbytes('труд2')) , "α байтмассив π") # not труд2
M("α %s π", (AA(xbytes('труд2')),) , "α байтмассив π") # not труд2
M("α %s π", [AA(xbytes('труд2'))] , "α [AA(байтмассив)] π") # not [труд2]
# dict at right
# less tests because stringification of arguments is already thoroughly
# verified with "tuple at right" tests above.
_("*str a %(x)s z", {'x': 123})
_("*str a %(x)s z", {'x': '*str \'"\x7f'})
_("*str a %(x)s z", {'x': 'β'})
_("*str a %(x)s z", {'x': ['β']} , "*str a ['β'] z")
_("*str a %(x)s %(y)s z", {'x':'β', 'y':'γ'})
_("*str a %(x)s %(y)s %(z)s z", {'x':'β', 'y':'γ', 'z':'δ'})
_("a %(x)s π", {'x': 123})
_("a %(x)s π", {'x': '*str \'"\x7f'})
_("a %(x)s π", {'x': 'β'})
_("a %(x)s π", {'x': ['β']} , "a ['β'] π")
_("a %(x)s %(y)s π", {'x': 'β', 'y':'γ'})
_("a %(x)s %(y)s %(z)s π", {'x': 'β', 'y':'γ', 'z':'δ'})
_("α %(x)s z", {'x': 123})
_("α %(x)s z", {'x': '*str \'"\x7f'})
_("α %(x)s z", {'x': 'β'})
_("α %(x)s z", {'x': ['β']} , "α ['β'] z")
_("α %(x)s %(y)s z", {'x': 'β', 'y':'γ'})
_("α %(x)s %(y)s %(z)s z", {'x': 'β', 'y':'γ', 'z':'δ'})
_("α %(x)s π", {'x': 123})
_("α %(x)s π", {'x': '*str \'"\x7f'})
_("α %(x)s π", {'x': 'β'})
_("α %(x)s π", {'x': ['β']} , "α ['β'] π")
_("α %(x)s %(y)s π", {'x':'β', 'y':'γ'})
_("α %(x)s %(y)s %(z)s π", {'x':'β', 'y':'γ', 'z':'δ'})
_("*str a %(x)s z", xcc.UserDict({'x': 'β'}))
_("α %(x)s π", xcc.UserDict({'x': 'β'}))
# %r (and !r)
M("α %r", u'z' , x32("α 'z'", "α u'z'"))
M("α %r", u'β' , x32("α 'β'", "α u'β'"))
M("α %r", b'z' , x32("α b'z'", "α 'z'"))
M("α %r", xbytes('β') , x32("α b'β'", "α 'β'"))
M("α %r", xbytearray('β') , "α bytearray(b'β')")
M("α %r", b('β') , "α b('β')")
M("α %r", u('β') , "α u('β')")
M("α %r", [u'z'] , x32("α ['z']", "α [u'z']"))
M("α %r", [u'β'] , x32("α ['β']", "α [u'β']"))
M("α %r", [b'z'] , x32("α [b'z']", "α ['z']"))
M("α %r", [xbytes('β')] , x32("α [b'β']", "α ['β']"))
M("α %r", [xbytearray('β')] , "α [bytearray(b'β')]")
M("α %r", [b('β')] , "α [b('β')]")
M("α %r", [u('β')] , "α [u('β')]")
# some explicit verifications for .format()
_("*str hello {}", ("world",))
_("*str hello {}", (["world"],))
_("*str hello {}", ("мир",))
_("*str hello {}", (["мир"],) , "*str hello ['мир']")
_("привет {}", ("мир",))
_("привет {}", (["мир"],) , "привет ['мир']")
_("привет {0}, {1}", ("Петя", "Вася"))
_("привет {name}", {'name': "Ваня"})
_("привет {name}", {"name": "Тигра"} , "привет Тигра")
_("привет {name!s}", {"name": "Винни"} , "привет Винни")
_("привет {name:>10}", {"name": "Пух"} , "привет Пух")
_("привет {!s}", ("мир",))
_("привет {!s}", (["мир"],) , "привет ['мир']")
_("привет {:>10}", ("мир",))
_("привет {:>{}} {}", ("мир", 10, "α"))
_("привет {:02x}", (23,))
# verify __format__ + format() builtin
def test_strings__format__():
assert "привет {}".format("мир") == "привет мир"
assert "привет {}".format(b("мир")) == "привет мир"
assert "привет {}".format(u("мир")) == "привет мир"
assert format(u"мир") == u"мир"
assert format(u"мир", "") == u"мир"
assert format(u"мир", "s") == u"мир"
assert format(u"мир", ">5") == u" мир"
fb = format(b("мир"))
fb_ = format(b("мир"), "")
fbs = format(b("мир"), "s")
fb5 = format(b("мир"), ">5")
assert type(fb) is ustr # NOTE ustr, not bstr due to b.__format__ returning u
assert type(fb_) is ustr
assert type(fbs) is ustr
assert type(fb5) is ustr
assert fb == "мир"
assert fb_ == "мир"
assert fbs == "мир"
assert fb5 == " мир"
fu = format(u("мир"))
fu_ = format(u("мир"), "")
fus = format(u("мир"), "s")
fu5 = format(u("мир"), ">5")
assert type(fu) is ustr
assert type(fu_) is ustr
assert type(fus) is ustr
assert type(fu5) is ustr
assert fu == "мир"
assert fu_ == "мир"
assert fus == "мир"
assert fu5 == " мир"
# string.__format__ accepts only '' and 's' format codes
for fmt_spec in "abcdefghijklmnopqrstuvwxyz":
if fmt_spec == 's':
continue
with raises(ValueError): format( u"мир", fmt_spec)
with raises(ValueError): format(b("мир"), fmt_spec)
with raises(ValueError): format(u("мир"), fmt_spec)
# verify print for b, u
def test_strings_print():
outok = readfile(dir_testprog + "/golang_test_str.txt")
retcode, stdout, stderr = _pyrun(["golang_test_str.py"],
cwd=dir_testprog, stdout=PIPE, stderr=PIPE)
assert retcode == 0, (stdout, stderr)
assert stderr == b""
assertDoc(outok, stdout)
# verify methods of bstr/ustr
def test_strings_methods():
# checkop verifies that `s.meth(*argv, **kw)` gives the same result for s,
# argv and kw being various combinations of unicode,bstr,ustr, bytes/bytearray.
def checkop(s, meth, *argv, **kw):
assert type(s) is str
ok = kw.pop('ok', None)
bs = b(s)
us = u(s)
# verify {str,bstr,ustr}.meth with str arguments
# on py2 use unicode(s/args) because e.g. 'мир'.capitalize()
# gives correct result only on unicode, not regular str.
argv_unicode = deepReplaceStr(argv, xunicode)
kw_unicode = deepReplaceStr(kw, xunicode)
if six.PY3:
r = xcall(s, meth, *argv, **kw)
else:
s = xunicode(s)
r = xcall(s, meth, *argv_unicode, **kw_unicode)
# we provide fallback implementations on e.g. py2
if ok is not None:
if six.PY2:
ok = xunicode(ok)
if isinstance(r, NotImplementedError):
r = ok
else:
assert r == ok
assert type(s) is unicode
br = xcall(bs, meth, *argv, **kw)
ur = xcall(us, meth, *argv, **kw)
def assertDeepEQ(a, b, bstrtype):
assert not isinstance(a, (bstr, ustr))
if type(a) is unicode:
assert type(b) is bstrtype
assert a == b
return
assert type(b) is type(a)
if isinstance(a, (list, tuple)):
assert len(a) == len(b)
for i in range(len(a)):
assertDeepEQ(a[i], b[i], bstrtype)
elif isinstance(a, dict):
assert len(a) == len(b)
for k, v in a.items():
v_ = b[k]
assertDeepEQ(v, v_, bstrtype)
elif isinstance(a, Exception):
assertDeepEQ(a.args, b.args, type('')) # NOTE bstr is not raised in exceptions
else:
assert a == b
assertDeepEQ(r, br, bstr)
assertDeepEQ(r, ur, ustr)
# verify {bstr,ustr}.meth with arguments being b/u instead of str
#
# NOTE str.meth does not work with b - on py3 e.g. unicode.center
# checks fillchar to be instance of unicode.
argv_b = deepReplaceStr(argv, b)
argv_u = deepReplaceStr(argv, u)
kw_b = deepReplaceStr(kw, b)
kw_u = deepReplaceStr(kw, u)
br_b = xcall(bs, meth, *argv_b, **kw_b)
br_u = xcall(bs, meth, *argv_u, **kw_u)
ur_b = xcall(us, meth, *argv_b, **kw_b)
ur_u = xcall(us, meth, *argv_u, **kw_u)
assertDeepEQ(r, br_b, bstr)
assertDeepEQ(r, br_u, bstr)
assertDeepEQ(r, ur_b, ustr)
assertDeepEQ(r, ur_u, ustr)
# verify {bstr,ustr}.meth with arguments being bytes/unicode/bytearray instead of str
argv_bytes = deepReplaceStr(argv, xbytes)
argv_barr = deepReplaceStr2Bytearray(argv)
kw_bytes = deepReplaceStr(kw, xbytes)
kw_barr = deepReplaceStr2Bytearray(kw)
br_bytes = xcall(bs, meth, *argv_bytes, **kw_bytes)
br_unicode = xcall(bs, meth, *argv_unicode, **kw_unicode)
br_barr = xcall(bs, meth, *argv_barr, **kw_barr)
ur_bytes = xcall(us, meth, *argv_bytes, **kw_bytes)
ur_unicode = xcall(us, meth, *argv_unicode, **kw_unicode)
ur_barr = xcall(us, meth, *argv_barr, **kw_barr)
assertDeepEQ(r, br_bytes, bstr) # everything is converted to bstr, not bytes
assertDeepEQ(r, br_unicode, bstr) # ----//---- not unicode
assertDeepEQ(r, br_barr, bstr) # ----//---- not bytearray
assertDeepEQ(r, ur_bytes, ustr) # ----//---- to ustr
assertDeepEQ(r, ur_unicode, ustr)
assertDeepEQ(r, ur_barr, ustr)
# verify that {bstr,ustr}.meth does not implicitly convert buffer to string
if not hasattr(bs, meth): # e.g. bstr.removeprefix on py2
assert not hasattr(us, meth)
return
for tbuf in buftypes:
_bufview = [False]
def bufview(s):
_bufview[0] = True
return tbuf(xbytes(s))
argv_buf = deepReplaceStr(argv, bufview)
argv_hasbuf = _bufview[0]
_bufview[0] = False
kw_buf = deepReplaceStr(kw, bufview)
kw_hasbuf = _bufview[0]
if argv_hasbuf:
with raises(TypeError):
getattr(bs, meth)(*argv_buf, **kw)
with raises(TypeError):
getattr(us, meth)(*argv_buf, **kw)
if kw_hasbuf:
with raises(TypeError):
getattr(bs, meth)(*argv, **kw_buf)
with raises(TypeError):
getattr(us, meth)(*argv, **kw_buf)
# Verifier provides syntactic sugar for checkop: V.attr returns wrapper around checkop(V.text, attr).
class Verifier:
def __init__(self, text):
self.text = text
def __getattr__(self, meth):
def _(*argv, **kw):
checkop(self.text, meth, *argv, **kw)
return _
_ = Verifier
_("миру мир").__contains__("ру")
_("миру мир").__contains__("α")
_("мир").capitalize()
_("МиР").casefold()
_("мир").center(10)
_("мир").center(10, "ж")
# count, endswith - tested in test_strings_index
_("миру\tмир").expandtabs()
_("миру\tмир").expandtabs(4)
# find, index - tested in test_strings_index
_("мир").isalnum()
_("мир!").isalnum()
_("мир").isalpha()
_("мир!").isalpha()
_("мир").isascii()
_("hello").isascii()
_("hellЫ").isascii()
_("123 мир").isdecimal()
_("123 q").isdecimal()
_("123").isdecimal()
_("мир").isdigit()
_("123 мир").isdigit()
_("123 q").isdigit()
_("123").isdigit()
_("٤").isdigit() # arabic 4
_("мир").isidentifier()
_("мир$").isidentifier()
_("мир").islower()
_("Мир").islower()
_("мир").isnumeric()
_("123").isnumeric()
_("0x123").isnumeric()
_("мир").isprintable()
_("\u2009").isspace() # thin space
_(" ").isspace()
_("мир").isspace()
_("мир").istitle()
_("Мир").istitle()
_(" мир ").join(["да", "май", "труд"])
_("мир").ljust(10)
_("мир").ljust(10, 'ж')
_("МиР").lower()
_("\u2009 мир").lstrip()
_("\u2009 мир\u2009 ").lstrip()
_("мммир").lstrip('ми')
_("миру мир").partition('ру')
_("миру мир").partition('ж')
_("миру мир").removeprefix("мир")
_("миру мир").removesuffix("мир")
_("миру мир").replace("ир", "ж")
_("миру мир").replace("ир", "ж", 1)
# rfind, rindex - tested in test_strings_index
_("мир").rjust(10)
_("мир").rjust(10, 'ж')
_("миру мир").rpartition('ру')
_("миру мир").rpartition('ж')
_("мир").rsplit()
_("привет мир").rsplit()
_("привет\u2009мир").rsplit()
_("привет мир").rsplit("и")
_("привет мир").rsplit("и", 1)
_("мир \u2009").rstrip()
_(" мир \u2009").rstrip()
_("мируу").rstrip('ру')
_("мир").split()
_("привет мир").split()
_("привет\u2009мир").split()
_("привет мир").split("и")
_("привет мир").split("и", 1)
_("мир").splitlines()
_("миру\nмир").splitlines()
_("миру\nмир").splitlines(True)
_("миру\nмир\n").splitlines(True)
_("мир\nтруд\nмай\n").splitlines()
_("мир\nтруд\nмай\n").splitlines(True)
# startswith - tested in test_strings_index
_("\u2009 мир \u2009").strip()
_("миру мир").strip('мир')
_("МиР").swapcase()
_("МиР").title()
_("мир").translate({ord(u'м'):ord(u'и'), ord(u'и'):'я', ord(u'р'):None})
_("МиР").upper()
_("мир").zfill(10)
_("123").zfill(10)
# verify bstr.translate in bytes mode
def test_strings_bstr_translate_bytemode():
bs = b('мир')
b_ = xbytes('мир')
def _(*argv):
rb = bs.translate(*argv)
rok = b_.translate(*argv)
assert rb == rok
_(None)
_(None, b'')
_(None, b'\xd1')
_(None, b'\x80\xd1')
t = bytearray(range(0x100))
t[0x80] = 0x81
t[0xbc] = 0xbd
t = bytes(t)
_(t)
_(t, b'')
_(None, b'\xd1')
_(None, b'\x80\xd1')
# verify bstr/ustr maketrans
def test_strings_maketrans():
def _(argv, ok):
rok = xcall(unicode, 'maketrans', *argv)
# py2 unicode does not have maketrans
if six.PY2 and isinstance(rok, NotImplementedError):
rok = ok
assert rok == ok
rb = xcall(bstr, 'maketrans', *argv)
ru = xcall(ustr, 'maketrans', *argv)
argv_b = deepReplaceStr(argv, b)
argv_u = deepReplaceStr(argv, u)
rb_b = xcall(bstr, 'maketrans', *argv_b)
rb_u = xcall(bstr, 'maketrans', *argv_u)
ru_b = xcall(ustr, 'maketrans', *argv_b)
ru_u = xcall(ustr, 'maketrans', *argv_u)
assert rok == rb
assert rok == ru
assert rok == rb_b
assert rok == rb_u
assert rok == ru_b
assert rok == ru_u
_( ({100:'ы', 200:'я'},) , {100:u'ы', 200:u'я'} )
_( ({'α':'ы', 'β':'я'},) , {ord(u'α'):u'ы', ord(u'β'):u'я'} )
_( ('αβ', 'ыя') , {ord(u'α'):ord(u'ы'), ord(u'β'):ord(u'я')} )
_( ('αβ', 'ыя', 'πρ') , {ord(u'α'):ord(u'ы'), ord(u'β'):ord(u'я'),
ord(u'π'):None, ord(u'ρ'):None} )
# verify behaviour of bstr|ustr subclasses.
@mark.parametrize('tx', (unicode, bstr, ustr)) # XXX + str(=bytes) on py2 XXX + bytes on py3 ?
def test_strings_subclasses(tx):
x = xstr(u'мир', tx); assert type(x) is tx
# subclass without __str__
class MyStr(tx):
pass
xx = MyStr(x); assert type(xx) is MyStr
_ = tx(xx); assert type(_) is tx ; assert _ == x # e.g. unicode(MyStr) -> unicode, not MyStr
_ = bstr(xx); assert type(_) is bstr ; assert _ == 'мир'
_ = ustr(xx); assert type(_) is ustr ; assert _ == 'мир'
_ = b(xx); assert type(_) is bstr ; assert _ == 'мир'
_ = u(xx); assert type(_) is ustr ; assert _ == 'мир'
# subclass with __str__
class MyStr(tx):
def __str__(self): return u'αβγ'
__unicode__ = __str__
xx = MyStr(x); assert type(xx) is MyStr
_ = tx(xx); assert type(_) is tx ; assert _ == u'αβγ' # unicode(MyStr) -> u'αβγ', not 'мир'
_ = bstr(xx); assert type(_) is bstr ; assert _ == u'αβγ'
_ = ustr(xx); assert type(_) is ustr ; assert _ == u'αβγ'
_ = b(xx); assert type(_) is bstr ; assert _ == u'мир' # b(MyStr) -> 'мир', not 'αβγ'
_ = u(xx); assert type(_) is ustr ; assert _ == u'мир'
# non-subclass with __str__ (for completeness)
class MyObj(object):
def __str__(self):
return 'myobj'
xx = MyObj(); assert type(xx) is MyObj
_ = tx(xx); assert type(_) is tx ; assert _ == 'myobj'
_ = bstr(xx); assert type(_) is bstr ; assert _ == 'myobj'
_ = ustr(xx); assert type(_) is ustr ; assert _ == 'myobj'
with raises(TypeError): b(xx) # NOTE b/u reports "convertion failure"
with raises(TypeError): u(xx)
def test_qq():
# NOTE qq is also tested as part of strconv.quote
# qq(any) -> bstr
def _(s, qqok):
_ = qq(s)
assert type(_) is bstr
assert _ == qqok
_( xbytes('мир'), '"мир"') # b''
_( u'мир', '"мир"') # u''
_( xbytearray('мир'), '"мир"') # bytearray()
_( b('мир'), '"мир"') # b()
_( u('мир'), '"мир"') # u()
_( 1, '"1"') # int
_( [xbytes('мир')], '"[\'мир\']"') # [b'']
_( [u'мир'], '"[\'мир\']"') # [u'']
_([xbytearray('мир')], '"[\'мир\']"') # [b'']
_( [b('мир')], '"[\'мир\']"') # [b()]
_( [u('мир')], '"[\'мир\']"') # [u()]
# what qq returns - bstr - can be mixed with both unicode, bytes and bytearray
# it is tested e.g. in test_strings_ops2 and test_strings_mod_and_format
# ---- deep replace ----
# deepReplace returns object's clone with replacing all internal objects selected by predicate.
#
# Specifically: for every object x - obj or its internal object - if
# fpred(x) is true, it is replaced by what freplace(x) returns.
def deepReplace(obj, fpred, freplace):
r = _DeepReplacer(fpred, freplace)
return r.replace(obj)
_pickleproto = min(4, pickle.HIGHEST_PROTOCOL) # py2 does not define pickle.DEFAULT_PROTOCOL
_OldClassInstance = None
if six.PY2:
_OldClassInstance = types.InstanceType
# _DeepReplacer serves deepReplace.
#
# It works by recursively going through objects, unassembling them, doing
# replacements in the unassembled parts, and then rebuilding objects back.
#
# The unassemble/rebuild is implemented via using pickle-related machinery
# (__reduce__ and friends).
class _DeepReplacer:
def __init__(r, fpred, freplace):
r.fpred = fpred
r.freplace = freplace
r.memo = {}
r.keepalive = []
r.rlevel = 0 # recursion level
def _debug(self, fmt='', *argv):
if 0:
print(' '*(self.rlevel-1) + (fmt % argv))
@func
def replace(r, obj):
r.rlevel += 1
def _(): r.rlevel -= 1
defer(_)
r._debug()
r._debug('_replace %r @%s', obj, id(obj))
r._debug(' memo:\t%r', r.memo)
if id(obj) in r.memo:
r._debug(' (in memo)')
return r.memo[id(obj)]
obj_ = r._replace(obj)
r._debug('-> %r @%s', obj_, id(obj_))
if id(obj) in r.memo:
assert r.memo[id(obj)] is obj_
else:
r.memo[id(obj)] = obj_
# keep obj alive while we keep its amended version in memo referenced by id(obj).
#
# some objects, that we are processing, might be temporary (i.e. created by ilist/idict)
# and if we don't keep them alive other temporary objects could be
# further created with the same id which will break our memo accounting.
if obj_ is not obj:
r.keepalive.append(obj)
return obj_
def _replace(r, obj):
if r.fpred(obj):
return r.freplace(obj)
cls = type(obj)
if issubclass(cls, type): # a class, e.g. 'tuple'
return obj
# fast path for atomic objects (int, float, bool, bytes, unicode, ... but not e.g. tuple)
if copy._deepcopy_dispatch.get(cls) is copy._deepcopy_atomic:
return obj
# obj is non-atomic - it contains references to other objects
return r._replace_nonatomic(obj)
def _replace_nonatomic(r, obj): # -> obj*
# unassemble and rebuild obj after doing the replacement in its state
cls = type(obj)
# handle tuples specially
# if we don't - we won't get into replacing tuple items, because
# `tup.__getnewargs__() is (tup,)` and that same tup object will be present in newargv.
if cls is tuple: # NOTE plain tuple only, no subclasses here
v = []
for x in obj:
x_ = r.replace(x)
v.append(x_)
# for self-referencing cases, going replace through the state
# might already replace the tuple itself
if id(obj) in r.memo:
return r.memo[id(obj)]
return tuple(v)
if cls is _OldClassInstance: # obj is instance of old-style class
return r._replace_oldstyle(obj)
else:
return r._replace_newstyle(obj)
def _replace_oldstyle(r, obj): # -> obj*
# old-style classes are pickled not via __reduce__ - see copy._copy_inst / _deepcopy_inst
initargv = None
if hasattr(obj, '__getinitargs__'):
initargv = obj.__getinitargs__()
if hasattr(obj, '__getstate__'):
state = obj.__getstate__()
else:
state = obj.__dict__
# initargv is empty - instantiate the class via _EmptyClass and .__class__ patch
# https://github.com/python/cpython/blob/2.7-0-g8d21aa21f2c/Lib/pickle.py#L1057-L1059
if not initargv:
obj_ = copy._EmptyClass()
obj_.__class__ = obj.__class__
assert id(obj) not in r.memo
r.memo[id(obj)] = obj_
# initargv
if initargv is not None:
initargv_ = []
for x in initargv:
x_, n = r.replace(x)
initargv_.append(x_)
initargv = tuple(initargv_)
# state
state = r.replace(state)
if initargv is not None:
obj_ = obj.__class__(*initargv)
else:
obj_ = r.memo[id(obj)]
if hasattr(obj_, '__setstate__'):
obj_.__setstate__(state)
else:
obj_.__dict__.update(state)
return obj_
def _replace_newstyle(r, obj): # -> obj*
# new-style classes are pickled via __reduce__
# see copy and pickle documentation for details
# https://docs.python.org/3/library/pickle.html#pickling-class-instances
state = None
ilist = None
idict = None
setstate = None
# TODO copy_reg.reduce should have priority ?
_ = obj.__reduce_ex__(_pickleproto)
new = _[0]
newargv = _[1]
if len(_) >= 3:
state = _[2]
if len(_) >= 4:
ilist = _[3]
if len(_) >= 5:
idict = _[4]
if len(_) >= 6:
setstate = _[5]
r._debug()
r._debug(' obj:\t%r @%s', obj, id(obj))
r._debug(' new:\t%r', new)
r._debug(' newargv: %r', newargv)
r._debug(' state:\t%r', state)
r._debug(' ilist:\t%r', ilist)
r._debug(' idict:\t%r', idict)
r._debug(' setstate:\t%r', setstate)
# __newobj__ function is treated specially meaning __newobj__(cls) should call cls.__new__()
# https://github.com/python/cpython/blob/v3.11.0a7-248-g4153f2cbcb4/Lib/pickle.py#L652-L689
new_name = getattr(new, "__name__", "")
if new_name == "__newobj__" and len(newargv) == 1:
cls = newargv[0]
if hasattr(cls, "__new__"):
assert id(obj) not in r.memo
r.memo[id(obj)] = cls.__new__(cls)
# newargv
newargv_ = []
for x in newargv:
x_ = r.replace(x)
newargv_.append(x_)
newargv = tuple(newargv_)
# state
if state is not None:
state = r.replace(state)
# ilist
if ilist is not None:
ilist_ = []
for x in ilist:
x_ = r.replace(x)
ilist_.append(x_)
ilist = ilist_ # NOTE unconditionally (we consumed the iterator)
# idict
if idict is not None:
idict_ = []
for x in idict:
x_ = r.replace(x)
idict_.append(x_)
idict = idict_ # NOTE unconditionally (----//----)
# for self-referencing cases, going replace through arguments/state
# might already replace the object itself
if id(obj) in r.memo:
obj_ = r.memo[id(obj)]
else:
obj_ = new(*newargv)
if state is not None:
if setstate is not None:
setstate(obj_, state)
elif hasattr(obj_, '__setstate__'):
obj_.__setstate__(state)
else:
obj_.__dict__.update(state)
if ilist is not None:
for _ in ilist:
obj_.append(_)
if idict is not None:
for k,v in idict:
obj_[k] = v
return obj_
# deepReplaceBytes returns obj's clone with bytes instances replaced with
# unicode via UTF-8 decoding.
def _isxbytes(x):
if not isinstance(x, bytes):
return False
return True
def _bdecode(x):
return _udata(u(x))
def deepReplaceBytes(obj):
return deepReplace(obj, _isxbytes, _bdecode)
def test_deepreplace_bytes():
def f(): pass
g = lambda: None # non-picklable func
with raises((pickle.PicklingError, AttributeError), match="Can't pickle "):
pickle.dumps(g, pickle.HIGHEST_PROTOCOL)
class L(list): pass
class T(tuple): pass
class S(set): pass
class F(frozenset): pass
class D(dict): pass
class Cold: pass
class Cnew(object): pass
# TODO class without reduce, but that can be reduced via copy_reg
# TODO class completely without reduce support
cold = Cold(); cold.x = u"α"
cnew = Cnew(); cnew.x = u"β"
nochangev = [
1001, 123.4, complex(1,2), None, True, False,
u"α",
f, g,
type,
unicode, bytes,
tuple, list, int, dict,
Cold, Cnew,
NotImplementedError,
[], (), {}, set(), frozenset(),
[1, u"α", f], L([1, u"α", f]),
(1, u"α", g), T([1, u"α", g]),
{1, u"α", f}, S({1, u"α", f}),
frozenset({1, u"α", f}), F({1, u"α", f}),
{1:2, u"α":u"β", f:g}, D({1:2, u"α":u"β", f:g}),
#cold, cnew,
[(u"α", {u"β":2, 3:[4, {u"γ"}]})],
]
for x in nochangev:
assert deepReplaceBytes(x) == x
bs = xbytes("мир") ; assert type(bs) is bytes
us = xunicode("мир") ; assert type(us) is unicode
_ = deepReplaceBytes(bs)
assert type(_) is unicode
assert _ == us
x = 123
def R(obj):
obj_ = deepReplaceBytes(obj)
assert type(obj_) is type(obj)
return obj_
# list
for typ in (list, L):
assert R(typ([bs])) == typ([us])
_ = R(typ([x, bs, f]))
assert _ == typ([x, us, f])
assert _[0] is x
assert _[2] is f
_ = R(typ([bs, [bs]])) # verify that last bs is not missed to be converted due to memoization
assert _ == typ([us, [us]])
l = typ(); l += [l] # self-reference, nochange
_ = R(l)
assert len(_) == 1
assert _[0] is _
l = typ([bs]); l += [l, bs] # self-reference
_ = R(l)
assert len(_) == 3
assert _[0] == us
assert _[1] is _
assert _[2] == us
# tuple
for typ in (tuple, T):
assert R(typ((bs,))) == typ((us,))
_ = R(typ((x, bs, f)))
assert _ == typ((x, us, f))
assert _[0] is x
assert _[2] is f
t = typ(([],)); t[0].append(t) # self-reference, nochange
_ = R(t)
assert len(_) == 1
assert len(_[0]) == 1
assert _[0][0] is _
t = typ(([bs], bs)); t[0].append(t) # self-reference
_ = R(t)
assert len(_) == 2
assert len(_[0]) == 2
assert _[0][0] == us
assert _[0][1] is _
assert _[1] == us
# set
for typ in (set, frozenset, S, F):
assert R(typ({bs})) == typ({us})
_ = R(typ({x, bs, f}))
assert _ == typ({x, us, f})
_ = set(_) # e.g. frozenset -> set
_.remove(us)
while _:
obj = _.pop()
if obj == x: assert obj is x
elif obj == f: assert obj is f
else: panic(obj)
l = hlist(); s = typ({l}); l.append(s) # self-reference, nochange
s_ = R(s)
assert len(s_) == 1
l_ = list(s_)[0]
assert type(l_) is hlist
assert len(l_) == 1
assert l_[0] is s_
l = hlist(); s = typ({bs, l}); l.append(s) # self-reference
s_ = R(s)
assert len(s_) == 2
_ = list(s_)
assert us in _
obj = _.pop(_.index(us))
assert type(obj) is unicode
assert obj == us
assert len(_) == 1
assert type(_[0]) is hlist
assert len(_[0]) == 1
assert _[0][0] is s_
# dict
for typ in (dict, D):
_ = R(typ({x:bs, bs:12, f:g}))
assert _ == typ({x:us, us:12, f:g})
l = hlist([x]); d = typ({l:12}); l.append(d) # self-reference(value), nochange
d_ = R(d)
_ = list(d_.items())
assert len(_) == 1
l_, v = _[0]
assert v == 12
assert type(l_) is hlist
assert len(l_) == 2
assert l_[0] == x
assert l_[1] is d_
l = hlist([x]); d = typ({12:l}); l.append(d) # self-reference(value), nochange
d_ = R(d)
_ = list(d_.items())
assert len(_) == 1
k, l_ = _[0]
assert k == 12
assert type(l_) is hlist
assert len(l_) == 2
assert l_[0] == x
assert l_[1] is d_
lk = hlist([x]); lv = hlist([12]); d = typ({lk:lv}) # self-ref(key,value), nochange
lk.append(d); lv.append(d)
d_ = R(d)
_ = list(d_.items())
assert len(_) == 1
lk_, lv_ = _[0]
assert type(lk_) is hlist
assert type(lv_) is hlist
assert len(lk_) == 2
assert len(lv_) == 2
assert lk_[0] == x
assert lv_[0] == 12
assert lk_[1] is d_
assert lv_[1] is d_
lk = hlist([xbytes('key')]); lv = hlist([xbytes('value')]); d = typ({lk:lv}) # self-ref(k,v)
lk.append(d); lv.append(d)
d_ = R(d)
_ = list(d_.items())
assert len(_) == 1
lk_, lv_ = _[0]
assert type(lk_) is hlist
assert type(lv_) is hlist
assert len(lk_) == 2
assert len(lv_) == 2
assert type(lk_[0]) is unicode
assert type(lv_[0]) is unicode
assert lk_[0] == xunicode('key')
assert lv_[0] == xunicode('value')
assert lk_[1] is d_
assert lv_[1] is d_
# class instances
cold = Cold(); cold.x = x; cold.y = bs; cold.me = cold
cnew = Cnew(); cnew.f = f; cnew.y = bs; cnew.me = cnew
_ = R(cold)
assert _ is not cold
assert _.x is x
assert _.y == us
assert _.me is _
_ = R(cnew)
assert _ is not cnew
assert _.f is f
assert _.y == us
assert _.me is _
# combining example
cnew = Cnew()
cnew.a = [cnew, {bs}]
cnew.b = {(bs,f): g}
_ = R(cnew)
assert _ is not cnew
assert type(_.a) is list
assert len(_.a) == 2
assert _.a[0] is _
assert type(_.a[1]) is set
assert _.a[1] == {us}
assert type(_.b) is dict
assert len(_.b) == 1
k, v = list(_.b.items())[0]
assert type(k) is tuple
assert len(k) == 2
assert type(k[0]) is unicode
assert k[0] == us
assert k[1] is f
assert v is g
# deepReplaceStr returns x with all instances of str replaced with bstrmk(·)
#
# except as ad-hoc rule we we don't change ASCII strings to avoid changing
# e.g. __dict__ keys in classes from str to bytes. However a string can be
# forced to be processed as string and changed - even if it is all ASCII - by
# starting it with "*str " prefix.
def _isstr(x):
return (type(x) is str) and (x.startswith("*str ") or not isascii(x))
def deepReplaceStr(x, bstrmk):
return deepReplace(x, _isstr, bstrmk)
def test_deepreplace_str():
# verify deepReplaceStr only lightly because underlying deepReplace
# functionality is verified thoroughly via test_deepreplace_bytes
_ = deepReplaceStr('α', b)
assert type(_) is bstr
assert _ == 'α'
_ = deepReplaceStr('β', u)
assert type(_) is ustr
assert _ == 'β'
def R(x):
x_ = deepReplaceStr(x, b)
assert type(x_) is type(x)
return x_
x = 123
assert R(x) is x
_ = R([1, 'α', 2])
assert _ == [1, 'α', 2]
assert type(_[1]) is bstr
# ----------------------------------------
# verify that what we patched - e.g. bytes.__repr__ - stay unaffected when
# called outside of bstr/ustr context.
def test_strings_patched_transparently():
b_ = xbytes ("мир"); assert type(b_) is bytes
u_ = xunicode ("мир"); assert type(u_) is unicode
ba_ = xbytearray("мир"); assert type(ba_) is bytearray
# standard {repr,str}(bytes|unicode|bytearray) stay unaffected
assert repr(b_) == x32(r"b'\xd0\xbc\xd0\xb8\xd1\x80'",
r"'\xd0\xbc\xd0\xb8\xd1\x80'")
assert repr(u_) == x32(r"'мир'",
r"u'\u043c\u0438\u0440'")
assert repr(ba_) == r"bytearray(b'\xd0\xbc\xd0\xb8\xd1\x80')"
assert str(b_) == x32(r"b'\xd0\xbc\xd0\xb8\xd1\x80'",
"\xd0\xbc\xd0\xb8\xd1\x80")
if six.PY3 or sys.getdefaultencoding() == 'utf-8': # py3 or gpython/py2
assert str(u_) == "мир"
else:
# python/py2
with raises(UnicodeEncodeError): str(u_) # 'ascii' codec can't encode ...
assert str(u'abc') == "abc"
assert str(ba_) == x32(r"bytearray(b'\xd0\xbc\xd0\xb8\xd1\x80')",
b'\xd0\xbc\xd0\xb8\xd1\x80')
# unicode comparison stay unaffected
assert (u_ == u_) is True
assert (u_ != u_) is False
assert (u_ < u_) is False
assert (u_ > u_) is False
assert (u_ <= u_) is True
assert (u_ >= u_) is True
u2 = xunicode("май"); assert type(u2) is unicode
assert (u_ == u2) is False ; assert (u2 == u_) is False
assert (u_ != u2) is True ; assert (u2 != u_) is True
assert (u_ < u2) is False ; assert (u2 < u_) is True
assert (u_ > u2) is True ; assert (u2 > u_) is False
assert (u_ <= u2) is False ; assert (u2 <= u_) is True
assert (u_ >= u2) is True ; assert (u2 >= u_) is False
# bytearray.__init__ stay unaffected
with raises(TypeError): bytearray(u'мир')
a = bytearray()
with raises(TypeError): a.__init__(u'мир')
def _(*argv):
a = bytearray(*argv)
b = bytearray(); _ = b.__init__(*argv); assert _ is None
ra = repr(a)
rb = repr(b)
assert ra == rb
return ra
assert _() == r"bytearray(b'')"
assert _(b_) == r"bytearray(b'\xd0\xbc\xd0\xb8\xd1\x80')"
assert _(u_, 'utf-8') == r"bytearray(b'\xd0\xbc\xd0\xb8\xd1\x80')"
assert _(3) == r"bytearray(b'\x00\x00\x00')"
assert _((1,2,3)) == r"bytearray(b'\x01\x02\x03')"
# bytearray.{sq_concat,sq_inplace_concat} stay unaffected
a = bytearray()
def _(delta):
aa = a + delta
aa_ = a.__add__(delta)
assert aa is not a
assert aa_ is not a
aclone = bytearray(a)
a_ = a
a_ += delta
aclone_ = aclone
aclone_.__iadd__(delta)
assert a_ is a
assert a_ == aa
assert aclone_ is aclone
assert aclone_ == a_
return a_
assert _(b'') == b''
assert _(b'a') == b'a'
assert _(b'b') == b'ab'
assert _(b'cde') == b'abcde'
# ---- misc ----
# xbytes/xunicode/xbytearray convert provided bytes/unicode object to bytes,
# unicode or bytearray correspondingly to function name.
def xbytes(x): return x.encode('utf-8') if type(x) is unicode else x
def xunicode(x): return x.decode('utf-8') if type(x) is bytes else x
def xbytearray(x): return bytearray(xbytes(x))
# deepReplaceStr2Bytearray replaces str to bytearray, or hashable-version of
# bytearray, if str objects are detected to be present inside set or dict keys.
class hbytearray(bytearray):
def __hash__(self):
return hash(bytes(self))
def xhbytearray(x): return hbytearray(xbytes(x))
def deepReplaceStr2Bytearray(x):
try:
return deepReplaceStr(x, xbytearray)
except TypeError as e:
if e.args != ("unhashable type: 'bytearray'",):
raise
return deepReplaceStr(x, xhbytearray)
# xstr returns string corresponding to specified type and data.
def xstr(text, typ):
def _():
t = {
bytes: xbytes,
unicode: xunicode,
bytearray: xbytearray,
bstr: b,
ustr: u,
}
return t[typ](text)
s = _()
assert type(s) is typ
return s
# xudata returns data of x converted to unicode string.
# x can be bytes/unicode/bytearray / bstr/ustr.
def xudata(x):
def _():
if type(x) in (bytes, bytearray):
return x.decode('utf-8')
elif type(x) is unicode:
return x
elif type(x) is ustr:
return _udata(x)
elif type(x) is bstr:
return _bdata(x).decode('utf-8')
else:
raise TypeError(x)
xu = _()
assert type(xu) is unicode
return xu
# tbu maps specified type to b/u:
# b/bytes/bytearray -> b; u/unicode -> u.
def tbu(typ):
if typ in (bytes, bytearray, bstr):
return bstr
if typ in (unicode, ustr):
return ustr
raise AssertionError("invalid type %r" % typ)
# xcall returns result of the call to `obj.meth(*argv, **kw)`.
# exceptions are also converted to plain returns.
def xcall(obj, meth, *argv, **kw):
if not hasattr(obj, meth):
return NotImplementedError(meth)
meth = getattr(obj, meth)
try:
return meth(*argv, **kw)
except Exception as e:
#traceback.print_exc()
return e
# isascii returns whether bytes/unicode x consists of only ASCII characters.
def isascii(x):
if isinstance(x, unicode):
x = x.encode('utf-8')
assert isinstance(x, bytes)
# hand-made isascii (there is no bytes.isascii on py2)
try:
bytes.decode(x, 'ascii', 'strict')
except UnicodeDecodeError:
return False # non-ascii
else:
return True # ascii
# hlist is hashable list.
class hlist(list):
def __hash__(self):
return 0 # always hashable
# x32(a,b) returns a on py3, or b on py2
def x32(a, b):
return a if six.PY3 else b
......@@ -21,17 +21,14 @@
from __future__ import print_function, absolute_import
from golang import go, chan, select, default, nilchan, _PanicError, func, panic, \
defer, recover, u, b
from golang.gcompat import qq
defer, recover, u
from golang import sync
from golang.strconv_test import byterange
from pytest import raises, mark, fail
from _pytest._code import Traceback
from os.path import dirname
import os, sys, inspect, importlib, traceback, doctest
from subprocess import Popen, PIPE
import six
from six import text_type as unicode
from six.moves import range as xrange
import gc, weakref, warnings
import re
......@@ -1705,114 +1702,7 @@ def bench_defer(b):
# test_error lives in errors_test.py
# verify b, u
def test_strings():
testv = (
# bytes <-> unicode
(b'', u''),
(b'hello', u'hello'),
(b'hello\nworld', u'hello\nworld'),
(b'\xd0\xbc\xd0\xb8\xd1\x80', u'мир'),
# invalid utf-8
(b'\xd0', u'\udcd0'),
(b'a\xd0b', u'a\udcd0b'),
# invalid utf-8 with byte < 0x80
(b'\xe2\x28\xa1', u'\udce2(\udca1'),
# more invalid utf-8
# https://stackoverflow.com/questions/1301402/example-invalid-utf8-string
(b"\xc3\x28", u'\udcc3('), # Invalid 2 Octet Sequence
(b"\xa0\xa1", u'\udca0\udca1'), # Invalid Sequence Identifier
(b"\xe2\x82\xa1", u'\u20a1'), # Valid 3 Octet Sequence '₡'
(b"\xe2\x28\xa1", u'\udce2(\udca1'), # Invalid 3 Octet Sequence (in 2nd Octet)
(b"\xe2\x82\x28", u'\udce2\udc82('), # Invalid 3 Octet Sequence (in 3rd Octet)
(b"\xf0\x90\x8c\xbc", u'\U0001033c'), # Valid 4 Octet Sequence '𐌼'
(b"\xf0\x28\x8c\xbc", u'\udcf0(\udc8c\udcbc'), # Invalid 4 Octet Sequence (in 2nd Octet)
(b"\xf0\x90\x28\xbc", u'\udcf0\udc90(\udcbc'), # Invalid 4 Octet Sequence (in 3rd Octet)
(b"\xf0\x28\x8c\x28", u'\udcf0(\udc8c('), # Invalid 4 Octet Sequence (in 4th Octet)
(b"\xf8\xa1\xa1\xa1\xa1", # Valid 5 Octet Sequence (but not Unicode!)
u'\udcf8\udca1\udca1\udca1\udca1'),
(b"\xfc\xa1\xa1\xa1\xa1\xa1", # Valid 6 Octet Sequence (but not Unicode!)
u'\udcfc\udca1\udca1\udca1\udca1\udca1'),
# surrogate
(b'\xed\xa0\x80', u'\udced\udca0\udc80'),
# x00 - x1f
(byterange(0,32),
u"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" +
u"\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"),
# non-printable utf-8
(b'\x7f\xc2\x80\xc2\x81\xc2\x82\xc2\x83\xc2\x84\xc2\x85\xc2\x86\xc2\x87',
u"\u007f\u0080\u0081\u0082\u0083\u0084\u0085\u0086\u0087"),
# some characters with U >= 0x10000
(b'\xf0\x9f\x99\x8f', u'\U0001f64f'), # 🙏
(b'\xf0\x9f\x9a\x80', u'\U0001f680'), # 🚀
)
for tbytes, tunicode in testv:
assert b(tbytes) == tbytes
assert u(tunicode) == tunicode
assert b(tunicode) == tbytes
assert u(tbytes) == tunicode
assert b(u(tbytes)) == tbytes
assert u(b(tunicode)) == tunicode
# invalid types
with raises(TypeError): b(1)
with raises(TypeError): u(1)
with raises(TypeError): b(object())
with raises(TypeError): u(object())
# TODO also handle bytearray?
# b(b(·)) = identity
_ = b(u'миру мир 123')
assert isinstance(_, bytes)
assert b(_) is _
# u(u(·)) = identity
_ = u(u'мир труд май')
assert isinstance(_, unicode)
assert u(_) is _
# verify print for _pystr and _pyunicode
def test_strings_print():
outok = readfile(dir_testprog + "/golang_test_str.txt")
retcode, stdout, stderr = _pyrun(["golang_test_str.py"],
cwd=dir_testprog, stdout=PIPE, stderr=PIPE)
assert retcode == 0, (stdout, stderr)
assert stderr == b""
assertDoc(outok, stdout)
def test_qq():
# NOTE qq is also tested as part of strconv.quote
# qq(any) returns string type
assert isinstance(qq(b('мир')), str) # qq(b) -> str (bytes·py2, unicode·py3)
assert isinstance(qq( u'мир'), str) # qq(u) -> str (bytes·py2, unicode·py3)
# however what qq returns can be mixed with both unicode and bytes
assert b'hello %s !' % qq(b('мир')) == b('hello "мир" !') # b % qq(b)
assert b'hello %s !' % qq(u('мир')) == b('hello "мир" !') # b % qq(u) -> b
assert u'hello %s !' % qq(u('мир')) == u('hello "мир" !') # u % qq(u)
assert u'hello %s !' % qq(b('мир')) == u'hello "мир" !' # u % qq(b) -> u
# custom attributes cannot be injected to what qq returns
x = qq('мир')
if not ('PyPy' in sys.version): # https://foss.heptapod.net/pypy/pypy/issues/2763
with raises(AttributeError):
x.hello = 1
# strings tests live in golang_str_test.py
# ---- misc ----
......
# -*- coding: utf-8 -*-
# Copyright (C) 2018-2021 Nexedi SA and Contributors.
# Copyright (C) 2018-2022 Nexedi SA and Contributors.
# Kirill Smelkov <kirr@nexedi.com>
#
# This program is free software: you can Use, Study, Modify and Redistribute
......@@ -21,76 +21,32 @@
from __future__ import print_function, absolute_import
import sys
import six, unicodedata, codecs
from six import text_type as unicode # py2: unicode py3: str
from six import unichr # py2: unichr py3: chr
from six import int2byte as bchr # py2: chr py3: lambda x: bytes((x,))
import unicodedata, codecs
from six.moves import range as xrange
from golang import b
from golang._golang import _utf8_decode_rune, _rune_error, _xunichr
# _bstr is like b but also returns whether input was unicode.
def _bstr(s): # -> sbytes, wasunicode
wasunicode = False
if isinstance(s, bytes): # py2: str py3: bytes
pass
elif isinstance(s, unicode): # py2: unicode py3: str
wasunicode = True
else:
raise TypeError("b: invalid type %s" % type(s))
if wasunicode: # py2: unicode py3: str
if six.PY3:
s = s.encode('UTF-8', 'surrogateescape')
else:
# py2 does not have surrogateescape error handler, and even if we
# provide one, builtin unicode.encode() does not treat
# \udc80-\udcff as error. -> Do the encoding ourselves.
s = _utf8_encode_surrogateescape(s)
return s, wasunicode
# _ustr is like u but also returns whether input was bytes.
def _ustr(s): # -> sunicode, wasbytes
wasbytes = True
if isinstance(s, bytes): # py2: str py3: bytes
pass
elif isinstance(s, unicode): # py2: unicode py3: str
wasbytes = False
else:
raise TypeError("u: invalid type %s" % type(s))
if wasbytes:
if six.PY3:
s = s.decode('UTF-8', 'surrogateescape')
else:
# py2 does not have surrogateescape error handler, and even if we
# provide one, builtin bytes.decode() does not treat surrogate
# sequences as error. -> Do the decoding ourselves.
s = _utf8_decode_surrogateescape(s)
return s, wasbytes
# quote quotes unicode|bytes string into valid "..." unicode|bytes string always quoted with ".
def quote(s):
s, wasunicode = _bstr(s)
qs = _quote(s)
if wasunicode:
qs, _ = _ustr(qs)
return qs
# quote quotes unicode|bytes string into valid "..." bytestring always quoted with ".
def quote(s): # -> bstr
q, _ = _quote(b(s), b'"')
return b(q)
def _quote(s):
assert isinstance(s, bytes)
def _quote(s, quote): # -> (quoted, nonascii_escape)
assert isinstance(s, bytes), type(s)
assert isinstance(quote, bytes), type(quote)
assert len(quote) == 1, repr(quote)
outv = []
emit = outv.append
nonascii_escape = False
i = 0
while i < len(s):
c = s[i:i+1]
# fast path - ASCII only
if ord(c) < 0x80:
if c in b'\\"':
if c in (b'\\', quote):
emit(b'\\'+c)
# printable ASCII
......@@ -117,7 +73,8 @@ def _quote(s):
isize = i + size
# decode error - just emit raw byte as escaped
if r == _rune_error:
if r == _rune_error and size == 1:
nonascii_escape = True
emit(br'\x%02x' % ord(c))
# printable utf-8 characters go as is
......@@ -126,18 +83,19 @@ def _quote(s):
# everything else goes in numeric byte escapes
else:
nonascii_escape = True
for j in xrange(i, isize):
emit(br'\x%02x' % ord(s[j:j+1]))
i = isize
return b'"' + b''.join(outv) + b'"'
return (quote + b''.join(outv) + quote, nonascii_escape)
# unquote decodes "-quoted unicode|byte string.
#
# ValueError is raised if there are quoting syntax errors.
def unquote(s):
def unquote(s): # -> bstr
us, tail = unquote_next(s)
if len(tail) != 0:
raise ValueError('non-empty tail after closing "')
......@@ -148,13 +106,9 @@ def unquote(s):
# it returns -> (unquoted(s), tail-after-")
#
# ValueError is raised if there are quoting syntax errors.
def unquote_next(s):
s, wasunicode = _bstr(s)
us, tail = _unquote_next(s)
if wasunicode:
us, _ = _ustr(us)
tail, _ = _ustr(tail)
return us, tail
def unquote_next(s): # -> (bstr, bstr)
us, tail = _unquote_next(b(s))
return b(us), b(tail)
def _unquote_next(s):
assert isinstance(s, bytes)
......@@ -226,155 +180,3 @@ def _unquote_next(s):
_printable_cat0 = frozenset(['L', 'N', 'P', 'S']) # letters, numbers, punctuation, symbols
_rune_error = 0xFFFD # unicode replacement character
_ucs2_build = (sys.maxunicode == 0xffff) # ucs2
assert _ucs2_build or sys.maxunicode >= 0x0010ffff # or ucs4
# _utf8_decode_rune decodes next UTF8-character from byte string s.
#
# _utf8_decode_rune(s) -> (r, size)
def _utf8_decode_rune(s):
assert isinstance(s, bytes)
if len(s) == 0:
return _rune_error, 0
l = min(len(s), 4) # max size of an UTF-8 encoded character
while l > 0:
try:
r = s[:l].decode('utf-8', 'strict')
except UnicodeDecodeError:
l -= 1
continue
if len(r) == 1:
return ord(r), l
# see comment in _utf8_encode_surrogateescape
if _ucs2_build and len(r) == 2:
try:
return _xuniord(r), l
# e.g. TypeError: ord() expected a character, but string of length 2 found
except TypeError:
l -= 1
continue
l -= 1
continue
# invalid UTF-8
return _rune_error, 1
# _utf8_decode_surrogateescape mimics s.decode('utf-8', 'surrogateescape') from py3.
def _utf8_decode_surrogateescape(s): # -> unicode
assert isinstance(s, bytes)
outv = []
emit = outv.append
while len(s) > 0:
r, width = _utf8_decode_rune(s)
if r == _rune_error:
b = ord(s[0])
assert 0x80 <= b <= 0xff
emit(unichr(0xdc00 + b))
# python2 "correctly" decodes surrogates - don't allow that as
# surrogates are not valid UTF-8:
# https://github.com/python/cpython/blob/v3.8.1-118-gdbb37aac142/Objects/stringlib/codecs.h#L153-L157
# (python3 raises UnicodeDecodeError for surrogates)
elif 0xd800 <= r < 0xdfff:
for c in s[:width]:
b = ord(c)
if c >= 0x80:
emit(unichr(0xdc00 + b))
else:
emit(unichr(b))
else:
emit(_xunichr(r))
s = s[width:]
return u''.join(outv)
# _utf8_encode_surrogateescape mimics s.encode('utf-8', 'surrogateescape') from py3.
def _utf8_encode_surrogateescape(s): # -> bytes
assert isinstance(s, unicode)
outv = []
emit = outv.append
while len(s) > 0:
uc = s[0]; s = s[1:]
c = ord(uc)
if 0xdc80 <= c <= 0xdcff:
# surrogate - emit unescaped byte
emit(bchr(c & 0xff))
continue
# in builds with --enable-unicode=ucs2 (default for py2 on macos and windows)
# python represents unicode points > 0xffff as _two_ unicode characters:
#
# uh = u - 0x10000
# c1 = 0xd800 + (uh >> 10) ; [d800, dbff]
# c2 = 0xdc00 + (uh & 0x3ff) ; [dc00, dfff]
#
# if detected - merge those two unicode characters for .encode('utf-8') below
#
# this should be only relevant for python2, as python3 switched to "flexible"
# internal unicode representation: https://www.python.org/dev/peps/pep-0393
if _ucs2_build and (0xd800 <= c <= 0xdbff):
if len(s) > 0:
uc2 = s[0]
c2 = ord(uc2)
if 0xdc00 <= c2 <= 0xdfff:
uc = uc + uc2
s = s[1:]
emit(uc.encode('utf-8', 'strict'))
return b''.join(outv)
# _xuniord returns ordinal for a unicode character u.
#
# it works correctly even if u is represented as 2 unicode surrogate points on
# ucs2 python build.
if not _ucs2_build:
_xuniord = ord
else:
def _xuniord(u):
assert isinstance(u, unicode)
if len(u) == 1:
return ord(u)
# see _utf8_encode_surrogateescape for details
if len(u) == 2:
c1 = ord(u[0])
c2 = ord(u[1])
if (0xd800 <= c1 <= 0xdbff) and (0xdc00 <= c2 <= 0xdfff):
return 0x10000 | ((c1 - 0xd800) << 10) | (c2 - 0xdc00)
# let it crash
return ord(u)
# _xunichr returns unicode character for an ordinal i.
#
# it works correctly even on ucs2 python builds, where ordinals >= 0x10000 are
# represented as 2 unicode pointe.
if not _ucs2_build:
_xunichr = unichr
else:
def _xunichr(i):
if i < 0x10000:
return unichr(i)
# see _utf8_encode_surrogateescape for details
uh = i - 0x10000
return unichr(0xd800 + (uh >> 10)) + \
unichr(0xdc00 + (uh & 0x3ff))
# -*- coding: utf-8 -*-
# Copyright (C) 2018-2021 Nexedi SA and Contributors.
# Copyright (C) 2018-2022 Nexedi SA and Contributors.
# Kirill Smelkov <kirr@nexedi.com>
#
# This program is free software: you can Use, Study, Modify and Redistribute
......@@ -20,6 +20,7 @@
from __future__ import print_function, absolute_import
from golang import bstr
from golang.strconv import quote, unquote, unquote_next
from golang.gcompat import qq
......@@ -34,16 +35,9 @@ def byterange(start, stop):
return b
# asstr converts unicode|bytes to str type of current python.
def asstr(s):
if PY3:
if isinstance(s, bytes):
s = s.decode('utf-8')
# PY2
else:
if isinstance(s, unicode):
s = s.encode('utf-8')
return s
def assert_bstreq(x, y):
assert type(x) is bstr
assert x == y
def test_quote():
testv = (
......@@ -67,8 +61,14 @@ def test_quote():
# non-printable utf-8
(u"\u007f\u0080\u0081\u0082\u0083\u0084\u0085\u0086\u0087", u"\\x7f\\xc2\\x80\\xc2\\x81\\xc2\\x82\\xc2\\x83\\xc2\\x84\\xc2\\x85\\xc2\\x86\\xc2\\x87"),
# invalid rune
(u'\ufffd', u'�'),
)
# quote/unquote* always give bstr
BEQ = assert_bstreq
for tin, tquoted in testv:
# quote(in) == quoted
# in = unquote(quoted)
......@@ -76,14 +76,13 @@ def test_quote():
tail = b'123' if isinstance(tquoted, bytes) else '123'
tquoted = q + tquoted + q # add lead/trail "
assert quote(tin) == tquoted
assert unquote(tquoted) == tin
assert unquote_next(tquoted) == (tin, type(tin)())
assert unquote_next(tquoted + tail) == (tin, tail)
BEQ(quote(tin), tquoted)
BEQ(unquote(tquoted), tin)
_, __ = unquote_next(tquoted); BEQ(_, tin); BEQ(__, "")
_, __ = unquote_next(tquoted + tail); BEQ(_, tin); BEQ(__, tail)
with raises(ValueError): unquote(tquoted + tail)
# qq always gives str
assert qq(tin) == asstr(tquoted)
BEQ(qq(tin), tquoted)
# also check how it works on complementary unicode/bytes input type
if isinstance(tin, bytes):
......@@ -100,14 +99,13 @@ def test_quote():
tquoted = tquoted.encode('utf-8')
tail = tail.encode('utf-8')
assert quote(tin) == tquoted
assert unquote(tquoted) == tin
assert unquote_next(tquoted) == (tin, type(tin)())
assert unquote_next(tquoted + tail) == (tin, tail)
BEQ(quote(tin), tquoted)
BEQ(unquote(tquoted), tin)
_, __ = unquote_next(tquoted); BEQ(_, tin); BEQ(__, "")
_, __ = unquote_next(tquoted + tail); BEQ(_, tin); BEQ(__, tail)
with raises(ValueError): unquote(tquoted + tail)
# qq always gives str
assert qq(tin) == asstr(tquoted)
BEQ(qq(tin), tquoted)
# verify that non-canonical quotation can be unquoted too.
......
......@@ -18,9 +18,9 @@
#
# See COPYING file for full licensing terms.
# See https://www.nexedi.com/licensing for rationale and options.
"""This program helps to verify _pystr and _pyunicode.
"""This program helps to verify b, u and underlying bstr and ustr.
It complements golang_test.test_strings.
It complements golang_str_test.test_strings_print.
"""
from __future__ import print_function, absolute_import
......@@ -31,8 +31,17 @@ from golang.gcompat import qq
def main():
sb = b("привет b")
su = u("привет u")
print("print(b):", sb)
print("print(u):", su)
print("print(qq(b)):", qq(sb))
print("print(qq(u)):", qq(su))
print("print(repr(b)):", repr(sb))
print("print(repr(u)):", repr(su))
# py2: print(dict) calls PyObject_Print(flags=0) for both keys and values,
# not with flags=Py_PRINT_RAW used by default almost everywhere else.
# this way we can verify whether bstr.tp_print handles flags correctly.
print("print({b: u}):", {sb: su})
if __name__ == '__main__':
......
print(b): привет b
print(u): привет u
print(qq(b)): "привет b"
print(qq(u)): "привет u"
print(repr(b)): b('привет b')
print(repr(u)): u('привет u')
print({b: u}): {b('привет b'): u('привет u')}
# -*- coding: utf-8 -*-
# Copyright (C) 2019-2021 Nexedi SA and Contributors.
# Copyright (C) 2019-2022 Nexedi SA and Contributors.
# Kirill Smelkov <kirr@nexedi.com>
#
# This program is free software: you can Use, Study, Modify and Redistribute
......@@ -71,6 +71,10 @@ def test_golang_builtins():
assert error is golang.error
assert b is golang.b
assert u is golang.u
assert bstr is golang.bstr
assert ustr is golang.ustr
assert bbyte is golang.bbyte
assert uchr is golang.uchr
# indirectly verify golang.__all__
for k in golang.__all__:
......
......@@ -239,7 +239,8 @@ setup(
ext_modules = [
Ext('golang._golang',
['golang/_golang.pyx']),
['golang/_golang.pyx'],
depends = ['golang/_golang_str.pyx']),
Ext('golang.runtime._runtime_thread',
['golang/runtime/_runtime_thread.pyx']),
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment