Commit 386844d3 authored by Kirill Smelkov's avatar Kirill Smelkov

golang_str: bstr/ustr repr

Teach bstr/ustr to provide repr of themselves: it goes as b(...) and
u(...) where u stands for human-readable repr of contained data.
Human-readable means that non-ascii printable unicode characters are
shown as-is instead of escaping them, for example:

    >>> x = u'αβγ'
    >>> x
    'αβγ'
    >>> y = b(x)
    >>> y
    b('αβγ')				<-- NOTE not b(b'\xce\xb1\xce\xb2\xce\xb3')
    >>> x.encode('utf-8')
    b'\xce\xb1\xce\xb2\xce\xb3'
parent 604a7765
...@@ -264,6 +264,12 @@ class pybstr(bytes): ...@@ -264,6 +264,12 @@ class pybstr(bytes):
else: else:
return self return self
def __repr__(self):
qself, nonascii_escape = _bpysmartquote_u3b2(self)
if nonascii_escape: # so that e.g. b(u'\x80') is represented as
qself = 'b' + qself # b(b'\xc2\x80'), not as b('\xc2\x80')
return "b(" + qself + ")"
# override reduce for protocols < 2. Builtin handler for that goes through # override reduce for protocols < 2. Builtin handler for that goes through
# copyreg._reduce_ex which eventually calls bytes(bstr-instance) to # copyreg._reduce_ex which eventually calls bytes(bstr-instance) to
...@@ -403,6 +409,12 @@ class pyustr(unicode): ...@@ -403,6 +409,12 @@ class pyustr(unicode):
else: else:
return pyb(self) return pyb(self)
def __repr__(self):
qself, nonascii_escape = _upysmartquote_u3b2(self)
if nonascii_escape:
qself = 'b'+qself # see bstr.__repr__
return "u(" + qself + ")"
# override reduce for protocols < 2. Builtin handler for that goes through # override reduce for protocols < 2. Builtin handler for that goes through
# copyreg._reduce_ex which eventually calls unicode(ustr-instance) to # copyreg._reduce_ex which eventually calls unicode(ustr-instance) to
...@@ -539,6 +551,36 @@ IF PY2: ...@@ -539,6 +551,36 @@ IF PY2:
(<_PyTypeObject_Print*>Py_TYPE(pybstr())) .tp_print = _pybstr_tp_print (<_PyTypeObject_Print*>Py_TYPE(pybstr())) .tp_print = _pybstr_tp_print
# _bpysmartquote_u3b2 quotes bytes/bytearray s the same way python would do for string.
#
# nonascii_escape indicates whether \xNN with NN >= 0x80 is present in the output.
#
# NOTE the return type is str type of current python, so that quoted result
# could be directly used in __repr__ or __str__ implementation.
cdef _bpysmartquote_u3b2(s): # -> (unicode(py3)|bytes(py2), nonascii_escape)
# TODO change to `const uint8_t[::1] s` after strconv._quote is moved to pyx
assert isinstance(s, bytes), s
# smartquotes: choose ' or " as quoting character exactly the same way python does
# https://github.com/python/cpython/blob/v2.7.18-0-g8d21aa21f2c/Objects/stringobject.c#L905-L909
quote = b"'"
if (quote in s) and (b'"' not in s):
quote = b'"'
x, nonascii_escape = pystrconv._quote(s, quote) # raw bytes
if PY_MAJOR_VERSION < 3:
return x, nonascii_escape
else:
return _utf8_decode_surrogateescape(x), nonascii_escape # raw unicode
# _upysmartquote_u3b2 is similar to _bpysmartquote_u3b2 but accepts unicode argument.
#
# NOTE the return type is str type of current python - see _bpysmartquote_u3b2 for details.
cdef _upysmartquote_u3b2(s): # -> (unicode(py3)|bytes(py2), nonascii_escape)
assert isinstance(s, unicode), s
return _bpysmartquote_u3b2(_utf8_encode_surrogateescape(s))
# qq is substitute for %q, which is missing in python. # qq is substitute for %q, which is missing in python.
# #
# (python's automatic escape uses smartquotes quoting with either ' or "). # (python's automatic escape uses smartquotes quoting with either ' or ").
......
...@@ -237,9 +237,36 @@ def test_strings_basic(): ...@@ -237,9 +237,36 @@ def test_strings_basic():
assert hash(us) == hash("мир"); assert us == "мир" assert hash(us) == hash("мир"); assert us == "мир"
assert hash(bs) == hash("мир"); assert bs == "мир" assert hash(bs) == hash("мир"); assert bs == "мир"
# str # str/repr
_ = str(us); assert isinstance(_, str); assert _ == "мир" _ = str(us); assert isinstance(_, str); assert _ == "мир"
_ = str(bs); assert isinstance(_, str); assert _ == "мир" _ = str(bs); assert isinstance(_, str); assert _ == "мир"
_ = repr(us); assert isinstance(_, str); assert _ == "u('мир')"
_ = repr(bs); assert isinstance(_, str); assert _ == "b('мир')"
# str/repr of non-valid utf8
b_hik8 = xbytes ('привет ')+b(k8mir_bytes); assert type(b_hik8) is bstr
u_hik8 = xunicode('привет ')+u(k8mir_bytes); assert type(u_hik8) is ustr
assert _bdata(b_hik8) == b'\xd0\xbf\xd1\x80\xd0\xb8\xd0\xb2\xd0\xb5\xd1\x82 \xcd\xc9\xd2'
assert _udata(u_hik8) == u'привет \udccd\udcc9\udcd2'
_ = str(u_hik8); assert isinstance(_, str); assert _ == xbytes('привет ')+b'\xcd\xc9\xd2'
_ = str(b_hik8); assert isinstance(_, str); assert _ == xbytes('привет ')+b'\xcd\xc9\xd2'
_ = repr(u_hik8); assert isinstance(_, str); assert _ == r"u(b'привет \xcd\xc9\xd2')"
_ = repr(b_hik8); assert isinstance(_, str); assert _ == r"b(b'привет \xcd\xc9\xd2')"
# str/repr of quotes
def _(text, breprok, ureprok):
bt = b(text); assert type(bt) is bstr
ut = u(text); assert type(ut) is ustr
_ = str(bt); assert isinstance(_, str); assert _ == text
_ = str(ut); assert isinstance(_, str); assert _ == text
_ = repr(bt); assert isinstance(_, str); assert _ == breprok
_ = repr(ut); assert isinstance(_, str); assert _ == ureprok
_('', "b('')", "u('')")
_('"', "b('\"')", "u('\"')")
_("'", 'b("\'")', 'u("\'")')
_('"\'', "b('\"\\'')", "u('\"\\'')")
_('"α" \'β\'', "b('\"α\" \\\\'')", "u('\"α\" \\\\'')")
# custom attributes cannot be injected to bstr/ustr # custom attributes cannot be injected to bstr/ustr
if not ('PyPy' in sys.version): # https://foss.heptapod.net/pypy/pypy/issues/2763 if not ('PyPy' in sys.version): # https://foss.heptapod.net/pypy/pypy/issues/2763
......
...@@ -30,20 +30,23 @@ from golang._golang import _py_utf8_decode_rune as _utf8_decode_rune, _py_rune_e ...@@ -30,20 +30,23 @@ from golang._golang import _py_utf8_decode_rune as _utf8_decode_rune, _py_rune_e
# quote quotes unicode|bytes string into valid "..." bytestring always quoted with ". # quote quotes unicode|bytes string into valid "..." bytestring always quoted with ".
def quote(s): # -> bstr def quote(s): # -> bstr
q = _quote(b(s)) q, _ = _quote(b(s), b'"')
return b(q) return b(q)
def _quote(s): def _quote(s, quote): # -> (quoted, nonascii_escape)
assert isinstance(s, bytes) assert isinstance(s, bytes), type(s)
assert isinstance(quote, bytes), type(quote)
assert len(quote) == 1, repr(quote)
outv = [] outv = []
emit = outv.append emit = outv.append
nonascii_escape = False
i = 0 i = 0
while i < len(s): while i < len(s):
c = s[i:i+1] c = s[i:i+1]
# fast path - ASCII only # fast path - ASCII only
if ord(c) < 0x80: if ord(c) < 0x80:
if c in b'\\"': if c in (b'\\', quote):
emit(b'\\'+c) emit(b'\\'+c)
# printable ASCII # printable ASCII
...@@ -71,6 +74,7 @@ def _quote(s): ...@@ -71,6 +74,7 @@ def _quote(s):
# decode error - just emit raw byte as escaped # decode error - just emit raw byte as escaped
if r == _rune_error and size == 1: if r == _rune_error and size == 1:
nonascii_escape = True
emit(br'\x%02x' % ord(c)) emit(br'\x%02x' % ord(c))
# printable utf-8 characters go as is # printable utf-8 characters go as is
...@@ -79,12 +83,13 @@ def _quote(s): ...@@ -79,12 +83,13 @@ def _quote(s):
# everything else goes in numeric byte escapes # everything else goes in numeric byte escapes
else: else:
nonascii_escape = True
for j in xrange(i, isize): for j in xrange(i, isize):
emit(br'\x%02x' % ord(s[j:j+1])) emit(br'\x%02x' % ord(s[j:j+1]))
i = isize i = isize
return b'"' + b''.join(outv) + b'"' return (quote + b''.join(outv) + quote, nonascii_escape)
# unquote decodes "-quoted unicode|byte string. # unquote decodes "-quoted unicode|byte string.
......
...@@ -35,6 +35,8 @@ def main(): ...@@ -35,6 +35,8 @@ def main():
print("print(u):", su) print("print(u):", su)
print("print(qq(b)):", qq(sb)) print("print(qq(b)):", qq(sb))
print("print(qq(u)):", qq(su)) print("print(qq(u)):", qq(su))
print("print(repr(b)):", repr(sb))
print("print(repr(u)):", repr(su))
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -2,3 +2,5 @@ print(b): привет b ...@@ -2,3 +2,5 @@ print(b): привет b
print(u): привет u print(u): привет u
print(qq(b)): "привет b" print(qq(b)): "привет b"
print(qq(u)): "привет u" print(qq(u)): "привет u"
print(repr(b)): b('привет b')
print(repr(u)): u('привет u')
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment