golang: Provide b, u for strings

With Python3 I've got tired to constantly use .encode() and .decode(); getting exception if original argument was unicode on e.g. b.decode(); getting exception on raw bytes that are invalid UTF-8, not being able to use bytes literal with non-ASCII characters, etc. So instead of this pain provide two functions that make sure an object is either bytes or unicode: - b converts str/unicode/bytes s to UTF-8 encoded bytestring. Bytes input is preserved as-is: b(bytes_input) == bytes_input Unicode input is UTF-8 encoded. The encoding always succeeds. b is reverse operation to u - the following invariant is always true: b(u(bytes_input)) == bytes_input - u converts str/unicode/bytes s to unicode string. Unicode input is preserved as-is: u(unicode_input) == unicode_input Bytes input is UTF-8 decoded. The decoding always succeeds and input information is not lost: non-valid UTF-8 bytes are decoded into surrogate codes ranging from U+DC80 to U+DCFF. u is reverse operation to b - the following invariant is always true: u(b(unicode_input)) == unicode_input NOTE: encoding _and_ decoding *never* fail nor loose information. This is achieved by using 'surrogateescape' error handler on Python3, and providing manual fallback that behaves the same way on Python2. The naming is chosen with the idea so that b(something) resembles b"something", and u(something) resembles u"something". This, even being only a part of strings solution discussed in [1], should help handle byte- and unicode- strings in more robust and distraction free way. Top-level documentation is TODO. [1] nexedi/zodbtools!13

golang: Provide b, u for strings
With Python3 I've got tired to constantly use .encode() and .decode(); getting exception if original argument was unicode on e.g. b.decode(); getting exception on raw bytes that are invalid UTF-8, not being able to use bytes literal with non-ASCII characters, etc. So instead of this pain provide two functions that make sure an object is either bytes or unicode: - b converts str/unicode/bytes s to UTF-8 encoded bytestring. Bytes input is preserved as-is: b(bytes_input) == bytes_input Unicode input is UTF-8 encoded. The encoding always succeeds. b is reverse operation to u - the following invariant is always true: b(u(bytes_input)) == bytes_input - u converts str/unicode/bytes s to unicode string. Unicode input is preserved as-is: u(unicode_input) == unicode_input Bytes input is UTF-8 decoded. The decoding always succeeds and input information is not lost: non-valid UTF-8 bytes are decoded into surrogate codes ranging from U+DC80 to U+DCFF. u is reverse operation to b - the following invariant is always true: u(b(unicode_input)) == unicode_input NOTE: encoding _and_ decoding *never* fail nor loose information. This is achieved by using 'surrogateescape' error handler on Python3, and providing manual fallback that behaves the same way on Python2. The naming is chosen with the idea so that b(something) resembles b"something", and u(something) resembles u"something". This, even being only a part of strings solution discussed in [1], should help handle byte- and unicode- strings in more robust and distraction free way. Top-level documentation is TODO. [1] nexedi/zodbtools!13
bcb95cd5 · Kirill Smelkov · 230c81c4 · bcb95cd5 · bcb95cd5 · bcb95cd5
Commit bcb95cd5 authored Jan 29, 2020 by Kirill Smelkov
6 changed files
--- a/golang/__init__.py
+++ b/golang/__init__.py
 # -*- coding: utf-8 -*-
-# Copyright (C) 2018-2019  Nexedi SA and Contributors.
+# Copyright (C) 2018-2020  Nexedi SA and Contributors.
 #                          Kirill Smelkov <kirr@nexedi.com>
 #
 # This program is free software: you can Use, Study, Modify and Redistribute
@@ -33,7 +33,8 @@ from __future__ import print_function, absolute_import

 __version__ = "0.0.5"

-__all__ = ['go', 'chan', 'select', 'default', 'nilchan', 'defer', 'panic', 'recover', 'func', 'gimport']
+__all__ = ['go', 'chan', 'select', 'default', 'nilchan', 'defer', 'panic',
+           'recover', 'func', 'b', 'u', 'gimport']

 from golang._gopath import gimport  # make gimport available from golang
 import inspect, sys
@@ -294,7 +295,7 @@ if six.PY2:
    import golang._patch.ipython_py2


-# ---- go + channels ----
+# ---- go + channels, panic, etc... ----

 from ._golang import    \
    pygo        as go,      \
@@ -303,4 +304,6 @@ from ._golang import    \
    pydefault   as default, \
    pynilchan   as nilchan, \
    _PanicError,            \
-    pypanic     as panic
+    pypanic     as panic,   \
+    pyb         as b,       \
+    pyu         as u
--- a/golang/_golang.pyx
+++ b/golang/_golang.pyx
@@ -69,7 +69,7 @@ cdef void topyexc() except *:
    if arg != nil:
        pyarg = <bytes>arg
        if PY_MAJOR_VERSION >= 3:
-            pyarg = pyarg.decode("utf-8")
+            pyarg = pyu(pyarg)
        pypanic(pyarg)

 cdef extern from "golang/libgolang.h" nogil:
@@ -519,7 +519,7 @@ cdef void _init_libgolang() except*:
    # process of importing golang (it tries to access "X" attribute of half-created
    # golang module). -> preimport runtimemod via regular import first.
    __import__(runtimemod)
-    runtimecaps = (runtimemod + ".libgolang_runtime_ops").encode("utf-8") # py3
+    runtimecaps = (runtimemod + ".libgolang_runtime_ops").encode("utf-8") # py3, cannot use pyb yet
    cdef const _libgolang_runtime_ops *runtime_ops = \
        <const _libgolang_runtime_ops*>PyCapsule_Import(runtimecaps, 0)
    if runtime_ops == nil:
@@ -765,3 +765,48 @@ cdef DType parse_dtype(dtype) except <DType>-1:
    if _ is None:
        raise TypeError("pychan: invalid dtype: %r" % (dtype,))
    return _
+
+
+# ---- strings ----
+
+from golang import strconv as pystrconv
+
+def pyb(s): # -> bytes
+    """b converts str/unicode/bytes s to UTF-8 encoded bytestring.
+
+       Bytes input is preserved as-is:
+
+          b(bytes_input) == bytes_input
+
+       Unicode input is UTF-8 encoded. The encoding always succeeds.
+       b is reverse operation to u - the following invariant is always true:
+
+          b(u(bytes_input)) == bytes_input
+
+       TypeError is raised if type(s) is not one of the above.
+
+       See also: u.
+    """
+    bs, _ = pystrconv._bstr(s)
+    return bs
+
+def pyu(s): # -> unicode
+    """u converts str/unicode/bytes s to unicode string.
+
+       Unicode input is preserved as-is:
+
+          u(unicode_input) == unicode_input
+
+       Bytes input is UTF-8 decoded. The decoding always succeeds and input
+       information is not lost: non-valid UTF-8 bytes are decoded into
+       surrogate codes ranging from U+DC80 to U+DCFF.
+       u is reverse operation to b - the following invariant is always true:
+
+          u(b(unicode_input)) == unicode_input
+
+       TypeError is raised if type(s) is not one of the above.
+
+       See also: b.
+    """
+    us, _ = pystrconv._ustr(s)
+    return us
--- a/golang/gcompat.py
+++ b/golang/gcompat.py
 # -*- coding: utf-8 -*-
-# Copyright (C) 2018-2019  Nexedi SA and Contributors.
+# Copyright (C) 2018-2020  Nexedi SA and Contributors.
 #                          Kirill Smelkov <kirr@nexedi.com>
 #
 # This program is free software: you can Use, Study, Modify and Redistribute
@@ -41,6 +41,6 @@ def qq(obj):
    # `printf('%s', qq(obj))` should work. For this make sure qobj is always a
    # str - not bytes under py3 (if it was bytes it will print e.g. as b'...')
    if six.PY3 and isinstance(qobj, bytes):
-        qobj = qobj.decode('UTF-8')
+        qobj = qobj.decode('UTF-8')     # TODO use u

    return qobj
--- a/golang/golang_test.py
+++ b/golang/golang_test.py
 # -*- coding: utf-8 -*-
-# Copyright (C) 2018-2019  Nexedi SA and Contributors.
+# Copyright (C) 2018-2020  Nexedi SA and Contributors.
 #                          Kirill Smelkov <kirr@nexedi.com>
 #
 # This program is free software: you can Use, Study, Modify and Redistribute
@@ -20,8 +20,10 @@

 from __future__ import print_function, absolute_import

-from golang import go, chan, select, default, nilchan, _PanicError, func, panic, defer, recover
+from golang import go, chan, select, default, nilchan, _PanicError, func, panic, \
+        defer, recover, u, b
 from golang import sync
+from golang.strconv_test import byterange
 from pytest import raises, mark, fail
 from _pytest._code import Traceback
 from os.path import dirname
@@ -1551,6 +1553,70 @@ def bench_defer(b):
        _()


+# verify b, u
+def test_strings():
+    testv = (
+        # bytes          <->            unicode
+        (b'',                           u''),
+        (b'hello',                      u'hello'),
+        (b'hello\nworld',               u'hello\nworld'),
+        (b'\xd0\xbc\xd0\xb8\xd1\x80',   u'мир'),
+
+        # invalid utf-8
+        (b'\xd0',                       u'\udcd0'),
+        (b'a\xd0b',                     u'a\udcd0b'),
+        # invalid utf-8 with byte < 0x80
+        (b'\xe2\x28\xa1',               u'\udce2(\udca1'),
+
+        # more invalid utf-8
+        # https://stackoverflow.com/questions/1301402/example-invalid-utf8-string
+        (b"\xc3\x28",                   u'\udcc3('),        # Invalid 2 Octet Sequence
+        (b"\xa0\xa1",                   u'\udca0\udca1'),   # Invalid Sequence Identifier
+        (b"\xe2\x82\xa1",               u'\u20a1'),         # Valid 3 Octet Sequence '₡'
+        (b"\xe2\x28\xa1",               u'\udce2(\udca1'),  # Invalid 3 Octet Sequence (in 2nd Octet)
+        (b"\xe2\x82\x28",               u'\udce2\udc82('),  # Invalid 3 Octet Sequence (in 3rd Octet)
+        (b"\xf0\x90\x8c\xbc",           u'\U0001033c'),     # Valid 4 Octet Sequence '𐌼'
+        (b"\xf0\x28\x8c\xbc",           u'\udcf0(\udc8c\udcbc'), # Invalid 4 Octet Sequence (in 2nd Octet)
+        (b"\xf0\x90\x28\xbc",           u'\udcf0\udc90(\udcbc'), # Invalid 4 Octet Sequence (in 3rd Octet)
+        (b"\xf0\x28\x8c\x28",           u'\udcf0(\udc8c('), # Invalid 4 Octet Sequence (in 4th Octet)
+        (b"\xf8\xa1\xa1\xa1\xa1",                           # Valid 5 Octet Sequence (but not Unicode!)
+                                        u'\udcf8\udca1\udca1\udca1\udca1'),
+        (b"\xfc\xa1\xa1\xa1\xa1\xa1",                       # Valid 6 Octet Sequence (but not Unicode!)
+                                        u'\udcfc\udca1\udca1\udca1\udca1\udca1'),
+
+        # surrogate
+        (b'\xed\xa0\x80',               u'\udced\udca0\udc80'),
+
+        # x00 - x1f
+        (byterange(0,32),
+         u"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" +
+         u"\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"),
+
+        # non-printable utf-8
+        (b'\x7f\xc2\x80\xc2\x81\xc2\x82\xc2\x83\xc2\x84\xc2\x85\xc2\x86\xc2\x87',
+                                        u"\u007f\u0080\u0081\u0082\u0083\u0084\u0085\u0086\u0087"),
+    )
+
+    for tbytes, tunicode in testv:
+        assert b(tbytes)   == tbytes
+        assert u(tunicode) == tunicode
+
+        assert b(tunicode) == tbytes
+        assert u(tbytes)   == tunicode
+
+        assert b(u(tbytes))     == tbytes
+        assert u(b(tunicode))   == tunicode
+
+
+    # invalid types
+    with raises(TypeError): b(1)
+    with raises(TypeError): u(1)
+    with raises(TypeError): b(object())
+    with raises(TypeError): u(object())
+
+    # TODO also handle bytearray?
+
+
 # ---- misc ----

 # _pyrun runs `sys.executable argv... <stdin`.
@@ -1630,10 +1696,8 @@ def test_panics():
 # - PYGOLANG means real pygolang prefix
 # - empty lines are changed to <BLANKLINE>
 def assertDoc(want, got):
-    if isinstance(want, bytes):
-        want = want.decode('utf-8')
-    if isinstance(got, bytes):
-        got  = got .decode('utf-8')
+    want = u(want)
+    got  = u(got)

    # normalize got to PYGOLANG
    dir_pygolang = dirname((dirname(__file__))) # pygolang/golang/golang_test.py -> pygolang

--- a/golang/strconv.py
+++ b/golang/strconv.py
 # -*- coding: utf-8 -*-
-# Copyright (C) 2018-2019  Nexedi SA and Contributors.
+# Copyright (C) 2018-2020  Nexedi SA and Contributors.
 #                          Kirill Smelkov <kirr@nexedi.com>
 #
 # This program is free software: you can Use, Study, Modify and Redistribute
@@ -22,32 +22,61 @@
 from __future__ import print_function, absolute_import

 import six, unicodedata, codecs
+from six import text_type as unicode        # py2: unicode      py3: str
+from six import unichr                      # py2: unichr       py3: chr
+from six import int2byte as bchr            # py2: chr          py3: lambda x: bytes((x,))
 from six.moves import range as xrange


-# _bstr converts str/unicode/bytes s to UTF-8 encoded bytestring.
-#
-# TypeError is raised if type(s) is not one of the above.
+# _bstr is like b but also returns whether input was unicode.
 def _bstr(s):   # -> sbytes, wasunicode
    wasunicode = False
    if isinstance(s, bytes):                    # py2: str      py3: bytes
        pass
-    elif isinstance(s, six.text_type):          # py2: unicode  py3: str
+    elif isinstance(s, unicode):                # py2: unicode  py3: str
        wasunicode = True
    else:
-        raise TypeError("_bstr: invalid type %s" % type(s))
+        raise TypeError("b: invalid type %s" % type(s))

-    if wasunicode:                              # py2: unicode  py3: str    -> bytes
-        s = s.encode('UTF-8')
+    if wasunicode:                              # py2: unicode  py3: str
+        if six.PY3:
+            s = s.encode('UTF-8', 'surrogateescape')
+        else:
+            # py2 does not have surrogateescape error handler, and even if we
+            # provide one, builtin unicode.encode() does not treat
+            # \udc80-\udcff as error. -> Do the encoding ourselves.
+            s = _utf8_encode_surrogateescape(s)

    return s, wasunicode

+# _ustr is like u but also returns whether input was bytes.
+def _ustr(s):   # -> sunicode, wasbytes
+    wasbytes = True
+    if isinstance(s, bytes):                    # py2: str      py3: bytes
+        pass
+    elif isinstance(s, unicode):                # py2: unicode  py3: str
+        wasbytes = False
+    else:
+        raise TypeError("u: invalid type %s" % type(s))
+
+    if wasbytes:
+        if six.PY3:
+            s = s.decode('UTF-8', 'surrogateescape')
+        else:
+            # py2 does not have surrogateescape error handler, and even if we
+            # provide one, builtin bytes.decode() does not treat surrogate
+            # sequences as error. -> Do the decoding ourselves.
+            s = _utf8_decode_surrogateescape(s)
+
+    return s, wasbytes
+
+
 # quote quotes unicode|bytes string into valid "..." unicode|bytes string always quoted with ".
 def quote(s):
    s, wasunicode = _bstr(s)
    qs = _quote(s)
    if wasunicode:
-        qs = qs.decode('UTF-8')
+        qs, _ = _ustr(qs)
    return qs

 def _quote(s):
@@ -122,8 +151,8 @@ def unquote_next(s):
    s, wasunicode = _bstr(s)
    us, tail = _unquote_next(s)
    if wasunicode:
-        us = us.decode('UTF-8')
-        tail = tail.decode('UTF-8')
+        us, _   = _ustr(us)
+        tail, _ = _ustr(tail)
    return us, tail

 def _unquote_next(s):
@@ -220,3 +249,53 @@ def _utf8_decode_rune(s):

    # invalid UTF-8
    return _rune_error, 1
+
+
+# _utf8_decode_surrogateescape mimics s.decode('utf-8', 'surrogateescape') from py3.
+def _utf8_decode_surrogateescape(s): # -> unicode
+    assert isinstance(s, bytes)
+    outv = []
+    emit = outv.append
+
+    while len(s) > 0:
+        r, width = _utf8_decode_rune(s)
+        if r == _rune_error:
+            b = ord(s[0])
+            assert 0x80 <= b <= 0xff
+            emit(unichr(0xdc00 + b))
+
+        # python2 "correctly" decodes surrogates - don't allow that as
+        # surrogates are not valid UTF-8:
+        # https://github.com/python/cpython/blob/v3.8.1-118-gdbb37aac142/Objects/stringlib/codecs.h#L153-L157
+        # (python3 raises UnicodeDecodeError for surrogates)
+        elif 0xd800 <= ord(r) < 0xdfff:
+            for c in s[:width]:
+                b = ord(c)
+                if c >= 0x80:
+                    emit(unichr(0xdc00 + b))
+                else:
+                    emit(unichr(b))
+
+        else:
+            emit(r)
+
+        s = s[width:]
+
+    return u''.join(outv)
+
+
+# _utf8_encode_surrogateescape mimics s.encode('utf-8', 'surrogateescape') from py3.
+def _utf8_encode_surrogateescape(s): # -> bytes
+    assert isinstance(s, unicode)
+    outv = []
+    emit = outv.append
+
+    for uc in s:
+        c = ord(uc)
+        if 0xdc80 <= c <= 0xdcff:
+            # surrogate - emit unescaped byte
+            emit(bchr(c & 0xff))
+        else:
+            emit(uc.encode('utf-8', 'strict'))
+
+    return b''.join(outv)
--- a/gpython/gpython_test.py
+++ b/gpython/gpython_test.py
 # -*- coding: utf-8 -*-
-# Copyright (C) 2019  Nexedi SA and Contributors.
-#                     Kirill Smelkov <kirr@nexedi.com>
+# Copyright (C) 2019-2020  Nexedi SA and Contributors.
+#                          Kirill Smelkov <kirr@nexedi.com>
 #
 # This program is free software: you can Use, Study, Modify and Redistribute
 # it under the terms of the GNU General Public License version 3, or (at your
@@ -40,6 +40,8 @@ def test_golang_builtins():
    assert go     is golang.go
    assert chan   is golang.chan
    assert select is golang.select
+    assert b      is golang.b
+    assert u      is golang.u

    # indirectly verify golang.__all__
    for k in golang.__all__:
@@ -92,17 +94,12 @@ def test_executable():
    out = pyout(['-c', 'import sys; print(sys.version)'])
    assert ('[GPython %s]' % golang.__version__) in str(out)

-# b converts s to UTF-8 encoded bytes.
-def b(s):
-    from golang.strconv import _bstr
-    s, _ = _bstr(s)
-    return s
-
 # verify pymain.
 #
 # !gpython_only to make sure we get the same output when run via pymain (under
 # gpython) and plain python (!gpython).
 def test_pymain():
+    from golang import b
    from os.path import join, dirname, realpath
    here     = dirname(__file__)
    testdata = join(dirname(__file__), 'testdata')