Commit bcb95cd5 authored by Kirill Smelkov's avatar Kirill Smelkov

golang: Provide b, u for strings

With Python3 I've got tired to constantly use .encode() and .decode();
getting exception if original argument was unicode on e.g. b.decode();
getting exception on raw bytes that are invalid UTF-8, not being able to
use bytes literal with non-ASCII characters, etc.

So instead of this pain provide two functions that make sure an object
is either bytes or unicode:

- b converts str/unicode/bytes s to UTF-8 encoded bytestring.

	Bytes input is preserved as-is:

	   b(bytes_input) == bytes_input

	Unicode input is UTF-8 encoded. The encoding always succeeds.
	b is reverse operation to u - the following invariant is always true:

	   b(u(bytes_input)) == bytes_input

- u converts str/unicode/bytes s to unicode string.

	Unicode input is preserved as-is:

	   u(unicode_input) == unicode_input

	Bytes input is UTF-8 decoded. The decoding always succeeds and input
	information is not lost: non-valid UTF-8 bytes are decoded into
	surrogate codes ranging from U+DC80 to U+DCFF.
	u is reverse operation to b - the following invariant is always true:

	   u(b(unicode_input)) == unicode_input

NOTE: encoding _and_ decoding *never* fail nor loose information. This
is achieved by using 'surrogateescape' error handler on Python3, and
providing manual fallback that behaves the same way on Python2.

The naming is chosen with the idea so that b(something) resembles
b"something", and u(something) resembles u"something".

This, even being only a part of strings solution discussed in [1],
should help handle byte- and unicode- strings in more robust and
distraction free way.

Top-level documentation is TODO.

[1] nexedi/zodbtools!13
parent 230c81c4
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright (C) 2018-2019 Nexedi SA and Contributors. # Copyright (C) 2018-2020 Nexedi SA and Contributors.
# Kirill Smelkov <kirr@nexedi.com> # Kirill Smelkov <kirr@nexedi.com>
# #
# This program is free software: you can Use, Study, Modify and Redistribute # This program is free software: you can Use, Study, Modify and Redistribute
...@@ -33,7 +33,8 @@ from __future__ import print_function, absolute_import ...@@ -33,7 +33,8 @@ from __future__ import print_function, absolute_import
__version__ = "0.0.5" __version__ = "0.0.5"
__all__ = ['go', 'chan', 'select', 'default', 'nilchan', 'defer', 'panic', 'recover', 'func', 'gimport'] __all__ = ['go', 'chan', 'select', 'default', 'nilchan', 'defer', 'panic',
'recover', 'func', 'b', 'u', 'gimport']
from golang._gopath import gimport # make gimport available from golang from golang._gopath import gimport # make gimport available from golang
import inspect, sys import inspect, sys
...@@ -294,7 +295,7 @@ if six.PY2: ...@@ -294,7 +295,7 @@ if six.PY2:
import golang._patch.ipython_py2 import golang._patch.ipython_py2
# ---- go + channels ---- # ---- go + channels, panic, etc... ----
from ._golang import \ from ._golang import \
pygo as go, \ pygo as go, \
...@@ -303,4 +304,6 @@ from ._golang import \ ...@@ -303,4 +304,6 @@ from ._golang import \
pydefault as default, \ pydefault as default, \
pynilchan as nilchan, \ pynilchan as nilchan, \
_PanicError, \ _PanicError, \
pypanic as panic pypanic as panic, \
pyb as b, \
pyu as u
...@@ -69,7 +69,7 @@ cdef void topyexc() except *: ...@@ -69,7 +69,7 @@ cdef void topyexc() except *:
if arg != nil: if arg != nil:
pyarg = <bytes>arg pyarg = <bytes>arg
if PY_MAJOR_VERSION >= 3: if PY_MAJOR_VERSION >= 3:
pyarg = pyarg.decode("utf-8") pyarg = pyu(pyarg)
pypanic(pyarg) pypanic(pyarg)
cdef extern from "golang/libgolang.h" nogil: cdef extern from "golang/libgolang.h" nogil:
...@@ -519,7 +519,7 @@ cdef void _init_libgolang() except*: ...@@ -519,7 +519,7 @@ cdef void _init_libgolang() except*:
# process of importing golang (it tries to access "X" attribute of half-created # process of importing golang (it tries to access "X" attribute of half-created
# golang module). -> preimport runtimemod via regular import first. # golang module). -> preimport runtimemod via regular import first.
__import__(runtimemod) __import__(runtimemod)
runtimecaps = (runtimemod + ".libgolang_runtime_ops").encode("utf-8") # py3 runtimecaps = (runtimemod + ".libgolang_runtime_ops").encode("utf-8") # py3, cannot use pyb yet
cdef const _libgolang_runtime_ops *runtime_ops = \ cdef const _libgolang_runtime_ops *runtime_ops = \
<const _libgolang_runtime_ops*>PyCapsule_Import(runtimecaps, 0) <const _libgolang_runtime_ops*>PyCapsule_Import(runtimecaps, 0)
if runtime_ops == nil: if runtime_ops == nil:
...@@ -765,3 +765,48 @@ cdef DType parse_dtype(dtype) except <DType>-1: ...@@ -765,3 +765,48 @@ cdef DType parse_dtype(dtype) except <DType>-1:
if _ is None: if _ is None:
raise TypeError("pychan: invalid dtype: %r" % (dtype,)) raise TypeError("pychan: invalid dtype: %r" % (dtype,))
return _ return _
# ---- strings ----
from golang import strconv as pystrconv
def pyb(s): # -> bytes
"""b converts str/unicode/bytes s to UTF-8 encoded bytestring.
Bytes input is preserved as-is:
b(bytes_input) == bytes_input
Unicode input is UTF-8 encoded. The encoding always succeeds.
b is reverse operation to u - the following invariant is always true:
b(u(bytes_input)) == bytes_input
TypeError is raised if type(s) is not one of the above.
See also: u.
"""
bs, _ = pystrconv._bstr(s)
return bs
def pyu(s): # -> unicode
"""u converts str/unicode/bytes s to unicode string.
Unicode input is preserved as-is:
u(unicode_input) == unicode_input
Bytes input is UTF-8 decoded. The decoding always succeeds and input
information is not lost: non-valid UTF-8 bytes are decoded into
surrogate codes ranging from U+DC80 to U+DCFF.
u is reverse operation to b - the following invariant is always true:
u(b(unicode_input)) == unicode_input
TypeError is raised if type(s) is not one of the above.
See also: b.
"""
us, _ = pystrconv._ustr(s)
return us
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright (C) 2018-2019 Nexedi SA and Contributors. # Copyright (C) 2018-2020 Nexedi SA and Contributors.
# Kirill Smelkov <kirr@nexedi.com> # Kirill Smelkov <kirr@nexedi.com>
# #
# This program is free software: you can Use, Study, Modify and Redistribute # This program is free software: you can Use, Study, Modify and Redistribute
...@@ -41,6 +41,6 @@ def qq(obj): ...@@ -41,6 +41,6 @@ def qq(obj):
# `printf('%s', qq(obj))` should work. For this make sure qobj is always a # `printf('%s', qq(obj))` should work. For this make sure qobj is always a
# str - not bytes under py3 (if it was bytes it will print e.g. as b'...') # str - not bytes under py3 (if it was bytes it will print e.g. as b'...')
if six.PY3 and isinstance(qobj, bytes): if six.PY3 and isinstance(qobj, bytes):
qobj = qobj.decode('UTF-8') qobj = qobj.decode('UTF-8') # TODO use u
return qobj return qobj
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright (C) 2018-2019 Nexedi SA and Contributors. # Copyright (C) 2018-2020 Nexedi SA and Contributors.
# Kirill Smelkov <kirr@nexedi.com> # Kirill Smelkov <kirr@nexedi.com>
# #
# This program is free software: you can Use, Study, Modify and Redistribute # This program is free software: you can Use, Study, Modify and Redistribute
...@@ -20,8 +20,10 @@ ...@@ -20,8 +20,10 @@
from __future__ import print_function, absolute_import from __future__ import print_function, absolute_import
from golang import go, chan, select, default, nilchan, _PanicError, func, panic, defer, recover from golang import go, chan, select, default, nilchan, _PanicError, func, panic, \
defer, recover, u, b
from golang import sync from golang import sync
from golang.strconv_test import byterange
from pytest import raises, mark, fail from pytest import raises, mark, fail
from _pytest._code import Traceback from _pytest._code import Traceback
from os.path import dirname from os.path import dirname
...@@ -1551,6 +1553,70 @@ def bench_defer(b): ...@@ -1551,6 +1553,70 @@ def bench_defer(b):
_() _()
# verify b, u
def test_strings():
testv = (
# bytes <-> unicode
(b'', u''),
(b'hello', u'hello'),
(b'hello\nworld', u'hello\nworld'),
(b'\xd0\xbc\xd0\xb8\xd1\x80', u'мир'),
# invalid utf-8
(b'\xd0', u'\udcd0'),
(b'a\xd0b', u'a\udcd0b'),
# invalid utf-8 with byte < 0x80
(b'\xe2\x28\xa1', u'\udce2(\udca1'),
# more invalid utf-8
# https://stackoverflow.com/questions/1301402/example-invalid-utf8-string
(b"\xc3\x28", u'\udcc3('), # Invalid 2 Octet Sequence
(b"\xa0\xa1", u'\udca0\udca1'), # Invalid Sequence Identifier
(b"\xe2\x82\xa1", u'\u20a1'), # Valid 3 Octet Sequence '₡'
(b"\xe2\x28\xa1", u'\udce2(\udca1'), # Invalid 3 Octet Sequence (in 2nd Octet)
(b"\xe2\x82\x28", u'\udce2\udc82('), # Invalid 3 Octet Sequence (in 3rd Octet)
(b"\xf0\x90\x8c\xbc", u'\U0001033c'), # Valid 4 Octet Sequence '𐌼'
(b"\xf0\x28\x8c\xbc", u'\udcf0(\udc8c\udcbc'), # Invalid 4 Octet Sequence (in 2nd Octet)
(b"\xf0\x90\x28\xbc", u'\udcf0\udc90(\udcbc'), # Invalid 4 Octet Sequence (in 3rd Octet)
(b"\xf0\x28\x8c\x28", u'\udcf0(\udc8c('), # Invalid 4 Octet Sequence (in 4th Octet)
(b"\xf8\xa1\xa1\xa1\xa1", # Valid 5 Octet Sequence (but not Unicode!)
u'\udcf8\udca1\udca1\udca1\udca1'),
(b"\xfc\xa1\xa1\xa1\xa1\xa1", # Valid 6 Octet Sequence (but not Unicode!)
u'\udcfc\udca1\udca1\udca1\udca1\udca1'),
# surrogate
(b'\xed\xa0\x80', u'\udced\udca0\udc80'),
# x00 - x1f
(byterange(0,32),
u"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" +
u"\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"),
# non-printable utf-8
(b'\x7f\xc2\x80\xc2\x81\xc2\x82\xc2\x83\xc2\x84\xc2\x85\xc2\x86\xc2\x87',
u"\u007f\u0080\u0081\u0082\u0083\u0084\u0085\u0086\u0087"),
)
for tbytes, tunicode in testv:
assert b(tbytes) == tbytes
assert u(tunicode) == tunicode
assert b(tunicode) == tbytes
assert u(tbytes) == tunicode
assert b(u(tbytes)) == tbytes
assert u(b(tunicode)) == tunicode
# invalid types
with raises(TypeError): b(1)
with raises(TypeError): u(1)
with raises(TypeError): b(object())
with raises(TypeError): u(object())
# TODO also handle bytearray?
# ---- misc ---- # ---- misc ----
# _pyrun runs `sys.executable argv... <stdin`. # _pyrun runs `sys.executable argv... <stdin`.
...@@ -1630,10 +1696,8 @@ def test_panics(): ...@@ -1630,10 +1696,8 @@ def test_panics():
# - PYGOLANG means real pygolang prefix # - PYGOLANG means real pygolang prefix
# - empty lines are changed to <BLANKLINE> # - empty lines are changed to <BLANKLINE>
def assertDoc(want, got): def assertDoc(want, got):
if isinstance(want, bytes): want = u(want)
want = want.decode('utf-8') got = u(got)
if isinstance(got, bytes):
got = got .decode('utf-8')
# normalize got to PYGOLANG # normalize got to PYGOLANG
dir_pygolang = dirname((dirname(__file__))) # pygolang/golang/golang_test.py -> pygolang dir_pygolang = dirname((dirname(__file__))) # pygolang/golang/golang_test.py -> pygolang
......
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright (C) 2018-2019 Nexedi SA and Contributors. # Copyright (C) 2018-2020 Nexedi SA and Contributors.
# Kirill Smelkov <kirr@nexedi.com> # Kirill Smelkov <kirr@nexedi.com>
# #
# This program is free software: you can Use, Study, Modify and Redistribute # This program is free software: you can Use, Study, Modify and Redistribute
...@@ -22,32 +22,61 @@ ...@@ -22,32 +22,61 @@
from __future__ import print_function, absolute_import from __future__ import print_function, absolute_import
import six, unicodedata, codecs import six, unicodedata, codecs
from six import text_type as unicode # py2: unicode py3: str
from six import unichr # py2: unichr py3: chr
from six import int2byte as bchr # py2: chr py3: lambda x: bytes((x,))
from six.moves import range as xrange from six.moves import range as xrange
# _bstr converts str/unicode/bytes s to UTF-8 encoded bytestring. # _bstr is like b but also returns whether input was unicode.
#
# TypeError is raised if type(s) is not one of the above.
def _bstr(s): # -> sbytes, wasunicode def _bstr(s): # -> sbytes, wasunicode
wasunicode = False wasunicode = False
if isinstance(s, bytes): # py2: str py3: bytes if isinstance(s, bytes): # py2: str py3: bytes
pass pass
elif isinstance(s, six.text_type): # py2: unicode py3: str elif isinstance(s, unicode): # py2: unicode py3: str
wasunicode = True wasunicode = True
else: else:
raise TypeError("_bstr: invalid type %s" % type(s)) raise TypeError("b: invalid type %s" % type(s))
if wasunicode: # py2: unicode py3: str -> bytes if wasunicode: # py2: unicode py3: str
s = s.encode('UTF-8') if six.PY3:
s = s.encode('UTF-8', 'surrogateescape')
else:
# py2 does not have surrogateescape error handler, and even if we
# provide one, builtin unicode.encode() does not treat
# \udc80-\udcff as error. -> Do the encoding ourselves.
s = _utf8_encode_surrogateescape(s)
return s, wasunicode return s, wasunicode
# _ustr is like u but also returns whether input was bytes.
def _ustr(s): # -> sunicode, wasbytes
wasbytes = True
if isinstance(s, bytes): # py2: str py3: bytes
pass
elif isinstance(s, unicode): # py2: unicode py3: str
wasbytes = False
else:
raise TypeError("u: invalid type %s" % type(s))
if wasbytes:
if six.PY3:
s = s.decode('UTF-8', 'surrogateescape')
else:
# py2 does not have surrogateescape error handler, and even if we
# provide one, builtin bytes.decode() does not treat surrogate
# sequences as error. -> Do the decoding ourselves.
s = _utf8_decode_surrogateescape(s)
return s, wasbytes
# quote quotes unicode|bytes string into valid "..." unicode|bytes string always quoted with ". # quote quotes unicode|bytes string into valid "..." unicode|bytes string always quoted with ".
def quote(s): def quote(s):
s, wasunicode = _bstr(s) s, wasunicode = _bstr(s)
qs = _quote(s) qs = _quote(s)
if wasunicode: if wasunicode:
qs = qs.decode('UTF-8') qs, _ = _ustr(qs)
return qs return qs
def _quote(s): def _quote(s):
...@@ -122,8 +151,8 @@ def unquote_next(s): ...@@ -122,8 +151,8 @@ def unquote_next(s):
s, wasunicode = _bstr(s) s, wasunicode = _bstr(s)
us, tail = _unquote_next(s) us, tail = _unquote_next(s)
if wasunicode: if wasunicode:
us = us.decode('UTF-8') us, _ = _ustr(us)
tail = tail.decode('UTF-8') tail, _ = _ustr(tail)
return us, tail return us, tail
def _unquote_next(s): def _unquote_next(s):
...@@ -220,3 +249,53 @@ def _utf8_decode_rune(s): ...@@ -220,3 +249,53 @@ def _utf8_decode_rune(s):
# invalid UTF-8 # invalid UTF-8
return _rune_error, 1 return _rune_error, 1
# _utf8_decode_surrogateescape mimics s.decode('utf-8', 'surrogateescape') from py3.
def _utf8_decode_surrogateescape(s): # -> unicode
assert isinstance(s, bytes)
outv = []
emit = outv.append
while len(s) > 0:
r, width = _utf8_decode_rune(s)
if r == _rune_error:
b = ord(s[0])
assert 0x80 <= b <= 0xff
emit(unichr(0xdc00 + b))
# python2 "correctly" decodes surrogates - don't allow that as
# surrogates are not valid UTF-8:
# https://github.com/python/cpython/blob/v3.8.1-118-gdbb37aac142/Objects/stringlib/codecs.h#L153-L157
# (python3 raises UnicodeDecodeError for surrogates)
elif 0xd800 <= ord(r) < 0xdfff:
for c in s[:width]:
b = ord(c)
if c >= 0x80:
emit(unichr(0xdc00 + b))
else:
emit(unichr(b))
else:
emit(r)
s = s[width:]
return u''.join(outv)
# _utf8_encode_surrogateescape mimics s.encode('utf-8', 'surrogateescape') from py3.
def _utf8_encode_surrogateescape(s): # -> bytes
assert isinstance(s, unicode)
outv = []
emit = outv.append
for uc in s:
c = ord(uc)
if 0xdc80 <= c <= 0xdcff:
# surrogate - emit unescaped byte
emit(bchr(c & 0xff))
else:
emit(uc.encode('utf-8', 'strict'))
return b''.join(outv)
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright (C) 2019 Nexedi SA and Contributors. # Copyright (C) 2019-2020 Nexedi SA and Contributors.
# Kirill Smelkov <kirr@nexedi.com> # Kirill Smelkov <kirr@nexedi.com>
# #
# This program is free software: you can Use, Study, Modify and Redistribute # This program is free software: you can Use, Study, Modify and Redistribute
# it under the terms of the GNU General Public License version 3, or (at your # it under the terms of the GNU General Public License version 3, or (at your
...@@ -40,6 +40,8 @@ def test_golang_builtins(): ...@@ -40,6 +40,8 @@ def test_golang_builtins():
assert go is golang.go assert go is golang.go
assert chan is golang.chan assert chan is golang.chan
assert select is golang.select assert select is golang.select
assert b is golang.b
assert u is golang.u
# indirectly verify golang.__all__ # indirectly verify golang.__all__
for k in golang.__all__: for k in golang.__all__:
...@@ -92,17 +94,12 @@ def test_executable(): ...@@ -92,17 +94,12 @@ def test_executable():
out = pyout(['-c', 'import sys; print(sys.version)']) out = pyout(['-c', 'import sys; print(sys.version)'])
assert ('[GPython %s]' % golang.__version__) in str(out) assert ('[GPython %s]' % golang.__version__) in str(out)
# b converts s to UTF-8 encoded bytes.
def b(s):
from golang.strconv import _bstr
s, _ = _bstr(s)
return s
# verify pymain. # verify pymain.
# #
# !gpython_only to make sure we get the same output when run via pymain (under # !gpython_only to make sure we get the same output when run via pymain (under
# gpython) and plain python (!gpython). # gpython) and plain python (!gpython).
def test_pymain(): def test_pymain():
from golang import b
from os.path import join, dirname, realpath from os.path import join, dirname, realpath
here = dirname(__file__) here = dirname(__file__)
testdata = join(dirname(__file__), 'testdata') testdata = join(dirname(__file__), 'testdata')
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment