Commit 5aa1de72 authored by Kirill Smelkov's avatar Kirill Smelkov

golang_str_pickle: Fix bstr to pickle/unpickle in forward-compatible way wrt upcoming UTF-8bk

In 1ec5ed82 (golang_str_pickle: Fix it so that py3 can load what py2
saved and back) we changed how bstr and ustr are pickled so that the
pickling process is explicit and that both py2/py3 can load what any of
py2/py3 saved. It all works ok for that.

However for protocol < 3 bstr is pickled via unicode data, with
instructions to unpickle it as bstr(unicode-data). The idea is generally ok,
but taking into account planned introduction of UTF-8bk (see c0a53847
"golang_str: TODO UTF-8bk" for details), it might result in bstr data
saved before UTF-8b -> UTF-8bk switch, to become loaded in corrupt form
after the switch.

-> Care to avoid that by explicitly instructing pickle stream to always
load data saved before the switch to UTF-8bk, as UTF-8b.
parent 9ef32517
# -*- coding: utf-8 -*-
# Copyright (C) 2018-2024 Nexedi SA and Contributors.
# Copyright (C) 2018-2025 Nexedi SA and Contributors.
# Kirill Smelkov <kirr@nexedi.com>
#
# This program is free software: you can Use, Study, Modify and Redistribute
......@@ -376,4 +376,5 @@ from ._golang import \
pyustr as ustr, \
pyuchr as uchr, \
pybiter as biter, \
pyuiter as uiter
pyuiter as uiter, \
_butf8b
......@@ -2017,6 +2017,11 @@ cdef _encoding_with_defaults(encoding, errors): # -> (encoding, errors)
# UnicodeEncodeError: 'utf-8' codec can't encode character '\udc00' in position 0: surrogates not allowed
#
# (*) aka UTF-8b (see http://hyperreal.org/~est/utf-8b/releases/utf-8b-20060413043934/kuhn-utf-8b.html)
#
# Call resulting encoding as UTF-8bk.
#
# TODO(kirr) adjust bstr pickling for protocol < 3 after switching bstr/ustr
# to decode/encode via UTF-8bk instead of UTF-8b.
from six import unichr # py2: unichr py3: chr
from six import int2byte as bchr # py2: chr py3: lambda x: bytes((x,))
......
# -*- coding: utf-8 -*-
# Copyright (C) 2023-2024 Nexedi SA and Contributors.
# Copyright (C) 2023-2025 Nexedi SA and Contributors.
# Kirill Smelkov <kirr@nexedi.com>
#
# This program is free software: you can Use, Study, Modify and Redistribute
......@@ -53,12 +53,20 @@ cdef _bstr__reduce_ex__(self, protocol):
# to achieve that.
if protocol < 3:
# use UNICODE for data
udata = _udata(pyu(self))
if protocol < 2:
return (self.__class__, (udata,)) # bstr UNICODE REDUCE
#
# explicitly mark to unpickle via _butf8b because with the introduction
# of UTF-8bk the way bstr decodes unicode will change, and so if we
# would use `bstr UNICODE` for pickling it will result in corrupt data
# to be loaded after the switch to UTF-8bk.
#
# TODO pickle via bstr UNICODE REDUCE/NEWOBJ after switch from UTF-8b to UTF-8bk.
udata = _utf8_decode_surrogateescape(self)
if self.__class__ is pybstr:
return (_butf8b, # _butf8b UNICODE REDUCE
(udata,))
else:
return (pycopyreg.__newobj__,
(self.__class__, udata)) # bstr UNICODE NEWOBJ
return (_butf8b, # _butf8b bstr UNICODE REDUCE
(self.__class__, udata))
else:
# use BYTES for data
bdata = _bdata(self)
......@@ -73,10 +81,25 @@ cdef _bstr__reduce_ex__(self, protocol):
cdef _ustr__reduce_ex__(self, protocol):
# emit ustr(UNICODE).
# TODO later we might want to switch to emitting ustr(BYTES)
# TODO after UTF-8bk we might want to switch to emitting ustr(BYTES)
# even if we do this, it should be backward compatible
if protocol < 2:
return (self.__class__, (_udata(self),))# ustr UNICODE REDUCE
else:
return (pycopyreg.__newobj__, # ustr UNICODE NEWOBJ
(self.__class__, _udata(self)))
# `_butf8b [bcls] udata` serves unpickling of bstr pickled with data
# represented via UTF-8b decoded unicode.
def _butf8b(*argv):
cdef object bcls = pybstr
cdef object udata
cdef int l = len(argv)
if l == 1:
udata = argv[0]
elif l == 2:
bcls, udata = argv
else:
raise TypeError("_butf8b() takes 1 or 2 arguments; %d given" % l)
return _pyb(bcls, _utf8_encode_surrogateescape(udata))
_butf8b.__module__ = "golang"
# -*- coding: utf-8 -*-
# Copyright (C) 2022-2024 Nexedi SA and Contributors.
# Copyright (C) 2022-2025 Nexedi SA and Contributors.
# Kirill Smelkov <kirr@nexedi.com>
#
# This program is free software: you can Use, Study, Modify and Redistribute
......@@ -128,13 +128,13 @@ def test_strings_pickle_bstr_ustr(pickle):
_ = assert_pickle
_(bs, 0,
b"cgolang\nbstr\n(V\\u043c\\u0438\\u0440\\udcff\ntR.") # bstr(UNICODE)
b"cgolang\n_butf8b\n(V\\u043c\\u0438\\u0440\\udcff\ntR.") # _butf8b(UNICODE)
_(us, 0,
b'cgolang\nustr\n(V\\u043c\\u0430\\u0439\\udcff\ntR.') # ustr(UNICODE)
_(bs, 1,
b'cgolang\nbstr\n(X\x09\x00\x00\x00' # bstr(BINUNICODE)
b'cgolang\n_butf8b\n(X\x09\x00\x00\x00' # _butf8b(BINUNICODE)
b'\xd0\xbc\xd0\xb8\xd1\x80\xed\xb3\xbftR.')
# NOTE BINUNICODE ...edb3bf not ...ff (see test_strings_pickle_loadsave_UNICODE for details)
......@@ -143,8 +143,8 @@ def test_strings_pickle_bstr_ustr(pickle):
b'\xd0\xbc\xd0\xb0\xd0\xb9\xed\xb3\xbftR.')
_(bs, 2,
b'cgolang\nbstr\nX\x09\x00\x00\x00' # bstr(BINUNICODE)
b'\xd0\xbc\xd0\xb8\xd1\x80\xed\xb3\xbf\x85\x81.')
b'cgolang\n_butf8b\nX\x09\x00\x00\x00' # _butf8b(BINUNICODE)
b'\xd0\xbc\xd0\xb8\xd1\x80\xed\xb3\xbf\x85R.')
_(us, 2,
b'cgolang\nustr\nX\x09\x00\x00\x00' # ustr(BINUNICODE)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment