Commit 485fb56e authored by Victor Stinner's avatar Victor Stinner

Issue #8383: pickle and pickletools use surrogatepass error handler when

encoding unicode as utf8 to support lone surrogates and stay compatible with
Python 2.x and 3.0
parent 36067606
...@@ -499,7 +499,7 @@ class _Pickler: ...@@ -499,7 +499,7 @@ class _Pickler:
def save_str(self, obj, pack=struct.pack): def save_str(self, obj, pack=struct.pack):
if self.bin: if self.bin:
encoded = obj.encode('utf-8') encoded = obj.encode('utf-8', 'surrogatepass')
n = len(encoded) n = len(encoded)
self.write(BINUNICODE + pack("<i", n) + encoded) self.write(BINUNICODE + pack("<i", n) + encoded)
else: else:
...@@ -966,7 +966,7 @@ class _Unpickler: ...@@ -966,7 +966,7 @@ class _Unpickler:
def load_binunicode(self): def load_binunicode(self):
len = mloads(b'i' + self.read(4)) len = mloads(b'i' + self.read(4))
self.append(str(self.read(len), 'utf-8')) self.append(str(self.read(len), 'utf-8', 'surrogatepass'))
dispatch[BINUNICODE[0]] = load_binunicode dispatch[BINUNICODE[0]] = load_binunicode
def load_short_binstring(self): def load_short_binstring(self):
......
...@@ -469,7 +469,7 @@ def read_unicodestring4(f): ...@@ -469,7 +469,7 @@ def read_unicodestring4(f):
raise ValueError("unicodestring4 byte count < 0: %d" % n) raise ValueError("unicodestring4 byte count < 0: %d" % n)
data = f.read(n) data = f.read(n)
if len(data) == n: if len(data) == n:
return str(data, 'utf-8') return str(data, 'utf-8', 'surrogatepass')
raise ValueError("expected %d bytes in a unicodestring4, but only %d " raise ValueError("expected %d bytes in a unicodestring4, but only %d "
"remain" % (n, len(data))) "remain" % (n, len(data)))
......
...@@ -515,7 +515,9 @@ class AbstractPickleTests(unittest.TestCase): ...@@ -515,7 +515,9 @@ class AbstractPickleTests(unittest.TestCase):
def test_unicode(self): def test_unicode(self):
endcases = ['', '<\\u>', '<\\\u1234>', '<\n>', endcases = ['', '<\\u>', '<\\\u1234>', '<\n>',
'<\\>', '<\\\U00012345>'] '<\\>', '<\\\U00012345>',
# surrogates
'<\udc80>']
for proto in protocols: for proto in protocols:
for u in endcases: for u in endcases:
p = self.dumps(u, proto) p = self.dumps(u, proto)
......
...@@ -312,6 +312,10 @@ C-API ...@@ -312,6 +312,10 @@ C-API
Library Library
------- -------
- Issue #8383: pickle and pickletools use surrogatepass error handler when
encoding unicode as utf8 to support lone surrogates and stay compatible with
Python 2.x and 3.0
- Issue #7585: difflib context and unified diffs now place a tab between - Issue #7585: difflib context and unified diffs now place a tab between
filename and date, conforming to the 'standards' they were originally filename and date, conforming to the 'standards' they were originally
designed to follow. This improves compatibility with patch tools. designed to follow. This improves compatibility with patch tools.
......
...@@ -1227,7 +1227,9 @@ save_unicode(PicklerObject *self, PyObject *obj) ...@@ -1227,7 +1227,9 @@ save_unicode(PicklerObject *self, PyObject *obj)
if (self->bin) { if (self->bin) {
char pdata[5]; char pdata[5];
encoded = PyUnicode_AsUTF8String(obj); encoded = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(obj),
PyUnicode_GET_SIZE(obj),
"surrogatepass");
if (encoded == NULL) if (encoded == NULL)
goto error; goto error;
...@@ -3352,7 +3354,7 @@ load_binunicode(UnpicklerObject *self) ...@@ -3352,7 +3354,7 @@ load_binunicode(UnpicklerObject *self)
if (unpickler_read(self, &s, size) < 0) if (unpickler_read(self, &s, size) < 0)
return -1; return -1;
str = PyUnicode_DecodeUTF8(s, size, NULL); str = PyUnicode_DecodeUTF8(s, size, "surrogatepass");
if (str == NULL) if (str == NULL)
return -1; return -1;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment