Issue #8383: pickle and pickletools use surrogatepass error handler when

encoding unicode as utf8 to support lone surrogates and stay compatible with Python 2.x and 3.0

Issue #8383: pickle and pickletools use surrogatepass error handler when
encoding unicode as utf8 to support lone surrogates and stay compatible with Python 2.x and 3.0
485fb56e · Victor Stinner · 36067606 · 485fb56e · 485fb56e · 485fb56e
Commit 485fb56e authored Apr 13, 2010 by Victor Stinner
Showing with 14 additions and 6 deletions

Lib/pickle.py Lib/pickle.py +2 -2

Lib/pickletools.py Lib/pickletools.py +1 -1

Lib/test/pickletester.py Lib/test/pickletester.py +3 -1

Misc/NEWS Misc/NEWS +4 -0

Modules/_pickle.c Modules/_pickle.c +4 -2

No files found.
--- a/Lib/pickle.py
+++ b/Lib/pickle.py
@@ -499,7 +499,7 @@ class _Pickler:
    def save_str(self, obj, pack=struct.pack):
        if self.bin:
-            encoded = obj.encode('utf-8')
+            encoded = obj.encode('utf-8', 'surrogatepass')
            n = len(encoded)
            self.write(BINUNICODE + pack("<i", n) + encoded)
        else:
@@ -966,7 +966,7 @@ class _Unpickler:
    def load_binunicode(self):
        len = mloads(b'i' + self.read(4))
-        self.append(str(self.read(len), 'utf-8'))
+        self.append(str(self.read(len), 'utf-8', 'surrogatepass'))
    dispatch[BINUNICODE[0]] = load_binunicode
    def load_short_binstring(self):

--- a/Lib/pickletools.py
+++ b/Lib/pickletools.py
@@ -469,7 +469,7 @@ def read_unicodestring4(f):
        raise ValueError("unicodestring4 byte count < 0: %d" % n)
    data = f.read(n)
    if len(data) == n:
-        return str(data, 'utf-8')
+        return str(data, 'utf-8', 'surrogatepass')
    raise ValueError("expected %d bytes in a unicodestring4, but only %d "
                     "remain" % (n, len(data)))

--- a/Lib/test/pickletester.py
+++ b/Lib/test/pickletester.py
@@ -515,7 +515,9 @@ class AbstractPickleTests(unittest.TestCase):
    def test_unicode(self):
        endcases = ['', '<\\u>', '<\\\u1234>', '<\n>',
-                    '<\\>', '<\\\U00012345>']
+                    '<\\>', '<\\\U00012345>',
+                    # surrogates
+                    '<\udc80>']
        for proto in protocols:
            for u in endcases:
                p = self.dumps(u, proto)

--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -312,6 +312,10 @@ C-API
 Library
 -------
+- Issue #8383: pickle and pickletools use surrogatepass error handler when
+  encoding unicode as utf8 to support lone surrogates and stay compatible with
+  Python 2.x and 3.0
 - Issue #7585: difflib context and unified diffs now place a tab between
  filename and date, conforming to the 'standards' they were originally
  designed to follow.  This improves compatibility with patch tools.

--- a/Modules/_pickle.c
+++ b/Modules/_pickle.c
@@ -1227,7 +1227,9 @@ save_unicode(PicklerObject *self, PyObject *obj)
    if (self->bin) {
        char pdata[5];
-        encoded = PyUnicode_AsUTF8String(obj);
+        encoded = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(obj),
+                                    PyUnicode_GET_SIZE(obj),
+                                    "surrogatepass");
        if (encoded == NULL)
            goto error;
@@ -3352,7 +3354,7 @@ load_binunicode(UnpicklerObject *self)
    if (unpickler_read(self, &s, size) < 0)
        return -1;
-    str = PyUnicode_DecodeUTF8(s, size, NULL);
+    str = PyUnicode_DecodeUTF8(s, size, "surrogatepass");
    if (str == NULL)
        return -1;