bpo-24214: Fixed the UTF-8 incremental decoder. (GH-12603)

The bug occurred when the encoded surrogate character is passed to the incremental decoder in two chunks.

bpo-24214: Fixed the UTF-8 incremental decoder. (GH-12603)
The bug occurred when the encoded surrogate character is passed to the incremental decoder in two chunks.
7a465cb5 · Serhiy Storchaka · GitHub · 38f4e468 · 7a465cb5 · 7a465cb5
Commit 7a465cb5 authored Mar 30, 2019 by Serhiy Storchaka Committed by GitHub Mar 30, 2019
3 changed files
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -406,6 +406,15 @@ class ReadTest(MixInCheckStateHandling):
            self.assertEqual(test_sequence.decode(self.encoding, "backslashreplace"),
                             before + backslashreplace + after)

+    def test_incremental_surrogatepass(self):
+        # Test incremental decoder for surrogatepass handler:
+        # see issue #24214
+        data = '\uD901'.encode(self.encoding, 'surrogatepass')
+        for i in range(1, len(data)):
+            dec = codecs.getincrementaldecoder(self.encoding)('surrogatepass')
+            self.assertEqual(dec.decode(data[:i]), '')
+            self.assertEqual(dec.decode(data[i:], True), '\uD901')
+

 class UTF32Test(ReadTest, unittest.TestCase):
    encoding = "utf-32"

--- a/Misc/NEWS.d/next/Core and Builtins/2019-03-28-15-22-45.bpo-24214.tZ6lYU.rst
+++ b/Misc/NEWS.d/next/Core and Builtins/2019-03-28-15-22-45.bpo-24214.tZ6lYU.rst
+Fixed support of the surrogatepass error handler in the UTF-8 incremental
+decoder.
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -4883,6 +4883,9 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
        case 2:
        case 3:
        case 4:
+            if (s == end || consumed) {
+                goto End;
+            }
            errmsg = "invalid continuation byte";
            startinpos = s - starts;
            endinpos = startinpos + ch - 1;