bpo-36311: Fixes decoding multibyte characters around chunk boundaries and...

bpo-36311: Fixes decoding multibyte characters around chunk boundaries and improves decoding performance (GH-15083)

bpo-36311: Fixes decoding multibyte characters around chunk boundaries and...
bpo-36311: Fixes decoding multibyte characters around chunk boundaries and improves decoding performance (GH-15083)
7ebdda0d · Steve Dower · GitHub · df0c21ff · 7ebdda0d · 7ebdda0d
Commit 7ebdda0d authored Aug 21, 2019 by Steve Dower Committed by GitHub Aug 21, 2019
3 changed files
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -3075,13 +3075,13 @@ class CodePageTest(unittest.TestCase):
            self.assertEqual(codec.name, 'mbcs')

    @support.bigmemtest(size=2**31, memuse=7, dry_run=False)
-    def test_large_input(self):
+    def test_large_input(self, size):
        # Test input longer than INT_MAX.
        # Input should contain undecodable bytes before and after
        # the INT_MAX limit.
-        encoded = (b'01234567' * (2**28-1) +
+        encoded = (b'01234567' * ((size//8)-1) +
                   b'\x85\x86\xea\xeb\xec\xef\xfc\xfd\xfe\xff')
-        self.assertEqual(len(encoded), 2**31+2)
+        self.assertEqual(len(encoded), size+2)
        decoded = codecs.code_page_decode(932, encoded, 'surrogateescape', True)
        self.assertEqual(decoded[1], len(encoded))
        del encoded
@@ -3092,6 +3092,20 @@ class CodePageTest(unittest.TestCase):
                         '\udc85\udc86\udcea\udceb\udcec'
                         '\udcef\udcfc\udcfd\udcfe\udcff')

+    @support.bigmemtest(size=2**31, memuse=6, dry_run=False)
+    def test_large_utf8_input(self, size):
+        # Test input longer than INT_MAX.
+        # Input should contain a decodable multi-byte character
+        # surrounding INT_MAX
+        encoded = (b'0123456\xed\x84\x80' * (size//8))
+        self.assertEqual(len(encoded), size // 8 * 10)
+        decoded = codecs.code_page_decode(65001, encoded, 'ignore', True)
+        self.assertEqual(decoded[1], len(encoded))
+        del encoded
+        self.assertEqual(len(decoded[0]), size)
+        self.assertEqual(decoded[0][:10], '0123456\ud10001')
+        self.assertEqual(decoded[0][-11:], '56\ud1000123456\ud100')
+

 class ASCIITest(unittest.TestCase):
    def test_encode(self):

--- a/Misc/NEWS.d/next/Core and Builtins/2019-08-02-15-01-33.bpo-36311.uY5vt-.rst
+++ b/Misc/NEWS.d/next/Core and Builtins/2019-08-02-15-01-33.bpo-36311.uY5vt-.rst
+Decoding bytes objects larger than 2GiB is faster and no longer fails when a
+multibyte characters spans a chunk boundary.
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -7186,6 +7186,12 @@ PyUnicode_AsASCIIString(PyObject *unicode)
 #define NEED_RETRY
 #endif

+/* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
+   transcoding from UTF-16), but INT_MAX / 4 perfoms better in
+   both cases also and avoids partial characters overrunning the
+   length limit in MultiByteToWideChar on Windows */
+#define DECODING_CHUNK_SIZE (INT_MAX/4)
+
 #ifndef WC_ERR_INVALID_CHARS
 #  define WC_ERR_INVALID_CHARS 0x0080
 #endif
@@ -7422,8 +7428,8 @@ decode_code_page_stateful(int code_page,
    do
    {
 #ifdef NEED_RETRY
-        if (size > INT_MAX) {
-            chunk_size = INT_MAX;
+        if (size > DECODING_CHUNK_SIZE) {
+            chunk_size = DECODING_CHUNK_SIZE;
            final = 0;
            done = 0;
        }
@@ -7827,10 +7833,8 @@ encode_code_page(int code_page,
    do
    {
 #ifdef NEED_RETRY
-        /* UTF-16 encoding may double the size, so use only INT_MAX/2
-           chunks. */
-        if (len > INT_MAX/2) {
-            chunk_len = INT_MAX/2;
+        if (len > DECODING_CHUNK_SIZE) {
+            chunk_len = DECODING_CHUNK_SIZE;
            done = 0;
        }
        else