Commit 2c7fd46e authored by Xiang Zhang's avatar Xiang Zhang Committed by GitHub

bpo-32583: Fix possible crashing in builtin Unicode decoders (#5325)

When using customized decode error handlers, it is possible for builtin decoders
to write out-of-bounds and then crash.
parent 84521047
......@@ -1044,6 +1044,58 @@ class CodecCallbackTest(unittest.TestCase):
for (encoding, data) in baddata:
self.assertEqual(data.decode(encoding, "test.mutating"), "\u4242")
# issue32583
def test_crashing_decode_handler(self):
# better generating one more character to fill the extra space slot
# so in debug build it can steadily fail
def forward_shorter_than_end(exc):
if isinstance(exc, UnicodeDecodeError):
# size one character, 0 < forward < exc.end
return ('\ufffd', exc.start+1)
else:
raise TypeError("don't know how to handle %r" % exc)
codecs.register_error(
"test.forward_shorter_than_end", forward_shorter_than_end)
self.assertEqual(
b'\xd8\xd8\xd8\xd8\xd8\x00\x00\x00'.decode(
'utf-16-le', 'test.forward_shorter_than_end'),
'\ufffd\ufffd\ufffd\ufffd\xd8\x00'
)
self.assertEqual(
b'\xd8\xd8\xd8\xd8\x00\xd8\x00\x00'.decode(
'utf-16-be', 'test.forward_shorter_than_end'),
'\ufffd\ufffd\ufffd\ufffd\xd8\x00'
)
self.assertEqual(
b'\x11\x11\x11\x11\x11\x00\x00\x00\x00\x00\x00'.decode(
'utf-32-le', 'test.forward_shorter_than_end'),
'\ufffd\ufffd\ufffd\u1111\x00'
)
self.assertEqual(
b'\x11\x11\x11\x00\x00\x11\x11\x00\x00\x00\x00'.decode(
'utf-32-be', 'test.forward_shorter_than_end'),
'\ufffd\ufffd\ufffd\u1111\x00'
)
def replace_with_long(exc):
if isinstance(exc, UnicodeDecodeError):
exc.object = b"\x00" * 8
return ('\ufffd', exc.start)
else:
raise TypeError("don't know how to handle %r" % exc)
codecs.register_error("test.replace_with_long", replace_with_long)
self.assertEqual(
b'\x00'.decode('utf-16', 'test.replace_with_long'),
'\ufffd\x00\x00\x00\x00'
)
self.assertEqual(
b'\x00'.decode('utf-32', 'test.replace_with_long'),
'\ufffd\x00\x00'
)
def test_fake_error_class(self):
handlers = [
codecs.strict_errors,
......
Fix possible crashing in builtin Unicode decoders caused by write
out-of-bound errors when using customized decode error handlers.
......@@ -4190,7 +4190,10 @@ unicode_decode_call_errorhandler_writer(
Py_ssize_t insize;
Py_ssize_t newpos;
Py_ssize_t replen;
Py_ssize_t remain;
PyObject *inputobj = NULL;
int need_to_grow = 0;
const char *new_inptr;
if (*errorHandler == NULL) {
*errorHandler = PyCodec_LookupError(errors);
......@@ -4221,6 +4224,7 @@ unicode_decode_call_errorhandler_writer(
inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
if (!inputobj)
goto onError;
remain = *inend - *input - *endinpos;
*input = PyBytes_AS_STRING(inputobj);
insize = PyBytes_GET_SIZE(inputobj);
*inend = *input + insize;
......@@ -4238,6 +4242,19 @@ unicode_decode_call_errorhandler_writer(
replen = PyUnicode_GET_LENGTH(repunicode);
if (replen > 1) {
writer->min_length += replen - 1;
need_to_grow = 1;
}
new_inptr = *input + newpos;
if (*inend - new_inptr > remain) {
/* We don't know the decoding algorithm here so we make the worst
assumption that one byte decodes to one unicode character.
If unfortunately one byte could decode to more unicode characters,
the decoder may write out-of-bound then. Is it possible for the
algorithms using this function? */
writer->min_length += *inend - new_inptr - remain;
need_to_grow = 1;
}
if (need_to_grow) {
writer->overallocate = 1;
if (_PyUnicodeWriter_Prepare(writer, writer->min_length,
PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
......@@ -4247,7 +4264,7 @@ unicode_decode_call_errorhandler_writer(
goto onError;
*endinpos = newpos;
*inptr = *input + newpos;
*inptr = new_inptr;
/* we made it! */
Py_DECREF(restuple);
......@@ -5572,7 +5589,8 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
#endif
/* Note: size will always be longer than the resulting Unicode
character count */
character count normally. Error handler will take care of
resizing when needed. */
_PyUnicodeWriter_Init(&writer);
writer.min_length = (e - q + 1) / 2;
if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment