Commit 106c4145 authored by Mark Dickinson's avatar Mark Dickinson

Issue #14923: Optimize continuation-byte check in UTF-8 decoding. Patch by Serhiy Storchaka.

parent 16ad7a25
...@@ -15,6 +15,9 @@ ...@@ -15,6 +15,9 @@
# error C 'long' size should be either 4 or 8! # error C 'long' size should be either 4 or 8!
#endif #endif
/* 10xxxxxx */
#define IS_CONTINUATION_BYTE(ch) ((ch) >= 0x80 && (ch) < 0xC0)
Py_LOCAL_INLINE(Py_UCS4) Py_LOCAL_INLINE(Py_UCS4)
STRINGLIB(utf8_decode)(const char **inptr, const char *end, STRINGLIB(utf8_decode)(const char **inptr, const char *end,
STRINGLIB_CHAR *dest, STRINGLIB_CHAR *dest,
...@@ -107,7 +110,7 @@ STRINGLIB(utf8_decode)(const char **inptr, const char *end, ...@@ -107,7 +110,7 @@ STRINGLIB(utf8_decode)(const char **inptr, const char *end,
break; break;
} }
ch2 = (unsigned char)s[1]; ch2 = (unsigned char)s[1];
if ((ch2 & 0xC0) != 0x80) if (!IS_CONTINUATION_BYTE(ch2))
/* invalid continuation byte */ /* invalid continuation byte */
goto InvalidContinuation; goto InvalidContinuation;
ch = (ch << 6) + ch2 - ch = (ch << 6) + ch2 -
...@@ -131,8 +134,8 @@ STRINGLIB(utf8_decode)(const char **inptr, const char *end, ...@@ -131,8 +134,8 @@ STRINGLIB(utf8_decode)(const char **inptr, const char *end,
} }
ch2 = (unsigned char)s[1]; ch2 = (unsigned char)s[1];
ch3 = (unsigned char)s[2]; ch3 = (unsigned char)s[2];
if ((ch2 & 0xC0) != 0x80 || if (!IS_CONTINUATION_BYTE(ch2) ||
(ch3 & 0xC0) != 0x80) { !IS_CONTINUATION_BYTE(ch3)) {
/* invalid continuation byte */ /* invalid continuation byte */
goto InvalidContinuation; goto InvalidContinuation;
} }
...@@ -172,9 +175,9 @@ STRINGLIB(utf8_decode)(const char **inptr, const char *end, ...@@ -172,9 +175,9 @@ STRINGLIB(utf8_decode)(const char **inptr, const char *end,
ch2 = (unsigned char)s[1]; ch2 = (unsigned char)s[1];
ch3 = (unsigned char)s[2]; ch3 = (unsigned char)s[2];
ch4 = (unsigned char)s[3]; ch4 = (unsigned char)s[3];
if ((ch2 & 0xC0) != 0x80 || if (!IS_CONTINUATION_BYTE(ch2) ||
(ch3 & 0xC0) != 0x80 || !IS_CONTINUATION_BYTE(ch3) ||
(ch4 & 0xC0) != 0x80) { !IS_CONTINUATION_BYTE(ch4)) {
/* invalid continuation byte */ /* invalid continuation byte */
goto InvalidContinuation; goto InvalidContinuation;
} }
...@@ -216,6 +219,7 @@ InvalidContinuation: ...@@ -216,6 +219,7 @@ InvalidContinuation:
} }
#undef ASCII_CHAR_MASK #undef ASCII_CHAR_MASK
#undef IS_CONTINUATION_BYTE
/* UTF-8 encoder specialized for a Unicode kind to avoid the slow /* UTF-8 encoder specialized for a Unicode kind to avoid the slow
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment