Commit b2e324a2 authored by Alexander Barkov's avatar Alexander Barkov

MDEV-8416 ucs2: compare broken bytes as "greater than any non-broken character"

MDEV-8418 utf16: compare broken bytes as "greater than any non-broken character"
parent 35d8ac35
This diff is collapsed.
......@@ -64,13 +64,16 @@
@return - the number of bytes scanned
The including source file must define the following macros:
IS_MB1_CHAR(x)
IS_MB1_MB2HEAD_GAP(x) - optional, for better performance
IS_MB2_CHAR(x,y)
IS_MB3_CHAR(x,y,z) - for character sets with mbmaxlen>2
IS_MB1_CHAR(b0) - for character sets that have MB1 characters
IS_MB1_MB2HEAD_GAP(b0) - optional, for better performance
IS_MB2_CHAR(b0,b1) - for character sets that have MB2 characters
IS_MB3_CHAR(b0,b1,b2) - for character sets that have MB3 characters
IS_MB4_CHAR(b0,b1,b2,b3) - for character sets with have MB4 characters
WEIGHT_PAD_SPACE
WEIGHT_MB1(x)
WEIGHT_MB2(x,y)
WEIGHT_MB1(b0) - for character sets that have MB1 characters
WEIGHT_MB2(b0,b1) - for character sets that have MB2 characters
WEIGHT_MB3(b0,b1,b2) - for character sets that have MB3 characters
WEIGHT_MB4(b0,b1,b2,b3) - for character sets that have MB4 characters
WEIGHT_ILSEQ(x)
*/
static inline uint
......@@ -82,11 +85,13 @@ MY_FUNCTION_NAME(scan_weight)(int *weight, const uchar *str, const uchar *end)
return 0;
}
#ifdef IS_MB1_CHAR
if (IS_MB1_CHAR(*str))
{
*weight= WEIGHT_MB1(*str); /* A valid single byte character*/
return 1;
}
#endif
#ifdef IS_MB1_MBHEAD_UNUSED_GAP
/*
......@@ -98,6 +103,7 @@ MY_FUNCTION_NAME(scan_weight)(int *weight, const uchar *str, const uchar *end)
goto bad;
#endif
#ifdef IS_MB2_CHAR
if (str + 2 > end) /* The string ended unexpectedly */
goto bad; /* Treat as a bad byte */
......@@ -106,6 +112,7 @@ MY_FUNCTION_NAME(scan_weight)(int *weight, const uchar *str, const uchar *end)
*weight= WEIGHT_MB2(str[0], str[1]);
return 2; /* A valid two-byte character */
}
#endif
#ifdef IS_MB3_CHAR
if (str + 3 > end) /* Incomplete three-byte character */
......
......@@ -149,7 +149,7 @@ typedef struct
A1A1 - MB2 or 8BIT+8BIT
E0E0 - MB2
*/
STRNNCOLL_PARAM strcoll_mb2_common[]=
static STRNNCOLL_PARAM strcoll_mb2_common[]=
{
/* Compare two good sequences */
{CSTR(""), CSTR(""), 0},
......@@ -210,7 +210,7 @@ STRNNCOLL_PARAM strcoll_mb2_common[]=
/*
For character sets that have good mb2 characters A1A1 and F9FE
*/
STRNNCOLL_PARAM strcoll_mb2_A1A1_mb2_F9FE[]=
static STRNNCOLL_PARAM strcoll_mb2_A1A1_mb2_F9FE[]=
{
/* Compare two good characters */
{CSTR(""), CSTR("\xF9\xFE"), -1},
......@@ -246,7 +246,7 @@ STRNNCOLL_PARAM strcoll_mb2_A1A1_mb2_F9FE[]=
A1A1 - a good mb2 character
F9FE - a bad sequence
*/
STRNNCOLL_PARAM strcoll_mb2_A1A1_bad_F9FE[]=
static STRNNCOLL_PARAM strcoll_mb2_A1A1_bad_F9FE[]=
{
/* Compare a good character to an illegal or an incomplete sequence */
{CSTR(""), CSTR("\xF9\xFE"), -1},
......@@ -283,7 +283,7 @@ STRNNCOLL_PARAM strcoll_mb2_A1A1_bad_F9FE[]=
F9 - ILSEQ or H2
F9FE - a bad sequence (ILSEQ+XX or H2+ILSEQ)
*/
STRNNCOLL_PARAM strcoll_mb1_A1_bad_F9FE[]=
static STRNNCOLL_PARAM strcoll_mb1_A1_bad_F9FE[]=
{
/* Compare two good characters */
{CSTR(""), CSTR("\xA1"), -1},
......@@ -323,7 +323,7 @@ STRNNCOLL_PARAM strcoll_mb1_A1_bad_F9FE[]=
and sort in this order:
8181 < A1 < E0E0
*/
STRNNCOLL_PARAM strcoll_8181_A1_E0E0[]=
static STRNNCOLL_PARAM strcoll_8181_A1_E0E0[]=
{
{CSTR("\x81\x81"), CSTR("\xA1"), -1},
{CSTR("\x81\x81"), CSTR("\xE0\xE0"), -1},
......@@ -336,7 +336,7 @@ STRNNCOLL_PARAM strcoll_8181_A1_E0E0[]=
/*
A shared test for eucjpms and ujis.
*/
STRNNCOLL_PARAM strcoll_ujis[]=
static STRNNCOLL_PARAM strcoll_ujis[]=
{
{CSTR("\x8E\xA1"), CSTR("\x8E"), -1}, /* Good MB2 vs incomplete MB2 */
{CSTR("\x8E\xA1"), CSTR("\x8F\xA1"), -1}, /* Good MB2 vs incomplete MB3 */
......@@ -347,7 +347,7 @@ STRNNCOLL_PARAM strcoll_ujis[]=
};
STRNNCOLL_PARAM strcoll_utf8mb3_common[]=
static STRNNCOLL_PARAM strcoll_utf8mb3_common[]=
{
{CSTR("\xC0"), CSTR("\xC1"), -1}, /* Unused byte vs unused byte */
{CSTR("\xC0"), CSTR("\xFF"), -1}, /* Unused byte vs unused byte */
......@@ -369,7 +369,7 @@ STRNNCOLL_PARAM strcoll_utf8mb3_common[]=
};
STRNNCOLL_PARAM strcoll_utf8mb4_common[]=
static STRNNCOLL_PARAM strcoll_utf8mb4_common[]=
{
/* Minimum four-byte character: U+10000 == _utf8 0xF0908080 */
{CSTR("\xF0\x90\x80\x80"), CSTR("\xC0"), -1}, /* MB4 vs unused byte */
......@@ -412,6 +412,101 @@ STRNNCOLL_PARAM strcoll_utf8mb4_common[]=
};
static STRNNCOLL_PARAM strcoll_ucs2_common[]=
{
{CSTR("\xC0"), CSTR("\xC1"), -1}, /* Incomlete MB2 vs incomplete MB2 */
{CSTR("\xC0"), CSTR("\xFF"), -1}, /* Incomlete MB2 vs incomplete MB2 */
{CSTR("\xC2\xA1"), CSTR("\xC0"), -1}, /* MB2 vs incomplete MB2 */
{CSTR("\xC2\xA1"), CSTR("\xC2"), -1}, /* MB2 vs incomplete MB2 */
{CSTR("\xC2\xA0"), CSTR("\xC2\xA1"), -1}, /* MB2 vs MB2 */
{CSTR("\xC2\xA1"), CSTR("\xC2\xA2"), -1}, /* MB2 vs MB2 */
{CSTR("\xFF\xFF"), CSTR("\x00"),-1}, /* MB2 vs incomplete */
{CSTR("\xFF\xFF\xFF\xFF"), CSTR("\x00"),-1}, /* MB2+MB2 vs incomplete */
{CSTR("\xFF\xFF\xFF\xFF"), CSTR("\x00\x00\x00"), 1},/* MB2+MB2 vs MB2+incomplete */
{NULL, 0, NULL, 0, 0}
};
/* Tests that involve comparison to SPACE (explicit, or padded) */
static STRNNCOLL_PARAM strcoll_ucs2_space[]=
{
{CSTR("\x00\x1F"), CSTR("\x00\x20"), -1}, /* MB2 vs MB2 */
{CSTR("\x00\x20"), CSTR("\x00\x21"), -1}, /* MB2 vs MB2 */
{CSTR("\x00\x1F"), CSTR(""), -1}, /* MB2 vs empty */
{CSTR("\x00\x20"), CSTR(""), 0}, /* MB2 vs empty */
{CSTR("\x00\x21"), CSTR(""), 1}, /* MB2 vs empty */
{NULL, 0, NULL, 0, 0}
};
/* Tests that involve comparison to SPACE (explicit, or padded) */
static STRNNCOLL_PARAM strcoll_utf16le_space[]=
{
{CSTR("\x1F\x00"), CSTR("\x20\x00"), -1}, /* MB2 vs MB2 */
{CSTR("\x20\x00"), CSTR("\x21\x00"), -1}, /* MB2 vs MB2 */
{CSTR("\x1F\x00"), CSTR(""), -1}, /* MB2 vs empty */
{CSTR("\x20\x00"), CSTR(""), 0}, /* MB2 vs empty */
{CSTR("\x21\x00"), CSTR(""), 1}, /* MB2 vs empty */
{NULL, 0, NULL, 0, 0}
};
static STRNNCOLL_PARAM strcoll_utf16_common[]=
{
/* Minimum four-byte character: U+10000 == _utf16 0xD800DC00 */
{CSTR("\xD8\x00\xDC\x00"), CSTR("\xC0"), -1},/* MB4 vs incomplete MB2 */
{CSTR("\xD8\x00\xDC\x00"), CSTR("\xC2"), -1},/* MB4 vs incomplete MB2 */
{CSTR("\xD8\x00\xDC\x00"), CSTR("\xD8\x00\xDB\x00"),-1},/* MB4 vs broken MB4 */
{CSTR("\xD8\x00\xDC\x00"), CSTR("\xD8\x00\xE0\x00"),-1},/* MB4 vs broken MB4 */
{CSTR("\xD8\x00\xDC\x00"), CSTR("\xDC\x00"), -1},/* MB4 vs broken MB2 */
{CSTR("\xD8\x00\xDC\x00"), CSTR("\xD8\x00\xDC"), -1},/* MB4 vs incomplete MB4 */
/* Maximum four-byte character: U+10FFFF == _utf8 0xF48FBFBF */
{CSTR("\xDB\xFF\xDF\xFF"), CSTR("\xC0"), -1},/* MB4 vs incomplete MB2 */
{CSTR("\xDB\xFF\xDF\xFF"), CSTR("\xC2"), -1},/* MB4 vs incomplete MB2 */
{CSTR("\xDB\xFF\xDF\xFF"), CSTR("\xD8\x00\xDB\x00"),-1},/* MB4 vs broken MB4 */
{CSTR("\xDB\xFF\xDF\xFF"), CSTR("\xD8\x00\xE0\x00"),-1},/* MB4 vs broken MB4 */
{CSTR("\xDB\xFF\xDF\xFF"), CSTR("\xDC\x00"), -1},/* MB4 vs broken MB2 */
{CSTR("\xDB\xFF\xDF\xFF"), CSTR("\xDC\xFF\xDF"), -1},/* MB4 vs incomplete MB4 */
/* Broken MB4 vs broken MB4 */
{CSTR("\xD8\x00\xDC\x00"), CSTR("\xD8\x00\xDC\x01"),-1},/* Broken MB4 vs broken MB4 */
{CSTR("\xDB\xFF\xE0\xFE"), CSTR("\xDB\xFF\xE0\xFF"),-1},/* Broken MB4 vs broken MB4 */
{NULL, 0, NULL, 0, 0}
};
static STRNNCOLL_PARAM strcoll_utf16le_common[]=
{
/* Minimum four-byte character: U+10000 == _utf16 0xD800DC00 */
{CSTR("\x00\xD8\x00\xDC"), CSTR("\xC0"), -1},/* MB4 vs incomplete MB2 */
{CSTR("\x00\xD8\x00\xDC"), CSTR("\xC2"), -1},/* MB4 vs incomplete MB2 */
{CSTR("\x00\xD8\x00\xDC"), CSTR("\x00\xD8\x00\xDB"),-1},/* MB4 vs broken MB4 */
{CSTR("\x00\xD8\x00\xDC"), CSTR("\x00\xD8\x00\xD0"),-1},/* MB4 vs broken MB4 */
{CSTR("\x00\xD8\x00\xDC"), CSTR("\x00\xDC"), -1},/* MB4 vs broken MB2 */
{CSTR("\x00\xD8\x00\xDC"), CSTR("\x00\xD8\x00"), -1},/* MB4 vs incomplete MB4 */
/* Maximum four-byte character: U+10FFFF == _utf8 0xF48FBFBF */
{CSTR("\xFF\xDB\xFF\xDF"), CSTR("\xC0"), -1},/* MB4 vs incomplete MB2 */
{CSTR("\xFF\xDB\xFF\xDF"), CSTR("\xC2"), -1},/* MB4 vs incomplete MB2 */
{CSTR("\xFF\xDB\xFF\xDF"), CSTR("\x00\xD8\x00\xDB"),-1},/* MB4 vs broken MB4 */
{CSTR("\xFF\xDB\xFF\xDF"), CSTR("\x00\xD8\x00\xE0"),-1},/* MB4 vs broken MB4 */
{CSTR("\xFF\xDB\xFF\xDF"), CSTR("\x00\xDC"), -1},/* MB4 vs broken MB2 */
{CSTR("\xFF\xDB\xFF\xDF"), CSTR("\xFF\xDC\x00"), -1},/* MB4 vs incomplete MB4 */
/* Broken MB4 vs broken MB4 */
{CSTR("\x00\xD8\x00\xDC"), CSTR("\x00\xD8\x01\xDC"),-1},/* Broken MB4 vs broken MB4 */
{CSTR("\xFF\xDB\xFE\xE0"), CSTR("\xFF\xDB\xFF\xE0"),-1},/* Broken MB4 vs broken MB4 */
{NULL, 0, NULL, 0, 0}
};
static void
str2hex(char *dst, size_t dstlen, const char *src, size_t srclen)
{
......@@ -528,6 +623,12 @@ test_strcollsp()
failed+= strcollsp(&my_charset_sjis_japanese_ci, strcoll_8181_A1_E0E0);
failed+= strcollsp(&my_charset_sjis_bin, strcoll_8181_A1_E0E0);
#endif
#ifdef HAVE_CHARSET_ucs2
failed+= strcollsp(&my_charset_ucs2_general_ci, strcoll_ucs2_common);
failed+= strcollsp(&my_charset_ucs2_general_ci, strcoll_ucs2_space);
failed+= strcollsp(&my_charset_ucs2_bin, strcoll_ucs2_common);
failed+= strcollsp(&my_charset_ucs2_bin, strcoll_ucs2_space);
#endif
#ifdef HAVE_CHARSET_ujis
failed+= strcollsp(&my_charset_ujis_japanese_ci, strcoll_mb2_common);
failed+= strcollsp(&my_charset_ujis_bin, strcoll_mb2_common);
......@@ -536,6 +637,21 @@ test_strcollsp()
failed+= strcollsp(&my_charset_ujis_japanese_ci, strcoll_ujis);
failed+= strcollsp(&my_charset_ujis_bin, strcoll_ujis);
#endif
#ifdef HAVE_CHARSET_utf16
failed+= strcollsp(&my_charset_utf16_general_ci, strcoll_ucs2_common);
failed+= strcollsp(&my_charset_utf16_general_ci, strcoll_ucs2_space);
failed+= strcollsp(&my_charset_utf16_general_ci, strcoll_utf16_common);
failed+= strcollsp(&my_charset_utf16_bin, strcoll_ucs2_common);
failed+= strcollsp(&my_charset_utf16_bin, strcoll_ucs2_space);
failed+= strcollsp(&my_charset_utf16_bin, strcoll_utf16_common);
failed+= strcollsp(&my_charset_utf16le_general_ci,strcoll_ucs2_common);
failed+= strcollsp(&my_charset_utf16le_general_ci,strcoll_utf16le_space);
failed+= strcollsp(&my_charset_utf16le_general_ci,strcoll_utf16le_common);
failed+= strcollsp(&my_charset_utf16le_bin, strcoll_ucs2_common);
failed+= strcollsp(&my_charset_utf16le_bin, strcoll_utf16le_space);
failed+= strcollsp(&my_charset_utf16le_bin, strcoll_utf16le_common);
#endif
#ifdef HAVE_CHARSET_utf8
failed+= strcollsp(&my_charset_utf8_general_ci, strcoll_utf8mb3_common);
failed+= strcollsp(&my_charset_utf8_general_mysql500_ci, strcoll_utf8mb3_common);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment