Commit 9ad8ff66 authored by Alexander Barkov's avatar Alexander Barkov

MDEV-8415 utf8: compare broken bytes as "greater than any non-broken character"

parent 95d07ee4
...@@ -5594,11 +5594,11 @@ a?z ...@@ -5594,11 +5594,11 @@ a?z
ab ab
az az
SET @query=CONCAT('SELECT ch FROM t1 WHERE ch>''a', 0xD1,''' ORDER BY ch'); SET @query=CONCAT('SELECT ch FROM t1 WHERE ch>''a', 0xD1,''' ORDER BY ch');
PREPARE stmt FROM @query; PREPARE stmt FROM @query;
EXECUTE stmt; EXECUTE stmt;
ch ch
z z
# 0xEA9A96 would be a good 3-byte character, 0xEA9A is an incomplete sequence # 0xEA9A96 would be a good 3-byte character, 0xEA9A is an incomplete sequence
SET @query=CONCAT('SELECT ch FROM t1 WHERE ch=''a', 0xEA9A,''' ORDER BY ch'); SET @query=CONCAT('SELECT ch FROM t1 WHERE ch=''a', 0xEA9A,''' ORDER BY ch');
...@@ -5802,11 +5802,11 @@ a?z ...@@ -5802,11 +5802,11 @@ a?z
ab ab
az az
SET @query=CONCAT('SELECT ch FROM t1 WHERE ch>''a', 0xD1,''' ORDER BY ch'); SET @query=CONCAT('SELECT ch FROM t1 WHERE ch>''a', 0xD1,''' ORDER BY ch');
PREPARE stmt FROM @query; PREPARE stmt FROM @query;
EXECUTE stmt; EXECUTE stmt;
ch ch
z z
# 0xEA9A96 would be a good 3-byte character, 0xEA9A is an incomplete sequence # 0xEA9A96 would be a good 3-byte character, 0xEA9A is an incomplete sequence
SET @query=CONCAT('SELECT ch FROM t1 WHERE ch=''a', 0xEA9A,''' ORDER BY ch'); SET @query=CONCAT('SELECT ch FROM t1 WHERE ch=''a', 0xEA9A,''' ORDER BY ch');
...@@ -6002,11 +6002,11 @@ a?z ...@@ -6002,11 +6002,11 @@ a?z
ab ab
az az
SET @query=CONCAT('SELECT ch FROM t1 WHERE ch>''a', 0xD1,''' ORDER BY ch'); SET @query=CONCAT('SELECT ch FROM t1 WHERE ch>''a', 0xD1,''' ORDER BY ch');
PREPARE stmt FROM @query; PREPARE stmt FROM @query;
EXECUTE stmt; EXECUTE stmt;
ch ch
z z
# 0xEA9A96 would be a good 3-byte character, 0xEA9A is an incomplete sequence # 0xEA9A96 would be a good 3-byte character, 0xEA9A is an incomplete sequence
SET @query=CONCAT('SELECT ch FROM t1 WHERE ch=''a', 0xEA9A,''' ORDER BY ch'); SET @query=CONCAT('SELECT ch FROM t1 WHERE ch=''a', 0xEA9A,''' ORDER BY ch');
......
...@@ -394,7 +394,7 @@ struct st_myisam_info ...@@ -394,7 +394,7 @@ struct st_myisam_info
#define PACK_TYPE_SELECTED 1 /* Bits in field->pack_type */ #define PACK_TYPE_SELECTED 1 /* Bits in field->pack_type */
#define PACK_TYPE_SPACE_FIELDS 2 #define PACK_TYPE_SPACE_FIELDS 2
#define PACK_TYPE_ZERO_FILL 4 #define PACK_TYPE_ZERO_FILL 4
#define MI_FOUND_WRONG_KEY 32738 /* Impossible value from ha_key_cmp */ #define MI_FOUND_WRONG_KEY 0x7FFFFFFF /* Impossible value from ha_key_cmp */
#define MI_MAX_KEY_BLOCK_SIZE (MI_MAX_KEY_BLOCK_LENGTH/MI_MIN_KEY_BLOCK_LENGTH) #define MI_MAX_KEY_BLOCK_SIZE (MI_MAX_KEY_BLOCK_LENGTH/MI_MIN_KEY_BLOCK_LENGTH)
#define MI_BLOCK_SIZE(key_length,data_pointer,key_pointer,block_size) (((((key_length)+(data_pointer)+(key_pointer))*4+(key_pointer)+2)/(block_size)+1)*(block_size)) #define MI_BLOCK_SIZE(key_length,data_pointer,key_pointer,block_size) (((((key_length)+(data_pointer)+(key_pointer))*4+(key_pointer)+2)/(block_size)+1)*(block_size))
......
This diff is collapsed.
...@@ -65,7 +65,9 @@ ...@@ -65,7 +65,9 @@
The including source file must define the following macros: The including source file must define the following macros:
IS_MB1_CHAR(x) IS_MB1_CHAR(x)
IS_MB1_MB2HEAD_GAP(x) - optional, for better performance
IS_MB2_CHAR(x,y) IS_MB2_CHAR(x,y)
IS_MB3_CHAR(x,y,z) - for character sets with mbmaxlen>2
WEIGHT_PAD_SPACE WEIGHT_PAD_SPACE
WEIGHT_MB1(x) WEIGHT_MB1(x)
WEIGHT_MB2(x,y) WEIGHT_MB2(x,y)
...@@ -86,6 +88,16 @@ MY_FUNCTION_NAME(scan_weight)(int *weight, const uchar *str, const uchar *end) ...@@ -86,6 +88,16 @@ MY_FUNCTION_NAME(scan_weight)(int *weight, const uchar *str, const uchar *end)
return 1; return 1;
} }
#ifdef IS_MB1_MBHEAD_UNUSED_GAP
/*
Quickly filter out unused bytes that are neither MB1 nor MBHEAD.
E.g. [0x80..0xC1] in utf8. This allows using simplified conditions
in IS_MB2_CHAR(), IS_MB3_CHAR(), etc.
*/
if (IS_MB1_MBHEAD_UNUSED_GAP(*str))
goto bad;
#endif
if (str + 2 > end) /* The string ended unexpectedly */ if (str + 2 > end) /* The string ended unexpectedly */
goto bad; /* Treat as a bad byte */ goto bad; /* Treat as a bad byte */
......
...@@ -347,6 +347,28 @@ STRNNCOLL_PARAM strcoll_ujis[]= ...@@ -347,6 +347,28 @@ STRNNCOLL_PARAM strcoll_ujis[]=
}; };
STRNNCOLL_PARAM strcoll_utf8mb3_common[]=
{
{CSTR("\xC0"), CSTR("\xC1"), -1}, /* Unused byte vs unused byte */
{CSTR("\xC0"), CSTR("\xFF"), -1}, /* Unused byte vs unused byte */
{CSTR("\xC2\xA1"), CSTR("\xC0"), -1}, /* MB2 vs unused byte */
{CSTR("\xC2\xA1"), CSTR("\xC2"), -1}, /* MB2 vs incomplete MB2 */
{CSTR("\xC2\xA1"), CSTR("\xC2\xA2"), -1}, /* MB2 vs MB2 */
{CSTR("\xC2\xA1"), CSTR("\xE0\xA0\x7F"),-1}, /* MB2 vs broken MB3 */
{CSTR("\xC2\xA1"), CSTR("\xE0\xA0\x80"),-1}, /* MB2 vs MB3 */
{CSTR("\xC2\xA1"), CSTR("\xE0\xA0\xBF"),-1}, /* MB2 vs MB3 */
{CSTR("\xC2\xA1"), CSTR("\xE0\xA0\xC0"),-1}, /* MB2 vs broken MB3 */
{CSTR("\xC2\xA1"), CSTR("\xE0\xA0"), -1}, /* MB2 vs incomplete MB3 */
{CSTR("\xE0\xA0\x7E"), CSTR("\xE0\xA0\x7F"),-1},/* Broken MB3 vs broken MB3 */
{CSTR("\xE0\xA0\x80"), CSTR("\xE0\xA0"), -1},/* MB3 vs incomplete MB3 */
{CSTR("\xE0\xA0\x80"), CSTR("\xE0\xA0\x7F"),-1},/* MB3 vs broken MB3 */
{CSTR("\xE0\xA0\x80"), CSTR("\xE0\xA0\xBF"),-1},/* MB3 vs MB3 */
{CSTR("\xE0\xA0\x80"), CSTR("\xE0\xA0\xC0"),-1},/* MB3 vs broken MB3 */
{CSTR("\xE0\xA0\xC0"), CSTR("\xE0\xA0\xC1"),-1},/* Broken MB3 vs broken MB3 */
{NULL, 0, NULL, 0, 0}
};
static void static void
str2hex(char *dst, size_t dstlen, const char *src, size_t srclen) str2hex(char *dst, size_t dstlen, const char *src, size_t srclen)
{ {
...@@ -470,6 +492,11 @@ test_strcollsp() ...@@ -470,6 +492,11 @@ test_strcollsp()
failed+= strcollsp(&my_charset_ujis_bin, strcoll_mb2_A1A1_mb2_F9FE); failed+= strcollsp(&my_charset_ujis_bin, strcoll_mb2_A1A1_mb2_F9FE);
failed+= strcollsp(&my_charset_ujis_japanese_ci, strcoll_ujis); failed+= strcollsp(&my_charset_ujis_japanese_ci, strcoll_ujis);
failed+= strcollsp(&my_charset_ujis_bin, strcoll_ujis); failed+= strcollsp(&my_charset_ujis_bin, strcoll_ujis);
#endif
#ifdef HAVE_CHARSET_utf8
failed+= strcollsp(&my_charset_utf8_general_ci, strcoll_utf8mb3_common);
failed+= strcollsp(&my_charset_utf8_general_mysql500_ci, strcoll_utf8mb3_common);
failed+= strcollsp(&my_charset_utf8_bin, strcoll_utf8mb3_common);
#endif #endif
return failed; return failed;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment