Bug#28916 LDML doesn't work for utf8

and is not described in the manual
- Adding missing initialization for utf8 collations
- Minor code clean-ups: renaming variables,
  moving code into a new separate function.
- Adding test, to check that both ucs2 and utf8 user
  defined collations work (ucs2_test_ci and utf8_test_ci)
- Adding Vietnamese collation as a complex user defined
  collation example.
parent b53b448d
......@@ -281,10 +281,11 @@ extern CHARSET_INFO my_charset_tis620_thai_ci;
extern CHARSET_INFO my_charset_tis620_bin;
extern CHARSET_INFO my_charset_ucs2_general_ci;
extern CHARSET_INFO my_charset_ucs2_bin;
extern CHARSET_INFO my_charset_ucs2_general_uca;
extern CHARSET_INFO my_charset_ucs2_unicode_ci;
extern CHARSET_INFO my_charset_ujis_japanese_ci;
extern CHARSET_INFO my_charset_ujis_bin;
extern CHARSET_INFO my_charset_utf8_general_ci;
extern CHARSET_INFO my_charset_utf8_unicode_ci;
extern CHARSET_INFO my_charset_utf8_bin;
extern CHARSET_INFO my_charset_cp1250_czech_ci;
......
drop table if exists t1;
set names utf8;
show variables like 'character_sets_dir%';
Variable_name Value
character_sets_dir MYSQL_TEST_DIR/std_data/
show collation like 'utf8_test_ci';
Collation Charset Id Default Compiled Sortlen
utf8_test_ci utf8 240 8
create table t1 (c1 char(1) character set utf8 collate utf8_test_ci);
insert into t1 values ('a');
select * from t1 where c1='b';
c1
a
drop table t1;
show collation like 'ucs2_test_ci';
Collation Charset Id Default Compiled Sortlen
ucs2_test_ci ucs2 241 8
create table t1 (c1 char(1) character set ucs2 collate ucs2_test_ci);
insert into t1 values ('a');
select * from t1 where c1='b';
c1
a
drop table t1;
show collation like 'ucs2_vn_ci';
Collation Charset Id Default Compiled Sortlen
ucs2_vn_ci ucs2 242 8
create table t1 (c1 char(1) character set ucs2 collate ucs2_vn_ci);
insert into t1 values (0x0061),(0x0041),(0x00E0),(0x00C0),(0x1EA3),(0x1EA2),
(0x00E3),(0x00C3),(0x00E1),(0x00C1),(0x1EA1),(0x1EA0);
insert into t1 values (0x0103),(0x0102),(0x1EB1),(0x1EB0),(0x1EB3),(0x1EB2),
(0x1EB5),(0x1EB4),(0x1EAF),(0x1EAE),(0x1EB7),(0x1EB6);
insert into t1 values (0x00E2),(0x00C2),(0x1EA7),(0x1EA6),(0x1EA9),(0x1EA8),
(0x1EAB),(0x1EAA),(0x1EA5),(0x1EA4),(0x1EAD),(0x1EAC);
insert into t1 values ('b'),('B'),('c'),('C');
insert into t1 values ('d'),('D'),(0x0111),(0x0110);
insert into t1 values (0x0065),(0x0045),(0x00E8),(0x00C8),(0x1EBB),(0x1EBA),
(0x1EBD),(0x1EBC),(0x00E9),(0x00C9),(0x1EB9),(0x1EB8);
insert into t1 values (0x00EA),(0x00CA),(0x1EC1),(0x1EC0),(0x1EC3),(0x1EC2),
(0x1EC5),(0x1EC4),(0x1EBF),(0x1EBE),(0x1EC7),(0x1EC6);
insert into t1 values ('g'),('G'),('h'),('H');
insert into t1 values (0x0069),(0x0049),(0x00EC),(0x00CC),(0x1EC9),(0x1EC8),
(0x0129),(0x0128),(0x00ED),(0x00CD),(0x1ECB),(0x1ECA);
insert into t1 values ('k'),('K'),('l'),('L'),('m'),('M');
insert into t1 values (0x006F),(0x004F),(0x00F2),(0x00D2),(0x1ECF),(0x1ECE),
(0x00F5),(0x00D5),(0x00F3),(0x00D3),(0x1ECD),(0x1ECC);
insert into t1 values (0x00F4),(0x00D4),(0x1ED3),(0x1ED2),(0x1ED5),(0x1ED4),
(0x1ED7),(0x1ED6),(0x1ED1),(0x1ED0),(0x1ED9),(0x1ED8);
insert into t1 values (0x01A1),(0x01A0),(0x1EDD),(0x1EDC),(0x1EDF),(0x1EDE),
(0x1EE1),(0x1EE0),(0x1EDB),(0x1EDA),(0x1EE3),(0x1EE2);
insert into t1 values ('p'),('P'),('q'),('Q'),('r'),('R'),('s'),('S'),('t'),('T');
insert into t1 values (0x0075),(0x0055),(0x00F9),(0x00D9),(0x1EE7),(0x1EE6),
(0x0169),(0x0168),(0x00FA),(0x00DA),(0x1EE5),(0x1EE4);
insert into t1 values (0x01B0),(0x01AF),(0x1EEB),(0x1EEA),(0x1EED),(0x1EEC),
(0x1EEF),(0x1EEE),(0x1EE9),(0x1EE8),(0x1EF1),(0x1EF0);
insert into t1 values ('v'),('V'),('x'),('X');
insert into t1 values (0x0079),(0x0059),(0x1EF3),(0x1EF2),(0x1EF7),(0x1EF6),
(0x1EF9),(0x1EF8),(0x00FD),(0x00DD),(0x1EF5),(0x1EF4);
select hex(c1) as h, c1 from t1 order by c1, h;
h c1
0041 A
0061 a
00C0 À
00C1 Á
00C3 Ã
00E0 à
00E1 á
00E3 ã
1EA0 Ạ
1EA1 ạ
1EA2 Ả
1EA3 ả
0102 Ă
0103 ă
1EAE Ắ
1EAF ắ
1EB0 Ằ
1EB1 ằ
1EB2 Ẳ
1EB3 ẳ
1EB4 Ẵ
1EB5 ẵ
1EB6 Ặ
1EB7 ặ
00C2 Â
00E2 â
1EA4 Ấ
1EA5 ấ
1EA6 Ầ
1EA7 ầ
1EA8 Ẩ
1EA9 ẩ
1EAA Ẫ
1EAB ẫ
1EAC Ậ
1EAD ậ
0042 B
0062 b
0043 C
0063 c
0044 D
0064 d
0110 Đ
0111 đ
0045 E
0065 e
00C8 È
00C9 É
00E8 è
00E9 é
1EB8 Ẹ
1EB9 ẹ
1EBA Ẻ
1EBB ẻ
1EBC Ẽ
1EBD ẽ
00CA Ê
00EA ê
1EBE Ế
1EBF ế
1EC0 Ề
1EC1 ề
1EC2 Ể
1EC3 ể
1EC4 Ễ
1EC5 ễ
1EC6 Ệ
1EC7 ệ
0047 G
0067 g
0048 H
0068 h
0049 I
0069 i
00CC Ì
00CD Í
00EC ì
00ED í
0128 Ĩ
0129 ĩ
1EC8 Ỉ
1EC9 ỉ
1ECA Ị
1ECB ị
004B K
006B k
004C L
006C l
004D M
006D m
004F O
006F o
00D2 Ò
00D3 Ó
00D5 Õ
00F2 ò
00F3 ó
00F5 õ
1ECC Ọ
1ECD ọ
1ECE Ỏ
1ECF ỏ
00D4 Ô
00F4 ô
1ED0 Ố
1ED1 ố
1ED2 Ồ
1ED3 ồ
1ED4 Ổ
1ED5 ổ
1ED6 Ỗ
1ED7 ỗ
1ED8 Ộ
1ED9 ộ
01A0 Ơ
01A1 ơ
1EDA Ớ
1EDB ớ
1EDC Ờ
1EDD ờ
1EDE Ở
1EDF ở
1EE0 Ỡ
1EE1 ỡ
1EE2 Ợ
1EE3 ợ
0050 P
0070 p
0051 Q
0071 q
0052 R
0072 r
0053 S
0073 s
0054 T
0074 t
0055 U
0075 u
00D9 Ù
00DA Ú
00F9 ù
00FA ú
0168 Ũ
0169 ũ
1EE4 Ụ
1EE5 ụ
1EE6 Ủ
1EE7 ủ
01AF Ư
01B0 ư
1EE8 Ứ
1EE9 ứ
1EEA Ừ
1EEB ừ
1EEC Ử
1EED ử
1EEE Ữ
1EEF ữ
1EF0 Ự
1EF1 ự
0056 V
0076 v
0058 X
0078 x
0059 Y
0079 y
00DD Ý
00FD ý
1EF2 Ỳ
1EF3 ỳ
1EF4 Ỵ
1EF5 ỵ
1EF6 Ỷ
1EF7 ỷ
1EF8 Ỹ
1EF9 ỹ
select group_concat(hex(c1) order by hex(c1)) from t1 group by c1;
group_concat(hex(c1) order by hex(c1))
0041,0061,00C0,00C1,00C3,00E0,00E1,00E3,1EA0,1EA1,1EA2,1EA3
0102,0103,1EAE,1EAF,1EB0,1EB1,1EB2,1EB3,1EB4,1EB5,1EB6,1EB7
00C2,00E2,1EA4,1EA5,1EA6,1EA7,1EA8,1EA9,1EAA,1EAB,1EAC,1EAD
0042,0062
0043,0063
0044,0064
0110,0111
0045,0065,00C8,00C9,00E8,00E9,1EB8,1EB9,1EBA,1EBB,1EBC,1EBD
00CA,00EA,1EBE,1EBF,1EC0,1EC1,1EC2,1EC3,1EC4,1EC5,1EC6,1EC7
0047,0067
0048,0068
0049,0069,00CC,00CD,00EC,00ED,0128,0129,1EC8,1EC9,1ECA,1ECB
004B,006B
004C,006C
004D,006D
004F,006F,00D2,00D3,00D5,00F2,00F3,00F5,1ECC,1ECD,1ECE,1ECF
00D4,00F4,1ED0,1ED1,1ED2,1ED3,1ED4,1ED5,1ED6,1ED7,1ED8,1ED9
01A0,01A1,1EDA,1EDB,1EDC,1EDD,1EDE,1EDF,1EE0,1EE1,1EE2,1EE3
0050,0070
0051,0071
0052,0072
0053,0073
0054,0074
0055,0075,00D9,00DA,00F9,00FA,0168,0169,1EE4,1EE5,1EE6,1EE7
01AF,01B0,1EE8,1EE9,1EEA,1EEB,1EEC,1EED,1EEE,1EEF,1EF0,1EF1
0056,0076
0058,0078
0059,0079,00DD,00FD,1EF2,1EF3,1EF4,1EF5,1EF6,1EF7,1EF8,1EF9
select group_concat(c1 order by hex(c1) SEPARATOR '') from t1 group by c1;
group_concat(c1 order by hex(c1) SEPARATOR '')
AaÀÁÃàáãẠạẢả
ĂăẮắẰằẲẳẴẵẶặ
ÂâẤấẦầẨẩẪẫẬậ
Bb
Cc
Dd
Đđ
EeÈÉèéẸẹẺẻẼẽ
ÊêẾếỀềỂểỄễỆệ
Gg
Hh
IiÌÍìíĨĩỈỉỊị
Kk
Ll
Mm
OoÒÓÕòóõỌọỎỏ
ÔôỐốỒồỔổỖỗỘộ
ƠơỚớỜờỞởỠỡỢợ
Pp
Qq
Rr
Ss
Tt
UuÙÚùúŨũỤụỦủ
ƯưỨứỪừỬửỮữỰự
Vv
Xx
YyÝýỲỳỴỵỶỷỸỹ
drop table t1;
<charsets>
<charset name="utf8">
<collation name="utf8_test_ci" id="240">
<rules>
<reset>a</reset>
<s>b</s>
</rules>
</collation>
</charset>
<charset name="ucs2">
<collation name="ucs2_test_ci" id="241">
<rules>
<reset>a</reset>
<s>b</s>
</rules>
</collation>
<collation name="ucs2_vn_ci" id="242">
<!-- Vietnamese experimental collation -->
<rules>
<reset>A</reset>
<p>\u0103</p><t>\u0102</t>
<s>\u1EB1</s><t>\u1EB0</t>
<s>\u1EB3</s><t>\u1EB2</t>
<s>\u1EB5</s><t>\u1EB4</t>
<s>\u1EAF</s><t>\u1EAE</t>
<s>\u1EB7</s><t>\u1EB6</t>
<p>\u00E2</p><t>\u00C2</t>
<s>\u1EA7</s><t>\u1EA6</t>
<s>\u1EA9</s><t>\u1EA8</t>
<s>\u1EAB</s><t>\u1EAA</t>
<s>\u1EA5</s><t>\u1EA4</t>
<s>\u1EAD</s><t>\u1EAC</t>
<reset>D</reset>
<p>\u0111</p><t>\u0110</t>
<reset>E</reset>
<p>\u00EA</p><t>\u00CA</t>
<s>\u1EC1</s><t>\u1EC0</t>
<s>\u1EC3</s><t>\u1EC2</t>
<s>\u1EC5</s><t>\u1EC4</t>
<s>\u1EBF</s><t>\u1EBE</t>
<s>\u1EC7</s><t>\u1EC6</t>
<reset>O</reset>
<p>\u00F4</p><t>\u00D4</t>
<s>\u1ED3</s><t>\u1ED2</t>
<s>\u1ED5</s><t>\u1ED4</t>
<s>\u1ED7</s><t>\u1ED6</t>
<s>\u1ED1</s><t>\u1ED0</t>
<s>\u1ED9</s><t>\u1ED8</t>
<p>\u01A1</p><t>\u01A0</t>
<s>\u1EDD</s><t>\u1EDC</t>
<s>\u1EDF</s><t>\u1EDE</t>
<s>\u1EE1</s><t>\u1EE0</t>
<s>\u1EDB</s><t>\u1EDA</t>
<s>\u1EE3</s><t>\u1EE2</t>
<reset>U</reset>
<p>\u01B0</p><t>\u01AF</t>
<s>\u1EEB</s><t>\u1EEA</t>
<s>\u1EED</s><t>\u1EEC</t>
<s>\u1EEF</s><t>\u1EEE</t>
<s>\u1EE9</s><t>\u1EE8</t>
<s>\u1EF1</s><t>\u1EF0</t>
</rules>
</collation>
</charset>
</charsets>
--character-sets-dir=$MYSQL_TEST_DIR/std_data/
--source include/have_ucs2.inc
--disable_warnings
drop table if exists t1;
--enable_warnings
set names utf8;
--replace_result $MYSQL_TEST_DIR MYSQL_TEST_DIR
show variables like 'character_sets_dir%';
show collation like 'utf8_test_ci';
create table t1 (c1 char(1) character set utf8 collate utf8_test_ci);
insert into t1 values ('a');
select * from t1 where c1='b';
drop table t1;
show collation like 'ucs2_test_ci';
create table t1 (c1 char(1) character set ucs2 collate ucs2_test_ci);
insert into t1 values ('a');
select * from t1 where c1='b';
drop table t1;
#
# Vietnamese experimental collation
#
show collation like 'ucs2_vn_ci';
create table t1 (c1 char(1) character set ucs2 collate ucs2_vn_ci);
insert into t1 values (0x0061),(0x0041),(0x00E0),(0x00C0),(0x1EA3),(0x1EA2),
(0x00E3),(0x00C3),(0x00E1),(0x00C1),(0x1EA1),(0x1EA0);
insert into t1 values (0x0103),(0x0102),(0x1EB1),(0x1EB0),(0x1EB3),(0x1EB2),
(0x1EB5),(0x1EB4),(0x1EAF),(0x1EAE),(0x1EB7),(0x1EB6);
insert into t1 values (0x00E2),(0x00C2),(0x1EA7),(0x1EA6),(0x1EA9),(0x1EA8),
(0x1EAB),(0x1EAA),(0x1EA5),(0x1EA4),(0x1EAD),(0x1EAC);
insert into t1 values ('b'),('B'),('c'),('C');
insert into t1 values ('d'),('D'),(0x0111),(0x0110);
insert into t1 values (0x0065),(0x0045),(0x00E8),(0x00C8),(0x1EBB),(0x1EBA),
(0x1EBD),(0x1EBC),(0x00E9),(0x00C9),(0x1EB9),(0x1EB8);
insert into t1 values (0x00EA),(0x00CA),(0x1EC1),(0x1EC0),(0x1EC3),(0x1EC2),
(0x1EC5),(0x1EC4),(0x1EBF),(0x1EBE),(0x1EC7),(0x1EC6);
insert into t1 values ('g'),('G'),('h'),('H');
insert into t1 values (0x0069),(0x0049),(0x00EC),(0x00CC),(0x1EC9),(0x1EC8),
(0x0129),(0x0128),(0x00ED),(0x00CD),(0x1ECB),(0x1ECA);
insert into t1 values ('k'),('K'),('l'),('L'),('m'),('M');
insert into t1 values (0x006F),(0x004F),(0x00F2),(0x00D2),(0x1ECF),(0x1ECE),
(0x00F5),(0x00D5),(0x00F3),(0x00D3),(0x1ECD),(0x1ECC);
insert into t1 values (0x00F4),(0x00D4),(0x1ED3),(0x1ED2),(0x1ED5),(0x1ED4),
(0x1ED7),(0x1ED6),(0x1ED1),(0x1ED0),(0x1ED9),(0x1ED8);
insert into t1 values (0x01A1),(0x01A0),(0x1EDD),(0x1EDC),(0x1EDF),(0x1EDE),
(0x1EE1),(0x1EE0),(0x1EDB),(0x1EDA),(0x1EE3),(0x1EE2);
insert into t1 values ('p'),('P'),('q'),('Q'),('r'),('R'),('s'),('S'),('t'),('T');
insert into t1 values (0x0075),(0x0055),(0x00F9),(0x00D9),(0x1EE7),(0x1EE6),
(0x0169),(0x0168),(0x00FA),(0x00DA),(0x1EE5),(0x1EE4);
insert into t1 values (0x01B0),(0x01AF),(0x1EEB),(0x1EEA),(0x1EED),(0x1EEC),
(0x1EEF),(0x1EEE),(0x1EE9),(0x1EE8),(0x1EF1),(0x1EF0);
insert into t1 values ('v'),('V'),('x'),('X');
insert into t1 values (0x0079),(0x0059),(0x1EF3),(0x1EF2),(0x1EF7),(0x1EF6),
(0x1EF9),(0x1EF8),(0x00FD),(0x00DD),(0x1EF5),(0x1EF4);
select hex(c1) as h, c1 from t1 order by c1, h;
select group_concat(hex(c1) order by hex(c1)) from t1 group by c1;
select group_concat(c1 order by hex(c1) SEPARATOR '') from t1 group by c1;
drop table t1;
......@@ -24,7 +24,6 @@
#ifdef HAVE_UCA_COLLATIONS
#ifdef HAVE_CHARSET_ucs2
extern CHARSET_INFO my_charset_ucs2_general_uca;
extern CHARSET_INFO my_charset_ucs2_icelandic_uca_ci;
extern CHARSET_INFO my_charset_ucs2_latvian_uca_ci;
extern CHARSET_INFO my_charset_ucs2_romanian_uca_ci;
......@@ -46,7 +45,6 @@ extern CHARSET_INFO my_charset_ucs2_hungarian_uca_ci;
#endif
#ifdef HAVE_CHARSET_utf8
extern CHARSET_INFO my_charset_utf8_general_uca_ci;
extern CHARSET_INFO my_charset_utf8_icelandic_uca_ci;
extern CHARSET_INFO my_charset_utf8_latvian_uca_ci;
extern CHARSET_INFO my_charset_utf8_romanian_uca_ci;
......@@ -134,7 +132,7 @@ my_bool init_compiled_charsets(myf flags __attribute__((unused)))
add_compiled_collation(&my_charset_ucs2_general_ci);
add_compiled_collation(&my_charset_ucs2_bin);
#ifdef HAVE_UCA_COLLATIONS
add_compiled_collation(&my_charset_ucs2_general_uca);
add_compiled_collation(&my_charset_ucs2_unicode_ci);
add_compiled_collation(&my_charset_ucs2_icelandic_uca_ci);
add_compiled_collation(&my_charset_ucs2_latvian_uca_ci);
add_compiled_collation(&my_charset_ucs2_romanian_uca_ci);
......@@ -168,7 +166,7 @@ my_bool init_compiled_charsets(myf flags __attribute__((unused)))
add_compiled_collation(&my_charset_utf8_general_cs);
#endif
#ifdef HAVE_UCA_COLLATIONS
add_compiled_collation(&my_charset_utf8_general_uca_ci);
add_compiled_collation(&my_charset_utf8_unicode_ci);
add_compiled_collation(&my_charset_utf8_icelandic_uca_ci);
add_compiled_collation(&my_charset_utf8_latvian_uca_ci);
add_compiled_collation(&my_charset_utf8_romanian_uca_ci);
......
......@@ -202,6 +202,19 @@ static my_bool simple_cs_is_full(CHARSET_INFO *cs)
}
static void
copy_uca_collation(CHARSET_INFO *to, CHARSET_INFO *from)
{
to->cset= from->cset;
to->coll= from->coll;
to->strxfrm_multiply= from->strxfrm_multiply;
to->min_sort_char= from->min_sort_char;
to->max_sort_char= from->max_sort_char;
to->mbminlen= from->mbminlen;
to->mbmaxlen= from->mbmaxlen;
}
static int add_collation(CHARSET_INFO *cs)
{
if (cs->name && (cs->number ||
......@@ -225,29 +238,30 @@ static int add_collation(CHARSET_INFO *cs)
if (!(all_charsets[cs->number]->state & MY_CS_COMPILED))
{
CHARSET_INFO *new= all_charsets[cs->number];
CHARSET_INFO *newcs= all_charsets[cs->number];
if (cs_copy_data(all_charsets[cs->number],cs))
return MY_XML_ERROR;
if (!strcmp(cs->csname,"ucs2") )
{
#if defined(HAVE_CHARSET_ucs2) && defined(HAVE_UCA_COLLATIONS)
new->cset= my_charset_ucs2_general_uca.cset;
new->coll= my_charset_ucs2_general_uca.coll;
new->strxfrm_multiply= my_charset_ucs2_general_uca.strxfrm_multiply;
new->min_sort_char= my_charset_ucs2_general_uca.min_sort_char;
new->max_sort_char= my_charset_ucs2_general_uca.max_sort_char;
new->mbminlen= 2;
new->mbmaxlen= 2;
new->state |= MY_CS_AVAILABLE | MY_CS_LOADED;
copy_uca_collation(newcs, &my_charset_ucs2_unicode_ci);
newcs->state|= MY_CS_AVAILABLE | MY_CS_LOADED;
#endif
}
else if (!strcmp(cs->csname, "utf8"))
{
#if defined (HAVE_CHARSET_utf8) && defined(HAVE_UCA_COLLATIONS)
copy_uca_collation(newcs, &my_charset_utf8_unicode_ci);
newcs->state|= MY_CS_AVAILABLE | MY_CS_LOADED;
#endif
}
else
{
uchar *sort_order= all_charsets[cs->number]->sort_order;
simple_cs_init_functions(all_charsets[cs->number]);
new->mbminlen= 1;
new->mbmaxlen= 1;
newcs->mbminlen= 1;
newcs->mbmaxlen= 1;
if (simple_cs_is_full(all_charsets[cs->number]))
{
all_charsets[cs->number]->state |= MY_CS_LOADED;
......
......@@ -8073,7 +8073,7 @@ MY_COLLATION_HANDLER my_collation_ucs2_uca_handler =
my_propagate_complex
};
CHARSET_INFO my_charset_ucs2_general_uca=
CHARSET_INFO my_charset_ucs2_unicode_ci=
{
128,0,0, /* number */
MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE,
......@@ -8734,7 +8734,7 @@ static uchar ctype_utf8[] = {
extern MY_CHARSET_HANDLER my_charset_utf8_handler;
CHARSET_INFO my_charset_utf8_general_uca_ci=
CHARSET_INFO my_charset_utf8_unicode_ci=
{
192,0,0, /* number */
MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE,
......
......@@ -123,7 +123,7 @@ static struct my_cs_file_section_st * cs_file_sec(const char *attr, uint len)
}
#define MY_CS_CSDESCR_SIZE 64
#define MY_CS_TAILORING_SIZE 128
#define MY_CS_TAILORING_SIZE 1024
typedef struct my_cs_file_info
{
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment