Bug#31081 server crash in regexp function

Problem: The "regex" library written by Henry Spencer does not support tricky character sets like UCS2. Fix: convert tricky character sets to UTF8 before calling regex functions.

Bug#31081 server crash in regexp function
Problem: The "regex" library written by Henry Spencer does not support tricky character sets like UCS2. Fix: convert tricky character sets to UTF8 before calling regex functions.
40f68cd4 · bar@mysql.com/bar.myoffice.izhnet.ru · 8dd6398e · 40f68cd4 · 40f68cd4 · 40f68cd4
Commit 40f68cd4 authored Oct 05, 2007 by bar@mysql.com/bar.myoffice.izhnet.ru
11 changed files
--- a/mysql-test/include/ctype_regex.inc
+++ b/mysql-test/include/ctype_regex.inc
+#
+# To test a desired collation, set session.collation_connection to
+# this collation before including this file
+#
+
+--disable_warnings
+drop table if exists t1;
+--enable_warnings
+
+#
+# Create a table with two varchar(64) null-able column,
+# using current values of
+# @@character_set_connection and  @@collation_connection.
+#
+
+create table t1 as
+select repeat(' ', 64) as s1, repeat(' ',64) as s2
+union
+select null, null;
+show create table t1;
+delete from t1;
+
+insert into t1 values('aaa','aaa');
+insert into t1 values('aaa|qqq','qqq');
+insert into t1 values('gheis','^[^a-dXYZ]+$');
+insert into t1 values('aab','^aa?b');
+insert into t1 values('Baaan','^Ba*n');
+insert into t1 values('aaa','qqq|aaa');
+insert into t1 values('qqq','qqq|aaa');
+
+insert into t1 values('bbb','qqq|aaa');
+insert into t1 values('bbb','qqq');
+insert into t1 values('aaa','aba');
+
+insert into t1 values(null,'abc');
+insert into t1 values('def',null);
+insert into t1 values(null,null);
+insert into t1 values('ghi','ghi[');
+
+select HIGH_PRIORITY s1 regexp s2 from t1;
+
+drop table t1;
--- a/mysql-test/r/ctype_uca.result
+++ b/mysql-test/r/ctype_uca.result
@@ -2754,4 +2754,49 @@ a
 c
 ch
 drop table t1;
+set collation_connection=ucs2_unicode_ci;
+drop table if exists t1;
+create table t1 as
+select repeat(' ', 64) as s1, repeat(' ',64) as s2
+union
+select null, null;
+show create table t1;
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `s1` varchar(64) character set ucs2 collate ucs2_unicode_ci default NULL,
+  `s2` varchar(64) character set ucs2 collate ucs2_unicode_ci default NULL
+) ENGINE=MyISAM DEFAULT CHARSET=latin1
+delete from t1;
+insert into t1 values('aaa','aaa');
+insert into t1 values('aaa|qqq','qqq');
+insert into t1 values('gheis','^[^a-dXYZ]+$');
+insert into t1 values('aab','^aa?b');
+insert into t1 values('Baaan','^Ba*n');
+insert into t1 values('aaa','qqq|aaa');
+insert into t1 values('qqq','qqq|aaa');
+insert into t1 values('bbb','qqq|aaa');
+insert into t1 values('bbb','qqq');
+insert into t1 values('aaa','aba');
+insert into t1 values(null,'abc');
+insert into t1 values('def',null);
+insert into t1 values(null,null);
+insert into t1 values('ghi','ghi[');
+select HIGH_PRIORITY s1 regexp s2 from t1;
+s1 regexp s2
+1
+1
+1
+1
+1
+1
+1
+0
+0
+0
+NULL
+NULL
+NULL
+NULL
+drop table t1;
+set names utf8;
 End for 5.0 tests
--- a/mysql-test/r/ctype_ucs.result
+++ b/mysql-test/r/ctype_ucs.result
@@ -922,4 +922,49 @@ ERROR HY000: Illegal mix of collations (ascii_general_ci,IMPLICIT) and (ucs2_gen
 select * from t1 where a=if(b<10,_ucs2 0x0062,_ucs2 0x00C0);
 ERROR HY000: Illegal mix of collations (ascii_general_ci,IMPLICIT) and (ucs2_general_ci,COERCIBLE) for operation '='
 drop table t1;
+set collation_connection=ucs2_general_ci;
+drop table if exists t1;
+create table t1 as
+select repeat(' ', 64) as s1, repeat(' ',64) as s2
+union
+select null, null;
+show create table t1;
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `s1` varchar(64) character set ucs2 default NULL,
+  `s2` varchar(64) character set ucs2 default NULL
+) ENGINE=MyISAM DEFAULT CHARSET=latin1
+delete from t1;
+insert into t1 values('aaa','aaa');
+insert into t1 values('aaa|qqq','qqq');
+insert into t1 values('gheis','^[^a-dXYZ]+$');
+insert into t1 values('aab','^aa?b');
+insert into t1 values('Baaan','^Ba*n');
+insert into t1 values('aaa','qqq|aaa');
+insert into t1 values('qqq','qqq|aaa');
+insert into t1 values('bbb','qqq|aaa');
+insert into t1 values('bbb','qqq');
+insert into t1 values('aaa','aba');
+insert into t1 values(null,'abc');
+insert into t1 values('def',null);
+insert into t1 values(null,null);
+insert into t1 values('ghi','ghi[');
+select HIGH_PRIORITY s1 regexp s2 from t1;
+s1 regexp s2
+1
+1
+1
+1
+1
+1
+1
+0
+0
+0
+NULL
+NULL
+NULL
+NULL
+drop table t1;
+set names latin1;
 End of 5.0 tests
--- a/mysql-test/r/ctype_utf8.result
+++ b/mysql-test/r/ctype_utf8.result
@@ -267,6 +267,51 @@ b
 select * from t1 where a = 'b' and a != 'b';
 a
 drop table t1;
+set collation_connection=utf8_general_ci;
+drop table if exists t1;
+create table t1 as
+select repeat(' ', 64) as s1, repeat(' ',64) as s2
+union
+select null, null;
+show create table t1;
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `s1` varchar(64) character set utf8 default NULL,
+  `s2` varchar(64) character set utf8 default NULL
+) ENGINE=MyISAM DEFAULT CHARSET=latin1
+delete from t1;
+insert into t1 values('aaa','aaa');
+insert into t1 values('aaa|qqq','qqq');
+insert into t1 values('gheis','^[^a-dXYZ]+$');
+insert into t1 values('aab','^aa?b');
+insert into t1 values('Baaan','^Ba*n');
+insert into t1 values('aaa','qqq|aaa');
+insert into t1 values('qqq','qqq|aaa');
+insert into t1 values('bbb','qqq|aaa');
+insert into t1 values('bbb','qqq');
+insert into t1 values('aaa','aba');
+insert into t1 values(null,'abc');
+insert into t1 values('def',null);
+insert into t1 values(null,null);
+insert into t1 values('ghi','ghi[');
+select HIGH_PRIORITY s1 regexp s2 from t1;
+s1 regexp s2
+1
+1
+1
+1
+1
+1
+1
+0
+0
+0
+NULL
+NULL
+NULL
+NULL
+drop table t1;
+set names utf8;
 set names utf8;
 select  'вася'  rlike '[[:<:]]вася[[:>:]]';
 'вася'  rlike '[[:<:]]вася[[:>:]]'

--- a/mysql-test/r/func_regexp.result
+++ b/mysql-test/r/func_regexp.result
 drop table if exists t1;
-create table t1 (s1 char(64),s2 char(64));
+set names latin1;
+drop table if exists t1;
+create table t1 as
+select repeat(' ', 64) as s1, repeat(' ',64) as s2
+union
+select null, null;
+show create table t1;
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `s1` varchar(64) default NULL,
+  `s2` varchar(64) default NULL
+) ENGINE=MyISAM DEFAULT CHARSET=latin1
+delete from t1;
 insert into t1 values('aaa','aaa');
 insert into t1 values('aaa|qqq','qqq');
 insert into t1 values('gheis','^[^a-dXYZ]+$');

--- a/mysql-test/t/ctype_uca.test
+++ b/mysql-test/t/ctype_uca.test
@@ -538,4 +538,8 @@ alter table t1 convert to character set ucs2 collate ucs2_czech_ci;
 select * from t1 where a like 'c%';
 drop table t1;

+set collation_connection=ucs2_unicode_ci;
+--source include/ctype_regex.inc
+set names utf8;
+
 -- echo End for 5.0 tests
--- a/mysql-test/t/ctype_ucs.test
+++ b/mysql-test/t/ctype_ucs.test
@@ -651,4 +651,8 @@ select * from t1 where a=if(b<10,_ucs2 0x00C0,_ucs2 0x0062);
 select * from t1 where a=if(b<10,_ucs2 0x0062,_ucs2 0x00C0);
 drop table t1;

+set collation_connection=ucs2_general_ci;
+--source include/ctype_regex.inc
+set names latin1;
+
 --echo End of 5.0 tests
--- a/mysql-test/t/ctype_utf8.test
+++ b/mysql-test/t/ctype_utf8.test
@@ -185,6 +185,13 @@ select * from t1 where a = 'b' and a = 'b';
 select * from t1 where a = 'b' and a != 'b';
 drop table t1;

+#
+# Testing regexp
+#
+set collation_connection=utf8_general_ci;
+--source include/ctype_regex.inc
+set names utf8;
+
 #
 # Bug #3928 regexp [[:>:]] and UTF-8
 #

--- a/mysql-test/t/func_regexp.test
+++ b/mysql-test/t/func_regexp.test
@@ -6,28 +6,9 @@
 drop table if exists t1;
 --enable_warnings

-create table t1 (s1 char(64),s2 char(64));
+set names latin1;
+--source include/ctype_regex.inc

-insert into t1 values('aaa','aaa');
-insert into t1 values('aaa|qqq','qqq');
-insert into t1 values('gheis','^[^a-dXYZ]+$');
-insert into t1 values('aab','^aa?b');
-insert into t1 values('Baaan','^Ba*n');
-insert into t1 values('aaa','qqq|aaa');
-insert into t1 values('qqq','qqq|aaa');
-
-insert into t1 values('bbb','qqq|aaa');
-insert into t1 values('bbb','qqq');
-insert into t1 values('aaa','aba');
-
-insert into t1 values(null,'abc');
-insert into t1 values('def',null);
-insert into t1 values(null,null);
-insert into t1 values('ghi','ghi[');
-
-select HIGH_PRIORITY s1 regexp s2 from t1;
-
-drop table t1;

 #
 # This test a bug in regexp on Alpha

--- a/sql/item_cmpfunc.cc
+++ b/sql/item_cmpfunc.cc
@@ -4225,6 +4225,51 @@ void Item_func_like::cleanup()

 #ifdef USE_REGEX

+bool
+Item_func_regex::regcomp(bool send_error)
+{
+  char buff[MAX_FIELD_WIDTH];
+  String tmp(buff,sizeof(buff),&my_charset_bin);
+  String *res= args[1]->val_str(&tmp);
+  int error;
+
+  if (args[1]->null_value)
+    return TRUE;
+
+  if (regex_compiled)
+  {
+    if (!stringcmp(res, &prev_regexp))
+      return FALSE;
+    prev_regexp.copy(*res);
+    my_regfree(&preg);
+    regex_compiled= 0;
+  }
+
+  if (cmp_collation.collation != regex_lib_charset)
+  {
+    /* Convert UCS2 strings to UTF8 */
+    uint dummy_errors;
+    if (conv.copy(res->ptr(), res->length(), res->charset(),
+                  regex_lib_charset, &dummy_errors))
+      return TRUE;
+    res= &conv;
+  }
+
+  if ((error= my_regcomp(&preg, res->c_ptr(),
+                         regex_lib_flags, regex_lib_charset)))
+  {
+    if (send_error)
+    {
+      (void) my_regerror(error, &preg, buff, sizeof(buff));
+      my_error(ER_REGEXP_ERROR, MYF(0), buff);
+    }
+    return TRUE;
+  }
+  regex_compiled= 1;
+  return FALSE;
+}
+
+
 bool
 Item_func_regex::fix_fields(THD *thd, Item **ref)
 {
@@ -4241,34 +4286,33 @@ Item_func_regex::fix_fields(THD *thd, Item **ref)
  if (agg_arg_charsets(cmp_collation, args, 2, MY_COLL_CMP_CONV, 1))
    return TRUE;

+  regex_lib_flags= (cmp_collation.collation->state &
+                    (MY_CS_BINSORT | MY_CS_CSSORT)) ?
+                   REG_EXTENDED | REG_NOSUB :
+                   REG_EXTENDED | REG_NOSUB | REG_ICASE;
+  /*
+    If the case of UCS2 and other non-ASCII character sets,
+    we will convert patterns and strings to UTF8.
+  */
+  regex_lib_charset= (cmp_collation.collation->mbminlen > 1) ?
+                     &my_charset_utf8_general_ci :
+                     cmp_collation.collation;
+
  used_tables_cache=args[0]->used_tables() | args[1]->used_tables();
  not_null_tables_cache= (args[0]->not_null_tables() |
 			  args[1]->not_null_tables());
  const_item_cache=args[0]->const_item() && args[1]->const_item();
  if (!regex_compiled && args[1]->const_item())
  {
-    char buff[MAX_FIELD_WIDTH];
-    String tmp(buff,sizeof(buff),&my_charset_bin);
-    String *res=args[1]->val_str(&tmp);
    if (args[1]->null_value)
    {						// Will always return NULL
      maybe_null=1;
      return FALSE;
    }
-    int error;
-    if ((error= my_regcomp(&preg,res->c_ptr(),
-                           ((cmp_collation.collation->state &
-                             (MY_CS_BINSORT | MY_CS_CSSORT)) ?
-                            REG_EXTENDED | REG_NOSUB :
-                            REG_EXTENDED | REG_NOSUB | REG_ICASE),
-                           cmp_collation.collation)))
-    {
-      (void) my_regerror(error,&preg,buff,sizeof(buff));
-      my_error(ER_REGEXP_ERROR, MYF(0), buff);
+    if (regcomp(TRUE))
      return TRUE;
-    }
-    regex_compiled=regex_is_const=1;
-    maybe_null=args[0]->maybe_null;
+    regex_is_const= 1;
+    maybe_null= args[0]->maybe_null;
  }
  else
    maybe_null=1;
@@ -4281,47 +4325,25 @@ longlong Item_func_regex::val_int()
 {
  DBUG_ASSERT(fixed == 1);
  char buff[MAX_FIELD_WIDTH];
-  String *res, tmp(buff,sizeof(buff),&my_charset_bin);
+  String tmp(buff,sizeof(buff),&my_charset_bin);
+  String *res= args[0]->val_str(&tmp);

-  res=args[0]->val_str(&tmp);
-  if (args[0]->null_value)
-  {
-    null_value=1;
+  if ((null_value= (args[0]->null_value ||
+                    (!regex_is_const && regcomp(FALSE)))))
    return 0;
-  }
-  if (!regex_is_const)
-  {
-    char buff2[MAX_FIELD_WIDTH];
-    String *res2, tmp2(buff2,sizeof(buff2),&my_charset_bin);

-    res2= args[1]->val_str(&tmp2);
-    if (args[1]->null_value)
+  if (cmp_collation.collation != regex_lib_charset)
+  {
+    /* Convert UCS2 strings to UTF8 */
+    uint dummy_errors;
+    if (conv.copy(res->ptr(), res->length(), res->charset(),
+                  regex_lib_charset, &dummy_errors))
    {
-      null_value=1;
+      null_value= 1;
      return 0;
    }
-    if (!regex_compiled || stringcmp(res2,&prev_regexp))
-    {
-      prev_regexp.copy(*res2);
-      if (regex_compiled)
-      {
-	my_regfree(&preg);
-	regex_compiled=0;
-      }
-      if (my_regcomp(&preg,res2->c_ptr_safe(),
-                     ((cmp_collation.collation->state &
-                       (MY_CS_BINSORT | MY_CS_CSSORT)) ?
-                      REG_EXTENDED | REG_NOSUB :
-                      REG_EXTENDED | REG_NOSUB | REG_ICASE),
-                     cmp_collation.collation))
-      {
-	null_value=1;
-	return 0;
-      }
-      regex_compiled=1;
-    }
+    res= &conv;
  }
-  null_value=0;
  return my_regexec(&preg,res->c_ptr_safe(),0,(my_regmatch_t*) 0,0) ? 0 : 1;
 }


--- a/sql/item_cmpfunc.h
+++ b/sql/item_cmpfunc.h
@@ -1313,6 +1313,10 @@ class Item_func_regex :public Item_bool_func
  bool regex_is_const;
  String prev_regexp;
  DTCollation cmp_collation;
+  CHARSET_INFO *regex_lib_charset;
+  int regex_lib_flags;
+  String conv;
+  bool regcomp(bool send_error);
 public:
  Item_func_regex(Item *a,Item *b) :Item_bool_func(a,b),
    regex_compiled(0),regex_is_const(0) {}