MDEV-9369 IN operator with ( num, NULL ) gives inconsistent result

Based on this commit into MySQL-5.7: > commit 8e51b845aafc8b4cdebd763c8aebda262ac2d4cd > Author: Guilhem Bichot <guilhem.bichot@oracle.com> > Date: Mon Nov 4 15:44:55 2013 +0100 > > Bug#13944462 'NULL IN (XX)' RETURNS WRONG RESULTS

MDEV-9369 IN operator with ( num, NULL ) gives inconsistent result
Based on this commit into MySQL-5.7: > commit 8e51b845aafc8b4cdebd763c8aebda262ac2d4cd > Author: Guilhem Bichot <guilhem.bichot@oracle.com> > Date: Mon Nov 4 15:44:55 2013 +0100 > > Bug#13944462 'NULL IN (XX)' RETURNS WRONG RESULTS
94768542 · Alexander Barkov · 7cb16dc2 · 94768542 · 94768542 · 94768542
Commit 94768542 authored Mar 21, 2016 by Alexander Barkov
5 changed files
--- a/mysql-test/r/row.result
+++ b/mysql-test/r/row.result
@@ -50,6 +50,12 @@ id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
 1	SIMPLE	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	No tables used
 Warnings:
 Note	1003	select ((1,2,(3,4)) in ((3,2,(3,4)),(1,2,(3,NULL)))) AS `row(1,2,row(3,4)) IN (row(3,2,row(3,4)), row(1,2,row(3,NULL)))`
+select row(1,2,row(3,null)) IN (row(3,2,row(3,4)), row(1,2,row(4,5)));
+row(1,2,row(3,null)) IN (row(3,2,row(3,4)), row(1,2,row(4,5)))
+0
+select row(1,2,row(3,null)) IN (row(3,2,row(3,4)), row(1,2,row(3,5)));
+row(1,2,row(3,null)) IN (row(3,2,row(3,4)), row(1,2,row(3,5)))
+NULL
 SELECT (1,2,3)=(0,NULL,3);
 (1,2,3)=(0,NULL,3)
 0
@@ -130,7 +136,7 @@ ROW(c,2,3) IN(row(1,b,a), row(2,3,1))
 0
 0
 1
-NULL
+0
 select ROW(a,b,c) IN(row(1,2,3), row(3,2,1)) from t1;
 ROW(a,b,c) IN(row(1,2,3), row(3,2,1))
 1
@@ -509,5 +515,11 @@ Warnings:
 Note	1003	select `test`.`t1`.`a` AS `a`,`test`.`t1`.`b` AS `b` from `test`.`t1` where ((`test`.`t1`.`a` = 10) and (`test`.`t1`.`b` = 10))
 DROP TABLE t1;
 #
+# MDEV-9369 IN operator with ( num, NULL ) gives inconsistent result
+#
+SELECT (1,null) NOT IN ((2,2),(3,3)), (1,null) NOT IN ((2,2)), (1,null) NOT IN ((3,3));
+(1,null) NOT IN ((2,2),(3,3))	(1,null) NOT IN ((2,2))	(1,null) NOT IN ((3,3))
+1	1	1
+#
 # End of 10.1 tests
 #
--- a/mysql-test/t/row.test
+++ b/mysql-test/t/row.test
@@ -19,10 +19,12 @@ select (1,2,(3,4)) IN ((3,2,(3,4)), (1,2,(3,4)));
 select row(1,2,row(3,4)) IN (row(3,2,row(3,4)), row(1,2,4));
 select row(1,2,row(3,4)) IN (row(3,2,row(3,4)), row(1,2,row(3,NULL)));
 explain extended select row(1,2,row(3,4)) IN (row(3,2,row(3,4)), row(1,2,row(3,NULL)));
+select row(1,2,row(3,null)) IN (row(3,2,row(3,4)), row(1,2,row(4,5)));
+select row(1,2,row(3,null)) IN (row(3,2,row(3,4)), row(1,2,row(3,5)));
+

 SELECT (1,2,3)=(0,NULL,3);
 SELECT (1,2,3)=(1,NULL,3);
-# here's something for Sanja to fix :)
 SELECT (1,2,3)=(1,NULL,0);

 SELECT ROW(1,2,3)=ROW(1,2,3);
@@ -300,6 +302,13 @@ EXPLAIN EXTENDED SELECT * FROM t1 WHERE a=10 AND b=10 AND a>=10;
 EXPLAIN EXTENDED SELECT * FROM t1 WHERE (a,b)=(10,10) AND a>=10;
 DROP TABLE t1;

+
+--echo #
+--echo # MDEV-9369 IN operator with ( num, NULL ) gives inconsistent result
+--echo #
+SELECT (1,null) NOT IN ((2,2),(3,3)), (1,null) NOT IN ((2,2)), (1,null) NOT IN ((3,3));
+
+
 --echo #
 --echo # End of 10.1 tests
 --echo #
--- a/sql/item_cmpfunc.cc
+++ b/sql/item_cmpfunc.cc
@@ -2914,7 +2914,7 @@ Item *Item_func_case::find_item(String *str)
          return else_expr_num != -1 ? args[else_expr_num] : 0;
        value_added_map|= 1U << (uint)cmp_type;
      }
-      if (!cmp_items[(uint)cmp_type]->cmp(args[i]) && !args[i]->null_value)
+      if (cmp_items[(uint)cmp_type]->cmp(args[i]) == FALSE)
        return args[i + 1];
    }
  }
@@ -3545,11 +3545,11 @@ static int cmp_decimal(void *cmp_arg, my_decimal *a, my_decimal *b)
 }


-int in_vector::find(Item *item)
+bool in_vector::find(Item *item)
 {
  uchar *result=get_value(item);
  if (!result || !used_count)
-    return 0;				// Null value
+    return false;				// Null value

  uint start,end;
  start=0; end=used_count-1;
@@ -3558,13 +3558,13 @@ int in_vector::find(Item *item)
    uint mid=(start+end+1)/2;
    int res;
    if ((res=(*compare)(collation, base+mid*size, result)) == 0)
-      return 1;
+      return true;
    if (res < 0)
      start=mid;
    else
      end=mid-1;
  }
-  return (int) ((*compare)(collation, base+start*size, result) == 0);
+  return ((*compare)(collation, base+start*size, result) == 0);
 }

 in_string::in_string(uint elements,qsort2_cmp cmp_func, CHARSET_INFO *cs)
@@ -3894,14 +3894,20 @@ int cmp_item_row::cmp(Item *arg)
  arg->bring_value();
  for (uint i=0; i < n; i++)
  {
-    if (comparators[i]->cmp(arg->element_index(i)))
+    const int rc= comparators[i]->cmp(arg->element_index(i));
+    switch (rc)
    {
-      if (!arg->element_index(i)->null_value)
-	return 1;
-      was_null= 1;
+    case UNKNOWN:
+      was_null= true;
+      break;
+    case TRUE:
+      return TRUE;
+    case FALSE:
+      break;                                    // elements #i are equal
    }
+    arg->null_value|= arg->element_index(i)->null_value;
  }
-  return (arg->null_value= was_null);
+  return was_null ? UNKNOWN : FALSE;
 }


@@ -3924,15 +3930,15 @@ void cmp_item_decimal::store_value(Item *item)
  /* val may be zero if item is nnull */
  if (val && val != &value)
    my_decimal2decimal(val, &value);
+  m_null_value= item->null_value;
 }


 int cmp_item_decimal::cmp(Item *arg)
 {
  my_decimal tmp_buf, *tmp= arg->val_decimal(&tmp_buf);
-  if (arg->null_value)
-    return 1;
-  return my_decimal_cmp(&value, tmp);
+  return (m_null_value || arg->null_value) ?
+    UNKNOWN : (my_decimal_cmp(&value, tmp) != 0);
 }


@@ -3956,12 +3962,14 @@ void cmp_item_datetime::store_value(Item *item)
  enum_field_types f_type=
    tmp_item[0]->field_type_for_temporal_comparison(warn_item);
  value= get_datetime_value(thd, &tmp_item, &lval_cache, f_type, &is_null);
+  m_null_value= item->null_value;
 }


 int cmp_item_datetime::cmp(Item *arg)
 {
-  return value != arg->val_temporal_packed(warn_item);
+  const bool rc= value != arg->val_temporal_packed(warn_item);
+  return (m_null_value || arg->null_value) ? UNKNOWN : rc;
 }


@@ -3985,10 +3993,10 @@ bool Item_func_in::count_sargable_conds(uchar *arg)
 }


-bool Item_func_in::nulls_in_row()
+bool Item_func_in::list_contains_null()
 {
  Item **arg,**arg_end;
-  for (arg= args+1, arg_end= args+arg_count; arg != arg_end ; arg++)
+  for (arg= args + 1, arg_end= args+arg_count; arg != arg_end ; arg++)
  {
    if ((*arg)->null_inside())
      return 1;
@@ -4103,6 +4111,32 @@ void Item_func_in::fix_length_and_dec()
    }
  }

+  /*
+    First conditions for bisection to be possible:
+     1. All types are similar, and
+     2. All expressions in <in value list> are const
+  */
+  bool bisection_possible=
+    type_cnt == 1 &&                                   // 1
+    const_itm;                                         // 2
+  if (bisection_possible)
+  {
+    /*
+      In the presence of NULLs, the correct result of evaluating this item
+      must be UNKNOWN or FALSE. To achieve that:
+      - If type is scalar, we can use bisection and the "have_null" boolean.
+      - If type is ROW, we will need to scan all of <in value list> when
+        searching, so bisection is impossible. Unless:
+        3. UNKNOWN and FALSE are equivalent results
+        4. Neither left expression nor <in value list> contain any NULL value
+      */
+
+    if (m_compare_type == ROW_RESULT &&
+        ((!is_top_level_item() || negated) &&              // 3
+         (list_contains_null() || args[0]->maybe_null)))   // 4
+      bisection_possible= false;
+  }
+
  if (type_cnt == 1)
  {
    if (m_compare_type == STRING_RESULT &&
@@ -4115,7 +4149,7 @@ void Item_func_in::fix_length_and_dec()
      uint cols= args[0]->cols();
      cmp_item_row *cmp= 0;

-      if (const_itm && !nulls_in_row())
+      if (bisection_possible)
      {
        array= new (thd->mem_root) in_row(thd, arg_count-1, 0);
        cmp= &((in_row*)array)->tmp;
@@ -4144,11 +4178,8 @@ void Item_func_in::fix_length_and_dec()
      }
    }
  }
-  /*
-    Row item with NULLs inside can return NULL or FALSE =>
-    they can't be processed as static
-  */
-  if (type_cnt == 1 && const_itm && !nulls_in_row())
+
+  if (bisection_possible)
  {
    /*
      IN must compare INT columns and constants as int values (the same
@@ -4203,20 +4234,25 @@ void Item_func_in::fix_length_and_dec()
      array= new (thd->mem_root) in_datetime(date_arg, arg_count - 1);
      break;
    }
-    if (array && !(thd->is_fatal_error))		// If not EOM
+    if (!array || thd->is_fatal_error)		// OOM
+      return;
+    uint j=0;
+    for (uint i=1 ; i < arg_count ; i++)
    {
-      uint j=0;
-      for (uint i=1 ; i < arg_count ; i++)
+      array->set(j,args[i]);
+      if (!args[i]->null_value)
+        j++; // include this cell in the array.
+      else
      {
-        array->set(j,args[i]);
-        if (!args[i]->null_value)                      // Skip NULL values
-          j++;
-        else
-          have_null= 1;
+        /*
+          We don't put NULL values in array, to avoid erronous matches in
+          bisection.
+        */
+        have_null= 1;
      }
-      if ((array->used_count= j))
-	array->sort();
    }
+    if ((array->used_count= j))
+      array->sort();
  }
  else
  {
@@ -4284,7 +4320,14 @@ longlong Item_func_in::val_int()
  uint value_added_map= 0;
  if (array)
  {
-    int tmp=array->find(args[0]);
+    bool tmp=array->find(args[0]);
+    /*
+      NULL on left -> UNKNOWN.
+      Found no match, and NULL on right -> UNKNOWN.
+      NULL on right can never give a match, as it is not stored in
+      array.
+      See also the 'bisection_possible' variable in fix_length_and_dec().
+    */
    null_value=args[0]->null_value || (!tmp && have_null);
    return (longlong) (!null_value && tmp != negated);
  }
@@ -4306,13 +4349,12 @@ longlong Item_func_in::val_int()
    if (!(value_added_map & (1U << (uint)cmp_type)))
    {
      in_item->store_value(args[0]);
-      if ((null_value= args[0]->null_value))
-        return 0;
      value_added_map|= 1U << (uint)cmp_type;
    }
-    if (!in_item->cmp(args[i]) && !args[i]->null_value)
+    const int rc= in_item->cmp(args[i]);
+    if (rc == FALSE)
      return (longlong) (!negated);
-    have_null|= args[i]->null_value;
+    have_null|= (rc == UNKNOWN);
  }

  null_value= have_null;
@@ -6434,7 +6476,8 @@ longlong Item_equal::val_int()
    /* Skip fields of tables that has not been read yet */
    if (!field->table->status || (field->table->status & STATUS_NULL_ROW))
    {
-      if (eval_item->cmp(item) || (null_value= item->null_value))
+      const int rc= eval_item->cmp(item);
+      if ((rc == TRUE) || (null_value= (rc == UNKNOWN)))
        return 0;
    }
  }

--- a/sql/item_cmpfunc.h
+++ b/sql/item_cmpfunc.h
@@ -796,6 +796,7 @@ public:
 public:
  inline void negate() { negated= !negated; }
  inline void top_level_item() { pred_level= 1; }
+  bool is_top_level_item() const { return pred_level; }
  Item *neg_transformer(THD *thd)
  {
    negated= !negated;
@@ -1075,7 +1076,7 @@ public:
  {
    my_qsort2(base,used_count,size,compare,(void*)collation);
  }
-  int find(Item *item);
+  bool find(Item *item);
  
  /* 
    Create an instance of Item_{type} (e.g. Item_decimal) constant object
@@ -1243,6 +1244,10 @@ public:
  cmp_item() { cmp_charset= &my_charset_bin; }
  virtual ~cmp_item() {}
  virtual void store_value(Item *item)= 0;
+  /**
+     @returns result (TRUE, FALSE or UNKNOWN) of
+     "stored argument's value <> item's value"
+  */
  virtual int cmp(Item *item)= 0;
  // for optimized IN with row
  virtual int compare(cmp_item *item)= 0;
@@ -1255,7 +1260,14 @@ public:
  }
 };

-class cmp_item_string :public cmp_item 
+/// cmp_item which stores a scalar (i.e. non-ROW).
+class cmp_item_scalar : public cmp_item
+{
+protected:
+  bool m_null_value;                            ///< If stored value is NULL
+};
+
+class cmp_item_string : public cmp_item_scalar
 {
 protected:
  String *value_res;
@@ -1281,14 +1293,20 @@ public:
  void store_value(Item *item)
  {
    value_res= item->val_str(&value);
+    m_null_value= item->null_value;
  }
  int cmp(Item *arg)
  {
    char buff[STRING_BUFFER_USUAL_SIZE];
-    String tmp(buff, sizeof(buff), cmp_charset), *res;
-    res= arg->val_str(&tmp);
-    return (value_res ? (res ? sortcmp(value_res, res, cmp_charset) : 1) :
-            (res ? -1 : 0));
+    String tmp(buff, sizeof(buff), cmp_charset), *res= arg->val_str(&tmp);
+    if (m_null_value || arg->null_value)
+      return UNKNOWN;
+    if (value_res && res)
+      return sortcmp(value_res, res, cmp_charset) != 0;
+    else if (!value_res && !res)
+      return FALSE;
+    else
+      return TRUE;
  }
  int compare(cmp_item *ci)
  {
@@ -1303,7 +1321,7 @@ public:
  }
 };

-class cmp_item_int :public cmp_item
+class cmp_item_int : public cmp_item_scalar
 {
  longlong value;
 public:
@@ -1311,10 +1329,12 @@ public:
  void store_value(Item *item)
  {
    value= item->val_int();
+    m_null_value= item->null_value;
  }
  int cmp(Item *arg)
  {
-    return value != arg->val_int();
+    const bool rc= value != arg->val_int();
+    return (m_null_value || arg->null_value) ? UNKNOWN : rc;
  }
  int compare(cmp_item *ci)
  {
@@ -1330,7 +1350,7 @@ public:
  If the left item is a constant one then its value is cached in the
  lval_cache variable.
 */
-class cmp_item_datetime :public cmp_item
+class cmp_item_datetime : public cmp_item_scalar
 {
  longlong value;
 public:
@@ -1348,7 +1368,7 @@ public:
  cmp_item *make_same();
 };

-class cmp_item_real :public cmp_item
+class cmp_item_real : public cmp_item_scalar
 {
  double value;
 public:
@@ -1356,10 +1376,12 @@ public:
  void store_value(Item *item)
  {
    value= item->val_real();
+    m_null_value= item->null_value;
  }
  int cmp(Item *arg)
  {
-    return value != arg->val_real();
+    const bool rc= value != arg->val_real();
+    return (m_null_value || arg->null_value) ? UNKNOWN : rc;
  }
  int compare(cmp_item *ci)
  {
@@ -1370,7 +1392,7 @@ public:
 };


-class cmp_item_decimal :public cmp_item
+class cmp_item_decimal : public cmp_item_scalar
 {
  my_decimal value;
 public:
@@ -1397,12 +1419,13 @@ public:
  void store_value(Item *item)
  {
    value_res= item->val_str(&value);
+    m_null_value= item->null_value;
  }
  int cmp(Item *item)
  {
    // Should never be called
-    DBUG_ASSERT(0);
-    return 1;
+    DBUG_ASSERT(false);
+    return TRUE;
  }
  int compare(cmp_item *ci)
  {
@@ -1467,33 +1490,43 @@ public:
 };

 /*
-  The Item_func_in class implements the in_expr IN(values_list) function.
+  The Item_func_in class implements
+  in_expr IN (<in value list>)
+  and
+  in_expr NOT IN (<in value list>)

  The current implementation distinguishes 2 cases:
-  1) all items in the value_list are constants and have the same
+  1) all items in <in value list> are constants and have the same
    result type. This case is handled by in_vector class.
-  2) items in the value_list have different result types or there is some
-    non-constant items.
-    In this case Item_func_in employs several cmp_item objects to performs
-    comparisons of in_expr and an item from the values_list. One cmp_item
+  2) otherwise Item_func_in employs several cmp_item objects to perform
+    comparisons of in_expr and an item from <in value list>. One cmp_item
    object for each result type. Different result types are collected in the
    fix_length_and_dec() member function by means of collect_cmp_types()
    function.
 */
 class Item_func_in :public Item_func_opt_neg
 {
+  /**
+     Usable if <in value list> is made only of constants. Returns true if one
+     of these constants contains a NULL. Example:
+     IN ( (-5, (12,NULL)), ... ).
+  */
+  bool list_contains_null();
 protected:
  SEL_TREE *get_func_mm_tree(RANGE_OPT_PARAM *param,
                             Field *field, Item *value);
 public:
-  /* 
-    an array of values when the right hand arguments of IN
-    are all SQL constant and there are no nulls 
-  */
+  /// An array of values, created when the bisection lookup method is used
  in_vector *array;
+  /**
+    If there is some NULL among <in value list>, during a val_int() call; for
+    example
+    IN ( (1,(3,'col')), ... ), where 'col' is a column which evaluates to
+    NULL.
+  */
  bool have_null;
-  /* 
-    true when all arguments of the IN clause are of compatible types
+  /**
+    true when all arguments of the IN list are of compatible types
    and can be used safely as comparisons for key conditions
  */
  bool arg_types_compatible;
@@ -1547,7 +1580,6 @@ public:
  virtual void print(String *str, enum_query_type query_type);
  enum Functype functype() const { return IN_FUNC; }
  const char *func_name() const { return " IN "; }
-  bool nulls_in_row();
  bool eval_not_null_tables(uchar *opt_arg);
  void fix_after_pullout(st_select_lex *new_parent, Item **ref);
  bool count_sargable_conds(uchar *arg);

--- a/sql/item_row.h
+++ b/sql/item_row.h
@@ -2,7 +2,7 @@
 #define ITEM_ROW_INCLUDED

 /*
-   Copyright (c) 2002, 2010, Oracle and/or its affiliates.
+   Copyright (c) 2002, 2013, Oracle and/or its affiliates.

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -28,11 +28,20 @@
  @endverbatim
 */

+
+/**
+   Item which stores (x,y,...) and ROW(x,y,...).
+   Note that this can be recursive: ((x,y),(z,t)) is a ROW of ROWs.
+*/
 class Item_row: public Item,
                private Item_args,
                private Used_tables_and_const_cache
 {
  table_map not_null_tables_cache;
+  /**
+    If elements are made only of constants, of which one or more are
+    NULL. For example, this item is (1,2,NULL), or ( (1,NULL), (2,3) ).
+  */
  bool with_null;
 public:
  Item_row(THD *thd, List<Item> &list):