Commit 26a3d567 authored by Sergey Petrunya's avatar Sergey Petrunya

MDEV-5992: EITS: Selectivity of non-indexed condition is counted twice in table's fanout

MDEV-5984: EITS: Incorrect filtered% value for single-table select with range access
- Fix calculate_cond_selectivity_for_table() to work correctly with range accesses 
  over multi-component keys:
  = First, take selectivity of all possible range scans into account. Remember which 
    fields were used bt the range scans.
  = Then, calculate selectivity produced by sargable predicates on fields. If a 
    field was used in a possible range access, assume its selectivity is already
    taken into account.
- Fix table_cond_selectivity(): when quick select is used, selectivity of
  COND(table) is taken into account in matching_candidates_in_table(). In
  table_cond_selectivity() we should not apply it for the second time.
parent 41a2ca5c
......@@ -118,6 +118,27 @@ Warnings:
Note 1003 select `test`.`t1`.`col1` AS `col1` from `test`.`t1` where (`test`.`t1`.`col1` <= <cache>(-(1)))
drop table t1, t2;
#
# MDEV-5984: EITS: Incorrect filtered% value for single-table select with range access
#
create table t1(a int);
insert into t1 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
create table t2 (a int, b int, col1 varchar(64), col2 varchar(64), key(a,b));
insert into t2 select A.a+10*B.a, C.a+10*D.a, 'filler-data1', 'filler-data2' from t1 A, t1 B, t1 C, t1 D;
set histogram_size=100;
set optimizer_use_condition_selectivity=4;
set use_stat_tables='preferably';
analyze table t2 persistent for all;
Table Op Msg_type Msg_text
test.t2 analyze status Engine-independent statistics collected
test.t2 analyze status Table is already up to date
# This must show filtered=100%:
explain extended select * from t2 where a in (1,2,3) and b in (1,2,3);
id select_type table type possible_keys key key_len ref rows filtered Extra
1 SIMPLE t2 range a a 10 NULL 9 100.00 Using index condition
Warnings:
Note 1003 select `test`.`t2`.`a` AS `a`,`test`.`t2`.`b` AS `b`,`test`.`t2`.`col1` AS `col1`,`test`.`t2`.`col2` AS `col2` from `test`.`t2` where ((`test`.`t2`.`a` in (1,2,3)) and (`test`.`t2`.`b` in (1,2,3)))
drop table t2, t1;
#
# End of the test file
#
set use_stat_tables= @save_use_stat_tables;
......
......@@ -84,7 +84,23 @@ explain extended select * from t1 where col1 in (-1,-2,-3);
explain extended select * from t1 where col1<=-1;
drop table t1, t2;
--echo #
--echo # MDEV-5984: EITS: Incorrect filtered% value for single-table select with range access
--echo #
create table t1(a int);
insert into t1 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
create table t2 (a int, b int, col1 varchar(64), col2 varchar(64), key(a,b));
insert into t2 select A.a+10*B.a, C.a+10*D.a, 'filler-data1', 'filler-data2' from t1 A, t1 B, t1 C, t1 D;
set histogram_size=100;
set optimizer_use_condition_selectivity=4;
set use_stat_tables='preferably';
analyze table t2 persistent for all;
--echo # This must show filtered=100%:
explain extended select * from t2 where a in (1,2,3) and b in (1,2,3);
drop table t2, t1;
--echo #
--echo # End of the test file
--echo #
......
......@@ -3390,6 +3390,17 @@ double records_in_column_ranges(PARAM *param, uint idx,
on the rows of 'table' in the processed query.
The calculated selectivity is assigned to the field table->cond_selectivity.
Selectivity is calculated as a product of selectivities imposed by:
1. possible range accesses. (if multiple range accesses use the same
restrictions on the same field, we make adjustments for that)
2. Sargable conditions on fields for which we have column statistics (if
a field is used in a possible range access, we assume that selectivity
is already provided by the range access' estimates)
3. Reading a few records from the table pages and checking the condition
selectivity (this is used for conditions like "column LIKE '%val%'"
where approaches #1 and #2 do not provide selectivity data).
NOTE
Currently the selectivities of range conditions over different columns are
considered independent.
......@@ -3415,14 +3426,90 @@ bool calculate_cond_selectivity_for_table(THD *thd, TABLE *table, Item *cond)
if (table->pos_in_table_list->schema_table)
DBUG_RETURN(FALSE);
MY_BITMAP handled_columns;
my_bitmap_map* buf;
if (!(buf= (my_bitmap_map*)thd->alloc(table->s->column_bitmap_size)))
DBUG_RETURN(TRUE);
my_bitmap_init(&handled_columns, buf, table->s->fields, FALSE);
/*
First, take into account possible range accesses.
range access estimates are the most precise, we prefer them to any other
estimate sources.
*/
for (keynr= 0; keynr < table->s->keys; keynr++)
{
if (table->quick_keys.is_set(keynr))
set_if_bigger(max_quick_key_parts, table->quick_key_parts[keynr]);
}
/*
Walk through all indexes, indexes where range access uses more keyparts
go first.
*/
for (uint quick_key_parts= max_quick_key_parts;
quick_key_parts; quick_key_parts--)
{
for (keynr= 0; keynr < table->s->keys; keynr++)
{
if (table->quick_keys.is_set(keynr) &&
table->quick_key_parts[keynr] == quick_key_parts)
{
uint i;
uint used_key_parts= table->quick_key_parts[keynr];
double quick_cond_selectivity= table->quick_rows[keynr] /
table_records;
KEY *key_info= table->key_info + keynr;
KEY_PART_INFO* key_part= key_info->key_part;
/*
Suppose, there are range conditions on two keys
KEY1 (col1, col2)
KEY2 (col3, col2)
we don't want to count selectivity of condition on col2 twice.
First, find the longest key prefix that's made of columns whose
selectivity wasn't already accounted for.
*/
for (i= 0; i < used_key_parts; i++, key_part++)
{
if (bitmap_is_set(&handled_columns, key_part->fieldnr-1))
break;
bitmap_set_bit(&handled_columns, key_part->fieldnr-1);
}
if (i)
{
/*
There is at least 1-column prefix of columns whose selectivity has
not yet been accounted for.
*/
table->cond_selectivity*= quick_cond_selectivity;
if (i != used_key_parts)
{
/*
Range access got us estimate for #used_key_parts.
We need estimate for #(i-1) key parts.
*/
double f1= key_info->actual_rec_per_key(i-1);
double f2= key_info->actual_rec_per_key(i);
table->cond_selectivity*= f1 / f2;
}
}
}
}
}
/*
Second step: calculate the selectivity of the range conditions not
supported by any index
*/
bitmap_subtract(used_fields, &handled_columns);
/* no need to do: my_bitmap_free(&handled_columns); */
if (thd->variables.optimizer_use_condition_selectivity > 2 &&
!bitmap_is_clear_all(used_fields))
{
/*
Calculate the selectivity of the range conditions not supported
by any index
*/
PARAM param;
MEM_ROOT alloc;
SEL_TREE *tree;
......@@ -3509,47 +3596,8 @@ bool calculate_cond_selectivity_for_table(THD *thd, TABLE *table, Item *cond)
bitmap_clear_all(used_fields);
for (keynr= 0; keynr < table->s->keys; keynr++)
{
if (table->quick_keys.is_set(keynr))
set_if_bigger(max_quick_key_parts, table->quick_key_parts[keynr]);
}
for (uint quick_key_parts= max_quick_key_parts;
quick_key_parts; quick_key_parts--)
{
for (keynr= 0; keynr < table->s->keys; keynr++)
{
if (table->quick_keys.is_set(keynr) &&
table->quick_key_parts[keynr] == quick_key_parts)
{
uint i;
uint used_key_parts= table->quick_key_parts[keynr];
double quick_cond_selectivity= table->quick_rows[keynr] /
table_records;
KEY *key_info= table->key_info + keynr;
KEY_PART_INFO* key_part= key_info->key_part;
for (i= 0; i < used_key_parts; i++, key_part++)
{
if (bitmap_is_set(used_fields, key_part->fieldnr-1))
break;
bitmap_set_bit(used_fields, key_part->fieldnr-1);
}
if (i)
{
table->cond_selectivity*= quick_cond_selectivity;
if (i != used_key_parts)
{
double f1= key_info->actual_rec_per_key(i-1);
double f2= key_info->actual_rec_per_key(i);
table->cond_selectivity*= f1 / f2;
}
}
}
}
}
/* Calculate selectivity of probably highly selective predicates */
/* Check if we can improve selectivity estimates by using sampling */
ulong check_rows=
MY_MIN(thd->variables.optimizer_selectivity_sampling_limit,
(ulong) (table_records * SELECTIVITY_SAMPLING_SHARE));
......
......@@ -5557,7 +5557,20 @@ void set_position(JOIN *join,uint idx,JOIN_TAB *table,KEYUSE *key)
}
/* Estimate of the number matching candidates in the joined table */
/*
Estimate how many records we will get if we read just this table and apply
a part of WHERE that can be checked for it.
@detail
Estimate how many records we will get if we
- read the given table with its "independent" access method (either quick
select or full table/index scan),
- apply the part of WHERE that refers only to this table.
@seealso
table_cond_selectivity() produces selectivity of condition that is checked
after joining rows from this table to rows from preceding tables.
*/
inline
double matching_candidates_in_table(JOIN_TAB *s, bool with_found_constraint,
......@@ -7236,14 +7249,25 @@ double table_multi_eq_cond_selectivity(JOIN *join, uint idx, JOIN_TAB *s,
/**
@brief
Get the selectivity of conditions when joining a table
Get the selectivity of conditions when joining a table
@param join The optimized join
@param s The table to be joined for evaluation
@param rem_tables The bitmap of tables to be joined later
@detail
Get selectivity of conditions that can be applied when joining this table
with previous tables.
For quick selects and full table scans, selectivity of COND(this_table)
is accounted for in matching_candidates_in_table(). Here, we only count
selectivity of COND(this_table, previous_tables).
For other access methods, we need to calculate selectivity of the whole
condition, "COND(this_table) AND COND(this_table, previous_tables)".
@retval
selectivity of the conditions imposed on the rows of s
selectivity of the conditions imposed on the rows of s
*/
static
......@@ -7255,22 +7279,21 @@ double table_cond_selectivity(JOIN *join, uint idx, JOIN_TAB *s,
TABLE *table= s->table;
MY_BITMAP *read_set= table->read_set;
double sel= s->table->cond_selectivity;
double table_records= table->stat_records();
POSITION *pos= &join->positions[idx];
uint keyparts= 0;
uint found_part_ref_or_null= 0;
/* Discount the selectivity of the access method used to join table s */
if (s->quick && s->quick->index != MAX_KEY)
{
if (pos->key == 0 && table_records > 0)
{
sel/= table->quick_rows[s->quick->index]/table_records;
}
}
else if (pos->key != 0)
if (pos->key != 0)
{
/* A ref/ access or hash join is used to join table */
/*
A ref access or hash join is used for this table.
It could have some parts with "t.key_part=const". Using ref access
means that we will only get records where the condition holds, so we
should remove its selectivity from the condition selectivity.
(TODO: more details about the "t.key=othertable.col" case)
*/
KEYUSE *keyuse= pos->key;
KEYUSE *prev_ref_keyuse= keyuse;
uint key= keyuse->key;
......@@ -7317,9 +7340,14 @@ double table_cond_selectivity(JOIN *join, uint idx, JOIN_TAB *s,
}
else
{
/*
The table is accessed with full table scan, or quick select.
Selectivity of COND(table) is already accounted for in
matching_candidates_in_table().
*/
sel= 1;
}
/*
If the field f from the table is equal to a field from one the
earlier joined tables then the selectivity of the range conditions
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment