Commit cb5b6c7e authored by Sergey Petrunya's avatar Sergey Petrunya

MDEV-5177: ha_partition and innodb index intersection produce fewer rows (MySQL Bug#70703)

(This is attempt at fix #2) (re-commit with fixed typo)
- Moved the testcase from partition_test to partition_innodb.test where it can really work.
- Made ordered index scans over ha_partition tables to satisfy ROR property for 
  the case where underlying table uses extended keys.
parent 525c00c6
......@@ -2494,45 +2494,6 @@ i
4
DROP TABLE t1;
#
# MDEV-5177: ha_partition and innodb index intersection produce fewer rows (MySQL Bug#70703)
#
create table t1 (
a int not null,
b int not null,
pk int not null,
primary key (pk),
key(a),
key(b)
) partition by hash(pk) partitions 10;
insert into t1 values (1,2,4);
insert into t1 values (1,0,17);
insert into t1 values (1,2,25);
insert into t1 values (10,20,122);
insert into t1 values (10,20,123);
create table t2 (a int);
insert into t2 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
insert into t1 select 1,2, 200 + A.a + 10*B.a + 100*C.a from t2 A, t2 B, t2 C;
insert into t1 select 10+A.a + 10*B.a + 100*C.a + 1000*D.a,
10+A.a + 10*B.a + 100*C.a + 1000*D.a,
2000 + A.a + 10*B.a + 100*C.a + 1000*D.a
from t2 A, t2 B, t2 C ,t2 D;
explain select * from t1 where a=1 and b=2 and pk between 1 and 999999 ;
id select_type table type possible_keys key key_len ref rows Extra
1 SIMPLE t1 ref PRIMARY,a,b b 4 const 982 Using where
create temporary table t3 as
select * from t1 where a=1 and b=2 and pk between 1 and 999 ;
select count(*) from t3;
count(*)
802
drop table t3;
create temporary table t3 as
select * from t1 ignore index(a,b) where a=1 and b=2 and pk between 1 and 999 ;
select count(*) from t3;
count(*)
802
drop table t3;
drop table t1,t2;
#
# MDEV-5555: Incorrect index_merge on BTREE indices
#
CREATE TABLE t1 (
......
......@@ -639,4 +639,43 @@ col1 col2 col3
1 2 2013-03-11 16:33:04
1 2 2013-03-11 16:33:24
DROP TABLE t1;
#
# MDEV-5177: ha_partition and innodb index intersection produce fewer rows (MySQL Bug#70703)
#
create table t1 (
a int not null,
b int not null,
pk int not null,
primary key (pk),
key(a),
key(b)
) engine=innodb partition by hash(pk) partitions 10;
insert into t1 values (1,2,4);
insert into t1 values (1,0,17);
insert into t1 values (1,2,25);
insert into t1 values (10,20,122);
insert into t1 values (10,20,123);
create table t2 (a int);
insert into t2 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
insert into t1 select 1,2, 200 + A.a + 10*B.a + 100*C.a from t2 A, t2 B, t2 C;
insert into t1 select 10+A.a + 10*B.a + 100*C.a + 1000*D.a,
10+A.a + 10*B.a + 100*C.a + 1000*D.a,
2000 + A.a + 10*B.a + 100*C.a + 1000*D.a
from t2 A, t2 B, t2 C ,t2 D;
explain select * from t1 where a=1 and b=2 and pk between 1 and 999999 ;
id select_type table type possible_keys key key_len ref rows Extra
1 SIMPLE t1 index_merge PRIMARY,a,b b,a 4,4 NULL # Using intersect(b,a); Using where; Using index
create temporary table t3 as
select * from t1 where a=1 and b=2 and pk between 1 and 999 ;
select count(*) from t3;
count(*)
802
drop table t3;
create temporary table t3 as
select * from t1 ignore index(a,b) where a=1 and b=2 and pk between 1 and 999 ;
select count(*) from t3;
count(*)
802
drop table t3;
drop table t1,t2;
set global default_storage_engine=default;
......@@ -734,8 +734,8 @@ a b
7 1
35 2
3 3
2 4
30 4
2 4
4 5
6 6
select * from t1 force index (b) where b < 10 ORDER BY b;
......@@ -744,16 +744,16 @@ a b
7 1
35 2
3 3
2 4
30 4
2 4
4 5
6 6
select * from t1 force index (b) where b < 10 ORDER BY b DESC;
a b
6 6
4 5
30 4
2 4
30 4
3 3
35 2
7 1
......
......@@ -2495,52 +2495,6 @@ ALTER TABLE t1 ADD PARTITION PARTITIONS 2;
SELECT * from t1 order by i;
DROP TABLE t1;
--echo #
--echo # MDEV-5177: ha_partition and innodb index intersection produce fewer rows (MySQL Bug#70703)
--echo #
create table t1 (
a int not null,
b int not null,
pk int not null,
primary key (pk),
key(a),
key(b)
) partition by hash(pk) partitions 10;
insert into t1 values (1,2,4); # both
insert into t1 values (1,0,17); # left
insert into t1 values (1,2,25); # both
insert into t1 values (10,20,122);
insert into t1 values (10,20,123);
# Now, fill in some data so that the optimizer choses index_merge
create table t2 (a int);
insert into t2 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
insert into t1 select 1,2, 200 + A.a + 10*B.a + 100*C.a from t2 A, t2 B, t2 C;
insert into t1 select 10+A.a + 10*B.a + 100*C.a + 1000*D.a,
10+A.a + 10*B.a + 100*C.a + 1000*D.a,
2000 + A.a + 10*B.a + 100*C.a + 1000*D.a
from t2 A, t2 B, t2 C ,t2 D;
# This should show index_merge, using intersect
explain select * from t1 where a=1 and b=2 and pk between 1 and 999999 ;
# 794 rows in output
create temporary table t3 as
select * from t1 where a=1 and b=2 and pk between 1 and 999 ;
select count(*) from t3;
drop table t3;
# 802 rows in output
create temporary table t3 as
select * from t1 ignore index(a,b) where a=1 and b=2 and pk between 1 and 999 ;
select count(*) from t3;
drop table t3;
drop table t1,t2;
--echo #
--echo # MDEV-5555: Incorrect index_merge on BTREE indices
--echo #
......
......@@ -716,5 +716,52 @@ GROUP BY 1, 2, 3;
DROP TABLE t1;
--echo #
--echo # MDEV-5177: ha_partition and innodb index intersection produce fewer rows (MySQL Bug#70703)
--echo #
create table t1 (
a int not null,
b int not null,
pk int not null,
primary key (pk),
key(a),
key(b)
) engine=innodb partition by hash(pk) partitions 10;
insert into t1 values (1,2,4); # both
insert into t1 values (1,0,17); # left
insert into t1 values (1,2,25); # both
insert into t1 values (10,20,122);
insert into t1 values (10,20,123);
# Now, fill in some data so that the optimizer choses index_merge
create table t2 (a int);
insert into t2 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
insert into t1 select 1,2, 200 + A.a + 10*B.a + 100*C.a from t2 A, t2 B, t2 C;
insert into t1 select 10+A.a + 10*B.a + 100*C.a + 1000*D.a,
10+A.a + 10*B.a + 100*C.a + 1000*D.a,
2000 + A.a + 10*B.a + 100*C.a + 1000*D.a
from t2 A, t2 B, t2 C ,t2 D;
# This should show index_merge, using intersect
--replace_column 9 #
explain select * from t1 where a=1 and b=2 and pk between 1 and 999999 ;
# 794 rows in output
create temporary table t3 as
select * from t1 where a=1 and b=2 and pk between 1 and 999 ;
select count(*) from t3;
drop table t3;
# 802 rows in output
create temporary table t3 as
select * from t1 ignore index(a,b) where a=1 and b=2 and pk between 1 and 999 ;
select count(*) from t3;
drop table t3;
drop table t1,t2;
set global default_storage_engine=default;
......@@ -80,7 +80,8 @@ static handler *partition_create_handler(handlerton *hton,
static uint partition_flags();
static uint alter_table_flags(uint flags);
extern "C" int cmp_key_then_part_id(void *key_p, uchar *ref1, uchar *ref2);
extern "C" int cmp_key_part_id(void *key_p, uchar *ref1, uchar *ref2);
extern "C" int cmp_key_rowid_part_id(void *ptr, uchar *ref1, uchar *ref2);
static int partition_initialize(void *p)
{
......@@ -4514,7 +4515,10 @@ bool ha_partition::init_record_priority_queue()
uint alloc_len;
uint used_parts= bitmap_bits_set(&m_part_info->used_partitions);
/* Allocate record buffer for each used partition. */
alloc_len= used_parts * (m_rec_length + PARTITION_BYTES_IN_POS);
m_priority_queue_rec_len= m_rec_length + PARTITION_BYTES_IN_POS;
if (!m_using_extended_keys)
m_priority_queue_rec_len += m_file[0]->ref_length;
alloc_len= used_parts * m_priority_queue_rec_len;
/* Allocate a key for temporary use when setting up the scan. */
alloc_len+= table_share->max_key_length;
......@@ -4536,13 +4540,25 @@ bool ha_partition::init_record_priority_queue()
{
DBUG_PRINT("info", ("init rec-buf for part %u", i));
int2store(ptr, i);
ptr+= m_rec_length + PARTITION_BYTES_IN_POS;
ptr+= m_priority_queue_rec_len;
}
} while (++i < m_tot_parts);
m_start_key.key= (const uchar*)ptr;
/* Initialize priority queue, initialized to reading forward. */
if (init_queue(&m_queue, used_parts, 0,
0, cmp_key_then_part_id, (void*)m_curr_key_info, 0, 0))
int (*cmp_func)(void *, uchar *, uchar *);
void *cmp_arg;
if (!m_using_extended_keys)
{
cmp_func= cmp_key_rowid_part_id;
cmp_arg= (void*)this;
}
else
{
cmp_func= cmp_key_part_id;
cmp_arg= (void*)m_curr_key_info;
}
if (init_queue(&m_queue, used_parts, 0, 0, cmp_func, cmp_arg, 0, 0))
{
my_free(m_ordered_rec_buffer);
m_ordered_rec_buffer= NULL;
......@@ -4609,9 +4625,13 @@ int ha_partition::index_init(uint inx, bool sorted)
DBUG_PRINT("info", ("Clustered pk, using pk as secondary cmp"));
m_curr_key_info[1]= table->key_info+table->s->primary_key;
m_curr_key_info[2]= NULL;
m_using_extended_keys= TRUE;
}
else
{
m_curr_key_info[1]= NULL;
m_using_extended_keys= FALSE;
}
if (init_record_priority_queue())
DBUG_RETURN(HA_ERR_OUT_OF_MEM);
......@@ -4738,36 +4758,12 @@ int ha_partition::index_read_map(uchar *buf, const uchar *key,
}
/*
@brief
Provide ordering by (key_value, partition_id).
@detail
Ordering by partition id is required so that key scans on key=const
return rows in rowid order (this is required for some variants of
index_merge to work).
In ha_partition, rowid is a (partition_id, underlying_table_rowid).
handle_ordered_index_scan must return rows ordered by (key, rowid).
If two rows have the same key value and come from different partitions,
it is sufficient to return them in the order of their partition_id.
*/
extern "C" int cmp_key_then_part_id(void *key_p, uchar *ref1, uchar *ref2)
/* Compare two part_no partition numbers */
static int cmp_part_ids(uchar *ref1, uchar *ref2)
{
my_ptrdiff_t diff1, diff2;
int res;
if ((res= key_rec_cmp(key_p, ref1 + PARTITION_BYTES_IN_POS,
ref2 + PARTITION_BYTES_IN_POS)))
{
return res;
}
/* The following was taken from ha_partition::cmp_ref */
diff1= ref2[1] - ref1[1];
diff2= ref2[0] - ref1[0];
my_ptrdiff_t diff1= ref2[1] - ref1[1];
my_ptrdiff_t diff2= ref2[0] - ref1[0];
if (!diff1 && !diff2)
return 0;
......@@ -4784,6 +4780,45 @@ extern "C" int cmp_key_then_part_id(void *key_p, uchar *ref1, uchar *ref2)
}
/*
@brief
Provide ordering by (key_value, part_no).
*/
extern "C" int cmp_key_part_id(void *key_p, uchar *ref1, uchar *ref2)
{
int res;
if ((res= key_rec_cmp(key_p, ref1 + PARTITION_BYTES_IN_POS,
ref2 + PARTITION_BYTES_IN_POS)))
{
return res;
}
return cmp_part_ids(ref1, ref2);
}
/*
@brief
Provide ordering by (key_value, underying_table_rowid, part_no).
*/
extern "C" int cmp_key_rowid_part_id(void *ptr, uchar *ref1, uchar *ref2)
{
ha_partition *file= (ha_partition*)ptr;
int res;
if ((res= key_rec_cmp(file->m_curr_key_info, ref1 + PARTITION_BYTES_IN_POS,
ref2 + PARTITION_BYTES_IN_POS)))
{
return res;
}
if ((res= file->m_file[0]->cmp_ref(ref1 + PARTITION_BYTES_IN_POS + file->m_rec_length,
ref2 + PARTITION_BYTES_IN_POS + file->m_rec_length)))
{
return res;
}
return cmp_part_ids(ref1, ref2);
}
/**
Common routine for a number of index_read variants
......@@ -5484,7 +5519,7 @@ int ha_partition::handle_ordered_index_scan(uchar *buf, bool reverse_order)
for (; first_used_part < m_part_spec.start_part; first_used_part++)
{
if (bitmap_is_set(&(m_part_info->used_partitions), first_used_part))
part_rec_buf_ptr+= m_rec_length + PARTITION_BYTES_IN_POS;
part_rec_buf_ptr+= m_priority_queue_rec_len;
}
DBUG_PRINT("info", ("m_part_spec.start_part %u first_used_part %u",
m_part_spec.start_part, first_used_part));
......@@ -5539,6 +5574,11 @@ int ha_partition::handle_ordered_index_scan(uchar *buf, bool reverse_order)
if (!error)
{
found= TRUE;
if (!m_using_extended_keys)
{
file->position(rec_buf_ptr);
memcpy(rec_buf_ptr + m_rec_length, file->ref, file->ref_length);
}
/*
Initialize queue without order first, simply insert
*/
......@@ -5555,7 +5595,7 @@ int ha_partition::handle_ordered_index_scan(uchar *buf, bool reverse_order)
m_key_not_found= true;
saved_error= error;
}
part_rec_buf_ptr+= m_rec_length + PARTITION_BYTES_IN_POS;
part_rec_buf_ptr+= m_priority_queue_rec_len;
}
if (found)
{
......@@ -5564,7 +5604,7 @@ int ha_partition::handle_ordered_index_scan(uchar *buf, bool reverse_order)
after that read the first entry and copy it to the buffer to return in.
*/
queue_set_max_at_top(&m_queue, reverse_order);
queue_set_cmp_arg(&m_queue, (void*)m_curr_key_info);
queue_set_cmp_arg(&m_queue, m_using_extended_keys? m_curr_key_info : (void*)this);
m_queue.elements= j - queue_first_element(&m_queue);
queue_fix(&m_queue);
return_top_record(buf);
......@@ -5640,7 +5680,7 @@ int ha_partition::handle_ordered_index_scan_key_not_found()
else if (error != HA_ERR_END_OF_FILE && error != HA_ERR_KEY_NOT_FOUND)
DBUG_RETURN(error);
}
part_buf+= m_rec_length + PARTITION_BYTES_IN_POS;
part_buf += m_priority_queue_rec_len;
}
DBUG_ASSERT(curr_rec_buf);
bitmap_clear_all(&m_key_not_found_partitions);
......@@ -5724,6 +5764,13 @@ int ha_partition::handle_ordered_next(uchar *buf, bool is_next_same)
else
error= file->ha_index_next_same(rec_buf, m_start_key.key,
m_start_key.length);
if (!m_using_extended_keys)
{
file->position(rec_buf);
memcpy(rec_buf + m_rec_length, file->ref, file->ref_length);
}
if (error)
{
if (error == HA_ERR_END_OF_FILE)
......@@ -7711,19 +7758,29 @@ uint ha_partition::min_record_length(uint options) const
int ha_partition::cmp_ref(const uchar *ref1, const uchar *ref2)
{
uint part_id;
int cmp;
my_ptrdiff_t diff1, diff2;
handler *file;
DBUG_ENTER("ha_partition::cmp_ref");
cmp = m_file[0]->cmp_ref((ref1 + PARTITION_BYTES_IN_POS),
(ref2 + PARTITION_BYTES_IN_POS));
if (cmp)
DBUG_RETURN(cmp);
if ((ref1[0] == ref2[0]) && (ref1[1] == ref2[1]))
{
part_id= uint2korr(ref1);
file= m_file[part_id];
DBUG_ASSERT(part_id < m_tot_parts);
DBUG_RETURN(file->cmp_ref((ref1 + PARTITION_BYTES_IN_POS),
(ref2 + PARTITION_BYTES_IN_POS)));
/* This means that the references are same and are in same partition.*/
DBUG_RETURN(0);
}
/*
In Innodb we compare with either primary key value or global DB_ROW_ID so
it is not possible that the two references are equal and are in different
partitions, but in myisam it is possible since we are comparing offsets.
Remove this assert if DB_ROW_ID is changed to be per partition.
*/
DBUG_ASSERT(!m_innodb);
diff1= ref2[1] - ref1[1];
diff2= ref2[0] - ref1[0];
if (diff1 > 0)
......
......@@ -49,6 +49,8 @@ enum partition_keywords
/* offset to the engines array */
#define PAR_ENGINES_OFFSET 12
extern "C" int cmp_key_rowid_part_id(void *ptr, uchar *ref1, uchar *ref2);
class ha_partition :public handler
{
private:
......@@ -88,6 +90,22 @@ private:
uchar *m_rec0; // table->record[0]
const uchar *m_err_rec; // record which gave error
QUEUE m_queue; // Prio queue used by sorted read
/*
Length of an element in m_ordered_rec_buffer. The elements are composed of
[part_no] [table->record copy] [underlying_table_rowid]
underlying_table_rowid is only stored when the table has no extended keys.
*/
uint m_priority_queue_rec_len;
/*
If true, then sorting records by key value also sorts them by their
underlying_table_rowid.
*/
bool m_using_extended_keys;
/*
Since the partition handler is a handler on top of other handlers, it
is necessary to keep information about what the underlying handler
......@@ -1172,6 +1190,9 @@ public:
DBUG_ASSERT(h == m_file[i]->ht);
return h;
}
friend int cmp_key_rowid_part_id(void *ptr, uchar *ref1, uchar *ref2);
};
#endif /* HA_PARTITION_INCLUDED */
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment