Commit badc6a12 authored by Andrei Elkin's avatar Andrei Elkin

Bug#38205 Row-based Replication (RBR) causes inconsistencies: HA_ERR_FOUND_DUP

Bug#319  if while a non-transactional slave is replicating a transaction possible problem 

It is impossible to roll back a mixed engines transaction when one of the engine is
non-transaction. In replication that fact is crucial because the slave can not safely
re-apply a transction that was interrupted with STOP SLAVE.

Fixed with making STOP SLAVE not be effective immediately in the case the current
group of replication events has modified a non-transaction table. In order for slave to leave
either the group needs finishing or the user issues KILL QUERY|CONNECTION slave_thread_id.
parent 5d5f0fcd
stop slave;
drop table if exists t1,t2,t3,t4,t5,t6,t7,t8,t9;
reset master;
reset slave;
drop table if exists t1,t2,t3,t4,t5,t6,t7,t8,t9;
start slave;
create table t1i(n int primary key) engine=innodb;
create table t2m(n int primary key) engine=myisam;
begin;
insert into t1i values (1);
insert into t1i values (2);
insert into t1i values (3);
commit;
begin;
insert into t1i values (5);
begin;
insert into t1i values (4);
insert into t2m values (1);
update t1i set n = 5 where n = 4;
commit;
zero
0
*** kill sql thread ***
rollback;
*** sql thread is *not* running: No ***
*** the prove: the killed slave has not finished the current transaction ***
three
3
one
1
zero
0
delete from t2m;
start slave sql_thread;
delete from t1i;
delete from t2m;
begin;
insert into t1i values (5);
begin;
insert into t1i values (4);
update t1i set n = 5 where n = 4;
commit;
zero
0
stop slave sql_thread;
rollback;
*** sql thread is running: No ***
*** the prove: the stopped slave has rolled back the current transaction ***
zero
0
zero
0
one
1
start slave sql_thread;
drop table t1i, t2m;
#
# Bug #38205 Row-based Replication (RBR) causes inconsistencies: HA_ERR_FOUND_DUPP_KEY
#
# Verifying the fact that STOP SLAVE in the middle of a group execution waits
# for the end of the group before the slave sql thread will stop.
# The patch refines STOP SLAVE to not interrupt a transaction or other type of
# the replication events group (the part I).
# Killing the sql thread continues to provide a "hard" stop (the part II).
#
# Non-deterministic tests
#
source include/master-slave.inc;
source include/have_innodb.inc;
#
# Part II, killed sql slave leaves instantly
#
# A. multi-statement transaction as the replication group
connection master;
create table t1i(n int primary key) engine=innodb;
create table t2m(n int primary key) engine=myisam;
sync_slave_with_master;
connection master;
begin;
insert into t1i values (1);
insert into t1i values (2);
insert into t1i values (3);
commit;
sync_slave_with_master;
#
# todo: first challenge is to find out the SQL thread id
# the following is not fully reliable
#
let $id=`SELECT id from information_schema.processlist where user like 'system user' and state like '%Has read all relay log%' or user like 'system user' and state like '%Reading event from the relay log%'`;
connection slave;
begin;
insert into t1i values (5);
connection master;
let $pos0_master= query_get_value(SHOW MASTER STATUS, Position, 1);
begin;
insert into t1i values (4);
insert into t2m values (1); # non-ta update
update t1i set n = 5 where n = 4; # to block at. can't be played with killed
commit;
let $pos1_master= query_get_value(SHOW MASTER STATUS, Position, 1);
connection slave;
# slave sql thread must be locked out by the conn `slave' explicit lock
let $pos0_slave= query_get_value(SHOW SLAVE STATUS, Exec_Master_Log_Pos, 1);
--disable_query_log
eval select $pos0_master - $pos0_slave as zero;
--enable_query_log
connection slave1;
let $count= 1;
let $table= t2m;
source include/wait_until_rows_count.inc;
#
# todo: may fail as said above
#
--echo *** kill sql thread ***
--disable_query_log
eval kill connection $id;
--enable_query_log
connection slave;
rollback; # release the sql thread
connection slave1;
source include/wait_for_slave_sql_to_stop.inc;
let $sql_status= query_get_value(SHOW SLAVE STATUS, Slave_SQL_Running, 1);
--echo *** sql thread is *not* running: $sql_status ***
let $pos1_slave= query_get_value(SHOW SLAVE STATUS, Exec_Master_Log_Pos, 1);
connection slave;
--echo *** the prove: the killed slave has not finished the current transaction ***
--disable_query_log
select count(*) as three from t1i;
eval select $pos1_master > $pos1_slave as one;
eval select $pos1_slave - $pos0_slave as zero;
--enable_query_log
delete from t2m; # remove the row to be able to replay
start slave sql_thread;
#
# Part I: B The homogenous transaction remains interuptable in between
#
connection master;
delete from t1i;
delete from t2m;
sync_slave_with_master;
begin;
insert into t1i values (5);
connection master;
let $pos0_master= query_get_value(SHOW MASTER STATUS, Position, 1);
begin;
insert into t1i values (4);
update t1i set n = 5 where n = 4; # to block at. not to be played
commit;
let $pos1_master= query_get_value(SHOW MASTER STATUS, Position, 1);
connection slave1;
# slave sql can't advance as must be locked by the conn `slave' trans
let $pos0_slave= query_get_value(SHOW SLAVE STATUS, Exec_Master_Log_Pos, 1);
--disable_query_log
eval select $pos0_master - $pos0_slave as zero;
--enable_query_log
#
# the replicated trans is blocked by the slave's local.
# However, it's not easy to catch the exact moment when it happens.
# The test issues sleep which makes the test either non-deterministic or
# wasting too much time.
#
--sleep 3
send stop slave sql_thread;
connection slave;
rollback; # release the sql thread
connection slave1;
reap;
source include/wait_for_slave_sql_to_stop.inc;
let $sql_status= query_get_value(SHOW SLAVE STATUS, Slave_SQL_Running, 1);
--echo *** sql thread is running: $sql_status ***
let $pos1_slave= query_get_value(SHOW SLAVE STATUS, Exec_Master_Log_Pos, 1);
--echo *** the prove: the stopped slave has rolled back the current transaction ***
--disable_query_log
select count(*) as zero from t1i;
eval select $pos0_master - $pos0_slave as zero;
eval select $pos1_master > $pos0_slave as one;
--enable_query_log
start slave sql_thread;
# clean-up
connection master;
drop table t1i, t2m;
sync_slave_with_master;
...@@ -10,3 +10,31 @@ start slave; ...@@ -10,3 +10,31 @@ start slave;
stop slave io_thread; stop slave io_thread;
start slave io_thread; start slave io_thread;
drop table t1; drop table t1;
create table t1i(n int primary key) engine=innodb;
create table t2m(n int primary key) engine=myisam;
begin;
insert into t1i values (1);
insert into t1i values (2);
insert into t1i values (3);
commit;
begin;
insert into t1i values (5);
begin;
insert into t1i values (4);
insert into t2m values (1);
insert into t1i values (5);
commit;
zero
0
stop slave;
rollback;
*** sql thread is running: No ***
*** the prove: the stopped slave has finished the current transaction ***
five
5
zero
0
one
1
start slave;
drop table t1i, t2m;
source include/master-slave.inc; source include/master-slave.inc;
source include/have_innodb.inc;
# #
# Bug#6148 () # Bug#6148 ()
...@@ -35,4 +36,88 @@ save_master_pos; ...@@ -35,4 +36,88 @@ save_master_pos;
connection slave; connection slave;
sync_with_master; sync_with_master;
# End of 4.1 tests
#
# Bug #38205 Row-based Replication (RBR) causes inconsistencies...
#
# Verifying that STOP SLAVE does not interrupt excution of a group
# execution of events if the group can not roll back.
# Killing the sql thread continues to provide a "hard" stop (the
# part II, moved to the bugs suite as it's hard to make it
# deterministic with KILL).
#
#
# Part I. The being stopped sql thread finishes first the current group of
# events if the group contains an event on a non-transaction table.
connection master;
create table t1i(n int primary key) engine=innodb;
create table t2m(n int primary key) engine=myisam;
begin;
insert into t1i values (1);
insert into t1i values (2);
insert into t1i values (3);
commit;
sync_slave_with_master;
connection slave;
begin;
insert into t1i values (5);
connection master;
let $pos0_master= query_get_value(SHOW MASTER STATUS, Position, 1);
begin;
insert into t1i values (4);
insert into t2m values (1); # non-ta update to process
insert into t1i values (5); # to block at. to be played with stopped
commit;
connection slave;
# slave sql thread must be locked out by the conn `slave' explicit lock
let $pos0_slave= query_get_value(SHOW SLAVE STATUS, Exec_Master_Log_Pos, 1);
--disable_query_log
eval select $pos0_master - $pos0_slave as zero;
--enable_query_log
connection slave1;
let $count= 1;
let $table= t2m;
source include/wait_until_rows_count.inc;
send stop slave;
connection slave;
rollback; # release the sql thread
connection slave1;
reap;
source include/wait_for_slave_to_stop.inc;
let $sql_status= query_get_value(SHOW SLAVE STATUS, Slave_SQL_Running, 1);
--echo *** sql thread is running: $sql_status ***
connection master;
let $pos1_master= query_get_value(SHOW MASTER STATUS, Position, 1);
connection slave;
source include/wait_for_slave_sql_to_stop.inc;
let $pos1_slave= query_get_value(SHOW SLAVE STATUS, Exec_Master_Log_Pos, 1);
--echo *** the prove: the stopped slave has finished the current transaction ***
--disable_query_log
select count(*) as five from t1i;
eval select $pos1_master - $pos1_slave as zero;
eval select $pos1_slave > $pos0_slave as one;
--enable_query_log
start slave;
# clean-up
connection master;
drop table t1i, t2m;
sync_slave_with_master;
# End of tests
...@@ -7114,7 +7114,12 @@ int Rows_log_event::do_apply_event(Relay_log_info const *rli) ...@@ -7114,7 +7114,12 @@ int Rows_log_event::do_apply_event(Relay_log_info const *rli)
*/ */
lex_start(thd); lex_start(thd);
mysql_reset_thd_for_next_command(thd); mysql_reset_thd_for_next_command(thd);
/*
The current statement is just about to begin and
has not yet modified anything. Note, all.modified is reset
by mysql_reset_thd_for_next_command.
*/
thd->transaction.stmt.modified_non_trans_table= FALSE;
/* /*
Check if the slave is set to use SBR. If so, it should switch Check if the slave is set to use SBR. If so, it should switch
to using RBR until the end of the "statement", i.e., next to using RBR until the end of the "statement", i.e., next
...@@ -7217,6 +7222,7 @@ int Rows_log_event::do_apply_event(Relay_log_info const *rli) ...@@ -7217,6 +7222,7 @@ int Rows_log_event::do_apply_event(Relay_log_info const *rli)
if (table) if (table)
{ {
bool transactional_table= table->file->has_transactions();
/* /*
table == NULL means that this table should not be replicated table == NULL means that this table should not be replicated
(this was set up by Table_map_log_event::do_apply_event() (this was set up by Table_map_log_event::do_apply_event()
...@@ -7348,6 +7354,9 @@ int Rows_log_event::do_apply_event(Relay_log_info const *rli) ...@@ -7348,6 +7354,9 @@ int Rows_log_event::do_apply_event(Relay_log_info const *rli)
m_curr_row= m_curr_row_end; m_curr_row= m_curr_row_end;
if (error == 0 && !transactional_table)
thd->transaction.all.modified_non_trans_table=
thd->transaction.stmt.modified_non_trans_table= TRUE;
} // row processing loop } // row processing loop
DBUG_EXECUTE_IF("STOP_SLAVE_after_first_Rows_event", DBUG_EXECUTE_IF("STOP_SLAVE_after_first_Rows_event",
......
...@@ -687,6 +687,9 @@ static bool sql_slave_killed(THD* thd, Relay_log_info* rli) ...@@ -687,6 +687,9 @@ static bool sql_slave_killed(THD* thd, Relay_log_info* rli)
DBUG_ASSERT(rli->slave_running == 1);// tracking buffer overrun DBUG_ASSERT(rli->slave_running == 1);// tracking buffer overrun
if (abort_loop || thd->killed || rli->abort_slave) if (abort_loop || thd->killed || rli->abort_slave)
{ {
if (rli->abort_slave && rli->is_in_group() &&
thd->transaction.all.modified_non_trans_table)
DBUG_RETURN(0);
/* /*
If we are in an unsafe situation (stopping could corrupt replication), If we are in an unsafe situation (stopping could corrupt replication),
we give one minute to the slave SQL thread of grace before really we give one minute to the slave SQL thread of grace before really
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment