BUG#20697 slave fails to rollback replicated transaction hang over innodb_lock_wait_timeou

Transaction on the slave sql thread got blocked against a slave's mysqld local ta's
lock. Since the default, slave-transaction-retries=10, there was replaying of the 
replicated ta. That failed because of a new started from 5.0.13 policy not to rollback
a timed-out transaction. Effectively the first round of a timed-out ta becomes committed
by the replaying's first "BEGIN".

It was decided to backport already existed method working in 5.1 implemented in
bug #16228 for handling symmetrical deadlock problem. That patch introduced end_trans
execution whenever a replicated ta deadlocks or timed-out.

Note, that this solution can be practically suboptimal - in the light of the changed behavior
due to timeout we still could replay only the last statement -  only with a high rate of timeouting
replicated transactions.
parent 40bcedf4
...@@ -6,7 +6,7 @@ drop table if exists t1,t2,t3,t4,t5,t6,t7,t8,t9; ...@@ -6,7 +6,7 @@ drop table if exists t1,t2,t3,t4,t5,t6,t7,t8,t9;
start slave; start slave;
create table t1 (a int not null, key(a)) engine=innodb; create table t1 (a int not null, key(a)) engine=innodb;
create table t2 (a int not null, key(a)) engine=innodb; create table t2 (a int not null, key(a)) engine=innodb;
create table t3 (a int) engine=innodb; create table t3 (a int unique) engine=innodb;
create table t4 (a int) engine=innodb; create table t4 (a int) engine=innodb;
show variables like 'slave_transaction_retries'; show variables like 'slave_transaction_retries';
Variable_name Value Variable_name Value
...@@ -35,14 +35,14 @@ begin; ...@@ -35,14 +35,14 @@ begin;
select * from t1 for update; select * from t1 for update;
a a
start slave; start slave;
insert into t2 values(22); insert into t2 values(201);
commit; commit;
select * from t1; select * from t1;
a a
1 1
select * from t2; select * from t2;
a a
22 201
show slave status; show slave status;
Slave_IO_State # Slave_IO_State #
Master_Host 127.0.0.1 Master_Host 127.0.0.1
...@@ -50,7 +50,7 @@ Master_User root ...@@ -50,7 +50,7 @@ Master_User root
Master_Port MASTER_MYPORT Master_Port MASTER_MYPORT
Connect_Retry 1 Connect_Retry 1
Master_Log_File master-bin.000001 Master_Log_File master-bin.000001
Read_Master_Log_Pos 18911 Read_Master_Log_Pos 18918
Relay_Log_File # Relay_Log_File #
Relay_Log_Pos # Relay_Log_Pos #
Relay_Master_Log_File master-bin.000001 Relay_Master_Log_File master-bin.000001
...@@ -65,7 +65,7 @@ Replicate_Wild_Ignore_Table ...@@ -65,7 +65,7 @@ Replicate_Wild_Ignore_Table
Last_Errno 0 Last_Errno 0
Last_Error Last_Error
Skip_Counter 0 Skip_Counter 0
Exec_Master_Log_Pos 18911 Exec_Master_Log_Pos 18918
Relay_Log_Space # Relay_Log_Space #
Until_Condition None Until_Condition None
Until_Log_File Until_Log_File
...@@ -78,12 +78,16 @@ Master_SSL_Cipher ...@@ -78,12 +78,16 @@ Master_SSL_Cipher
Master_SSL_Key Master_SSL_Key
Seconds_Behind_Master # Seconds_Behind_Master #
stop slave; stop slave;
change master to master_log_pos=532; delete from t3;
change master to master_log_pos=539;
begin; begin;
select * from t2 for update; select * from t2 for update;
a a
22 201
start slave; start slave;
select count(*) from t3 /* must be zero */;
count(*)
0
commit; commit;
select * from t1; select * from t1;
a a
...@@ -91,7 +95,7 @@ a ...@@ -91,7 +95,7 @@ a
1 1
select * from t2; select * from t2;
a a
22 201
show slave status; show slave status;
Slave_IO_State # Slave_IO_State #
Master_Host 127.0.0.1 Master_Host 127.0.0.1
...@@ -99,7 +103,7 @@ Master_User root ...@@ -99,7 +103,7 @@ Master_User root
Master_Port MASTER_MYPORT Master_Port MASTER_MYPORT
Connect_Retry 1 Connect_Retry 1
Master_Log_File master-bin.000001 Master_Log_File master-bin.000001
Read_Master_Log_Pos 18911 Read_Master_Log_Pos 18918
Relay_Log_File # Relay_Log_File #
Relay_Log_Pos # Relay_Log_Pos #
Relay_Master_Log_File master-bin.000001 Relay_Master_Log_File master-bin.000001
...@@ -114,7 +118,7 @@ Replicate_Wild_Ignore_Table ...@@ -114,7 +118,7 @@ Replicate_Wild_Ignore_Table
Last_Errno 0 Last_Errno 0
Last_Error Last_Error
Skip_Counter 0 Skip_Counter 0
Exec_Master_Log_Pos 18911 Exec_Master_Log_Pos 18918
Relay_Log_Space # Relay_Log_Space #
Until_Condition None Until_Condition None
Until_Log_File Until_Log_File
...@@ -128,12 +132,16 @@ Master_SSL_Key ...@@ -128,12 +132,16 @@ Master_SSL_Key
Seconds_Behind_Master # Seconds_Behind_Master #
set global max_relay_log_size=0; set global max_relay_log_size=0;
stop slave; stop slave;
change master to master_log_pos=532; delete from t3;
change master to master_log_pos=539;
begin; begin;
select * from t2 for update; select * from t2 for update;
a a
22 201
start slave; start slave;
select count(*) from t3 /* must be zero */;
count(*)
0
commit; commit;
select * from t1; select * from t1;
a a
...@@ -142,7 +150,7 @@ a ...@@ -142,7 +150,7 @@ a
1 1
select * from t2; select * from t2;
a a
22 201
show slave status; show slave status;
Slave_IO_State # Slave_IO_State #
Master_Host 127.0.0.1 Master_Host 127.0.0.1
...@@ -150,7 +158,7 @@ Master_User root ...@@ -150,7 +158,7 @@ Master_User root
Master_Port MASTER_MYPORT Master_Port MASTER_MYPORT
Connect_Retry 1 Connect_Retry 1
Master_Log_File master-bin.000001 Master_Log_File master-bin.000001
Read_Master_Log_Pos 18911 Read_Master_Log_Pos 18918
Relay_Log_File # Relay_Log_File #
Relay_Log_Pos # Relay_Log_Pos #
Relay_Master_Log_File master-bin.000001 Relay_Master_Log_File master-bin.000001
...@@ -165,7 +173,7 @@ Replicate_Wild_Ignore_Table ...@@ -165,7 +173,7 @@ Replicate_Wild_Ignore_Table
Last_Errno 0 Last_Errno 0
Last_Error Last_Error
Skip_Counter 0 Skip_Counter 0
Exec_Master_Log_Pos 18911 Exec_Master_Log_Pos 18918
Relay_Log_Space # Relay_Log_Space #
Until_Condition None Until_Condition None
Until_Log_File Until_Log_File
...@@ -177,4 +185,91 @@ Master_SSL_Cert ...@@ -177,4 +185,91 @@ Master_SSL_Cert
Master_SSL_Cipher Master_SSL_Cipher
Master_SSL_Key Master_SSL_Key
Seconds_Behind_Master # Seconds_Behind_Master #
drop table if exists t1;
create table t1 (f int unique) engine=innodb;
insert into t1 values (0);
begin;
select * from t1 where f = 0 for update;
f
0
begin;
insert into t1 values (1);
update t1 set f=-1 where f = 0;
commit;
insert into t1 values (2);
show slave status;
Slave_IO_State #
Master_Host 127.0.0.1
Master_User root
Master_Port MASTER_MYPORT
Connect_Retry 1
Master_Log_File master-bin.000001
Read_Master_Log_Pos 19618
Relay_Log_File #
Relay_Log_Pos #
Relay_Master_Log_File master-bin.000001
Slave_IO_Running #
Slave_SQL_Running No
Replicate_Do_DB
Replicate_Ignore_DB
Replicate_Do_Table
Replicate_Ignore_Table
Replicate_Wild_Do_Table
Replicate_Wild_Ignore_Table
Last_Errno 1205
Last_Error Error 'Lock wait timeout exceeded; try restarting transaction' on query. Default database: 'test'. Query: 'update t1 set f=-1 where f = 0'
Skip_Counter 0
Exec_Master_Log_Pos 19227
Relay_Log_Space #
Until_Condition None
Until_Log_File
Until_Log_Pos 0
Master_SSL_Allowed No
Master_SSL_CA_File
Master_SSL_CA_Path
Master_SSL_Cert
Master_SSL_Cipher
Master_SSL_Key
Seconds_Behind_Master #
set @@global.sql_slave_skip_counter = 4;
start slave;
show slave status;
Slave_IO_State #
Master_Host 127.0.0.1
Master_User root
Master_Port MASTER_MYPORT
Connect_Retry 1
Master_Log_File master-bin.000001
Read_Master_Log_Pos 19618
Relay_Log_File #
Relay_Log_Pos #
Relay_Master_Log_File master-bin.000001
Slave_IO_Running #
Slave_SQL_Running Yes
Replicate_Do_DB
Replicate_Ignore_DB
Replicate_Do_Table
Replicate_Ignore_Table
Replicate_Wild_Do_Table
Replicate_Wild_Ignore_Table
Last_Errno 0
Last_Error
Skip_Counter 0
Exec_Master_Log_Pos 19618
Relay_Log_Space #
Until_Condition None
Until_Log_File
Until_Log_Pos 0
Master_SSL_Allowed No
Master_SSL_CA_File
Master_SSL_CA_Path
Master_SSL_Cert
Master_SSL_Cipher
Master_SSL_Key
Seconds_Behind_Master #
select * from t1;
f
0
2
commit;
drop table t1,t2,t3,t4; drop table t1,t2,t3,t4;
...@@ -16,7 +16,8 @@ source include/master-slave.inc; ...@@ -16,7 +16,8 @@ source include/master-slave.inc;
connection master; connection master;
create table t1 (a int not null, key(a)) engine=innodb; create table t1 (a int not null, key(a)) engine=innodb;
create table t2 (a int not null, key(a)) engine=innodb; create table t2 (a int not null, key(a)) engine=innodb;
create table t3 (a int) engine=innodb; # requiring 'unique' for the timeout part of the test
create table t3 (a int unique) engine=innodb;
create table t4 (a int) engine=innodb; create table t4 (a int) engine=innodb;
show variables like 'slave_transaction_retries'; show variables like 'slave_transaction_retries';
sync_slave_with_master; sync_slave_with_master;
...@@ -31,8 +32,7 @@ stop slave; ...@@ -31,8 +32,7 @@ stop slave;
connection master; connection master;
begin; begin;
# Let's keep BEGIN and the locked statement in two different relay logs. # Let's keep BEGIN and the locked statement in two different relay logs.
let $1=200; let $1=200;disable_query_log;
disable_query_log;
while ($1) while ($1)
{ {
eval insert into t3 values( $1 ); eval insert into t3 values( $1 );
...@@ -59,7 +59,7 @@ enable_query_log; ...@@ -59,7 +59,7 @@ enable_query_log;
select * from t1 for update; select * from t1 for update;
start slave; start slave;
--real_sleep 3 # hope that slave is blocked now --real_sleep 3 # hope that slave is blocked now
insert into t2 values(22); # provoke deadlock, slave should be victim insert into t2 values(201); # provoke deadlock, slave should be victim
commit; commit;
sync_with_master; sync_with_master;
select * from t1; # check that slave succeeded finally select * from t1; # check that slave succeeded finally
...@@ -74,11 +74,13 @@ show slave status; ...@@ -74,11 +74,13 @@ show slave status;
# 2) Test lock wait timeout # 2) Test lock wait timeout
stop slave; stop slave;
change master to master_log_pos=532; # the BEGIN log event delete from t3;
change master to master_log_pos=539; # the BEGIN log event
begin; begin;
select * from t2 for update; # hold lock select * from t2 for update; # hold lock
start slave; start slave;
--real_sleep 10 # slave should have blocked, and be retrying --real_sleep 10 # slave should have blocked, and be retrying
select count(*) from t3 /* must be zero */; # replaying begins after rollback
commit; commit;
sync_with_master; sync_with_master;
select * from t1; # check that slave succeeded finally select * from t1; # check that slave succeeded finally
...@@ -97,11 +99,13 @@ set global max_relay_log_size=0; ...@@ -97,11 +99,13 @@ set global max_relay_log_size=0;
# This is really copy-paste of 2) of above # This is really copy-paste of 2) of above
stop slave; stop slave;
change master to master_log_pos=532; delete from t3;
change master to master_log_pos=539;
begin; begin;
select * from t2 for update; select * from t2 for update;
start slave; start slave;
--real_sleep 10 --real_sleep 10
select count(*) from t3 /* must be zero */; # replaying begins after rollback
commit; commit;
sync_with_master; sync_with_master;
select * from t1; select * from t1;
......
...@@ -3345,9 +3345,9 @@ static int exec_relay_log_event(THD* thd, RELAY_LOG_INFO* rli) ...@@ -3345,9 +3345,9 @@ static int exec_relay_log_event(THD* thd, RELAY_LOG_INFO* rli)
const char *errmsg; const char *errmsg;
/* /*
We were in a transaction which has been rolled back because of a We were in a transaction which has been rolled back because of a
deadlock (currently, InnoDB deadlock detected by InnoDB) or lock Sonera deadlock. if lock wait timeout (innodb_lock_wait_timeout exceeded)
wait timeout (innodb_lock_wait_timeout exceeded); let's seek back to there is no rollback since 5.0.13 (ref: manual).
BEGIN log event and retry it all again. let's seek back to BEGIN log event and retry it all again.
We have to not only seek but also We have to not only seek but also
a) init_master_info(), to seek back to hot relay log's start for later a) init_master_info(), to seek back to hot relay log's start for later
(for when we will come back to this hot log after re-processing the (for when we will come back to this hot log after re-processing the
...@@ -3369,6 +3369,7 @@ static int exec_relay_log_event(THD* thd, RELAY_LOG_INFO* rli) ...@@ -3369,6 +3369,7 @@ static int exec_relay_log_event(THD* thd, RELAY_LOG_INFO* rli)
else else
{ {
exec_res= 0; exec_res= 0;
end_trans(thd, ROLLBACK);
/* chance for concurrent connection to get more locks */ /* chance for concurrent connection to get more locks */
safe_sleep(thd, min(rli->trans_retries, MAX_SLAVE_RETRY_PAUSE), safe_sleep(thd, min(rli->trans_retries, MAX_SLAVE_RETRY_PAUSE),
(CHECK_KILLED_FUNC)sql_slave_killed, (void*)rli); (CHECK_KILLED_FUNC)sql_slave_killed, (void*)rli);
...@@ -3386,9 +3387,17 @@ static int exec_relay_log_event(THD* thd, RELAY_LOG_INFO* rli) ...@@ -3386,9 +3387,17 @@ static int exec_relay_log_event(THD* thd, RELAY_LOG_INFO* rli)
"the slave_transaction_retries variable.", "the slave_transaction_retries variable.",
slave_trans_retries); slave_trans_retries);
} }
if (!((thd->options & OPTION_BEGIN) && opt_using_transactions)) else if (!((thd->options & OPTION_BEGIN) && opt_using_transactions))
rli->trans_retries= 0; // restart from fresh {
} /*
Only reset the retry counter if the event succeeded or
failed with a non-transient error. On a successful event,
the execution will proceed as usual; in the case of a
non-transient error, the slave will stop with an error.
*/
rli->trans_retries= 0; // restart from fresh
}
}
return exec_res; return exec_res;
} }
else else
...@@ -4613,7 +4622,7 @@ static int connect_to_master(THD* thd, MYSQL* mysql, MASTER_INFO* mi, ...@@ -4613,7 +4622,7 @@ static int connect_to_master(THD* thd, MYSQL* mysql, MASTER_INFO* mi,
suppress_warnings= 0; suppress_warnings= 0;
sql_print_error("Slave I/O thread: error %s to master \ sql_print_error("Slave I/O thread: error %s to master \
'%s@%s:%d': \ '%s@%s:%d': \
Error: '%s' errno: %d retry-time: %d retries: %d", Error: '%s' errno: %d retry-time: %d retries: %lu",
(reconnect ? "reconnecting" : "connecting"), (reconnect ? "reconnecting" : "connecting"),
mi->user,mi->host,mi->port, mi->user,mi->host,mi->port,
mysql_error(mysql), last_errno, mysql_error(mysql), last_errno,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment