BUG#20697 slave fails to rollback replicated transaction hang over innodb_lock_wait_timeou

Transaction on the slave sql thread got blocked against a slave's mysqld local ta's
lock. Since the default, slave-transaction-retries=10, there was replaying of the 
replicated ta. That failed because of a new started from 5.0.13 policy not to rollback
a timed-out transaction. Effectively the first round of a timed-out ta becomes committed
by the replaying's first "BEGIN".

It was decided to backport already existed method working in 5.1 implemented in
bug #16228 for handling symmetrical deadlock problem. That patch introduced end_trans
execution whenever a replicated ta deadlocks or timed-out.

Note, that this solution can be practically suboptimal - in the light of the changed behavior
due to timeout we still could replay only the last statement -  only with a high rate of timeouting
replicated transactions.
parent 40bcedf4
......@@ -6,7 +6,7 @@ drop table if exists t1,t2,t3,t4,t5,t6,t7,t8,t9;
start slave;
create table t1 (a int not null, key(a)) engine=innodb;
create table t2 (a int not null, key(a)) engine=innodb;
create table t3 (a int) engine=innodb;
create table t3 (a int unique) engine=innodb;
create table t4 (a int) engine=innodb;
show variables like 'slave_transaction_retries';
Variable_name Value
......@@ -35,14 +35,14 @@ begin;
select * from t1 for update;
a
start slave;
insert into t2 values(22);
insert into t2 values(201);
commit;
select * from t1;
a
1
select * from t2;
a
22
201
show slave status;
Slave_IO_State #
Master_Host 127.0.0.1
......@@ -50,7 +50,7 @@ Master_User root
Master_Port MASTER_MYPORT
Connect_Retry 1
Master_Log_File master-bin.000001
Read_Master_Log_Pos 18911
Read_Master_Log_Pos 18918
Relay_Log_File #
Relay_Log_Pos #
Relay_Master_Log_File master-bin.000001
......@@ -65,7 +65,7 @@ Replicate_Wild_Ignore_Table
Last_Errno 0
Last_Error
Skip_Counter 0
Exec_Master_Log_Pos 18911
Exec_Master_Log_Pos 18918
Relay_Log_Space #
Until_Condition None
Until_Log_File
......@@ -78,12 +78,16 @@ Master_SSL_Cipher
Master_SSL_Key
Seconds_Behind_Master #
stop slave;
change master to master_log_pos=532;
delete from t3;
change master to master_log_pos=539;
begin;
select * from t2 for update;
a
22
201
start slave;
select count(*) from t3 /* must be zero */;
count(*)
0
commit;
select * from t1;
a
......@@ -91,7 +95,7 @@ a
1
select * from t2;
a
22
201
show slave status;
Slave_IO_State #
Master_Host 127.0.0.1
......@@ -99,7 +103,7 @@ Master_User root
Master_Port MASTER_MYPORT
Connect_Retry 1
Master_Log_File master-bin.000001
Read_Master_Log_Pos 18911
Read_Master_Log_Pos 18918
Relay_Log_File #
Relay_Log_Pos #
Relay_Master_Log_File master-bin.000001
......@@ -114,7 +118,7 @@ Replicate_Wild_Ignore_Table
Last_Errno 0
Last_Error
Skip_Counter 0
Exec_Master_Log_Pos 18911
Exec_Master_Log_Pos 18918
Relay_Log_Space #
Until_Condition None
Until_Log_File
......@@ -128,12 +132,16 @@ Master_SSL_Key
Seconds_Behind_Master #
set global max_relay_log_size=0;
stop slave;
change master to master_log_pos=532;
delete from t3;
change master to master_log_pos=539;
begin;
select * from t2 for update;
a
22
201
start slave;
select count(*) from t3 /* must be zero */;
count(*)
0
commit;
select * from t1;
a
......@@ -142,7 +150,7 @@ a
1
select * from t2;
a
22
201
show slave status;
Slave_IO_State #
Master_Host 127.0.0.1
......@@ -150,7 +158,7 @@ Master_User root
Master_Port MASTER_MYPORT
Connect_Retry 1
Master_Log_File master-bin.000001
Read_Master_Log_Pos 18911
Read_Master_Log_Pos 18918
Relay_Log_File #
Relay_Log_Pos #
Relay_Master_Log_File master-bin.000001
......@@ -165,7 +173,7 @@ Replicate_Wild_Ignore_Table
Last_Errno 0
Last_Error
Skip_Counter 0
Exec_Master_Log_Pos 18911
Exec_Master_Log_Pos 18918
Relay_Log_Space #
Until_Condition None
Until_Log_File
......@@ -177,4 +185,91 @@ Master_SSL_Cert
Master_SSL_Cipher
Master_SSL_Key
Seconds_Behind_Master #
drop table if exists t1;
create table t1 (f int unique) engine=innodb;
insert into t1 values (0);
begin;
select * from t1 where f = 0 for update;
f
0
begin;
insert into t1 values (1);
update t1 set f=-1 where f = 0;
commit;
insert into t1 values (2);
show slave status;
Slave_IO_State #
Master_Host 127.0.0.1
Master_User root
Master_Port MASTER_MYPORT
Connect_Retry 1
Master_Log_File master-bin.000001
Read_Master_Log_Pos 19618
Relay_Log_File #
Relay_Log_Pos #
Relay_Master_Log_File master-bin.000001
Slave_IO_Running #
Slave_SQL_Running No
Replicate_Do_DB
Replicate_Ignore_DB
Replicate_Do_Table
Replicate_Ignore_Table
Replicate_Wild_Do_Table
Replicate_Wild_Ignore_Table
Last_Errno 1205
Last_Error Error 'Lock wait timeout exceeded; try restarting transaction' on query. Default database: 'test'. Query: 'update t1 set f=-1 where f = 0'
Skip_Counter 0
Exec_Master_Log_Pos 19227
Relay_Log_Space #
Until_Condition None
Until_Log_File
Until_Log_Pos 0
Master_SSL_Allowed No
Master_SSL_CA_File
Master_SSL_CA_Path
Master_SSL_Cert
Master_SSL_Cipher
Master_SSL_Key
Seconds_Behind_Master #
set @@global.sql_slave_skip_counter = 4;
start slave;
show slave status;
Slave_IO_State #
Master_Host 127.0.0.1
Master_User root
Master_Port MASTER_MYPORT
Connect_Retry 1
Master_Log_File master-bin.000001
Read_Master_Log_Pos 19618
Relay_Log_File #
Relay_Log_Pos #
Relay_Master_Log_File master-bin.000001
Slave_IO_Running #
Slave_SQL_Running Yes
Replicate_Do_DB
Replicate_Ignore_DB
Replicate_Do_Table
Replicate_Ignore_Table
Replicate_Wild_Do_Table
Replicate_Wild_Ignore_Table
Last_Errno 0
Last_Error
Skip_Counter 0
Exec_Master_Log_Pos 19618
Relay_Log_Space #
Until_Condition None
Until_Log_File
Until_Log_Pos 0
Master_SSL_Allowed No
Master_SSL_CA_File
Master_SSL_CA_Path
Master_SSL_Cert
Master_SSL_Cipher
Master_SSL_Key
Seconds_Behind_Master #
select * from t1;
f
0
2
commit;
drop table t1,t2,t3,t4;
......@@ -16,7 +16,8 @@ source include/master-slave.inc;
connection master;
create table t1 (a int not null, key(a)) engine=innodb;
create table t2 (a int not null, key(a)) engine=innodb;
create table t3 (a int) engine=innodb;
# requiring 'unique' for the timeout part of the test
create table t3 (a int unique) engine=innodb;
create table t4 (a int) engine=innodb;
show variables like 'slave_transaction_retries';
sync_slave_with_master;
......@@ -31,8 +32,7 @@ stop slave;
connection master;
begin;
# Let's keep BEGIN and the locked statement in two different relay logs.
let $1=200;
disable_query_log;
let $1=200;disable_query_log;
while ($1)
{
eval insert into t3 values( $1 );
......@@ -59,7 +59,7 @@ enable_query_log;
select * from t1 for update;
start slave;
--real_sleep 3 # hope that slave is blocked now
insert into t2 values(22); # provoke deadlock, slave should be victim
insert into t2 values(201); # provoke deadlock, slave should be victim
commit;
sync_with_master;
select * from t1; # check that slave succeeded finally
......@@ -74,11 +74,13 @@ show slave status;
# 2) Test lock wait timeout
stop slave;
change master to master_log_pos=532; # the BEGIN log event
delete from t3;
change master to master_log_pos=539; # the BEGIN log event
begin;
select * from t2 for update; # hold lock
start slave;
--real_sleep 10 # slave should have blocked, and be retrying
select count(*) from t3 /* must be zero */; # replaying begins after rollback
commit;
sync_with_master;
select * from t1; # check that slave succeeded finally
......@@ -97,11 +99,13 @@ set global max_relay_log_size=0;
# This is really copy-paste of 2) of above
stop slave;
change master to master_log_pos=532;
delete from t3;
change master to master_log_pos=539;
begin;
select * from t2 for update;
start slave;
--real_sleep 10
select count(*) from t3 /* must be zero */; # replaying begins after rollback
commit;
sync_with_master;
select * from t1;
......
......@@ -3345,9 +3345,9 @@ static int exec_relay_log_event(THD* thd, RELAY_LOG_INFO* rli)
const char *errmsg;
/*
We were in a transaction which has been rolled back because of a
deadlock (currently, InnoDB deadlock detected by InnoDB) or lock
wait timeout (innodb_lock_wait_timeout exceeded); let's seek back to
BEGIN log event and retry it all again.
Sonera deadlock. if lock wait timeout (innodb_lock_wait_timeout exceeded)
there is no rollback since 5.0.13 (ref: manual).
let's seek back to BEGIN log event and retry it all again.
We have to not only seek but also
a) init_master_info(), to seek back to hot relay log's start for later
(for when we will come back to this hot log after re-processing the
......@@ -3369,6 +3369,7 @@ static int exec_relay_log_event(THD* thd, RELAY_LOG_INFO* rli)
else
{
exec_res= 0;
end_trans(thd, ROLLBACK);
/* chance for concurrent connection to get more locks */
safe_sleep(thd, min(rli->trans_retries, MAX_SLAVE_RETRY_PAUSE),
(CHECK_KILLED_FUNC)sql_slave_killed, (void*)rli);
......@@ -3386,9 +3387,17 @@ static int exec_relay_log_event(THD* thd, RELAY_LOG_INFO* rli)
"the slave_transaction_retries variable.",
slave_trans_retries);
}
if (!((thd->options & OPTION_BEGIN) && opt_using_transactions))
rli->trans_retries= 0; // restart from fresh
}
else if (!((thd->options & OPTION_BEGIN) && opt_using_transactions))
{
/*
Only reset the retry counter if the event succeeded or
failed with a non-transient error. On a successful event,
the execution will proceed as usual; in the case of a
non-transient error, the slave will stop with an error.
*/
rli->trans_retries= 0; // restart from fresh
}
}
return exec_res;
}
else
......@@ -4613,7 +4622,7 @@ static int connect_to_master(THD* thd, MYSQL* mysql, MASTER_INFO* mi,
suppress_warnings= 0;
sql_print_error("Slave I/O thread: error %s to master \
'%s@%s:%d': \
Error: '%s' errno: %d retry-time: %d retries: %d",
Error: '%s' errno: %d retry-time: %d retries: %lu",
(reconnect ? "reconnecting" : "connecting"),
mi->user,mi->host,mi->port,
mysql_error(mysql), last_errno,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment