BUG#24860 (Incorrect SLAVE_TRANSACTION_RETRIES code can result in slave stuck):

If a temporary error occured inside a group on an event that was not the first
event of the group, the slave could get stuck because the retry counter is reset
whenever an event was executed successfully.

This patch only reset the retry counter when an entire group has been successfully
executed, or failed with a non-transient error.
parent 24ea15a2
stop slave;
drop table if exists t1,t2,t3,t4,t5,t6,t7,t8,t9;
reset master;
reset slave;
drop table if exists t1,t2,t3,t4,t5,t6,t7,t8,t9;
start slave;
**** On Master ****
SET SESSION BINLOG_FORMAT=ROW;
CREATE TABLE t1 (a INT PRIMARY KEY, b INT);
INSERT INTO t1 VALUES (1,1), (2,2), (3,3), (4,4);
**** On Slave ****
SHOW STATUS LIKE 'Slave_retried_transactions';
Variable_name Value
Slave_retried_transactions 0
UPDATE t1 SET a = 5, b = 47 WHERE a = 1;
SELECT * FROM t1;
a b
5 47
2 2
3 3
4 4
**** On Master ****
UPDATE t1 SET a = 5, b = 5 WHERE a = 1;
SELECT * FROM t1;
a b
5 5
2 2
3 3
4 4
**** On Slave ****
SHOW STATUS LIKE 'Slave_retried_transactions';
Variable_name Value
Slave_retried_transactions 2
SELECT * FROM t1;
a b
5 47
2 2
3 3
4 4
SHOW SLAVE STATUS;
Slave_IO_State #
Master_Host 127.0.0.1
Master_User root
Master_Port MASTER_PORT
Connect_Retry 1
Master_Log_File master-bin.000001
Read_Master_Log_Pos 408
Relay_Log_File #
Relay_Log_Pos #
Relay_Master_Log_File master-bin.000001
Slave_IO_Running Yes
Slave_SQL_Running No
Replicate_Do_DB
Replicate_Ignore_DB
Replicate_Do_Table
Replicate_Ignore_Table #
Replicate_Wild_Do_Table
Replicate_Wild_Ignore_Table
Last_Errno 1032
Last_Error Error in Update_rows event: error during transaction execution on table test.t1. Can't find record in 't1'
Skip_Counter 0
Exec_Master_Log_Pos 318
Relay_Log_Space #
Until_Condition None
Until_Log_File
Until_Log_Pos 0
Master_SSL_Allowed No
Master_SSL_CA_File
Master_SSL_CA_Path
Master_SSL_Cert
Master_SSL_Cipher
Master_SSL_Key
Seconds_Behind_Master #
Master_SSL_Verify_Server_Cert No
Last_IO_Errno #
Last_IO_Error #
Last_SQL_Errno 1032
Last_SQL_Error Error in Update_rows event: error during transaction execution on table test.t1. Can't find record in 't1'
DROP TABLE t1;
**** On Master ****
DROP TABLE t1;
--loose-debug="+d,all_errors_are_temporary_errors" --slave-transaction-retries=2
source include/master-slave.inc;
--echo **** On Master ****
connection master;
SET SESSION BINLOG_FORMAT=ROW;
CREATE TABLE t1 (a INT PRIMARY KEY, b INT);
INSERT INTO t1 VALUES (1,1), (2,2), (3,3), (4,4);
--echo **** On Slave ****
sync_slave_with_master;
SHOW STATUS LIKE 'Slave_retried_transactions';
UPDATE t1 SET a = 5, b = 47 WHERE a = 1;
SELECT * FROM t1;
--echo **** On Master ****
connection master;
UPDATE t1 SET a = 5, b = 5 WHERE a = 1;
save_master_pos;
SELECT * FROM t1;
#SHOW BINLOG EVENTS;
--echo **** On Slave ****
connection slave;
source include/wait_for_slave_sql_to_stop.inc;
SHOW STATUS LIKE 'Slave_retried_transactions';
SELECT * FROM t1;
source include/show_slave_status.inc;
DROP TABLE t1;
--echo **** On Master ****
connection master;
DROP TABLE t1;
......@@ -1714,7 +1714,14 @@ static int has_temporary_error(THD *thd)
DBUG_ENTER("has_temporary_error");
if (thd->is_fatal_error)
{
DBUG_PRINT("info", ("thd->net.last_errno: %s", ER(thd->net.last_errno)));
DBUG_RETURN(0);
}
DBUG_EXECUTE_IF("all_errors_are_temporary_errors",
if (thd->net.last_errno)
thd->net.last_errno= ER_LOCK_DEADLOCK;);
/*
Temporary error codes:
......@@ -1723,7 +1730,10 @@ static int has_temporary_error(THD *thd)
*/
if (thd->net.last_errno == ER_LOCK_DEADLOCK ||
thd->net.last_errno == ER_LOCK_WAIT_TIMEOUT)
{
DBUG_PRINT("info", ("thd->net.last_errno: %s", ER(thd->net.last_errno)));
DBUG_RETURN(1);
}
#ifdef HAVE_NDB_BINLOG
/*
......@@ -1905,7 +1915,8 @@ static int exec_relay_log_event(THD* thd, Relay_log_info* rli)
}
if (slave_trans_retries)
{
if (exec_res && has_temporary_error(thd))
int temp_err;
if (exec_res && (temp_err= has_temporary_error(thd)))
{
const char *errmsg;
/*
......@@ -1953,15 +1964,19 @@ static int exec_relay_log_event(THD* thd, Relay_log_info* rli)
"the slave_transaction_retries variable.",
slave_trans_retries);
}
else if (!((thd->options & OPTION_BEGIN) && opt_using_transactions))
else if (exec_res && !temp_err ||
(opt_using_transactions &&
rli->group_relay_log_pos == rli->event_relay_log_pos))
{
/*
Only reset the retry counter if the event succeeded or
failed with a non-transient error. On a successful event,
the execution will proceed as usual; in the case of a
Only reset the retry counter if the entire group succeeded
or failed with a non-transient error. On a successful
event, the execution will proceed as usual; in the case of a
non-transient error, the slave will stop with an error.
*/
rli->trans_retries= 0; // restart from fresh
DBUG_PRINT("info", ("Resetting retry counter, rli->trans_retries: %d",
rli->trans_retries));
}
}
DBUG_RETURN(exec_res);
......@@ -2450,6 +2465,7 @@ pthread_handler_t handle_slave_sql(void *arg)
rli->ignore_log_space_limit= 0;
pthread_mutex_unlock(&rli->log_space_lock);
rli->trans_retries= 0; // start from "no error"
DBUG_PRINT("info", ("rli->trans_retries: %d", rli->trans_retries));
if (init_relay_log_pos(rli,
rli->group_relay_log_name,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment