Bug#22725 Replication outages from ER_SERVER_SHUTDOWN (1053) set in replication events

  
The reason for the bug was that replaying of a query on slave could not be possible since its event
was recorded with the killed error. Due to the specific of handling INSERT, which per-row-while-loop is 
unbreakable to killing, the query on transactional table should have not appeared in binlog unless
there was  a call to a stored routine that got interrupted with killing (and then there must be an error
returned out of the loop).
   
The offered solution added the following rule for binlogging of INSERT that accounts the above
specifics:
For INSERT on transactional-table if the error was not set the only raised flag
is harmless and is ignored via masking out on time of creation of binlog event.
   
For both table types the combination of raised error and KILLED flag indicates that there
was potentially partial execution on master and consistency is under the question.
In that case the code continues to binlog an event with an appropriate killed error.
 
The fix relies on the specified behaviour of stored routine that must propagate the error 
to the top level query handling if the thd->killed flag was raised in the routine execution.
   
The patch adds an arg with the default killed-status-unset value to Query_log_event::Query_log_event.
parent 3b598218
create function bug27563()
RETURNS int(11)
DETERMINISTIC
begin
select get_lock("a", 10) into @a;
return 1;
end|
create function bug27565()
RETURNS int(11)
DETERMINISTIC
begin
select a from t1 where a=1 into @a for update;
return 1;
end|
create table t1 (a int auto_increment, b int, PRIMARY KEY (a)) ENGINE=InnoDB;
create table t2 (a int auto_increment, b int, PRIMARY KEY (a)) ENGINE=MyISAM;
create table t3 (a int auto_increment, b int, PRIMARY KEY (a)) ENGINE=InnoDB;
reset master;
select get_lock("a", 20);
get_lock("a", 20)
1
insert into t1 values (bug27563(),1);
kill query 3;
affected rows: 1
show master status /* must be only FD event unless Bug#27563 */;
File Position Binlog_Do_DB Binlog_Ignore_DB
master-bin.000001 260
select count(*) from t1 /* must be zero unless Bug#27563 */;
count(*)
1
begin;
insert into t1 values (bug27563(),1);
kill query 3;
affected rows: 1
select count(*) from t1 /* must be zero unless Bug#27563 */;
count(*)
2
commit;
reset master;
insert into t2 values (bug27563(),1);
kill query 3;
select count(*) from t2 /* must be one */;
count(*)
1
show master status /* must have the insert event more to FD */;
File Position Binlog_Do_DB Binlog_Ignore_DB
master-bin.000001 225
select RELEASE_LOCK("a");
RELEASE_LOCK("a")
1
delete from t1;
delete from t2;
insert into t1 values (1,1);
insert into t2 values (1,1);
begin;
update t1 set b=0 where a=1;
update t2 set b=bug27565()-1 where a=1;
kill query 3;
commit;
Got one of the listed errors
select * from t1 /* must be: (1,0) */;
a b
1 0
select * from t2 /* must be as before: (1,1) */;
a b
1 1
delete from t3;
reset master;
begin;
update t1 set b=0 where a=1;
insert into t3 values (0,0),(1,bug27565());
kill query 3;
rollback;
Got one of the listed errors
select count(*) from t3 /* must be zero */;
count(*)
0
show master status /* nothing in binlog */;
File Position Binlog_Do_DB Binlog_Ignore_DB
master-bin.000001 98
delete from t2;
reset master;
begin;
update t1 set b=0 where a=1;
insert into t2 values (0,0),(1,bug27565()) /* non-ta t2 */;
kill query 3;
rollback;
Got one of the listed errors
select count(*) from t2 /* count must be one */;
count(*)
1
show master status /* insert into non-ta must be in binlog */;
File Position Binlog_Do_DB Binlog_Ignore_DB
master-bin.000001 247
select
(@a:=load_file("MYSQLTEST_VARDIR/tmp/kill_query_calling_sp.binlog"))
is not null;
(@a:=load_file("MYSQLTEST_VARDIR/tmp/kill_query_calling_sp.binlog"))
is not null
1
select @a like "%#%error_code=1317%" /* must return 1 */;
@a like "%#%error_code=1317%"
1
drop table t1,t2,t3;
drop function bug27563;
drop function bug27565;
-- source include/have_innodb.inc
###
### bug#22725 : incorrect killed error in binlogged query
### and
### Bug#27563 killing noticed in SF() stack but the error gets missed in action
### Bug#27565 killed query of SF() is not reported correctly and
###
connect (con1, localhost, root,,);
connect (con2, localhost, root,,);
# the function is *insensitive* to killing - TO FIX IN BUG#27563
# the function is used in the test anyway with `TODO' left
# to correct results afterwards
delimiter |;
create function bug27563()
RETURNS int(11)
DETERMINISTIC
begin
select get_lock("a", 10) into @a;
return 1;
end|
delimiter ;|
# the function is sensitive to killing requiring innodb though with wrong client error
# TO FIX in BUG#27565; TODO: remove --error 1105 afterwards
delimiter |;
create function bug27565()
RETURNS int(11)
DETERMINISTIC
begin
select a from t1 where a=1 into @a for update;
return 1;
end|
delimiter ;|
create table t1 (a int auto_increment, b int, PRIMARY KEY (a)) ENGINE=InnoDB;
create table t2 (a int auto_increment, b int, PRIMARY KEY (a)) ENGINE=MyISAM;
create table t3 (a int auto_increment, b int, PRIMARY KEY (a)) ENGINE=InnoDB;
reset master;
### ta table case: killing causes rollback
# A. autocommit ON
connection con1;
select get_lock("a", 20);
connection con2;
let $ID= `select connection_id()`;
send insert into t1 values (bug27563(),1);
connection con1;
eval kill query $ID;
connection con2;
# todo (re-record test): after bugs 27563,27565 got fixed affected rows will report zero
--enable_info
# todo: remove 0 return after fixing Bug#27563
--error 0,ER_QUERY_INTERRUPTED
reap;
--disable_info
###--replace_column 2 # 5 #
### show binlog events from 98 /* nothing in binlog unless Bug#27563 */;
show master status /* must be only FD event unless Bug#27563 */;
select count(*) from t1 /* must be zero unless Bug#27563 */;
# M. multi-statement-ta
connection con2;
let $ID= `select connection_id()`;
begin;
send insert into t1 values (bug27563(),1);
connection con1;
eval kill query $ID;
connection con2;
# todo (re-record test): after bugs 27563,27565 got fixed affected rows will report zero
--enable_info
# todo: remove 0 return after fixing Bug#27563
--error 0,ER_QUERY_INTERRUPTED
reap;
--disable_info
select count(*) from t1 /* must be zero unless Bug#27563 */;
commit;
### non-ta table case: killing must be recorded in binlog
reset master;
connection con2;
let $ID= `select connection_id()`;
send insert into t2 values (bug27563(),1);
connection con1;
eval kill query $ID;
connection con2;
# todo: remove 0 return after fixing Bug#27563
--error 0,ER_QUERY_INTERRUPTED
reap;
select count(*) from t2 /* must be one */;
#show binlog events from 98 /* must have the insert on non-ta table */;
show master status /* must have the insert event more to FD */;
# the value of the error flag of KILLED_QUERY is tested further
connection con1;
select RELEASE_LOCK("a");
### test with effective killing of SF()
delete from t1;
delete from t2;
insert into t1 values (1,1);
insert into t2 values (1,1);
#
# Bug#27565
# test where KILL is propagated as error to the top level
# still another bug with the error message to the user
# todo: fix reexecute the result file after fixing
#
begin; update t1 set b=0 where a=1;
connection con2;
let $ID= `select connection_id()`;
send update t2 set b=bug27565()-1 where a=1;
connection con1;
eval kill query $ID;
commit;
connection con2;
# todo: fix Bug #27565 killed query of SF() is not reported correctly and
# remove 1105 (wrong)
#--error ER_QUERY_INTERRUPTED
--error 1105,ER_QUERY_INTERRUPTED
reap;
select * from t1 /* must be: (1,0) */;
select * from t2 /* must be as before: (1,1) */;
## bug#22725 with effective and propagating killing
#
# top-level ta-table
connection con1;
delete from t3;
reset master;
begin; update t1 set b=0 where a=1;
connection con2;
let $ID= `select connection_id()`;
# the query won't perform completely since the function gets interrupted
send insert into t3 values (0,0),(1,bug27565());
connection con1;
eval kill query $ID;
rollback;
connection con2;
# todo: fix Bug #27565 killed query of SF() is not reported correctly and
# remove 1105 (wrong)
#--error ER_QUERY_INTERRUPTED
--error 1105,ER_QUERY_INTERRUPTED
reap;
select count(*) from t3 /* must be zero */;
show master status /* nothing in binlog */;
# top-level non-ta-table
connection con1;
delete from t2;
reset master;
begin; update t1 set b=0 where a=1;
connection con2;
let $ID= `select connection_id()`;
# the query won't perform completely since the function gets intrurrupted
send insert into t2 values (0,0),(1,bug27565()) /* non-ta t2 */;
connection con1;
eval kill query $ID;
rollback;
connection con2;
# todo: fix Bug #27565 killed query of SF() is not reported correctly and
# remove 1105 (wrong)
#--error ER_QUERY_INTERRUPTED
--error 1105,ER_QUERY_INTERRUPTED
reap;
select count(*) from t2 /* count must be one */;
show master status /* insert into non-ta must be in binlog */;
--exec $MYSQL_BINLOG --start-position=126 $MYSQLTEST_VARDIR/log/master-bin.000001 > $MYSQLTEST_VARDIR/tmp/kill_query_calling_sp.binlog
--replace_result $MYSQLTEST_VARDIR MYSQLTEST_VARDIR
eval select
(@a:=load_file("$MYSQLTEST_VARDIR/tmp/kill_query_calling_sp.binlog"))
is not null;
--replace_result $MYSQL_TEST_DIR MYSQL_TEST_DIR
eval select @a like "%#%error_code=1317%" /* must return 1 */;
system rm $MYSQLTEST_VARDIR/tmp/kill_query_calling_sp.binlog ;
drop table t1,t2,t3;
drop function bug27563;
drop function bug27565;
......@@ -1279,20 +1279,31 @@ Query_log_event::Query_log_event()
/*
Query_log_event::Query_log_event()
SYNOPSIS
Query_log_event::Query_log_event()
thd - thread handle
query_arg - array of char representing the query
query_length - size of the `query_arg' array
using_trans - there is a modified transactional table
suppress_use - suppress the generation of 'USE' statements
killed_status_arg - an optional with default to THD::KILLED_NO_VALUE
if the value is different from the default, the arg
is set to the current thd->killed value.
A caller might need to masquerade thd->killed with
THD::NOT_KILLED.
DESCRIPTION
Creates an event for binlogging
The value for local `killed_status' can be supplied by caller.
*/
Query_log_event::Query_log_event(THD* thd_arg, const char* query_arg,
ulong query_length, bool using_trans,
bool suppress_use)
bool suppress_use, THD::killed_state killed_status_arg)
:Log_event(thd_arg,
((thd_arg->tmp_table_used ? LOG_EVENT_THREAD_SPECIFIC_F : 0)
| (suppress_use ? LOG_EVENT_SUPPRESS_USE_F : 0)),
using_trans),
data_buf(0), query(query_arg), catalog(thd_arg->catalog),
db(thd_arg->db), q_len((uint32) query_length),
error_code((thd_arg->killed != THD::NOT_KILLED) ?
((thd_arg->system_thread & SYSTEM_THREAD_DELAYED_INSERT) ?
0 : thd->killed_errno()) : thd_arg->net.last_errno),
thread_id(thd_arg->thread_id),
/* save the original thread id; we already know the server id */
slave_proxy_id(thd_arg->variables.pseudo_thread_id),
......@@ -1304,6 +1315,14 @@ Query_log_event::Query_log_event(THD* thd_arg, const char* query_arg,
charset_database_number(0)
{
time_t end_time;
if (killed_status_arg == THD::KILLED_NO_VALUE)
killed_status_arg= thd_arg->killed;
error_code=
(killed_status_arg == THD::NOT_KILLED) ? thd_arg->net.last_errno :
((thd_arg->system_thread & SYSTEM_THREAD_DELAYED_INSERT) ? 0 :
thd->killed_errno());
time(&end_time);
exec_time = (ulong) (end_time - thd->start_time);
catalog_len = (catalog) ? (uint32) strlen(catalog) : 0;
......
......@@ -804,7 +804,8 @@ public:
#ifndef MYSQL_CLIENT
Query_log_event(THD* thd_arg, const char* query_arg, ulong query_length,
bool using_trans, bool suppress_use);
bool using_trans, bool suppress_use,
THD::killed_state killed_err_arg= THD::KILLED_NO_VALUE);
const char* get_db() { return db; }
#ifdef HAVE_REPLICATION
void pack_info(Protocol* protocol);
......
......@@ -1397,7 +1397,14 @@ public:
DYNAMIC_ARRAY user_var_events; /* For user variables replication */
MEM_ROOT *user_var_events_alloc; /* Allocate above array elements here */
enum killed_state { NOT_KILLED=0, KILL_BAD_DATA=1, KILL_CONNECTION=ER_SERVER_SHUTDOWN, KILL_QUERY=ER_QUERY_INTERRUPTED };
enum killed_state
{
NOT_KILLED=0,
KILL_BAD_DATA=1,
KILL_CONNECTION=ER_SERVER_SHUTDOWN,
KILL_QUERY=ER_QUERY_INTERRUPTED,
KILLED_NO_VALUE /* means neither of the states */
};
killed_state volatile killed;
/* scramble - random string sent to client on handshake */
......@@ -1670,7 +1677,8 @@ public:
void end_statement();
inline int killed_errno() const
{
return killed != KILL_BAD_DATA ? killed : 0;
killed_state killed_val; /* to cache the volatile 'killed' */
return (killed_val= killed) != KILL_BAD_DATA ? killed_val : 0;
}
inline void send_kill_message() const
{
......
......@@ -870,9 +870,35 @@ bool mysql_insert(THD *thd,TABLE_LIST *table_list,
if (mysql_bin_log.is_open())
{
if (error <= 0)
{
/*
[Guilhem wrote] Temporary errors may have filled
thd->net.last_error/errno. For example if there has
been a disk full error when writing the row, and it was
MyISAM, then thd->net.last_error/errno will be set to
"disk full"... and the my_pwrite() will wait until free
space appears, and so when it finishes then the
write_row() was entirely successful
*/
/* todo: consider removing */
thd->clear_error();
}
/* bug#22725:
A query which per-row-loop can not be interrupted with
KILLED, like INSERT, and that does not invoke stored
routines can be binlogged with neglecting the KILLED error.
If there was no error (error == zero) until after the end of
inserting loop the KILLED flag that appeared later can be
disregarded since previously possible invocation of stored
routines did not result in any error due to the KILLED. In
such case the flag is ignored for constructing binlog event.
*/
Query_log_event qinfo(thd, thd->query, thd->query_length,
transactional_table, FALSE);
transactional_table, FALSE,
(error>0) ? thd->killed : THD::NOT_KILLED);
assert(thd->killed != THD::KILL_BAD_DATA || error > 0);
if (mysql_bin_log.write(&qinfo) && transactional_table)
error=1;
}
......
......@@ -521,6 +521,37 @@ int mysql_update(THD *thd,
table->file->unlock_row();
thd->row_count++;
}
/*
todo bug#27571: to avoid asynchronization of `error' and
`error_code' of binlog event constructor
The concept, which is a bit different for insert(!), is to
replace `error' assignment with the following lines
killed_status= thd->killed; // get the status of the volatile
Notice: thd->killed is type of "state" whereas the lhs has
"status" the suffix which translates according to WordNet: a state
at a particular time - at the time of the end of per-row loop in
our case. Binlogging ops are conducted with the status.
error= (killed_status == THD::NOT_KILLED)? error : 1;
which applies to most mysql_$query functions.
Event's constructor will accept `killed_status' as an argument:
Query_log_event qinfo(..., killed_status);
thd->killed might be changed after killed_status had got cached and this
won't affect binlogging event but other effects remain.
Open issue: In a case the error happened not because of KILLED -
and then KILLED was caught later still within the loop - we shall
do something to avoid binlogging of incorrect ER_SERVER_SHUTDOWN
error_code.
*/
if (thd->killed && !error)
error= 1; // Aborted
end_read_record(&info);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment