Commit 80649ee8 authored by unknown's avatar unknown

Fix for BUG#1870

    "CHANGE MASTER makes SQL thread restart from coordinates of I/O thread".
    So, in CHANGE MASTER:
    when it seems reasonable that the user did not want to discontinue
    its replication (i.e. when he is not specifying host or port or master_log_file
    or master_log_pos; this will be documented), set the coordinates of the
    I/O thread to those of the SQL thread. This way, the SQL thread will see
    no discontinuity in the relay log (i.e. will skip no events), because
    the I/O thread will fill the brand new relay log with the events which
    are just after the position where the SQL thread had stopped
    (before CHANGE MASTER was issued).
    And a new test for this bug.


mysql-test/r/rpl_loaddata.result:
  Now, after CHANGE MASTER the coordinates of the I/O thread are the last ones of the SQL thread, so result update.
sql/sql_repl.cc:
  Fix for BUG#1870
  "CHANGE MASTER makes SQL thread restart from coordinates of I/O thread".
  So, in CHANGE MASTER:
  when it seems reasonable that the user did not want to discontinue
  its replication (i.e. when he is not specifying host or port or master_log_file
  or master_log_pos; this will be documented), set the coordinates of the
  I/O thread to those of the SQL thread. This way, the SQL thread will see
  no discontinuity in the relay log (i.e. will skip no events), because
  the I/O thread will fill the brand new relay log with the events which
  are just after the position where the SQL thread had stopped
  (before CHANGE MASTER was issued).
parent fdd0e707
slave stop;
drop table if exists t1,t2,t3,t4,t5,t6,t7,t8,t9;
reset master;
reset slave;
drop table if exists t1,t2,t3,t4,t5,t6,t7,t8,t9;
slave start;
select get_lock("a",5);
get_lock("a",5)
1
create table t1(n int);
insert into t1 values(1+get_lock("a",10)*0);
insert into t1 values(2);
stop slave;
select * from t1;
n
1
show slave status;
Master_Host Master_User Master_Port Connect_retry Master_Log_File Read_Master_Log_Pos Relay_Log_File Relay_Log_Pos Relay_Master_Log_File Slave_IO_Running Slave_SQL_Running Replicate_do_db Replicate_ignore_db Last_errno Last_error Skip_counter Exec_master_log_pos Relay_log_space
127.0.0.1 root 9306 1 master-bin.001 273 slave-relay-bin.002 255 master-bin.001 No No 0 0 214 314
change master to master_user='root';
show slave status;
Master_Host Master_User Master_Port Connect_retry Master_Log_File Read_Master_Log_Pos Relay_Log_File Relay_Log_Pos Relay_Master_Log_File Slave_IO_Running Slave_SQL_Running Replicate_do_db Replicate_ignore_db Last_errno Last_error Skip_counter Exec_master_log_pos Relay_log_space
127.0.0.1 root 9306 1 master-bin.001 214 slave-relay-bin.001 4 master-bin.001 No No 0 0 214 4
select release_lock("a");
release_lock("a")
1
start slave;
select * from t1;
n
1
2
drop table t1;
......@@ -43,7 +43,7 @@ change master to master_user='test';
change master to master_user='root';
show slave status;
Master_Host Master_User Master_Port Connect_retry Master_Log_File Read_Master_Log_Pos Relay_Log_File Relay_Log_Pos Relay_Master_Log_File Slave_IO_Running Slave_SQL_Running Replicate_do_db Replicate_ignore_db Last_errno Last_error Skip_counter Exec_master_log_pos Relay_log_space
127.0.0.1 root MASTER_PORT 1 master-bin.001 1442 slave-relay-bin.001 4 master-bin.001 No No 0 0 1442 4
127.0.0.1 root MASTER_PORT 1 master-bin.001 1419 slave-relay-bin.001 4 master-bin.001 No No 0 0 1419 4
set global sql_slave_skip_counter=1;
start slave;
set sql_log_bin=0;
......
source include/master-slave.inc;
connection slave;
select get_lock("a",5);
connection master;
create table t1(n int);
insert into t1 values(1+get_lock("a",10)*0);
insert into t1 values(2);
save_master_pos;
connection slave;
sleep 3; # can't sync_with_master as we should be blocked
stop slave;
select * from t1;
show slave status;
change master to master_user='root';
show slave status;
# Will restart from after the values(2), which is bug
select release_lock("a");
start slave;
sync_with_master;
select * from t1;
connection master;
drop table t1;
save_master_pos;
connection slave;
sync_with_master;
......@@ -853,8 +853,8 @@ void kill_zombie_dump_threads(uint32 slave_server_id)
int change_master(THD* thd, MASTER_INFO* mi)
{
int thread_mask;
const char* errmsg=0;
bool need_relay_log_purge=1;
const char* errmsg= 0;
bool need_relay_log_purge= 1;
DBUG_ENTER("change_master");
lock_slave_threads(mi);
......@@ -928,6 +928,36 @@ int change_master(THD* thd, MASTER_INFO* mi)
mi->rli.relay_log_pos=lex_mi->relay_log_pos;
}
/*
If user did specify neither host nor port nor any log name nor any log
pos, i.e. he specified only user/password/master_connect_retry, he probably
wants replication to resume from where it had left, i.e. from the
coordinates of the **SQL** thread (imagine the case where the I/O is ahead
of the SQL; restarting from the coordinates of the I/O would lose some
events which is probably unwanted when you are just doing minor changes
like changing master_connect_retry).
A side-effect is that if only the I/O thread was started, this thread may
restart from ''/4 after the CHANGE MASTER. That's a minor problem (it is a
much more unlikely situation than the one we are fixing here).
Note: coordinates of the SQL thread must be read here, before the
'if (need_relay_log_purge)' block which resets them.
*/
if (!lex_mi->host && !lex_mi->port &&
!lex_mi->log_file_name && !lex_mi->pos &&
need_relay_log_purge)
{
/*
Sometimes mi->rli.master_log_pos == 0 (it happens when the SQL thread is
not initialized), so we use a max().
What happens to mi->rli.master_log_pos during the initialization stages
of replication is not 100% clear, so we guard against problems using
max().
*/
mi->master_log_pos = max(BIN_LOG_HEADER_SIZE, mi->rli.master_log_pos);
strmake(mi->master_log_name,mi->rli.master_log_name,
sizeof(mi->master_log_name)-1);
}
flush_master_info(mi);
if (need_relay_log_purge)
{
......@@ -959,10 +989,21 @@ int change_master(THD* thd, MASTER_INFO* mi)
}
}
DBUG_PRINT("info", ("master_log_pos: %d", (ulong) mi->master_log_pos));
/* If changing RELAY_LOG_FILE or RELAY_LOG_POS, this will be nonsense: */
/*
Coordinates in rli were spoilt by the 'if (need_relay_log_purge)' block,
so restore them to good values. If we left them to ''/0, that would work;
but that would fail in the case of 2 successive CHANGE MASTER (without a
START SLAVE in between): because first one would set the coords in mi to
the good values of those in rli, the set those in rli to ''/0, then
second CHANGE MASTER would set the coords in mi to those of rli, i.e. to
''/0: we have lost all copies of the original good coordinates.
That's why we always save good coords in rli.
*/
mi->rli.master_log_pos = mi->master_log_pos;
strmake(mi->rli.master_log_name,mi->master_log_name,
sizeof(mi->rli.master_log_name)-1);
if (!mi->rli.master_log_name[0]) // uninitialized case
mi->rli.master_log_pos=0;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment