MDEV-26: Global transaction ID.

Fix problems related to reconnect. When we need to reconnect (ie. explict stop/start of just the IO thread by user, or automatic reconnect due to loosing network connection with the master), it is a bit complex to correctly resume at the right point without causing duplicate or missing events in the relay log. The previous code had multiple problems in this regard. With this patch, the problem is solved as follows. The IO thread keeps track (in memory) of which GTID was last queued to the relay log. If it needs to reconnect, it resumes at that GTID position. It also counts number of events received within the last, possibly partial, event group, and skips the same number of events after a reconnect, so that events already enqueued before the reconnect are not duplicated. (There is no need to keep any persistent state; whenever we restart slave threads after both of them being stopped (such as after server restart), we erase the relay logs and start over from the last GTID applied by SQL thread. But while the SQL thread is running, this patch is needed to get correct relay log).

MDEV-26: Global transaction ID.
Fix problems related to reconnect. When we need to reconnect (ie. explict stop/start of just the IO thread by user, or automatic reconnect due to loosing network connection with the master), it is a bit complex to correctly resume at the right point without causing duplicate or missing events in the relay log. The previous code had multiple problems in this regard. With this patch, the problem is solved as follows. The IO thread keeps track (in memory) of which GTID was last queued to the relay log. If it needs to reconnect, it resumes at that GTID position. It also counts number of events received within the last, possibly partial, event group, and skips the same number of events after a reconnect, so that events already enqueued before the reconnect are not duplicated. (There is no need to keep any persistent state; whenever we restart slave threads after both of them being stopped (such as after server restart), we erase the relay logs and start over from the last GTID applied by SQL thread. But while the SQL thread is running, this patch is needed to get correct relay log).
5cb486d1 · unknown · 7ad47ab0 · 5cb486d1 · 5cb486d1 · 5cb486d1
Commit 5cb486d1 authored Jun 05, 2013 by unknown
12 changed files
--- a/mysql-test/include/kill_binlog_dump_threads.inc
+++ b/mysql-test/include/kill_binlog_dump_threads.inc
+# ==== Purpose ====
+#
+# Terminate all binlog dump threads on a master.
+#
+# This is sometimes useful, as normally such dump threads can hang
+# around for some time before they notice that the slave has disconnected.
+#
+# Note that if there are active slave connections, they might try to
+# reconnect as their dump threads are killed, which may not lead to the
+# desired results.
+#
+#
+# ==== Usage ====
+#
+# [--let $kill_timeout= NUMBER]
+# --source include/stop_slavekill_binlog_dump_threads.inc
+#
+# Parameters:
+#   $kill_timeout
+#     Maximum number of seconds to wait for dump threads to disappear.
+
+
+--let $include_filename= kill_binlog_dump_threads.inc
+--source include/begin_include_file.inc
+
+--disable_query_log
+
+let $wait_counter= 300;
+if ($kill_timeout)
+{
+  let $wait_counter= `SELECT $kill_timeout * 10`;
+}
+
+let $success= 0;
+while ($wait_counter)
+{
+    dec $wait_counter;
+    let $_tid= `SELECT id FROM information_schema.processlist WHERE command = 'Binlog Dump' LIMIT 1`;
+    if ($_tid)
+    {
+        eval KILL QUERY $_tid;
+    }
+    if (!$_tid)
+    {
+        let $wait_counter= 0;
+	let $success= 1;
+    }
+    if (!$success)
+    {
+        real_sleep 0.1;
+    }
+}
+if (!$success)
+{
+    SHOW FULL PROCESSLIST;
+    --die Timeout while waiting for binlog dump threads to disappear.
+}
+
+--enable_query_log
+
+--let $include_filename= kill_binlog_dump_threads.inc
+--source include/end_include_file.inc
--- a/mysql-test/suite/rpl/r/rpl_gtid_reconnect.result
+++ b/mysql-test/suite/rpl/r/rpl_gtid_reconnect.result
+include/rpl_init.inc [topology=1->2]
+include/stop_slave.inc
+CHANGE MASTER TO master_use_gtid= current_pos;
+include/start_slave.inc
+CREATE TABLE t1 (a INT);
+FLUSH LOGS;
+SET gtid_domain_id=10;
+INSERT INTO t1 VALUES (1);
+INSERT INTO t1 VALUES (2);
+SET gtid_seq_no=100;
+INSERT INTO t1 VALUES (3);
+INSERT INTO t1 VALUES (4);
+INSERT INTO t1 VALUES (5);
+include/stop_slave.inc
+SELECT * FROM t1 ORDER BY a;
+a
+1
+2
+3
+4
+5
+include/kill_binlog_dump_threads.inc
+INSERT INTO t1 VALUES (10);
+SET @old_dbug= @@GLOBAL.debug_dbug;
+SET GLOBAL debug_dbug="+d,gtid_force_reconnect_at_10_1_100";
+include/start_slave.inc
+SELECT * FROM t1 ORDER BY a;
+a
+1
+2
+3
+4
+5
+10
+include/stop_slave.inc
+TRUNCATE t1;
+RESET MASTER;
+SET GLOBAL gtid_slave_pos= "";
+SET GLOBAL debug_dbug= @old_debug;
+TRUNCATE t1;
+RESET MASTER;
+SET gtid_domain_id=10;
+SET gtid_seq_no=50;
+INSERT INTO t1 VALUES (1);
+SET gtid_domain_id=11;
+INSERT INTO t1 VALUES (11);
+SET gtid_domain_id=10;
+SET gtid_seq_no=100;
+INSERT INTO t1 VALUES (2);
+SET gtid_domain_id=11;
+INSERT INTO t1 VALUES (12);
+SET gtid_domain_id=10;
+INSERT INTO t1 VALUES (3);
+SET gtid_domain_id=11;
+SET gtid_seq_no=200;
+INSERT INTO t1 VALUES (13);
+START SLAVE UNTIL master_gtid_pos="10-1-50,11-1-200";
+include/wait_for_slave_to_stop.inc
+SELECT * FROM t1 ORDER BY a;
+a
+1
+11
+12
+13
+include/kill_binlog_dump_threads.inc
+INSERT INTO t1 VALUES (20);
+SET GLOBAL debug_dbug="+d,gtid_force_reconnect_at_10_1_100";
+include/start_slave.inc
+SELECT * FROM t1 ORDER BY a;
+a
+1
+2
+3
+11
+12
+13
+20
+include/stop_slave.inc
+TRUNCATE t1;
+RESET MASTER;
+SET GLOBAL gtid_slave_pos= "";
+SET GLOBAL debug_dbug= @old_debug;
+TRUNCATE t1;
+RESET MASTER;
+include/kill_binlog_dump_threads.inc
+SET gtid_domain_id= 9;
+SET gtid_seq_no= 50;
+INSERT INTO t1 VALUES (1);
+SET gtid_domain_id= 10;
+INSERT INTO t1 VALUES (11);
+SET gtid_domain_id= 9;
+INSERT INTO t1 VALUES (2);
+SET gtid_domain_id= 10;
+SET gtid_seq_no= 100;
+INSERT INTO t1 VALUES (12);
+SET gtid_domain_id= 9;
+INSERT INTO t1 VALUES (3);
+SET gtid_domain_id= 10;
+SET gtid_seq_no= 200;
+INSERT INTO t1 VALUES (13);
+SET gtid_domain_id= 10;
+SET GLOBAL debug_dbug="+d,gtid_force_reconnect_at_10_1_100";
+START SLAVE UNTIL master_gtid_pos="9-1-50,10-1-200";
+include/wait_for_slave_to_stop.inc
+SELECT * FROM t1 ORDER BY a;
+a
+1
+11
+12
+13
+SET GLOBAL debug_dbug= @old_debug;
+INSERT INTO t1 VALUES (20);
+include/start_slave.inc
+SELECT * FROM t1 ORDER BY a;
+a
+1
+2
+3
+11
+12
+13
+20
+*** Test when slave IO thread needs to reconnect in the middle of an event group. ***
+include/stop_slave.inc
+TRUNCATE t1;
+RESET MASTER;
+SET GLOBAL gtid_slave_pos= "";
+SET GLOBAL debug_dbug= @old_debug;
+TRUNCATE t1;
+RESET MASTER;
+include/kill_binlog_dump_threads.inc
+SET GLOBAL debug_dbug="+d,binlog_force_reconnect_after_22_events";
+CREATE TABLE t2 (a INT PRIMARY KEY) ENGINE=InnoDB;
+INSERT INTO t2 VALUES (1);
+BEGIN;
+INSERT INTO t2 VALUES (10);
+INSERT INTO t2 VALUES (11);
+COMMIT;
+BEGIN;
+INSERT INTO t2 VALUES (20);
+INSERT INTO t2 VALUES (21);
+INSERT INTO t2 VALUES (22);
+INSERT INTO t2 VALUES (23);
+INSERT INTO t2 VALUES (24);
+INSERT INTO t2 VALUES (25);
+INSERT INTO t2 VALUES (26);
+INSERT INTO t2 VALUES (27);
+INSERT INTO t2 VALUES (28);
+INSERT INTO t2 VALUES (29);
+COMMIT;
+include/start_slave.inc
+SELECT * FROM t2 ORDER BY a;
+a
+1
+10
+11
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+SET GLOBAL debug_dbug= @old_debug;
+DROP TABLE t1, t2;
+include/rpl_end.inc
--- a/mysql-test/suite/rpl/t/rpl_gtid_reconnect.test
+++ b/mysql-test/suite/rpl/t/rpl_gtid_reconnect.test
+--let $rpl_topology=1->2
+--source include/rpl_init.inc
+--source include/have_innodb.inc
+--source include/have_debug.inc
+
+
+--connection server_2
+--source include/stop_slave.inc
+CHANGE MASTER TO master_use_gtid= current_pos;
+--source include/start_slave.inc
+
+--connection server_1
+CREATE TABLE t1 (a INT);
+FLUSH LOGS;
+--save_master_pos
+
+--connection server_2
+--sync_with_master
+
+
+# Prepare a string of events and have the slave replicate all of it.
+--connection server_1
+SET gtid_domain_id=10;
+INSERT INTO t1 VALUES (1);
+INSERT INTO t1 VALUES (2);
+SET gtid_seq_no=100;
+INSERT INTO t1 VALUES (3);
+INSERT INTO t1 VALUES (4);
+INSERT INTO t1 VALUES (5);
+--save_master_pos
+
+--connection server_2
+--sync_with_master
+--source include/stop_slave.inc
+SELECT * FROM t1 ORDER BY a;
+
+# Now start the slave again, but force a reconnect. There was a bug that this
+# reconnect would cause duplicate events.
+
+--connection server_1
+# Make sure to get rid of any old binlog dump thread so it does not
+# interfere with our DBUG error injection.
+--source include/kill_binlog_dump_threads.inc
+INSERT INTO t1 VALUES (10);
+SET @old_dbug= @@GLOBAL.debug_dbug;
+SET GLOBAL debug_dbug="+d,gtid_force_reconnect_at_10_1_100";
+--save_master_pos
+
+--connection server_2
+--source include/start_slave.inc
+--sync_with_master
+SELECT * FROM t1 ORDER BY a;
+
+--source include/stop_slave.inc
+TRUNCATE t1;
+RESET MASTER;
+SET GLOBAL gtid_slave_pos= "";
+
+--connection server_1
+SET GLOBAL debug_dbug= @old_debug;
+TRUNCATE t1;
+RESET MASTER;
+
+# A1 B1 A2 B2 A3 B3, slave reached A1 and B3 and stopped. Slave starts,
+# reconnects at A2. There was a bug that B2 would be duplicated.
+
+SET gtid_domain_id=10;
+SET gtid_seq_no=50;
+INSERT INTO t1 VALUES (1);
+SET gtid_domain_id=11;
+INSERT INTO t1 VALUES (11);
+SET gtid_domain_id=10;
+SET gtid_seq_no=100;
+INSERT INTO t1 VALUES (2);
+SET gtid_domain_id=11;
+INSERT INTO t1 VALUES (12);
+SET gtid_domain_id=10;
+INSERT INTO t1 VALUES (3);
+SET gtid_domain_id=11;
+SET gtid_seq_no=200;
+INSERT INTO t1 VALUES (13);
+
+--connection server_2
+START SLAVE UNTIL master_gtid_pos="10-1-50,11-1-200";
+--source include/wait_for_slave_to_stop.inc
+SELECT * FROM t1 ORDER BY a;
+
+--connection server_1
+--source include/kill_binlog_dump_threads.inc
+INSERT INTO t1 VALUES (20);
+SET GLOBAL debug_dbug="+d,gtid_force_reconnect_at_10_1_100";
+--save_master_pos
+
+--connection server_2
+--source include/start_slave.inc
+--sync_with_master
+SELECT * FROM t1 ORDER BY a;
+
+--source include/stop_slave.inc
+TRUNCATE t1;
+RESET MASTER;
+SET GLOBAL gtid_slave_pos= "";
+
+--connection server_1
+SET GLOBAL debug_dbug= @old_debug;
+TRUNCATE t1;
+RESET MASTER;
+
+# A1 B1 A2 B2 A3 B3. START SLAVE UNTIL A1,B3, gets reconnect at B2.
+# There was a bug that the UNTIL would be ignored, and A2 would be lost.
+
+--source include/kill_binlog_dump_threads.inc
+SET gtid_domain_id= 9;
+SET gtid_seq_no= 50;
+INSERT INTO t1 VALUES (1);
+SET gtid_domain_id= 10;
+INSERT INTO t1 VALUES (11);
+SET gtid_domain_id= 9;
+INSERT INTO t1 VALUES (2);
+SET gtid_domain_id= 10;
+SET gtid_seq_no= 100;
+INSERT INTO t1 VALUES (12);
+SET gtid_domain_id= 9;
+INSERT INTO t1 VALUES (3);
+SET gtid_domain_id= 10;
+SET gtid_seq_no= 200;
+INSERT INTO t1 VALUES (13);
+SET gtid_domain_id= 10;
+
+SET GLOBAL debug_dbug="+d,gtid_force_reconnect_at_10_1_100";
+
+--connection server_2
+START SLAVE UNTIL master_gtid_pos="9-1-50,10-1-200";
+--source include/wait_for_slave_to_stop.inc
+SELECT * FROM t1 ORDER BY a;
+
+--connection server_1
+SET GLOBAL debug_dbug= @old_debug;
+INSERT INTO t1 VALUES (20);
+--save_master_pos
+
+--connection server_2
+--source include/start_slave.inc
+SELECT * FROM t1 ORDER BY a;
+
+
+--echo *** Test when slave IO thread needs to reconnect in the middle of an event group. ***
+--connection server_2
+--source include/stop_slave.inc
+
+TRUNCATE t1;
+RESET MASTER;
+SET GLOBAL gtid_slave_pos= "";
+
+--connection server_1
+SET GLOBAL debug_dbug= @old_debug;
+TRUNCATE t1;
+RESET MASTER;
+
+--source include/kill_binlog_dump_threads.inc
+SET GLOBAL debug_dbug="+d,binlog_force_reconnect_after_22_events";
+
+# 4 events for FD, fake rotate, gtid list, binlog checkpoint.
+# 2 events for GTID, create table
+CREATE TABLE t2 (a INT PRIMARY KEY) ENGINE=InnoDB;
+# 3 events for BEGIN/query/COMMIT
+INSERT INTO t2 VALUES (1);
+# 4 events for BEGIN/query/query/COMMIT
+BEGIN;
+INSERT INTO t2 VALUES (10);
+INSERT INTO t2 VALUES (11);
+COMMIT;
+# So this event group starts after 4+2+4+3=13 events. Or 16 in row-based.
+BEGIN;
+INSERT INTO t2 VALUES (20);
+INSERT INTO t2 VALUES (21);
+INSERT INTO t2 VALUES (22);
+INSERT INTO t2 VALUES (23);
+INSERT INTO t2 VALUES (24);
+INSERT INTO t2 VALUES (25);
+INSERT INTO t2 VALUES (26);
+INSERT INTO t2 VALUES (27);
+INSERT INTO t2 VALUES (28);
+INSERT INTO t2 VALUES (29);
+COMMIT;
+--save_master_pos
+
+--connection server_2
+--source include/start_slave.inc
+--sync_with_master
+SELECT * FROM t2 ORDER BY a;
+
+--connection server_1
+SET GLOBAL debug_dbug= @old_debug;
+
+
+# Clean up.
+--connection server_1
+DROP TABLE t1, t2;
+
+--source include/rpl_end.inc
--- a/sql/rpl_gtid.cc
+++ b/sql/rpl_gtid.cc
@@ -482,26 +482,10 @@ rpl_slave_state_tostring_helper(String *dest, const rpl_gtid *gtid, bool *first)
 }


-/*
-  Prepare the current slave state as a string, suitable for sending to the
-  master to request to receive binlog events starting from that GTID state.
-
-  The state consists of the most recently applied GTID for each domain_id,
-  ie. the one with the highest sub_id within each domain_id.
-
-  Optinally, extra_gtids is a list of GTIDs from the binlog. This is used when
-  a server was previously a master and now needs to connect to a new master as
-  a slave. For each domain_id, if the GTID in the binlog was logged with our
-  own server_id _and_ has a higher seq_no than what is in the slave state,
-  then this should be used as the position to start replicating at. This
-  allows to promote a slave as new master, and connect the old master as a
-  slave with MASTER_GTID_POS=AUTO.
-*/
-
 int
-rpl_slave_state::tostring(String *dest, rpl_gtid *extra_gtids, uint32 num_extra)
+rpl_slave_state::iterate(int (*cb)(rpl_gtid *, void *), void *data,
+                         rpl_gtid *extra_gtids, uint32 num_extra)
 {
-  bool first= true;
  uint32 i;
  HASH gtid_hash;
  uchar *rec;
@@ -555,7 +539,7 @@ rpl_slave_state::tostring(String *dest, rpl_gtid *extra_gtids, uint32 num_extra)
      }
    }

-    if (rpl_slave_state_tostring_helper(dest, &best_gtid, &first))
+    if ((res= (*cb)(&best_gtid, data)))
    {
      unlock();
      goto err;
@@ -568,7 +552,7 @@ rpl_slave_state::tostring(String *dest, rpl_gtid *extra_gtids, uint32 num_extra)
  for (i= 0; i < gtid_hash.records; ++i)
  {
    gtid= (rpl_gtid *)my_hash_element(&gtid_hash, i);
-    if (rpl_slave_state_tostring_helper(dest, gtid, &first))
+    if ((res= (*cb)(gtid, data)))
      goto err;
  }

@@ -581,6 +565,44 @@ err:
 }


+struct rpl_slave_state_tostring_data {
+  String *dest;
+  bool first;
+};
+static int
+rpl_slave_state_tostring_cb(rpl_gtid *gtid, void *data)
+{
+  rpl_slave_state_tostring_data *p= (rpl_slave_state_tostring_data *)data;
+  return rpl_slave_state_tostring_helper(p->dest, gtid, &p->first);
+}
+
+
+/*
+  Prepare the current slave state as a string, suitable for sending to the
+  master to request to receive binlog events starting from that GTID state.
+
+  The state consists of the most recently applied GTID for each domain_id,
+  ie. the one with the highest sub_id within each domain_id.
+
+  Optinally, extra_gtids is a list of GTIDs from the binlog. This is used when
+  a server was previously a master and now needs to connect to a new master as
+  a slave. For each domain_id, if the GTID in the binlog was logged with our
+  own server_id _and_ has a higher seq_no than what is in the slave state,
+  then this should be used as the position to start replicating at. This
+  allows to promote a slave as new master, and connect the old master as a
+  slave with MASTER_GTID_POS=AUTO.
+*/
+int
+rpl_slave_state::tostring(String *dest, rpl_gtid *extra_gtids, uint32 num_extra)
+{
+  struct rpl_slave_state_tostring_data data;
+  data.first= true;
+  data.dest= dest;
+
+  return iterate(rpl_slave_state_tostring_cb, &data, extra_gtids, num_extra);
+}
+
+
 /*
  Lookup a domain_id in the current replication slave state.

@@ -626,9 +648,6 @@ rpl_slave_state::domain_to_gtid(uint32 domain_id, rpl_gtid *out_gtid)
  Parse a GTID at the start of a string, and update the pointer to point
  at the first character after the parsed GTID.

-  GTID can be in short form with domain_id=0 implied, SERVERID-SEQNO.
-  Or long form, DOMAINID-SERVERID-SEQNO.
-
  Returns 0 on ok, non-zero on parse error.
 */
 static int
@@ -1217,7 +1236,7 @@ slave_connection_state::load(char *slave_request, size_t len)
  rpl_gtid *gtid;
  const rpl_gtid *gtid2;

-  my_hash_reset(&hash);
+  reset();
  p= slave_request;
  end= slave_request + len;
  if (p == end)
@@ -1270,7 +1289,7 @@ slave_connection_state::load(const rpl_gtid *gtid_list, uint32 count)
 {
  uint32 i;

-  my_hash_reset(&hash);
+  reset();
  for (i= 0; i < count; ++i)
    if (update(&gtid_list[i]))
      return 1;
@@ -1278,6 +1297,28 @@ slave_connection_state::load(const rpl_gtid *gtid_list, uint32 count)
 }


+static int
+slave_connection_state_load_cb(rpl_gtid *gtid, void *data)
+{
+  slave_connection_state *state= (slave_connection_state *)data;
+  return state->update(gtid);
+}
+
+
+/*
+  Same as rpl_slave_state::tostring(), but populates a slave_connection_state
+  instead.
+*/
+int
+slave_connection_state::load(rpl_slave_state *state,
+                             rpl_gtid *extra_gtids, uint32 num_extra)
+{
+  reset();
+  return state->iterate(slave_connection_state_load_cb, this,
+                        extra_gtids, num_extra);
+}
+
+
 rpl_gtid *
 slave_connection_state::find(uint32 domain_id)
 {

--- a/sql/rpl_gtid.h
+++ b/sql/rpl_gtid.h
@@ -93,6 +93,8 @@ struct rpl_slave_state
  int record_gtid(THD *thd, const rpl_gtid *gtid, uint64 sub_id,
                  bool in_transaction, bool in_statement);
  uint64 next_subid(uint32 domain_id);
+  int iterate(int (*cb)(rpl_gtid *, void *), void *data,
+              rpl_gtid *extra_gtids, uint32 num_extra);
  int tostring(String *dest, rpl_gtid *extra_gtids, uint32 num_extra);
  bool domain_to_gtid(uint32 domain_id, rpl_gtid *out_gtid);
  int load(THD *thd, char *state_from_master, size_t len, bool reset,
@@ -178,8 +180,10 @@ struct slave_connection_state
  slave_connection_state();
  ~slave_connection_state();

+  void reset() { my_hash_reset(&hash); }
  int load(char *slave_request, size_t len);
  int load(const rpl_gtid *gtid_list, uint32 count);
+  int load(rpl_slave_state *state, rpl_gtid *extra_gtids, uint32 num_extra);
  rpl_gtid *find(uint32 domain_id);
  int update(const rpl_gtid *in_gtid);
  void remove(const rpl_gtid *gtid);

--- a/sql/rpl_mi.cc
+++ b/sql/rpl_mi.cc
@@ -38,7 +38,8 @@ Master_info::Master_info(LEX_STRING *connection_name_arg,
   connect_retry(DEFAULT_CONNECT_RETRY), inited(0), abort_slave(0),
   slave_running(0), slave_run_id(0), sync_counter(0),
   heartbeat_period(0), received_heartbeats(0), master_id(0),
-   using_gtid(USE_GTID_NO)
+   using_gtid(USE_GTID_NO), events_queued_since_last_gtid(0),
+   gtid_reconnect_event_skip_count(0), gtid_event_seen(false)
 {
  host[0] = 0; user[0] = 0; password[0] = 0;
  ssl_ca[0]= 0; ssl_capath[0]= 0; ssl_cert[0]= 0;
@@ -147,6 +148,23 @@ void Master_info::clear_in_memory_info(bool all)
  }
 }

+
+const char *
+Master_info::using_gtid_astext(enum enum_using_gtid arg)
+{
+  switch (arg)
+  {
+  case USE_GTID_NO:
+    return "No";
+  case USE_GTID_SLAVE_POS:
+    return "Slave_Pos";
+  default:
+    DBUG_ASSERT(arg == USE_GTID_CURRENT_POS);
+    return "Current_Pos";
+  }
+}
+
+
 void init_master_log_pos(Master_info* mi)
 {
  DBUG_ENTER("init_master_log_pos");
@@ -154,6 +172,10 @@ void init_master_log_pos(Master_info* mi)
  mi->master_log_name[0] = 0;
  mi->master_log_pos = BIN_LOG_HEADER_SIZE;             // skip magic number
  mi->using_gtid= Master_info::USE_GTID_NO;
+  mi->gtid_current_pos.reset();
+  mi->events_queued_since_last_gtid= 0;
+  mi->gtid_reconnect_event_skip_count= 0;
+  mi->gtid_event_seen= false;

  /* Intentionally init ssl_verify_server_cert to 0, no option available  */
  mi->ssl_verify_server_cert= 0;

--- a/sql/rpl_mi.h
+++ b/sql/rpl_mi.h
@@ -61,6 +61,10 @@ typedef struct st_mysql MYSQL;
 class Master_info : public Slave_reporting_capability
 {
 public:
+  enum enum_using_gtid {
+    USE_GTID_NO= 0, USE_GTID_CURRENT_POS= 1, USE_GTID_SLAVE_POS= 2
+  };
+
  Master_info(LEX_STRING *connection_name, bool is_slave_recovery);
  ~Master_info();
  bool shall_ignore_server_id(ulong s_id);
@@ -70,6 +74,7 @@ class Master_info : public Slave_reporting_capability
    /* If malloc() in initialization failed */
    return connection_name.str == 0;
  }
+  static const char *using_gtid_astext(enum enum_using_gtid arg);

  /* the variables below are needed because we can change masters on the fly */
  char master_log_name[FN_REFLEN+6]; /* Room for multi-*/
@@ -135,9 +140,35 @@ class Master_info : public Slave_reporting_capability
    Note that you can not change the numeric values of these, they are used
    in master.info.
  */
-  enum {
-    USE_GTID_NO= 0, USE_GTID_CURRENT_POS= 1, USE_GTID_SLAVE_POS= 2
-  } using_gtid;
+  enum enum_using_gtid using_gtid;
+
+  /*
+    This GTID position records how far we have fetched into the relay logs.
+    This is used to continue fetching when the IO thread reconnects to the
+    master.
+
+    (Full slave stop/start does not use it, as it resets the relay logs).
+  */
+  slave_connection_state gtid_current_pos;
+  /*
+    If events_queued_since_last_gtid is non-zero, it is the number of events
+    queued so far in the relaylog of a GTID-prefixed event group.
+    It is zero when no partial event group has been queued at the moment.
+  */
+  uint64 events_queued_since_last_gtid;
+  /*
+    The GTID of the partially-queued event group, when
+    events_queued_since_last_gtid is non-zero.
+  */
+  rpl_gtid last_queued_gtid;
+  /*
+    When slave IO thread needs to reconnect, gtid_reconnect_event_skip_count
+    counts number of events to skip from the first GTID-prefixed event group,
+    to avoid duplicating events in the relay log.
+  */
+  uint64 gtid_reconnect_event_skip_count;
+  /* gtid_event_seen is false until we receive first GTID event from master. */
+  bool gtid_event_seen;
 };
 int init_master_info(Master_info* mi, const char* master_info_fname,
 		     const char* slave_info_fname,

--- a/sql/share/errmsg-utf8.txt
+++ b/sql/share/errmsg-utf8.txt
@@ -6549,3 +6549,5 @@ ER_GTID_STRICT_OUT_OF_ORDER
 	eng "An attempt was made to binlog GTID %u-%u-%llu which would create an out-of-order sequence number with existing GTID %u-%u-%llu, and gtid strict mode is enabled."
 ER_GTID_START_FROM_BINLOG_HOLE
 	eng "The binlog on the master is missing the GTID %u-%u-%llu requested by the slave (even though both a prior and a subsequent sequence number does exist), and GTID strict mode is enabled"
+ER_SLAVE_UNEXPECTED_MASTER_SWITCH
+	eng "Unexpected GTID received from master after reconnect. This normally indicates that the master server was replaced without restarting the slave threads. %s"
--- a/sql/slave.cc
+++ b/sql/slave.cc
@@ -114,7 +114,7 @@ static const char *reconnect_messages[SLAVE_RECON_ACT_MAX][SLAVE_RECON_MSG_MAX]=
 registration on master",
    "Reconnecting after a failed registration on master",
    "failed registering on master, reconnecting to try again, \
-log '%s' at position %s",
+log '%s' at position %s%s",
    "COM_REGISTER_SLAVE",
    "Slave I/O thread killed during or after reconnect"
  },
@@ -122,7 +122,7 @@ log '%s' at position %s",
    "Waiting to reconnect after a failed binlog dump request",
    "Slave I/O thread killed while retrying master dump",
    "Reconnecting after a failed binlog dump request",
-    "failed dump request, reconnecting to try again, log '%s' at position %s",
+    "failed dump request, reconnecting to try again, log '%s' at position %s%s",
    "COM_BINLOG_DUMP",
    "Slave I/O thread killed during or after reconnect"
  },
@@ -131,7 +131,7 @@ log '%s' at position %s",
    "Slave I/O thread killed while waiting to reconnect after a failed read",
    "Reconnecting after a failed master event read",
    "Slave I/O thread: Failed reading log event, reconnecting to retry, \
-log '%s' at position %s",
+log '%s' at position %s%s",
    "",
    "Slave I/O thread killed during or after a reconnect done to recover from \
 failed read"
@@ -879,9 +879,13 @@ int start_slave_threads(bool need_slave_mutex, bool wait_for_start,
    purge_relay_logs(&mi->rli, NULL, 0, &errmsg);
    mi->master_log_name[0]= 0;
    mi->master_log_pos= 0;
+    error= rpl_load_gtid_state(&mi->gtid_current_pos, mi->using_gtid ==
+                                             Master_info::USE_GTID_CURRENT_POS);
+    mi->events_queued_since_last_gtid= 0;
+    mi->gtid_reconnect_event_skip_count= 0;
  }

-  if (thread_mask & SLAVE_IO)
+  if (!error && (thread_mask & SLAVE_IO))
    error= start_slave_thread(
 #ifdef HAVE_PSI_INTERFACE
                              key_thread_slave_io,
@@ -1873,18 +1877,9 @@ past_checksum:
 after_set_capability:
 #endif

-  /*
-    Request dump start from slave replication GTID state.
-
-    Only request GTID position the first time we connect after CHANGE MASTER
-    or after starting both IO or SQL thread.
-
-    Otherwise, if the IO thread was ahead of the SQL thread before the
-    restart or reconnect, we might end up re-fetching and hence re-applying
-    the same event(s) again.
-  */
-  if (mi->using_gtid != Master_info::USE_GTID_NO && !mi->master_log_name[0])
+  if (mi->using_gtid != Master_info::USE_GTID_NO)
  {
+    /* Request dump to start from slave replication GTID state. */
    int rc;
    char str_buf[256];
    String query_str(str_buf, sizeof(str_buf), system_charset_info);
@@ -1913,9 +1908,7 @@ after_set_capability:

    query_str.append(STRING_WITH_LEN("SET @slave_connect_state='"),
                     system_charset_info);
-    if (rpl_append_gtid_state(&query_str,
-                              mi->using_gtid ==
-                                  Master_info::USE_GTID_CURRENT_POS))
+    if (mi->gtid_current_pos.append_to_string(&query_str))
    {
      err_code= ER_OUTOFMEMORY;
      errmsg= "The slave I/O thread stops because a fatal out-of-memory "
@@ -2016,7 +2009,7 @@ after_set_capability:
      }
    }
  }
-  if (mi->using_gtid == Master_info::USE_GTID_NO)
+  else
  {
    /*
      If we are not using GTID to connect this time, then instead request
@@ -2588,10 +2581,7 @@ static bool send_show_master_info_data(THD *thd, Master_info *mi, bool full,
    }
    // Master_Server_id
    protocol->store((uint32) mi->master_id);
-    protocol->store((mi->using_gtid==Master_info::USE_GTID_NO ? "No" :
-                     (mi->using_gtid==Master_info::USE_GTID_SLAVE_POS ?
-                      "Slave_Pos" : "Current_Pos")),
-                    &my_charset_bin);
+    protocol->store(mi->using_gtid_astext(mi->using_gtid), &my_charset_bin);
    if (full)
    {
      protocol->store((uint32)    mi->rli.retried_trans);
@@ -3424,8 +3414,22 @@ static int try_to_reconnect(THD *thd, MYSQL *mysql, Master_info *mi,
  if (!suppress_warnings) 
  {
    char buf[256], llbuff[22];
+    String tmp;
+    if (mi->using_gtid != Master_info::USE_GTID_NO)
+    {
+      tmp.append(STRING_WITH_LEN("; GTID position '"));
+      mi->gtid_current_pos.append_to_string(&tmp);
+      if (mi->events_queued_since_last_gtid == 0)
+        tmp.append(STRING_WITH_LEN("'"));
+      else
+      {
+        tmp.append(STRING_WITH_LEN("', GTID event skip "));
+        tmp.append_ulonglong((ulonglong)mi->events_queued_since_last_gtid);
+      }
+    }
    my_snprintf(buf, sizeof(buf), messages[SLAVE_RECON_MSG_FAILED], 
-                IO_RPL_LOG_NAME, llstr(mi->master_log_pos, llbuff));
+                IO_RPL_LOG_NAME, llstr(mi->master_log_pos, llbuff),
+                tmp.c_ptr_safe());
    /* 
      Raise a warining during registering on master/requesting dump.
      Log a message reading event.
@@ -3545,11 +3549,21 @@ pthread_handler_t handle_slave_io(void *arg)
  // we can get killed during safe_connect
  if (!safe_connect(thd, mysql, mi))
  {
-    sql_print_information("Slave I/O thread: connected to master '%s@%s:%d',"
-                          "replication started in log '%s' at position %s",
-                          mi->user, mi->host, mi->port,
-			  IO_RPL_LOG_NAME,
-			  llstr(mi->master_log_pos,llbuff));
+    if (mi->using_gtid == Master_info::USE_GTID_NO)
+      sql_print_information("Slave I/O thread: connected to master '%s@%s:%d',"
+                            "replication started in log '%s' at position %s",
+                            mi->user, mi->host, mi->port,
+                            IO_RPL_LOG_NAME,
+                            llstr(mi->master_log_pos,llbuff));
+    else
+    {
+      String tmp;
+      mi->gtid_current_pos.to_string(&tmp);
+      sql_print_information("Slave I/O thread: connected to master '%s@%s:%d',"
+                            "replication starts at GTID position '%s'",
+                            mi->user, mi->host, mi->port, tmp.c_ptr_safe());
+    }
+
  /*
    Adding MAX_LOG_EVENT_HEADER_LEN to the max_packet_size on the I/O
    thread, since a replication event can become this much larger than
@@ -3566,6 +3580,25 @@ pthread_handler_t handle_slave_io(void *arg)

 connected:

+  if (mi->using_gtid != Master_info::USE_GTID_NO)
+  {
+    /*
+      When the IO thread (re)connects to the master using GTID, it will
+      connect at the start of an event group. But the IO thread may have
+      previously logged part of the following event group to the relay
+      log.
+
+      When the IO and SQL thread are started together, we erase any previous
+      relay logs, but this is not possible/desirable while the SQL thread is
+      running. To avoid duplicating partial event groups in the relay logs in
+      this case, we remember the count of events in any partially logged event
+      group before the reconnect, and then here at connect we set up a counter
+      to skip the already-logged part of the group.
+    */
+    mi->gtid_reconnect_event_skip_count= mi->events_queued_since_last_gtid;
+    mi->gtid_event_seen= false;
+  }
+
 #ifdef ENABLED_DEBUG_SYNC
    DBUG_EXECUTE_IF("dbug.before_get_running_status_yes",
                    {
@@ -3791,8 +3824,19 @@ log space");
  // error = 0;
 err:
  // print the current replication position
-  sql_print_information("Slave I/O thread exiting, read up to log '%s', position %s",
-                  IO_RPL_LOG_NAME, llstr(mi->master_log_pos,llbuff));
+  if (mi->using_gtid == Master_info::USE_GTID_NO)
+    sql_print_information("Slave I/O thread exiting, read up to log '%s', "
+                          "position %s",
+                          IO_RPL_LOG_NAME, llstr(mi->master_log_pos,llbuff));
+  else
+  {
+    String tmp;
+    mi->gtid_current_pos.to_string(&tmp);
+    sql_print_information("Slave I/O thread exiting, read up to log '%s', "
+                          "position %s; GTID position %s",
+                          IO_RPL_LOG_NAME, llstr(mi->master_log_pos,llbuff),
+                          tmp.c_ptr_safe());
+  }
  RUN_HOOK(binlog_relay_io, thread_stop, (thd, mi));
  thd->reset_query();
  thd->reset_db(NULL, 0);
@@ -4059,10 +4103,20 @@ pthread_handler_t handle_slave_sql(void *arg)
                            rli->group_master_log_name,
                            llstr(rli->group_master_log_pos,llbuff)));
  if (global_system_variables.log_warnings)
+  {
+    String tmp;
+    if (mi->using_gtid != Master_info::USE_GTID_NO)
+    {
+      tmp.append(STRING_WITH_LEN("; GTID position '"));
+      rpl_append_gtid_state(&tmp,
+                            mi->using_gtid==Master_info::USE_GTID_CURRENT_POS);
+      tmp.append(STRING_WITH_LEN("'"));
+    }
    sql_print_information("Slave SQL thread initialized, starting replication in \
-log '%s' at position %s, relay log '%s' position: %s", RPL_LOG_NAME,
+log '%s' at position %s, relay log '%s' position: %s%s", RPL_LOG_NAME,
                    llstr(rli->group_master_log_pos,llbuff),rli->group_relay_log_name,
-                    llstr(rli->group_relay_log_pos,llbuff1));
+                    llstr(rli->group_relay_log_pos,llbuff1), tmp.c_ptr_safe());
+  }

  if (check_temp_dir(rli->slave_patternload_file))
  {
@@ -4196,16 +4250,35 @@ log '%s' at position %s, relay log '%s' position: %s", RPL_LOG_NAME,
          sql_print_warning("Slave: %s Error_code: %d", err->get_message_text(), err->get_sql_errno());
        }
        if (udf_error)
+        {
+          String tmp;
+          if (mi->using_gtid != Master_info::USE_GTID_NO)
+          {
+            tmp.append(STRING_WITH_LEN("; GTID position '"));
+            rpl_append_gtid_state(&tmp, false);
+            tmp.append(STRING_WITH_LEN("'"));
+          }
          sql_print_error("Error loading user-defined library, slave SQL "
            "thread aborted. Install the missing library, and restart the "
            "slave SQL thread with \"SLAVE START\". We stopped at log '%s' "
-            "position %s", RPL_LOG_NAME, llstr(rli->group_master_log_pos, 
-            llbuff));
+            "position %s%s", RPL_LOG_NAME, llstr(rli->group_master_log_pos,
+            llbuff), tmp.c_ptr_safe());
+        }
        else
+        {
+          String tmp;
+          if (mi->using_gtid != Master_info::USE_GTID_NO)
+          {
+            tmp.append(STRING_WITH_LEN("; GTID position '"));
+            rpl_append_gtid_state(&tmp, false);
+            tmp.append(STRING_WITH_LEN("'"));
+          }
          sql_print_error("\
 Error running query, slave SQL thread aborted. Fix the problem, and restart \
 the slave SQL thread with \"SLAVE START\". We stopped at log \
-'%s' position %s", RPL_LOG_NAME, llstr(rli->group_master_log_pos, llbuff));
+'%s' position %s%s", RPL_LOG_NAME, llstr(rli->group_master_log_pos, llbuff),
+                          tmp.c_ptr_safe());
+        }
      }
      goto err;
    }
@@ -4213,9 +4286,20 @@ the slave SQL thread with \"SLAVE START\". We stopped at log \
  }

  /* Thread stopped. Print the current replication position to the log */
-  sql_print_information("Slave SQL thread exiting, replication stopped in log "
-                        "'%s' at position %s",
-                        RPL_LOG_NAME, llstr(rli->group_master_log_pos,llbuff));
+  {
+    String tmp;
+    if (mi->using_gtid != Master_info::USE_GTID_NO)
+    {
+      tmp.append(STRING_WITH_LEN("; GTID position '"));
+      rpl_append_gtid_state(&tmp, false);
+      tmp.append(STRING_WITH_LEN("'"));
+    }
+    sql_print_information("Slave SQL thread exiting, replication stopped in "
+                          "log '%s' at position %s%s",
+                          RPL_LOG_NAME,
+                          llstr(rli->group_master_log_pos,llbuff),
+                          tmp.c_ptr_safe());
+  }

 err:

@@ -4690,6 +4774,8 @@ static int queue_event(Master_info* mi,const char* buf, ulong event_len)
  mysql_mutex_t *log_lock= rli->relay_log.get_log_lock();
  ulong s_id;
  bool unlock_data_lock= TRUE;
+  bool gtid_skip_enqueue= false;
+
  /*
    FD_q must have been prepared for the first R_a event
    inside get_master_version_and_clock()
@@ -4876,6 +4962,19 @@ static int queue_event(Master_info* mi,const char* buf, ulong event_len)
    /* installing new value of checksum Alg for relay log */
    mi->rli.relay_log.relay_log_checksum_alg= tmp->checksum_alg;

+    /*
+      Do not queue any format description event that we receive after a
+      reconnect where we are skipping over a partial event group received
+      before the reconnect.
+
+      (If we queued such an event, and it was the first format_description
+      event after master restart, the slave SQL thread would think that
+      the partial event group before it in the relay log was from a
+      previous master crash and should be rolled back.
+    */
+    if (unlikely(mi->gtid_reconnect_event_skip_count))
+        gtid_skip_enqueue= true;
+
    /*
       Though this does some conversion to the slave's format, this will
       preserve the master's binlog format version, and number of event types.
@@ -4971,18 +5070,113 @@ static int queue_event(Master_info* mi,const char* buf, ulong event_len)
                            " UNTIL master_gtid_pos %s", str.c_ptr_safe());
      mi->abort_slave= true;
    }
+    event_pos= glev->log_pos;
    delete glev;

    /*
-      Do not update position for fake Gtid_list event (which has a zero
-      end_log_pos).
+      We use fake Gtid_list events to update the old-style position (among
+      other things).
+
+      Early code created fake Gtid_list events with zero log_pos, those should
+      not modify old-style position.
    */
-    inc_pos= uint4korr(buf+LOG_POS_OFFSET) ? event_len : 0;
+    if (event_pos == 0 || event_pos <= mi->master_log_pos)
+      inc_pos= 0;
+    else
+      inc_pos= event_pos - mi->master_log_pos;
+  }
+  break;
+
+  case GTID_EVENT:
+  {
+    uchar dummy_flag;
+
+    if (mi->using_gtid == Master_info::USE_GTID_NO)
+      goto default_action;
+    if (unlikely(!mi->gtid_event_seen))
+    {
+      mi->gtid_event_seen= true;
+      if (mi->gtid_reconnect_event_skip_count)
+      {
+        rpl_gtid gtid;
+
+        /*
+          If we are reconnecting, and we need to skip a partial event group
+          already queued to the relay log before the reconnect, then we check
+          that we actually get the same event group (same GTID) as before, so
+          we do not end up with half of one group and half another.
+
+          The only way we should be able to receive a different GTID than what
+          we expect is if the binlog on the master (or more likely the whole
+          master server) was replaced with a different one, one the same IP
+          address, _and_ the new master happens to have domains in a different
+          order so we get the GTID from a different domain first. Still, it is
+          best to protect against this case.
+        */
+        if (Gtid_log_event::peek(buf, event_len, checksum_alg,
+                                 &gtid.domain_id, &gtid.server_id,
+                                 &gtid.seq_no, &dummy_flag))
+        {
+          error= ER_SLAVE_RELAY_LOG_WRITE_FAILURE;
+          goto err;
+        }
+        if (gtid.domain_id != mi->last_queued_gtid.domain_id ||
+            gtid.server_id != mi->last_queued_gtid.server_id ||
+            gtid.seq_no != mi->last_queued_gtid.seq_no)
+        {
+          bool first;
+          error= ER_SLAVE_UNEXPECTED_MASTER_SWITCH;
+          error_msg.append(STRING_WITH_LEN("Expected: "));
+          first= true;
+          rpl_slave_state_tostring_helper(&error_msg, &mi->last_queued_gtid,
+                                          &first);
+          error_msg.append(STRING_WITH_LEN(", received: "));
+          first= true;
+          rpl_slave_state_tostring_helper(&error_msg, &gtid, &first);
+          goto err;
+        }
+      }
+    }
+
+    if (unlikely(mi->gtid_reconnect_event_skip_count))
+    {
+      goto default_action;
+    }
+
+    /*
+      We have successfully queued to relay log everything before this GTID, so
+      in case of reconnect we can start from after any previous GTID.
+    */
+    if (mi->events_queued_since_last_gtid)
+    {
+      mi->gtid_current_pos.update(&mi->last_queued_gtid);
+      mi->events_queued_since_last_gtid= 0;
+    }
+    if (Gtid_log_event::peek(buf, event_len, checksum_alg,
+                             &mi->last_queued_gtid.domain_id,
+                             &mi->last_queued_gtid.server_id,
+                             &mi->last_queued_gtid.seq_no, &dummy_flag))
+    {
+      error= ER_SLAVE_RELAY_LOG_WRITE_FAILURE;
+      goto err;
+    }
+    ++mi->events_queued_since_last_gtid;
  }
  break;

  default:
  default_action:
+    if (mi->using_gtid != Master_info::USE_GTID_NO && mi->gtid_event_seen)
+    {
+      if (unlikely(mi->gtid_reconnect_event_skip_count))
+      {
+        --mi->gtid_reconnect_event_skip_count;
+        gtid_skip_enqueue= true;
+      }
+      else if (mi->events_queued_since_last_gtid)
+        ++mi->events_queued_since_last_gtid;
+    }
+
    inc_pos= event_len;
    break;
  }
@@ -5067,8 +5261,16 @@ static int queue_event(Master_info* mi,const char* buf, ulong event_len)
  }
  else
  {
-    /* write the event to the relay log */
-    if (likely(!(rli->relay_log.appendv(buf,event_len,0))))
+    /*
+      Write the event to the relay log, unless we reconnected in the middle
+      of an event group and now need to skip the initial part of the group that
+      we already wrote before reconnecting.
+    */
+    if (unlikely(gtid_skip_enqueue))
+    {
+      mi->master_log_pos+= inc_pos;
+    }
+    else if (likely(!(rli->relay_log.appendv(buf,event_len,0))))
    {
      mi->master_log_pos+= inc_pos;
      DBUG_PRINT("info", ("master_log_pos: %lu", (ulong) mi->master_log_pos));

--- a/sql/sql_parse.cc
+++ b/sql/sql_parse.cc
@@ -2776,13 +2776,34 @@ end_with_restore_list:
  {
    LEX_MASTER_INFO* lex_mi= &thd->lex->mi;
    Master_info *mi;
+    int load_error;
+
+    load_error= rpl_load_gtid_slave_state(thd);
+
    mysql_mutex_lock(&LOCK_active_mi);

    if ((mi= (master_info_index->
              get_master_info(&lex_mi->connection_name,
                              MYSQL_ERROR::WARN_LEVEL_ERROR))))
+    {
+      if (load_error)
+      {
+        /*
+          We cannot start a slave using GTID if we cannot load the GTID position
+          from the mysql.gtid_slave_pos table. But we can allow non-GTID
+          replication (useful eg. during upgrade).
+        */
+        if (mi->using_gtid != Master_info::USE_GTID_NO)
+        {
+          mysql_mutex_unlock(&LOCK_active_mi);
+          break;
+        }
+        else
+          thd->clear_error();
+      }
      if (!start_slave(thd, mi, 1 /* net report*/))
        my_ok(thd);
+    }
    mysql_mutex_unlock(&LOCK_active_mi);
    break;
  }

--- a/sql/sql_repl.cc
+++ b/sql/sql_repl.cc
@@ -50,7 +50,7 @@ extern TYPELIB binlog_checksum_typelib;
 static int
 fake_event_header(String* packet, Log_event_type event_type, ulong extra_len,
                  my_bool *do_checksum, ha_checksum *crc, const char** errmsg,
-                  uint8 checksum_alg_arg)
+                  uint8 checksum_alg_arg, uint32 end_pos)
 {
  char header[LOG_EVENT_HEADER_LEN];
  ulong event_len;
@@ -70,7 +70,7 @@ fake_event_header(String* packet, Log_event_type event_type, ulong extra_len,
  int4store(header + EVENT_LEN_OFFSET, event_len);
  int2store(header + FLAGS_OFFSET, LOG_EVENT_ARTIFICIAL_F);
  // TODO: check what problems this may cause and fix them
-  int4store(header + LOG_POS_OFFSET, 0);
+  int4store(header + LOG_POS_OFFSET, end_pos);
  if (packet->append(header, sizeof(header)))
  {
    *errmsg= "Failed due to out-of-memory writing event";
@@ -146,7 +146,7 @@ static int fake_rotate_event(NET* net, String* packet, char* log_file_name,

  if ((err= fake_event_header(packet, ROTATE_EVENT,
                              ident_len + ROTATE_HEADER_LEN, &do_checksum, &crc,
-                              errmsg, checksum_alg_arg)))
+                              errmsg, checksum_alg_arg, 0)))
    DBUG_RETURN(err);

  int8store(buf+R_POS_OFFSET,position);
@@ -169,7 +169,7 @@ static int fake_rotate_event(NET* net, String* packet, char* log_file_name,

 static int fake_gtid_list_event(NET* net, String* packet,
                                Gtid_list_log_event *glev, const char** errmsg,
-                                uint8 checksum_alg_arg)
+                                uint8 checksum_alg_arg, uint32 current_pos)
 {
  my_bool do_checksum;
  int err;
@@ -185,7 +185,7 @@ static int fake_gtid_list_event(NET* net, String* packet,
  }
  if ((err= fake_event_header(packet, GTID_LIST_EVENT,
                              str.length(), &do_checksum, &crc,
-                              errmsg, checksum_alg_arg)))
+                              errmsg, checksum_alg_arg, current_pos)))
    return err;

  packet->append(str);
@@ -1406,7 +1406,7 @@ is_until_reached(THD *thd, NET *net, String *packet, ulong *ev_offset,
                 enum_gtid_until_state gtid_until_group,
                 Log_event_type event_type, uint8 current_checksum_alg,
                 ushort flags, const char **errmsg,
-                 rpl_binlog_state *until_binlog_state)
+                 rpl_binlog_state *until_binlog_state, uint32 current_pos)
 {
  switch (gtid_until_group)
  {
@@ -1437,7 +1437,8 @@ is_until_reached(THD *thd, NET *net, String *packet, ulong *ev_offset,
    return true;
  Gtid_list_log_event glev(until_binlog_state,
                           Gtid_list_log_event::FLAG_UNTIL_REACHED);
-  if (fake_gtid_list_event(net, packet, &glev, errmsg, current_checksum_alg))
+  if (fake_gtid_list_event(net, packet, &glev, errmsg, current_checksum_alg,
+                           current_pos))
    return true;
  *errmsg= NULL;
  return true;
@@ -1508,6 +1509,19 @@ send_event_to_slave(THD *thd, NET *net, String* const packet, ushort flags,
        return "Failed to read Gtid_log_event: corrupt binlog";
      }

+      DBUG_EXECUTE_IF("gtid_force_reconnect_at_10_1_100",
+        {
+          rpl_gtid *dbug_gtid;
+          if ((dbug_gtid= until_binlog_state->find(10,1)) &&
+              dbug_gtid->seq_no == 100)
+          {
+            DBUG_SET("-d,gtid_force_reconnect_at_10_1_100");
+            DBUG_SET_INITIAL("-d,gtid_force_reconnect_at_10_1_100");
+            my_errno= ER_UNKNOWN_ERROR;
+            return "DBUG-injected forced reconnect";
+          }
+        });
+
      if (until_binlog_state->update(&event_gtid, false))
      {
        my_errno= ER_MASTER_FATAL_ERROR_READING_BINLOG;
@@ -1527,19 +1541,31 @@ send_event_to_slave(THD *thd, NET *net, String* const packet, ushort flags,
          if (event_gtid.server_id == gtid->server_id &&
              event_gtid.seq_no >= gtid->seq_no)
          {
-            /*
-              In strict mode, it is an error if the slave requests to start in
-              a "hole" in the master's binlog: a GTID that does not exist, even
-              though both the prior and subsequent seq_no exists for same
-              domain_id and server_id.
-            */
-            if (slave_gtid_strict_mode && event_gtid.seq_no > gtid->seq_no)
+            if (event_gtid.seq_no > gtid->seq_no)
+            {
+              /*
+                In strict mode, it is an error if the slave requests to start
+                in a "hole" in the master's binlog: a GTID that does not
+                exist, even though both the prior and subsequent seq_no exists
+                for same domain_id and server_id.
+              */
+              if (slave_gtid_strict_mode)
+              {
+                my_errno= ER_GTID_START_FROM_BINLOG_HOLE;
+                *error_gtid= *gtid;
+                return "The binlog on the master is missing the GTID requested "
+                  "by the slave (even though both a prior and a subsequent "
+                  "sequence number does exist), and GTID strict mode is enabled.";
+              }
+            }
+            else
            {
-              my_errno= ER_GTID_START_FROM_BINLOG_HOLE;
-              *error_gtid= *gtid;
-              return "The binlog on the master is missing the GTID requested "
-                "by the slave (even though both a prior and a subsequent "
-                "sequence number does exist), and GTID strict mode is enabled.";
+              /*
+                Send a fake Gtid_list event to the slave.
+                This allows the slave to update its current binlog position
+                so MASTER_POS_WAIT() and MASTER_GTID_WAIT() can work.
+              */
+//              send_fake_gtid_list_event(until_binlog_state);
            }
            /*
              Delete this entry if we have reached slave start position (so we
@@ -1797,6 +1823,7 @@ void mysql_binlog_send(THD* thd, char* log_ident, my_off_t pos,
  int old_max_allowed_packet= thd->variables.max_allowed_packet;
 #ifndef DBUG_OFF
  int left_events = max_binlog_dump_events;
+  uint dbug_reconnect_counter= 0;
 #endif
  DBUG_ENTER("mysql_binlog_send");
  DBUG_PRINT("enter",("log_ident: '%s'  pos: %ld", log_ident, (long) pos));
@@ -1830,6 +1857,13 @@ void mysql_binlog_send(THD* thd, char* log_ident, my_off_t pos,
      until_gtid_state= &until_gtid_state_obj;
  }

+  DBUG_EXECUTE_IF("binlog_force_reconnect_after_22_events",
+    {
+      DBUG_SET("-d,binlog_force_reconnect_after_22_events");
+      DBUG_SET_INITIAL("-d,binlog_force_reconnect_after_22_events");
+      dbug_reconnect_counter= 22;
+    });
+
  /*
    We want to corrupt the first event, in Log_event::read_log_event().
    But we do not want the corruption to happen early, eg. when client does
@@ -2176,6 +2210,19 @@ impossible position";
        (*packet)[FLAGS_OFFSET+ev_offset] &= ~LOG_EVENT_BINLOG_IN_USE_F;
      }

+#ifndef DBUG_OFF
+      if (dbug_reconnect_counter > 0)
+      {
+        --dbug_reconnect_counter;
+        if (dbug_reconnect_counter == 0)
+        {
+          errmsg= "DBUG-injected forced reconnect";
+          my_errno= ER_UNKNOWN_ERROR;
+          goto err;
+        }
+      }
+#endif
+
      if ((tmp_msg= send_event_to_slave(thd, net, packet, flags, event_type,
                                        log_file_name, &log,
                                        mariadb_slave_capability, ev_offset,
@@ -2191,7 +2238,7 @@ impossible position";
      if (until_gtid_state &&
          is_until_reached(thd, net, packet, &ev_offset, gtid_until_group,
                           event_type, current_checksum_alg, flags, &errmsg,
-                           &until_binlog_state))
+                           &until_binlog_state, my_b_tell(&log)))
      {
        if (errmsg)
        {
@@ -2373,7 +2420,7 @@ impossible position";
              until_gtid_state &&
              is_until_reached(thd, net, packet, &ev_offset, gtid_until_group,
                               event_type, current_checksum_alg, flags, &errmsg,
-                               &until_binlog_state))
+                               &until_binlog_state, my_b_tell(&log)))
          {
            if (errmsg)
            {
@@ -2970,6 +3017,7 @@ bool change_master(THD* thd, Master_info* mi, bool *master_info_added)
  char saved_host[HOSTNAME_LENGTH + 1];
  uint saved_port;
  char saved_log_name[FN_REFLEN];
+  Master_info::enum_using_gtid saved_using_gtid;
  char master_info_file_tmp[FN_REFLEN];
  char relay_log_info_file_tmp[FN_REFLEN];
  my_off_t saved_log_pos;
@@ -3059,6 +3107,7 @@ bool change_master(THD* thd, Master_info* mi, bool *master_info_added)
  saved_port= mi->port;
  strmake(saved_log_name, mi->master_log_name, FN_REFLEN - 1);
  saved_log_pos= mi->master_log_pos;
+  saved_using_gtid= mi->using_gtid;

  /*
    If the user specified host or port without binlog or position,
@@ -3291,6 +3340,11 @@ bool change_master(THD* thd, Master_info* mi, bool *master_info_added)
    "master_log_pos='%ld'.", saved_host, saved_port, saved_log_name,
    (ulong) saved_log_pos, mi->host, mi->port, mi->master_log_name,
    (ulong) mi->master_log_pos);
+  if (saved_using_gtid != Master_info::USE_GTID_NO ||
+      mi->using_gtid != Master_info::USE_GTID_NO)
+    sql_print_information("Previous Using_Gtid=%s. New Using_Gtid=%s",
+                          mi->using_gtid_astext(saved_using_gtid),
+                          mi->using_gtid_astext(mi->using_gtid));

  /*
    If we don't write new coordinates to disk now, then old will remain in
@@ -3753,11 +3807,11 @@ rpl_deinit_gtid_slave_state()


 /*
-  Format the current GTID state as a string, for use when connecting to a
-  master server with GTID, or for returning the value of @@global.gtid_state.
+  Format the current GTID state as a string, for returning the value of
+  @@global.gtid_slave_pos.

  If the flag use_binlog is true, then the contents of the binary log (if
-  enabled) is merged into the current GTID state.
+  enabled) is merged into the current GTID state (@@global.gtid_current_pos).
 */
 int
 rpl_append_gtid_state(String *dest, bool use_binlog)
@@ -3770,10 +3824,35 @@ rpl_append_gtid_state(String *dest, bool use_binlog)
      (err= mysql_bin_log.get_most_recent_gtid_list(&gtid_list, &num_gtids)))
    return err;

-  rpl_global_gtid_slave_state.tostring(dest, gtid_list, num_gtids);
+  err= rpl_global_gtid_slave_state.tostring(dest, gtid_list, num_gtids);
  my_free(gtid_list);

-  return 0;
+  return err;
+}
+
+
+/*
+  Load the current GITD position into a slave_connection_state, for use when
+  connecting to a master server with GTID.
+
+  If the flag use_binlog is true, then the contents of the binary log (if
+  enabled) is merged into the current GTID state (master_use_gtid=current_pos).
+*/
+int
+rpl_load_gtid_state(slave_connection_state *state, bool use_binlog)
+{
+  int err;
+  rpl_gtid *gtid_list= NULL;
+  uint32 num_gtids= 0;
+
+  if (use_binlog && opt_bin_log &&
+      (err= mysql_bin_log.get_most_recent_gtid_list(&gtid_list, &num_gtids)))
+    return err;
+
+  err= state->load(&rpl_global_gtid_slave_state, gtid_list, num_gtids);
+  my_free(gtid_list);
+
+  return err;
 }



--- a/sql/sql_repl.h
+++ b/sql/sql_repl.h
@@ -32,6 +32,8 @@ typedef struct st_slave_info
  THD* thd;
 } SLAVE_INFO;

+class slave_connection_state;
+
 extern my_bool opt_show_slave_auth_info;
 extern char *master_host, *master_info_file;
 extern bool server_id_supplied;
@@ -70,6 +72,7 @@ void rpl_init_gtid_slave_state();
 void rpl_deinit_gtid_slave_state();
 int gtid_state_from_binlog_pos(const char *name, uint32 pos, String *out_str);
 int rpl_append_gtid_state(String *dest, bool use_binlog);
+int rpl_load_gtid_state(slave_connection_state *state, bool use_binlog);
 bool rpl_gtid_pos_check(THD *thd, char *str, size_t len);
 bool rpl_gtid_pos_update(THD *thd, char *str, size_t len);