MDEV-5914: Parallel replication deadlock due to InnoDB lock conflicts

Due to how gap locks work, two transactions could group commit together on the master, but get lock conflicts and then deadlock due to different thread scheduling order on slave. For now, remove these deadlocks by running the parallel slave in READ COMMITTED mode. And let InnoDB/XtraDB allow statement-based binlogging for the parallel slave in READ COMMITTED. We are also investigating a different solution long-term, which is based on relaxing the gap locks only between the transactions running in parallel for one slave, but not against possibly external transactions.

MDEV-5914: Parallel replication deadlock due to InnoDB lock conflicts
Due to how gap locks work, two transactions could group commit together on the master, but get lock conflicts and then deadlock due to different thread scheduling order on slave. For now, remove these deadlocks by running the parallel slave in READ COMMITTED mode. And let InnoDB/XtraDB allow statement-based binlogging for the parallel slave in READ COMMITTED. We are also investigating a different solution long-term, which is based on relaxing the gap locks only between the transactions running in parallel for one slave, but not against possibly external transactions.
b3529691 · unknown · a5418c55 · b3529691 · b3529691 · b3529691
Commit b3529691 authored Mar 21, 2014 by unknown
10 changed files
--- a/include/mysql/plugin.h
+++ b/include/mysql/plugin.h
@@ -622,6 +622,7 @@ void **thd_ha_data(const MYSQL_THD thd, const struct handlerton *hton);
 void thd_storage_lock_wait(MYSQL_THD thd, long long value);
 int thd_tx_isolation(const MYSQL_THD thd);
 int thd_tx_is_read_only(const MYSQL_THD thd);
+int thd_rpl_is_parallel(const MYSQL_THD thd);
 /**
  Create a temporary file.


--- a/include/mysql/plugin_audit.h.pp
+++ b/include/mysql/plugin_audit.h.pp
@@ -303,6 +303,7 @@ void **thd_ha_data(const void* thd, const struct handlerton *hton);
 void thd_storage_lock_wait(void* thd, long long value);
 int thd_tx_isolation(const void* thd);
 int thd_tx_is_read_only(const void* thd);
+int thd_rpl_is_parallel(const void* thd);
 int mysql_tmpfile(const char *prefix);
 unsigned long thd_get_thread_id(const void* thd);
 void thd_get_xid(const void* thd, MYSQL_XID *xid);

--- a/include/mysql/plugin_auth.h.pp
+++ b/include/mysql/plugin_auth.h.pp
@@ -303,6 +303,7 @@ void **thd_ha_data(const void* thd, const struct handlerton *hton);
 void thd_storage_lock_wait(void* thd, long long value);
 int thd_tx_isolation(const void* thd);
 int thd_tx_is_read_only(const void* thd);
+int thd_rpl_is_parallel(const void* thd);
 int mysql_tmpfile(const char *prefix);
 unsigned long thd_get_thread_id(const void* thd);
 void thd_get_xid(const void* thd, MYSQL_XID *xid);

--- a/include/mysql/plugin_ftparser.h.pp
+++ b/include/mysql/plugin_ftparser.h.pp
@@ -256,6 +256,7 @@ void **thd_ha_data(const void* thd, const struct handlerton *hton);
 void thd_storage_lock_wait(void* thd, long long value);
 int thd_tx_isolation(const void* thd);
 int thd_tx_is_read_only(const void* thd);
+int thd_rpl_is_parallel(const void* thd);
 int mysql_tmpfile(const char *prefix);
 unsigned long thd_get_thread_id(const void* thd);
 void thd_get_xid(const void* thd, MYSQL_XID *xid);

--- a/mysql-test/suite/rpl/r/rpl_parallel.result
+++ b/mysql-test/suite/rpl/r/rpl_parallel.result
@@ -761,11 +761,51 @@ a	b
 110	1
 111	2
 112	3
+***MDEV-5914: Parallel replication deadlock due to InnoDB lock conflicts ***
+include/stop_slave.inc
+CREATE TABLE t4 (a INT PRIMARY KEY, b INT, KEY b_idx(b)) ENGINE=InnoDB;
+INSERT INTO t4 VALUES (1,NULL), (2,2), (3,NULL), (4,4), (5, NULL), (6, 6);
+SET debug_sync='commit_after_release_LOCK_prepare_ordered SIGNAL master_queued1 WAIT_FOR master_cont1';
+UPDATE t4 SET b=NULL WHERE a=6;
+SET debug_sync='now WAIT_FOR master_queued1';
+SET debug_sync='commit_after_release_LOCK_prepare_ordered SIGNAL master_queued2';
+DELETE FROM t4 WHERE b <= 3;
+SET debug_sync='now WAIT_FOR master_queued2';
+SET debug_sync='now SIGNAL master_cont1';
+SET debug_sync='RESET';
+include/start_slave.inc
+include/stop_slave.inc
+SELECT * FROM t4 ORDER BY a;
+a	b
+1	NULL
+3	NULL
+4	4
+5	NULL
+6	NULL
+DELETE FROM t4;
+INSERT INTO t4 VALUES (1,NULL), (2,2), (3,NULL), (4,4), (5, NULL), (6, 6);
+SET debug_sync='commit_after_release_LOCK_prepare_ordered SIGNAL master_queued1 WAIT_FOR master_cont1';
+INSERT INTO t4 VALUES (7, NULL);
+SET debug_sync='now WAIT_FOR master_queued1';
+SET debug_sync='commit_after_release_LOCK_prepare_ordered SIGNAL master_queued2';
+DELETE FROM t4 WHERE b <= 3;
+SET debug_sync='now WAIT_FOR master_queued2';
+SET debug_sync='now SIGNAL master_cont1';
+SET debug_sync='RESET';
+include/start_slave.inc
+SELECT * FROM t4 ORDER BY a;
+a	b
+1	NULL
+3	NULL
+4	4
+5	NULL
+6	6
+7	NULL
 include/stop_slave.inc
 SET GLOBAL slave_parallel_threads=@old_parallel_threads;
 include/start_slave.inc
 SET DEBUG_SYNC= 'RESET';
 DROP function foo;
-DROP TABLE t1,t2,t3;
+DROP TABLE t1,t2,t3,t4;
 SET DEBUG_SYNC= 'RESET';
 include/rpl_end.inc
--- a/mysql-test/suite/rpl/t/rpl_parallel.test
+++ b/mysql-test/suite/rpl/t/rpl_parallel.test
@@ -1172,6 +1172,84 @@ START SLAVE SQL_THREAD;
 SELECT * FROM t3 WHERE a >= 110 ORDER BY a;


+--echo ***MDEV-5914: Parallel replication deadlock due to InnoDB lock conflicts ***
+--connection server_2
+--source include/stop_slave.inc
+
+--connection server_1
+CREATE TABLE t4 (a INT PRIMARY KEY, b INT, KEY b_idx(b)) ENGINE=InnoDB;
+INSERT INTO t4 VALUES (1,NULL), (2,2), (3,NULL), (4,4), (5, NULL), (6, 6);
+
+# Create a group commit with UPDATE and DELETE, in that order.
+# The bug was that while the UPDATE's row lock does not block the DELETE, the
+# DELETE's gap lock _does_ block the UPDATE. This could cause a deadlock
+# on the slave.
+--connection con1
+SET debug_sync='commit_after_release_LOCK_prepare_ordered SIGNAL master_queued1 WAIT_FOR master_cont1';
+send UPDATE t4 SET b=NULL WHERE a=6;
+--connection server_1
+SET debug_sync='now WAIT_FOR master_queued1';
+
+--connection con2
+SET debug_sync='commit_after_release_LOCK_prepare_ordered SIGNAL master_queued2';
+send DELETE FROM t4 WHERE b <= 3;
+
+--connection server_1
+SET debug_sync='now WAIT_FOR master_queued2';
+SET debug_sync='now SIGNAL master_cont1';
+
+--connection con1
+REAP;
+--connection con2
+REAP;
+SET debug_sync='RESET';
+--save_master_pos
+
+--connection server_2
+--source include/start_slave.inc
+--sync_with_master
+--source include/stop_slave.inc
+
+SELECT * FROM t4 ORDER BY a;
+
+
+# Another example, this one with INSERT vs. DELETE
+--connection server_1
+DELETE FROM t4;
+INSERT INTO t4 VALUES (1,NULL), (2,2), (3,NULL), (4,4), (5, NULL), (6, 6);
+
+# Create a group commit with INSERT and DELETE, in that order.
+# The bug was that while the INSERT's insert intention lock does not block
+# the DELETE, the DELETE's gap lock _does_ block the INSERT. This could cause
+# a deadlock on the slave.
+--connection con1
+SET debug_sync='commit_after_release_LOCK_prepare_ordered SIGNAL master_queued1 WAIT_FOR master_cont1';
+send INSERT INTO t4 VALUES (7, NULL);
+--connection server_1
+SET debug_sync='now WAIT_FOR master_queued1';
+
+--connection con2
+SET debug_sync='commit_after_release_LOCK_prepare_ordered SIGNAL master_queued2';
+send DELETE FROM t4 WHERE b <= 3;
+
+--connection server_1
+SET debug_sync='now WAIT_FOR master_queued2';
+SET debug_sync='now SIGNAL master_cont1';
+
+--connection con1
+REAP;
+--connection con2
+REAP;
+SET debug_sync='RESET';
+--save_master_pos
+
+--connection server_2
+--source include/start_slave.inc
+--sync_with_master
+
+SELECT * FROM t4 ORDER BY a;
+
+
 --connection server_2
 --source include/stop_slave.inc
 SET GLOBAL slave_parallel_threads=@old_parallel_threads;
@@ -1180,7 +1258,7 @@ SET DEBUG_SYNC= 'RESET';

 --connection server_1
 DROP function foo;
-DROP TABLE t1,t2,t3;
+DROP TABLE t1,t2,t3,t4;
 SET DEBUG_SYNC= 'RESET';

 --source include/rpl_end.inc
--- a/sql/rpl_parallel.cc
+++ b/sql/rpl_parallel.cc
@@ -242,6 +242,39 @@ handle_rpl_parallel_thread(void *arg)
  thd_proc_info(thd, "Waiting for work from main SQL threads");
  thd->set_time();
  thd->variables.lock_wait_timeout= LONG_TIMEOUT;
+  /*
+    For now, we need to run the replication parallel worker threads in
+    READ COMMITTED. This is needed because gap locks are not symmetric.
+    For example, a gap lock from a DELETE blocks an insert intention lock,
+    but not vice versa. So an INSERT followed by DELETE can group commit
+    on the master, but if we are unlucky with thread scheduling we can
+    then deadlock on the slave because the INSERT ends up waiting for a
+    gap lock from the DELETE (and the DELETE in turn waits for the INSERT
+    in wait_for_prior_commit()). See also MDEV-5914.
+
+    It should be mostly safe to run in READ COMMITTED in the slave anyway.
+    The commit order is already fixed from on the master, so we do not
+    risk logging into the binlog in an incorrect order between worker
+    threads (one that would cause different results if executed on a
+    lower-level slave that uses this slave as a master). The only
+    potential problem is with transactions run in a different master
+    connection (using multi-source replication), or run directly on the
+    slave by an application; when using READ COMMITTED we are not
+    guaranteed serialisability of binlogged statements.
+
+    In practice, this is unlikely to be an issue. In GTID mode, such
+    parallel transactions from multi-source or application must in any
+    case use a different replication domain, in which case binlog order
+    by definition must be independent between the different domain. Even
+    in non-GTID mode, normally one will assume that the external
+    transactions are not conflicting with those applied by the slave, so
+    that isolation level should make no difference. It would be rather
+    strange if the result of applying query events from one master would
+    depend on the timing and nature of other queries executed from
+    different multi-source connections or done directly on the slave by
+    an application. Still, something to be aware of.
+  */
+  thd->variables.tx_isolation= ISO_READ_COMMITTED;

  mysql_mutex_lock(&rpt->LOCK_rpl_thread);
  rpt->thd= thd;
@@ -306,6 +339,7 @@ handle_rpl_parallel_thread(void *arg)
        PSI_stage_info old_stage;
        uint64 wait_count;

+        thd->tx_isolation= (enum_tx_isolation)thd->variables.tx_isolation;
        in_event_group= true;
        /*
          If the standalone flag is set, then this event group consists of a

--- a/sql/sql_class.cc
+++ b/sql/sql_class.cc
@@ -4234,6 +4234,12 @@ extern "C" int thd_slave_thread(const MYSQL_THD thd)
  return(thd->slave_thread);
 }

+/* Returns true for a worker thread in parallel replication. */
+extern "C" int thd_rpl_is_parallel(const MYSQL_THD thd)
+{
+  return thd->rgi_slave && thd->rgi_slave->is_parallel_exec;
+}
+
 extern "C" int thd_non_transactional_update(const MYSQL_THD thd)
 {
  return(thd->transaction.all.modified_non_trans_table);

--- a/storage/innobase/handler/ha_innodb.cc
+++ b/storage/innobase/handler/ha_innodb.cc
@@ -4223,11 +4223,14 @@ handler::Table_flags
 ha_innobase::table_flags() const
 /*============================*/
 {
+	THD *thd = ha_thd();
 	/* Need to use tx_isolation here since table flags is (also)
 	called before prebuilt is inited. */
-	ulong const tx_isolation = thd_tx_isolation(ha_thd());
+	ulong const tx_isolation = thd_tx_isolation(thd);

-	if (tx_isolation <= ISO_READ_COMMITTED) {
+	if (tx_isolation <= ISO_READ_COMMITTED &&
+	    !(tx_isolation == ISO_READ_COMMITTED &&
+	      thd_rpl_is_parallel(thd))) {
 		return(int_table_flags);
 	}


--- a/storage/xtradb/handler/ha_innodb.cc
+++ b/storage/xtradb/handler/ha_innodb.cc
@@ -4680,11 +4680,14 @@ handler::Table_flags
 ha_innobase::table_flags() const
 /*============================*/
 {
+	THD *thd = ha_thd();
 	/* Need to use tx_isolation here since table flags is (also)
 	called before prebuilt is inited. */
-	ulong const tx_isolation = thd_tx_isolation(ha_thd());
+	ulong const tx_isolation = thd_tx_isolation(thd);

-	if (tx_isolation <= ISO_READ_COMMITTED) {
+	if (tx_isolation <= ISO_READ_COMMITTED &&
+	    !(tx_isolation == ISO_READ_COMMITTED &&
+	      thd_rpl_is_parallel(thd))) {
 		return(int_table_flags);
 	}