Commit a227cf80 authored by Kristian Nielsen's avatar Kristian Nielsen

MDEV-7335: Potential parallel slave deadlock with specific binlog corruption

If somehow the COMMIT or XID event in an event group was missing, the code in
parallel replication to handle this was not sufficient, leading to server
deadlock.
parent 79e9ff44
...@@ -1136,6 +1136,80 @@ SET GLOBAL debug_dbug=@old_dbug; ...@@ -1136,6 +1136,80 @@ SET GLOBAL debug_dbug=@old_dbug;
SET GLOBAL slave_parallel_threads=0; SET GLOBAL slave_parallel_threads=0;
SET GLOBAL slave_parallel_threads=10; SET GLOBAL slave_parallel_threads=10;
include/start_slave.inc include/start_slave.inc
*** MDEV-7335: Potential parallel slave deadlock with specific binlog corruption ***
include/stop_slave.inc
SET GLOBAL slave_parallel_threads=1;
SET @old_dbug= @@GLOBAL.debug_dbug;
SET GLOBAL debug_dbug="+d,slave_discard_xid_for_gtid_0_x_1000";
INSERT INTO t2 VALUES (101);
INSERT INTO t2 VALUES (102);
INSERT INTO t2 VALUES (103);
INSERT INTO t2 VALUES (104);
INSERT INTO t2 VALUES (105);
SET gtid_seq_no=1000;
INSERT INTO t2 VALUES (106);
INSERT INTO t2 VALUES (107);
INSERT INTO t2 VALUES (108);
INSERT INTO t2 VALUES (109);
INSERT INTO t2 VALUES (110);
INSERT INTO t2 VALUES (111);
INSERT INTO t2 VALUES (112);
INSERT INTO t2 VALUES (113);
INSERT INTO t2 VALUES (114);
INSERT INTO t2 VALUES (115);
INSERT INTO t2 VALUES (116);
INSERT INTO t2 VALUES (117);
INSERT INTO t2 VALUES (118);
INSERT INTO t2 VALUES (119);
INSERT INTO t2 VALUES (120);
INSERT INTO t2 VALUES (121);
INSERT INTO t2 VALUES (122);
INSERT INTO t2 VALUES (123);
INSERT INTO t2 VALUES (124);
INSERT INTO t2 VALUES (125);
INSERT INTO t2 VALUES (126);
INSERT INTO t2 VALUES (127);
INSERT INTO t2 VALUES (128);
INSERT INTO t2 VALUES (129);
INSERT INTO t2 VALUES (130);
include/save_master_gtid.inc
include/start_slave.inc
include/sync_with_master_gtid.inc
SELECT * FROM t2 WHERE a >= 100 ORDER BY a;
a
101
102
103
104
105
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
include/stop_slave.inc
SET GLOBAL debug_dbug=@old_dbug;
SET GLOBAL slave_parallel_threads=10;
include/start_slave.inc
include/stop_slave.inc include/stop_slave.inc
SET GLOBAL slave_parallel_threads=@old_parallel_threads; SET GLOBAL slave_parallel_threads=@old_parallel_threads;
include/start_slave.inc include/start_slave.inc
......
...@@ -1843,6 +1843,62 @@ SET GLOBAL slave_parallel_threads=10; ...@@ -1843,6 +1843,62 @@ SET GLOBAL slave_parallel_threads=10;
--source include/start_slave.inc --source include/start_slave.inc
--echo *** MDEV-7335: Potential parallel slave deadlock with specific binlog corruption ***
--connection server_2
--source include/stop_slave.inc
SET GLOBAL slave_parallel_threads=1;
SET @old_dbug= @@GLOBAL.debug_dbug;
SET GLOBAL debug_dbug="+d,slave_discard_xid_for_gtid_0_x_1000";
--connection server_1
INSERT INTO t2 VALUES (101);
INSERT INTO t2 VALUES (102);
INSERT INTO t2 VALUES (103);
INSERT INTO t2 VALUES (104);
INSERT INTO t2 VALUES (105);
# Inject a partial event group (missing XID at the end). The bug was that such
# partial group was not handled appropriately, leading to server deadlock.
SET gtid_seq_no=1000;
INSERT INTO t2 VALUES (106);
INSERT INTO t2 VALUES (107);
INSERT INTO t2 VALUES (108);
INSERT INTO t2 VALUES (109);
INSERT INTO t2 VALUES (110);
INSERT INTO t2 VALUES (111);
INSERT INTO t2 VALUES (112);
INSERT INTO t2 VALUES (113);
INSERT INTO t2 VALUES (114);
INSERT INTO t2 VALUES (115);
INSERT INTO t2 VALUES (116);
INSERT INTO t2 VALUES (117);
INSERT INTO t2 VALUES (118);
INSERT INTO t2 VALUES (119);
INSERT INTO t2 VALUES (120);
INSERT INTO t2 VALUES (121);
INSERT INTO t2 VALUES (122);
INSERT INTO t2 VALUES (123);
INSERT INTO t2 VALUES (124);
INSERT INTO t2 VALUES (125);
INSERT INTO t2 VALUES (126);
INSERT INTO t2 VALUES (127);
INSERT INTO t2 VALUES (128);
INSERT INTO t2 VALUES (129);
INSERT INTO t2 VALUES (130);
--source include/save_master_gtid.inc
--connection server_2
--source include/start_slave.inc
--source include/sync_with_master_gtid.inc
# The partial event group (a=106) should be rolled back and thus missing.
SELECT * FROM t2 WHERE a >= 100 ORDER BY a;
--source include/stop_slave.inc
SET GLOBAL debug_dbug=@old_dbug;
SET GLOBAL slave_parallel_threads=10;
--source include/start_slave.inc
# Clean up. # Clean up.
--connection server_2 --connection server_2
--source include/stop_slave.inc --source include/stop_slave.inc
......
...@@ -640,7 +640,7 @@ handle_rpl_parallel_thread(void *arg) ...@@ -640,7 +640,7 @@ handle_rpl_parallel_thread(void *arg)
} }
DBUG_ASSERT(qev->typ==rpl_parallel_thread::queued_event::QUEUED_EVENT); DBUG_ASSERT(qev->typ==rpl_parallel_thread::queued_event::QUEUED_EVENT);
thd->rgi_slave= group_rgi= rgi; thd->rgi_slave= rgi;
gco= rgi->gco; gco= rgi->gco;
/* Handle a new event group, which will be initiated by a GTID event. */ /* Handle a new event group, which will be initiated by a GTID event. */
if ((event_type= qev->ev->get_type_code()) == GTID_EVENT) if ((event_type= qev->ev->get_type_code()) == GTID_EVENT)
...@@ -657,6 +657,21 @@ handle_rpl_parallel_thread(void *arg) ...@@ -657,6 +657,21 @@ handle_rpl_parallel_thread(void *arg)
} }
}); });
if(unlikely(thd->wait_for_commit_ptr) && group_rgi != NULL)
{
/*
This indicates that we get a new GTID event in the middle of
a not completed event group. This is corrupt binlog (the master
will never write such binlog), so it does not happen unless
someone tries to inject wrong crafted binlog, but let us still
try to handle it somewhat nicely.
*/
group_rgi->cleanup_context(thd, true);
finish_event_group(rpt, group_rgi->gtid_sub_id,
group_rgi->parallel_entry, group_rgi);
rpt->loc_free_rgi(group_rgi);
}
in_event_group= true; in_event_group= true;
/* /*
If the standalone flag is set, then this event group consists of a If the standalone flag is set, then this event group consists of a
...@@ -742,19 +757,6 @@ handle_rpl_parallel_thread(void *arg) ...@@ -742,19 +757,6 @@ handle_rpl_parallel_thread(void *arg)
unlock_or_exit_cond(thd, &entry->LOCK_parallel_entry, unlock_or_exit_cond(thd, &entry->LOCK_parallel_entry,
&did_enter_cond, &old_stage); &did_enter_cond, &old_stage);
if(thd->wait_for_commit_ptr)
{
/*
This indicates that we get a new GTID event in the middle of
a not completed event group. This is corrupt binlog (the master
will never write such binlog), so it does not happen unless
someone tries to inject wrong crafted binlog, but let us still
try to handle it somewhat nicely.
*/
rgi->cleanup_context(thd, true);
thd->wait_for_commit_ptr->unregister_wait_for_prior_commit();
thd->wait_for_commit_ptr->wakeup_subsequent_commits(rgi->worker_error);
}
thd->wait_for_commit_ptr= &rgi->commit_orderer; thd->wait_for_commit_ptr= &rgi->commit_orderer;
if (opt_gtid_ignore_duplicates) if (opt_gtid_ignore_duplicates)
...@@ -780,6 +782,7 @@ handle_rpl_parallel_thread(void *arg) ...@@ -780,6 +782,7 @@ handle_rpl_parallel_thread(void *arg)
} }
} }
group_rgi= rgi;
group_ending= is_group_ending(qev->ev, event_type); group_ending= is_group_ending(qev->ev, event_type);
if (group_ending && likely(!rgi->worker_error)) if (group_ending && likely(!rgi->worker_error))
{ {
......
...@@ -5648,6 +5648,18 @@ static int queue_event(Master_info* mi,const char* buf, ulong event_len) ...@@ -5648,6 +5648,18 @@ static int queue_event(Master_info* mi,const char* buf, ulong event_len)
} }
break; break;
#ifndef DBUG_OFF
case XID_EVENT:
DBUG_EXECUTE_IF("slave_discard_xid_for_gtid_0_x_1000",
{
/* Inject an event group that is missing its XID commit event. */
if (mi->last_queued_gtid.domain_id == 0 &&
mi->last_queued_gtid.seq_no == 1000)
goto skip_relay_logging;
});
/* Fall through to default case ... */
#endif
default: default:
default_action: default_action:
if (mi->using_gtid != Master_info::USE_GTID_NO && mi->gtid_event_seen) if (mi->using_gtid != Master_info::USE_GTID_NO && mi->gtid_event_seen)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment