Commit ef38f9b7 authored by jonas@eel.(none)'s avatar jonas@eel.(none)

ndb - bug#25984 - more than 7 failed node restart can cause cluster failure

new behaviour is as follows:
1) node is refused to start, and should fail with message in error log that it must be restarted --initial
2) if cluster failure in this situation, node must also be restarted --intial
   if not SR will fail, with this message
parent 9f69d9ff
...@@ -1525,10 +1525,26 @@ void Dbdih::ndbStartReqLab(Signal* signal, BlockReference ref) ...@@ -1525,10 +1525,26 @@ void Dbdih::ndbStartReqLab(Signal* signal, BlockReference ref)
*/ */
SYSFILE->lastCompletedGCI[nodePtr.i] = 0; SYSFILE->lastCompletedGCI[nodePtr.i] = 0;
ndbrequire(nodePtr.p->nodeStatus != NodeRecord::ALIVE); ndbrequire(nodePtr.p->nodeStatus != NodeRecord::ALIVE);
warningEvent("Making filesystem for node %d unusable", warningEvent("Making filesystem for node %d unusable (need --initial)",
nodePtr.i); nodePtr.i);
} }
else if (nodePtr.p->nodeStatus == NodeRecord::ALIVE &&
SYSFILE->lastCompletedGCI[nodePtr.i] == 0)
{
jam();
CRASH_INSERTION(7170);
char buf[255];
BaseString::snprintf(buf, sizeof(buf),
"Cluster requires this node to be started "
" with --initial as partial start has been performed"
" and this filesystem is unusable");
progError(__LINE__,
NDBD_EXIT_SR_RESTARTCONFLICT,
buf);
ndbrequire(false);
}
} }
/** /**
* This set which GCI we will try to restart to * This set which GCI we will try to restart to
*/ */
...@@ -12515,14 +12531,23 @@ void Dbdih::newCrashedReplica(Uint32 nodeId, ReplicaRecordPtr ncrReplicaPtr) ...@@ -12515,14 +12531,23 @@ void Dbdih::newCrashedReplica(Uint32 nodeId, ReplicaRecordPtr ncrReplicaPtr)
/* THAT THE NEW REPLICA IS NOT STARTED YET AND REPLICA_LAST_GCI IS*/ /* THAT THE NEW REPLICA IS NOT STARTED YET AND REPLICA_LAST_GCI IS*/
/* SET TO -1 TO INDICATE THAT IT IS NOT DEAD YET. */ /* SET TO -1 TO INDICATE THAT IT IS NOT DEAD YET. */
/*----------------------------------------------------------------------*/ /*----------------------------------------------------------------------*/
Uint32 lastGCI = SYSFILE->lastCompletedGCI[nodeId];
arrGuardErr(ncrReplicaPtr.p->noCrashedReplicas + 1, 8, arrGuardErr(ncrReplicaPtr.p->noCrashedReplicas + 1, 8,
NDBD_EXIT_MAX_CRASHED_REPLICAS); NDBD_EXIT_MAX_CRASHED_REPLICAS);
ncrReplicaPtr.p->replicaLastGci[ncrReplicaPtr.p->noCrashedReplicas] = ncrReplicaPtr.p->replicaLastGci[ncrReplicaPtr.p->noCrashedReplicas] =
SYSFILE->lastCompletedGCI[nodeId]; lastGCI;
ncrReplicaPtr.p->noCrashedReplicas = ncrReplicaPtr.p->noCrashedReplicas + 1; ncrReplicaPtr.p->noCrashedReplicas = ncrReplicaPtr.p->noCrashedReplicas + 1;
ncrReplicaPtr.p->createGci[ncrReplicaPtr.p->noCrashedReplicas] = 0; ncrReplicaPtr.p->createGci[ncrReplicaPtr.p->noCrashedReplicas] = 0;
ncrReplicaPtr.p->replicaLastGci[ncrReplicaPtr.p->noCrashedReplicas] = ncrReplicaPtr.p->replicaLastGci[ncrReplicaPtr.p->noCrashedReplicas] =
(Uint32)-1; (Uint32)-1;
if (ncrReplicaPtr.p->noCrashedReplicas == 7 && lastGCI)
{
jam();
SYSFILE->lastCompletedGCI[nodeId] = 0;
warningEvent("Making filesystem for node %d unusable (need --initial)",
nodeId);
}
}//Dbdih::newCrashedReplica() }//Dbdih::newCrashedReplica()
/*************************************************************************/ /*************************************************************************/
......
...@@ -1178,6 +1178,101 @@ int runBug25554(NDBT_Context* ctx, NDBT_Step* step){ ...@@ -1178,6 +1178,101 @@ int runBug25554(NDBT_Context* ctx, NDBT_Step* step){
return NDBT_OK; return NDBT_OK;
} }
int runBug25984(NDBT_Context* ctx, NDBT_Step* step){
int result = NDBT_OK;
int loops = ctx->getNumLoops();
int records = ctx->getNumRecords();
NdbRestarter restarter;
if (restarter.getNumDbNodes() < 2)
return NDBT_OK;
if (restarter.restartAll(true, true, true))
return NDBT_FAILED;
if (restarter.waitClusterNoStart())
return NDBT_FAILED;
if (restarter.startAll())
return NDBT_FAILED;
if (restarter.waitClusterStarted())
return NDBT_FAILED;
int val2[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1 };
int master = restarter.getMasterNodeId();
int victim = restarter.getRandomNodeOtherNodeGroup(master, rand());
if (victim == -1)
victim = restarter.getRandomNodeSameNodeGroup(master, rand());
restarter.restartOneDbNode(victim, false, true, true);
for (Uint32 i = 0; i<6; i++)
{
ndbout_c("Loop: %d", i);
if (restarter.waitNodesNoStart(&victim, 1))
return NDBT_FAILED;
if (restarter.dumpStateOneNode(victim, val2, 2))
return NDBT_FAILED;
if (restarter.insertErrorInNode(victim, 7016))
return NDBT_FAILED;
if (restarter.startNodes(&victim, 1))
return NDBT_FAILED;
if (restarter.waitNodesStartPhase(&victim, 1, 2))
return NDBT_FAILED;
}
if (restarter.waitNodesNoStart(&victim, 1))
return NDBT_FAILED;
if (restarter.dumpStateOneNode(victim, val2, 2))
return NDBT_FAILED;
if (restarter.insertErrorInNode(victim, 7170))
return NDBT_FAILED;
if (restarter.startNodes(&victim, 1))
return NDBT_FAILED;
if (restarter.waitNodesNoStart(&victim, 1))
return NDBT_FAILED;
if (restarter.restartAll(false, true, true))
return NDBT_FAILED;
if (restarter.insertErrorInAllNodes(932))
return NDBT_FAILED;
if (restarter.insertErrorInNode(master, 7170))
return NDBT_FAILED;
if (restarter.dumpStateAllNodes(val2, 2))
return NDBT_FAILED;
restarter.startNodes(&master, 1);
NdbSleep_MilliSleep(3000);
restarter.startAll();
if (restarter.waitClusterNoStart())
return NDBT_FAILED;
if (restarter.restartOneDbNode(victim, true, true, true))
return NDBT_FAILED;
if (restarter.startAll())
return NDBT_FAILED;
if (restarter.waitClusterStarted())
return NDBT_FAILED;
return NDBT_OK;
}
NDBT_TESTSUITE(testNodeRestart); NDBT_TESTSUITE(testNodeRestart);
TESTCASE("NoLoad", TESTCASE("NoLoad",
...@@ -1514,6 +1609,9 @@ TESTCASE("Bug25468", ""){ ...@@ -1514,6 +1609,9 @@ TESTCASE("Bug25468", ""){
TESTCASE("Bug25554", ""){ TESTCASE("Bug25554", ""){
INITIALIZER(runBug25554); INITIALIZER(runBug25554);
} }
TESTCASE("Bug25984", ""){
INITIALIZER(runBug25984);
}
NDBT_TESTSUITE_END(testNodeRestart); NDBT_TESTSUITE_END(testNodeRestart);
int main(int argc, const char** argv){ int main(int argc, const char** argv){
......
...@@ -525,6 +525,10 @@ max-time: 1000 ...@@ -525,6 +525,10 @@ max-time: 1000
cmd: testNodeRestart cmd: testNodeRestart
args: -n Bug25554 T1 args: -n Bug25554 T1
max-time: 1000
cmd: testNodeRestart
args: -n Bug25984
# #
# DICT TESTS # DICT TESTS
max-time: 1500 max-time: 1500
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment