ndb - bug#25364

  on master node failure during qmgr-commitreq
    make sure to remove all committed failed nodes from failed/prepfailed arrays
parent b4d323c0
......@@ -68,6 +68,7 @@ public:
// 100-105 TUP and ACC
// 200-240 UTIL
// 300-305 TRIX
QmgrErr935 = 935,
NdbfsDumpFileStat = 400,
NdbfsDumpAllFiles = 401,
NdbfsDumpOpenFiles = 402,
......
......@@ -21,6 +21,9 @@ Crash president when he starts to run in ArbitState 1-9.
910: Crash new president after node crash
935 : Crash master on node failure (delayed)
and skip sending GSN_COMMIT_FAILREQ to specified node
ERROR CODES FOR TESTING NODE FAILURE, GLOBAL CHECKPOINT HANDLING:
-----------------------------------------------------------------
......
......@@ -426,6 +426,10 @@ private:
StopReq c_stopReq;
bool check_multi_node_shutdown(Signal* signal);
#ifdef ERROR_INSERT
Uint32 c_error_insert_extra;
#endif
};
#endif
......@@ -3110,6 +3110,18 @@ Qmgr::sendCommitFailReq(Signal* signal)
for (nodePtr.i = 1; nodePtr.i < MAX_NDB_NODES; nodePtr.i++) {
jam();
ptrAss(nodePtr, nodeRec);
#ifdef ERROR_INSERT
if (ERROR_INSERTED(935) && nodePtr.i == c_error_insert_extra)
{
ndbout_c("skipping node %d", c_error_insert_extra);
CLEAR_ERROR_INSERT_VALUE;
signal->theData[0] = 9999;
sendSignalWithDelay(CMVMI_REF, GSN_NDB_TAMPER, signal, 1000, 1);
continue;
}
#endif
if (nodePtr.p->phase == ZRUNNING) {
jam();
nodePtr.p->sendCommitFailReqStatus = Q_ACTIVE;
......@@ -3180,6 +3192,33 @@ void Qmgr::execPREP_FAILREF(Signal* signal)
return;
}//Qmgr::execPREP_FAILREF()
static
Uint32
clear_nodes(Uint32 dstcnt, Uint16 dst[], Uint32 srccnt, const Uint16 src[])
{
if (srccnt == 0)
return dstcnt;
Uint32 pos = 0;
for (Uint32 i = 0; i<dstcnt; i++)
{
Uint32 node = dst[i];
for (Uint32 j = 0; j<srccnt; j++)
{
if (node == dst[j])
{
node = RNIL;
break;
}
}
if (node != RNIL)
{
dst[pos++] = node;
}
}
return pos;
}
/*---------------------------------------------------------------------------*/
/* THE PRESIDENT IS NOW COMMITTING THE PREVIOUSLY PREPARED NODE FAILURE. */
/*---------------------------------------------------------------------------*/
......@@ -3267,19 +3306,18 @@ void Qmgr::execCOMMIT_FAILREQ(Signal* signal)
NodeFailRep::SignalLength, JBB);
}//if
}//for
if (cpresident != getOwnNodeId()) {
jam();
cnoFailedNodes = cnoCommitFailedNodes - cnoFailedNodes;
if (cnoFailedNodes > 0) {
jam();
guard0 = cnoFailedNodes - 1;
arrGuard(guard0 + cnoCommitFailedNodes, MAX_NDB_NODES);
for (Tj = 0; Tj <= guard0; Tj++) {
jam();
cfailedNodes[Tj] = cfailedNodes[Tj + cnoCommitFailedNodes];
}//for
}//if
}//if
/**
* Remove committed nodes from failed/prepared
*/
cnoFailedNodes = clear_nodes(cnoFailedNodes,
cfailedNodes,
cnoCommitFailedNodes,
ccommitFailedNodes);
cnoPrepFailedNodes = clear_nodes(cnoPrepFailedNodes,
cprepFailedNodes,
cnoCommitFailedNodes,
ccommitFailedNodes);
cnoCommitFailedNodes = 0;
}//if
/**-----------------------------------------------------------------------
......@@ -4658,6 +4696,14 @@ Qmgr::execDUMP_STATE_ORD(Signal* signal)
default:
;
}//switch
#ifdef ERROR_INSERT
if (signal->theData[0] == 935 && signal->getLength() == 2)
{
SET_ERROR_INSERT_VALUE(935);
c_error_insert_extra = signal->theData[1];
}
#endif
}//Qmgr::execDUMP_STATE_ORD()
void Qmgr::execSET_VAR_REQ(Signal* signal)
......
......@@ -955,6 +955,46 @@ int runBug24717(NDBT_Context* ctx, NDBT_Step* step){
return NDBT_OK;
}
int runBug25364(NDBT_Context* ctx, NDBT_Step* step){
int result = NDBT_OK;
NdbRestarter restarter;
Ndb* pNdb = GETNDB(step);
int loops = ctx->getNumLoops();
if (restarter.getNumDbNodes() < 4)
return NDBT_OK;
int val2[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1 };
for (; loops; loops --)
{
int master = restarter.getMasterNodeId();
int victim = restarter.getRandomNodeOtherNodeGroup(master, rand());
int second = restarter.getRandomNodeSameNodeGroup(victim, rand());
int dump[] = { 935, victim } ;
if (restarter.dumpStateOneNode(master, dump, 2))
return NDBT_FAILED;
if (restarter.dumpStateOneNode(master, val2, 2))
return NDBT_FAILED;
if (restarter.restartOneDbNode(second, false, true, true))
return NDBT_FAILED;
int nodes[2] = { master, second };
if (restarter.waitNodesNoStart(nodes, 2))
return NDBT_FAILED;
restarter.startNodes(nodes, 2);
if (restarter.waitNodesStarted(nodes, 2))
return NDBT_FAILED;
}
return NDBT_OK;
}
NDBT_TESTSUITE(testNodeRestart);
TESTCASE("NoLoad",
......@@ -1271,6 +1311,9 @@ TESTCASE("Bug20185",
TESTCASE("Bug24717", ""){
INITIALIZER(runBug24717);
}
TESTCASE("Bug25364", ""){
INITIALIZER(runBug25364);
}
NDBT_TESTSUITE_END(testNodeRestart);
int main(int argc, const char** argv){
......
......@@ -469,6 +469,10 @@ max-time: 1000
cmd: testNodeRestart
args: -n Bug24717 T1
max-time: 1000
cmd: testNodeRestart
args: -n Bug25364 T1
# OLD FLEX
max-time: 500
cmd: flexBench
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment