ndb - bug#25468

  handle partially transfered LCP_FRAG_REP after node failure
  recommit to 51-work
parent a1fdeba3
......@@ -637,6 +637,7 @@ private:
void execTCGETOPSIZECONF(Signal *);
void execTC_CLOPSIZECONF(Signal *);
int handle_invalid_lcp_no(const class LcpFragRep*, ReplicaRecordPtr);
void execLCP_FRAG_REP(Signal *);
void execLCP_COMPLETE_REP(Signal *);
void execSTART_LCP_REQ(Signal *);
......
......@@ -4046,6 +4046,11 @@ void Dbdih::execNODE_FAILREP(Signal* signal)
Uint32 newMasterId = nodeFail->masterNodeId;
const Uint32 noOfFailedNodes = nodeFail->noOfNodes;
if (ERROR_INSERTED(7179))
{
CLEAR_ERROR_INSERT_VALUE;
}
/*-------------------------------------------------------------------------*/
// The first step is to convert from a bit mask to an array of failed nodes.
/*-------------------------------------------------------------------------*/
......@@ -10257,6 +10262,36 @@ void Dbdih::execLCP_FRAG_REP(Signal* signal)
jamEntry();
if (ERROR_INSERTED(7178) && nodeId != getOwnNodeId())
{
jam();
Uint32 owng =Sysfile::getNodeGroup(getOwnNodeId(), SYSFILE->nodeGroups);
Uint32 nodeg = Sysfile::getNodeGroup(nodeId, SYSFILE->nodeGroups);
if (owng == nodeg)
{
jam();
ndbout_c("throwing away LCP_FRAG_REP from (and killing) %d", nodeId);
SET_ERROR_INSERT_VALUE(7179);
signal->theData[0] = 9999;
sendSignal(numberToRef(CMVMI, nodeId),
GSN_NDB_TAMPER, signal, 1, JBA);
return;
}
}
if (ERROR_INSERTED(7179) && nodeId != getOwnNodeId())
{
jam();
Uint32 owng =Sysfile::getNodeGroup(getOwnNodeId(), SYSFILE->nodeGroups);
Uint32 nodeg = Sysfile::getNodeGroup(nodeId, SYSFILE->nodeGroups);
if (owng == nodeg)
{
jam();
ndbout_c("throwing away LCP_FRAG_REP from %d", nodeId);
return;
}
}
CRASH_INSERTION2(7025, isMaster());
CRASH_INSERTION2(7016, !isMaster());
......@@ -10463,6 +10498,37 @@ void Dbdih::findReplica(ReplicaRecordPtr& replicaPtr,
ndbrequire(false);
}//Dbdih::findReplica()
int
Dbdih::handle_invalid_lcp_no(const LcpFragRep* rep,
ReplicaRecordPtr replicaPtr)
{
ndbrequire(!isMaster());
Uint32 lcpNo = rep->lcpNo;
Uint32 lcpId = rep->lcpId;
Uint32 replicaLcpNo = replicaPtr.p->nextLcp;
Uint32 prevReplicaLcpNo = prevLcpNo(replicaLcpNo);
warningEvent("Detected previous node failure of %d during lcp",
rep->nodeId);
replicaPtr.p->nextLcp = lcpNo;
replicaPtr.p->lcpId[lcpNo] = 0;
replicaPtr.p->lcpStatus[lcpNo] = ZINVALID;
for (Uint32 i = lcpNo; i != lcpNo; i = nextLcpNo(i))
{
jam();
if (replicaPtr.p->lcpStatus[i] == ZVALID &&
replicaPtr.p->lcpId[i] >= lcpId)
{
ndbout_c("i: %d lcpId: %d", i, replicaPtr.p->lcpId[i]);
ndbrequire(false);
}
}
return 0;
}
/**
* Return true if table is all fragment replicas have been checkpointed
* to disk (in all LQHs)
......@@ -10491,10 +10557,13 @@ Dbdih::reportLcpCompletion(const LcpFragRep* lcpReport)
ndbrequire(replicaPtr.p->lcpOngoingFlag == true);
if(lcpNo != replicaPtr.p->nextLcp){
if (handle_invalid_lcp_no(lcpReport, replicaPtr))
{
ndbout_c("lcpNo = %d replicaPtr.p->nextLcp = %d",
lcpNo, replicaPtr.p->nextLcp);
ndbrequire(false);
}
}
ndbrequire(lcpNo == replicaPtr.p->nextLcp);
ndbrequire(lcpNo < MAX_LCP_STORED);
ndbrequire(replicaPtr.p->lcpId[lcpNo] != lcpId);
......
......@@ -1073,6 +1073,63 @@ int runBug25364(NDBT_Context* ctx, NDBT_Step* step){
return NDBT_OK;
}
int runBug25468(NDBT_Context* ctx, NDBT_Step* step){
int result = NDBT_OK;
int loops = ctx->getNumLoops();
int records = ctx->getNumRecords();
NdbRestarter restarter;
for (int i = 0; i<loops; i++)
{
int master = restarter.getMasterNodeId();
int node1, node2;
switch(i % 5){
case 0:
node1 = master;
node2 = restarter.getRandomNodeSameNodeGroup(master, rand());
break;
case 1:
node1 = restarter.getRandomNodeSameNodeGroup(master, rand());
node2 = master;
break;
case 2:
case 3:
case 4:
node1 = restarter.getRandomNodeOtherNodeGroup(master, rand());
if (node1 == -1)
node1 = master;
node2 = restarter.getRandomNodeSameNodeGroup(node1, rand());
break;
}
ndbout_c("node1: %d node2: %d master: %d", node1, node2, master);
int val2[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1 };
if (restarter.dumpStateOneNode(node2, val2, 2))
return NDBT_FAILED;
if (restarter.insertErrorInNode(node1, 7178))
return NDBT_FAILED;
int val1 = 7099;
if (restarter.dumpStateOneNode(master, &val1, 1))
return NDBT_FAILED;
if (restarter.waitNodesNoStart(&node2, 1))
return NDBT_FAILED;
if (restarter.startAll())
return NDBT_FAILED;
if (restarter.waitClusterStarted())
return NDBT_FAILED;
}
return NDBT_OK;
}
NDBT_TESTSUITE(testNodeRestart);
TESTCASE("NoLoad",
......@@ -1403,6 +1460,9 @@ TESTCASE("Bug24717", ""){
TESTCASE("Bug25364", ""){
INITIALIZER(runBug25364);
}
TESTCASE("Bug25468", ""){
INITIALIZER(runBug25468);
}
NDBT_TESTSUITE_END(testNodeRestart);
int main(int argc, const char** argv){
......
......@@ -768,6 +768,10 @@ max-time: 1500
cmd: testSystemRestart
args: -n Bug24664
max-time: 1000
cmd: testNodeRestart
args: -n Bug25468 T1
# OLD FLEX
max-time: 500
cmd: flexBench
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment