Commit 4b6e6da6 authored by unknown's avatar unknown

ndb - bug#20895

  Fix occational LCP hang!!!
  Make sure only to consider alive nodes in startNextChkpt


ndb/src/kernel/blocks/dbdih/DbdihMain.cpp:
  Make sure only to consider alive nodes in startNextChkpt
parent 931af319
...@@ -9561,73 +9561,84 @@ void Dbdih::startNextChkpt(Signal* signal) ...@@ -9561,73 +9561,84 @@ void Dbdih::startNextChkpt(Signal* signal)
nodePtr.i = replicaPtr.p->procNode; nodePtr.i = replicaPtr.p->procNode;
ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord); ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
if (replicaPtr.p->lcpOngoingFlag && if (c_lcpState.m_participatingLQH.get(nodePtr.i))
replicaPtr.p->lcpIdStarted < lcpId) { {
jam(); if (replicaPtr.p->lcpOngoingFlag &&
//------------------------------------------------------------------- replicaPtr.p->lcpIdStarted < lcpId)
// We have found a replica on a node that performs local checkpoint {
// that is alive and that have not yet been started. jam();
//------------------------------------------------------------------- //-------------------------------------------------------------------
// We have found a replica on a node that performs local checkpoint
if (nodePtr.p->noOfStartedChkpt < 2) { // that is alive and that have not yet been started.
jam(); //-------------------------------------------------------------------
/**
* Send LCP_FRAG_ORD to LQH
*/
/**
* Mark the replica so with lcpIdStarted == true
*/
replicaPtr.p->lcpIdStarted = lcpId;
Uint32 i = nodePtr.p->noOfStartedChkpt;
nodePtr.p->startedChkpt[i].tableId = tabPtr.i;
nodePtr.p->startedChkpt[i].fragId = curr.fragmentId;
nodePtr.p->startedChkpt[i].replicaPtr = replicaPtr.i;
nodePtr.p->noOfStartedChkpt = i + 1;
sendLCP_FRAG_ORD(signal, nodePtr.p->startedChkpt[i]);
} else if (nodePtr.p->noOfQueuedChkpt < 2) {
jam();
/**
* Put LCP_FRAG_ORD "in queue"
*/
/**
* Mark the replica so with lcpIdStarted == true
*/
replicaPtr.p->lcpIdStarted = lcpId;
Uint32 i = nodePtr.p->noOfQueuedChkpt; if (nodePtr.p->noOfStartedChkpt < 2)
nodePtr.p->queuedChkpt[i].tableId = tabPtr.i; {
nodePtr.p->queuedChkpt[i].fragId = curr.fragmentId; jam();
nodePtr.p->queuedChkpt[i].replicaPtr = replicaPtr.i; /**
nodePtr.p->noOfQueuedChkpt = i + 1; * Send LCP_FRAG_ORD to LQH
} else { */
jam();
/**
* Mark the replica so with lcpIdStarted == true
*/
replicaPtr.p->lcpIdStarted = lcpId;
if(save){ Uint32 i = nodePtr.p->noOfStartedChkpt;
nodePtr.p->startedChkpt[i].tableId = tabPtr.i;
nodePtr.p->startedChkpt[i].fragId = curr.fragmentId;
nodePtr.p->startedChkpt[i].replicaPtr = replicaPtr.i;
nodePtr.p->noOfStartedChkpt = i + 1;
sendLCP_FRAG_ORD(signal, nodePtr.p->startedChkpt[i]);
}
else if (nodePtr.p->noOfQueuedChkpt < 2)
{
jam();
/** /**
* Stop increasing value on first that was "full" * Put LCP_FRAG_ORD "in queue"
*/ */
c_lcpState.currentFragment = curr;
save = false;
}
busyNodes.set(nodePtr.i);
if(busyNodes.count() == lcpNodes){
/** /**
* There were no possibility to start the local checkpoint * Mark the replica so with lcpIdStarted == true
* and it was not possible to queue it up. In this case we
* stop the start of local checkpoints until the nodes with a
* backlog have performed more checkpoints. We will return and
* will not continue the process of starting any more checkpoints.
*/ */
return; replicaPtr.p->lcpIdStarted = lcpId;
Uint32 i = nodePtr.p->noOfQueuedChkpt;
nodePtr.p->queuedChkpt[i].tableId = tabPtr.i;
nodePtr.p->queuedChkpt[i].fragId = curr.fragmentId;
nodePtr.p->queuedChkpt[i].replicaPtr = replicaPtr.i;
nodePtr.p->noOfQueuedChkpt = i + 1;
}
else
{
jam();
if(save)
{
/**
* Stop increasing value on first that was "full"
*/
c_lcpState.currentFragment = curr;
save = false;
}
busyNodes.set(nodePtr.i);
if(busyNodes.count() == lcpNodes)
{
/**
* There were no possibility to start the local checkpoint
* and it was not possible to queue it up. In this case we
* stop the start of local checkpoints until the nodes with a
* backlog have performed more checkpoints. We will return and
* will not continue the process of starting any more checkpoints.
*/
return;
}//if
}//if }//if
}//if }
} }//while
}//while }
curr.fragmentId++; curr.fragmentId++;
if (curr.fragmentId >= tabPtr.p->totalfragments) { if (curr.fragmentId >= tabPtr.p->totalfragments) {
jam(); jam();
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment