Commit 9ed1b843 authored by unknown's avatar unknown

Bug#26293 cluster mgmt node sometimes doesn't receive events from all nodes on restart

- signals where sometimes sent too early when setting up subscriptions


ndb/include/kernel/signaldata/DumpStateOrd.hpp:
  added dump for active subscriptions in cmvmi
ndb/src/kernel/blocks/cmvmi/Cmvmi.cpp:
  added dump for active subscriptions in cmvmi
ndb/src/mgmsrv/MgmtSrvr.cpp:
  bug in that signals where sent prior to api reg conf arrived, causing thrown away signals and subsequent hangs in mgmtserver
  also add retry if node connected but not yet received api reg conf
ndb/src/ndbapi/ClusterMgr.cpp:
  added status variable m_api_reg_conf in cluster manager to correctly be able to determine if a node is sendable
ndb/src/ndbapi/ClusterMgr.hpp:
  added status variable m_api_reg_conf in cluster manager to correctly be able to determine if a node is sendable
ndb/src/ndbapi/SignalSender.cpp:
  assert to see that node is sendable when signal is sent
ndb/src/ndbapi/SignalSender.hpp:
  manke metchd const
parent 2ed7eaf5
......@@ -107,6 +107,10 @@ public:
CmvmiDumpLongSignalMemory = 2601,
CmvmiSetRestartOnErrorInsert = 2602,
CmvmiTestLongSigWithDelay = 2603,
CmvmiDumpSubscriptions = 2604, /* note: done to respective outfile
to be able to debug if events
for some reason does not end up
in clusterlog */
// 7000 DIH
// 7001 DIH
// 7002 DIH
......
......@@ -897,7 +897,7 @@ void Cmvmi::execSET_VAR_REQ(Signal* signal)
case TimeToWaitAlive:
// QMGR
case HeartbeatIntervalDbDb: // TODO ev till Ndbcnt ocks
case HeartbeatIntervalDbDb: // TODO possibly Ndbcnt too
case HeartbeatIntervalDbApi:
case ArbitTimeout:
sendSignal(QMGR_REF, GSN_SET_VAR_REQ, signal, 3, JBB);
......@@ -1105,6 +1105,24 @@ Cmvmi::execDUMP_STATE_ORD(Signal* signal)
}
}
if (arg == DumpStateOrd::CmvmiDumpSubscriptions)
{
SubscriberPtr ptr;
subscribers.first(ptr);
g_eventLogger.info("List subscriptions:");
while(ptr.i != RNIL)
{
g_eventLogger.info("Subscription: %u, nodeId: %u, ref: 0x%x",
ptr.i, refToNode(ptr.p->blockRef), ptr.p->blockRef);
for(Uint32 i = 0; i < LogLevel::LOGLEVEL_CATEGORIES; i++)
{
Uint32 level = ptr.p->logLevel.getLogLevel((LogLevel::EventCategory)i);
g_eventLogger.info("Category %u Level %u", i, level);
}
subscribers.next(ptr);
}
}
if (arg == DumpStateOrd::CmvmiDumpLongSignalMemory){
infoEvent("Cmvmi: g_sectionSegmentPool size: %d free: %d",
g_sectionSegmentPool.getSize(),
......
......@@ -704,7 +704,7 @@ int MgmtSrvr::okToSendTo(NodeId nodeId, bool unCond)
return WRONG_PROCESS_TYPE;
// Check if we have contact with it
if(unCond){
if(theFacade->theClusterMgr->getNodeInfo(nodeId).connected)
if(theFacade->theClusterMgr->getNodeInfo(nodeId).m_api_reg_conf)
return 0;
}
else if (theFacade->get_node_alive(nodeId) == true)
......@@ -1562,32 +1562,85 @@ MgmtSrvr::status(int nodeId,
}
int
MgmtSrvr::setEventReportingLevelImpl(int nodeId,
MgmtSrvr::setEventReportingLevelImpl(int nodeId_arg,
const EventSubscribeReq& ll)
{
SignalSender ss(theFacade);
ss.lock();
SimpleSignal ssig;
EventSubscribeReq * dst =
CAST_PTR(EventSubscribeReq, ssig.getDataPtrSend());
ssig.set(ss,TestOrd::TraceAPI, CMVMI, GSN_EVENT_SUBSCRIBE_REQ,
EventSubscribeReq::SignalLength);
*dst = ll;
NodeBitmask nodes;
NdbNodeBitmask nodes;
int retries = 30;
nodes.clear();
Uint32 max = (nodeId == 0) ? (nodeId = 1, MAX_NDB_NODES) : nodeId;
for(; (Uint32) nodeId <= max; nodeId++)
while (1)
{
if (nodeTypes[nodeId] != NODE_TYPE_DB)
continue;
if (okToSendTo(nodeId, true))
continue;
if (ss.sendSignal(nodeId, &ssig) == SEND_OK)
Uint32 nodeId, max;
ss.lock();
SimpleSignal ssig;
EventSubscribeReq * dst =
CAST_PTR(EventSubscribeReq, ssig.getDataPtrSend());
ssig.set(ss,TestOrd::TraceAPI, CMVMI, GSN_EVENT_SUBSCRIBE_REQ,
EventSubscribeReq::SignalLength);
*dst = ll;
if (nodeId_arg == 0)
{
nodes.set(nodeId);
// all nodes
nodeId = 1;
max = MAX_NDB_NODES;
}
else
{
// only one node
max = nodeId = nodeId_arg;
}
// first make sure nodes are sendable
for(; nodeId <= max; nodeId++)
{
if (nodeTypes[nodeId] != NODE_TYPE_DB)
continue;
if (okToSendTo(nodeId, true))
{
if (theFacade->theClusterMgr->getNodeInfo(nodeId).connected == false)
{
// node not connected we can safely skip this one
continue;
}
// api_reg_conf not recevied yet, need to retry
break;
}
}
if (nodeId <= max)
{
if (--retries)
{
ss.unlock();
NdbSleep_MilliSleep(100);
continue;
}
return SEND_OR_RECEIVE_FAILED;
}
if (nodeId_arg == 0)
{
// all nodes
nodeId = 1;
max = MAX_NDB_NODES;
}
else
{
// only one node
max = nodeId = nodeId_arg;
}
// now send to all sendable nodes nodes
// note, lock is held, so states have not changed
for(; (Uint32) nodeId <= max; nodeId++)
{
if (nodeTypes[nodeId] != NODE_TYPE_DB)
continue;
if (theFacade->theClusterMgr->getNodeInfo(nodeId).connected == false)
continue; // node is not connected, skip
if (ss.sendSignal(nodeId, &ssig) == SEND_OK)
nodes.set(nodeId);
}
break;
}
if (nodes.isclear())
......@@ -1598,6 +1651,7 @@ MgmtSrvr::setEventReportingLevelImpl(int nodeId,
int error = 0;
while (!nodes.isclear())
{
Uint32 nodeId;
SimpleSignal *signal = ss.waitFor();
int gsn = signal->readSignalNumber();
nodeId = refToNode(signal->header.theSendersBlockRef);
......
......@@ -327,7 +327,7 @@ ClusterMgr::showState(NodeId nodeId){
ClusterMgr::Node::Node()
: m_state(NodeState::SL_NOTHING) {
compatible = nfCompleteRep = true;
connected = defined = m_alive = false;
connected = defined = m_alive = m_api_reg_conf = false;
m_state.m_connected_nodes.clear();
}
......@@ -401,6 +401,8 @@ ClusterMgr::execAPI_REGCONF(const Uint32 * theData){
node.m_info.m_version);
}
node.m_api_reg_conf = true;
node.m_state = apiRegConf->nodeState;
if (node.compatible && (node.m_state.startLevel == NodeState::SL_STARTED ||
node.m_state.startLevel == NodeState::SL_SINGLEUSER)){
......@@ -519,6 +521,7 @@ ClusterMgr::reportDisconnected(NodeId nodeId){
noOfConnectedNodes--;
theNodes[nodeId].connected = false;
theNodes[nodeId].m_api_reg_conf = false;
theNodes[nodeId].m_state.m_connected_nodes.clear();
reportNodeFailed(nodeId, true);
......
......@@ -65,6 +65,7 @@ public:
bool compatible; // Version is compatible
bool nfCompleteRep; // NF Complete Rep has arrived
bool m_alive; // Node is alive
bool m_api_reg_conf;// API_REGCONF has arrived
NodeInfo m_info;
NodeState m_state;
......
......@@ -140,6 +140,8 @@ SignalSender::getNoOfConnectedNodes() const {
SendStatus
SignalSender::sendSignal(Uint16 nodeId, const SimpleSignal * s){
assert(getNodeInfo(nodeId).m_api_reg_conf == true ||
s->readSignalNumber() == GSN_API_REGREQ);
return theFacade->theTransporterRegistry->prepareSend(&s->header,
1, // JBB
&s->theData[0],
......
......@@ -32,7 +32,7 @@ public:
Uint32 theData[25];
LinearSectionPtr ptr[3];
int readSignalNumber() {return header.theVerId_signalNumber; }
int readSignalNumber() const {return header.theVerId_signalNumber; }
Uint32 *getDataPtrSend() { return theData; }
const Uint32 *getDataPtr() const { return theData; }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment