Commit 9ed1b843 authored by unknown's avatar unknown

Bug#26293 cluster mgmt node sometimes doesn't receive events from all nodes on restart

- signals where sometimes sent too early when setting up subscriptions


ndb/include/kernel/signaldata/DumpStateOrd.hpp:
  added dump for active subscriptions in cmvmi
ndb/src/kernel/blocks/cmvmi/Cmvmi.cpp:
  added dump for active subscriptions in cmvmi
ndb/src/mgmsrv/MgmtSrvr.cpp:
  bug in that signals where sent prior to api reg conf arrived, causing thrown away signals and subsequent hangs in mgmtserver
  also add retry if node connected but not yet received api reg conf
ndb/src/ndbapi/ClusterMgr.cpp:
  added status variable m_api_reg_conf in cluster manager to correctly be able to determine if a node is sendable
ndb/src/ndbapi/ClusterMgr.hpp:
  added status variable m_api_reg_conf in cluster manager to correctly be able to determine if a node is sendable
ndb/src/ndbapi/SignalSender.cpp:
  assert to see that node is sendable when signal is sent
ndb/src/ndbapi/SignalSender.hpp:
  manke metchd const
parent 2ed7eaf5
...@@ -107,6 +107,10 @@ public: ...@@ -107,6 +107,10 @@ public:
CmvmiDumpLongSignalMemory = 2601, CmvmiDumpLongSignalMemory = 2601,
CmvmiSetRestartOnErrorInsert = 2602, CmvmiSetRestartOnErrorInsert = 2602,
CmvmiTestLongSigWithDelay = 2603, CmvmiTestLongSigWithDelay = 2603,
CmvmiDumpSubscriptions = 2604, /* note: done to respective outfile
to be able to debug if events
for some reason does not end up
in clusterlog */
// 7000 DIH // 7000 DIH
// 7001 DIH // 7001 DIH
// 7002 DIH // 7002 DIH
......
...@@ -897,7 +897,7 @@ void Cmvmi::execSET_VAR_REQ(Signal* signal) ...@@ -897,7 +897,7 @@ void Cmvmi::execSET_VAR_REQ(Signal* signal)
case TimeToWaitAlive: case TimeToWaitAlive:
// QMGR // QMGR
case HeartbeatIntervalDbDb: // TODO ev till Ndbcnt ocks case HeartbeatIntervalDbDb: // TODO possibly Ndbcnt too
case HeartbeatIntervalDbApi: case HeartbeatIntervalDbApi:
case ArbitTimeout: case ArbitTimeout:
sendSignal(QMGR_REF, GSN_SET_VAR_REQ, signal, 3, JBB); sendSignal(QMGR_REF, GSN_SET_VAR_REQ, signal, 3, JBB);
...@@ -1105,6 +1105,24 @@ Cmvmi::execDUMP_STATE_ORD(Signal* signal) ...@@ -1105,6 +1105,24 @@ Cmvmi::execDUMP_STATE_ORD(Signal* signal)
} }
} }
if (arg == DumpStateOrd::CmvmiDumpSubscriptions)
{
SubscriberPtr ptr;
subscribers.first(ptr);
g_eventLogger.info("List subscriptions:");
while(ptr.i != RNIL)
{
g_eventLogger.info("Subscription: %u, nodeId: %u, ref: 0x%x",
ptr.i, refToNode(ptr.p->blockRef), ptr.p->blockRef);
for(Uint32 i = 0; i < LogLevel::LOGLEVEL_CATEGORIES; i++)
{
Uint32 level = ptr.p->logLevel.getLogLevel((LogLevel::EventCategory)i);
g_eventLogger.info("Category %u Level %u", i, level);
}
subscribers.next(ptr);
}
}
if (arg == DumpStateOrd::CmvmiDumpLongSignalMemory){ if (arg == DumpStateOrd::CmvmiDumpLongSignalMemory){
infoEvent("Cmvmi: g_sectionSegmentPool size: %d free: %d", infoEvent("Cmvmi: g_sectionSegmentPool size: %d free: %d",
g_sectionSegmentPool.getSize(), g_sectionSegmentPool.getSize(),
......
...@@ -704,7 +704,7 @@ int MgmtSrvr::okToSendTo(NodeId nodeId, bool unCond) ...@@ -704,7 +704,7 @@ int MgmtSrvr::okToSendTo(NodeId nodeId, bool unCond)
return WRONG_PROCESS_TYPE; return WRONG_PROCESS_TYPE;
// Check if we have contact with it // Check if we have contact with it
if(unCond){ if(unCond){
if(theFacade->theClusterMgr->getNodeInfo(nodeId).connected) if(theFacade->theClusterMgr->getNodeInfo(nodeId).m_api_reg_conf)
return 0; return 0;
} }
else if (theFacade->get_node_alive(nodeId) == true) else if (theFacade->get_node_alive(nodeId) == true)
...@@ -1562,12 +1562,17 @@ MgmtSrvr::status(int nodeId, ...@@ -1562,12 +1562,17 @@ MgmtSrvr::status(int nodeId,
} }
int int
MgmtSrvr::setEventReportingLevelImpl(int nodeId, MgmtSrvr::setEventReportingLevelImpl(int nodeId_arg,
const EventSubscribeReq& ll) const EventSubscribeReq& ll)
{ {
SignalSender ss(theFacade); SignalSender ss(theFacade);
NdbNodeBitmask nodes;
int retries = 30;
nodes.clear();
while (1)
{
Uint32 nodeId, max;
ss.lock(); ss.lock();
SimpleSignal ssig; SimpleSignal ssig;
EventSubscribeReq * dst = EventSubscribeReq * dst =
CAST_PTR(EventSubscribeReq, ssig.getDataPtrSend()); CAST_PTR(EventSubscribeReq, ssig.getDataPtrSend());
...@@ -1575,19 +1580,67 @@ MgmtSrvr::setEventReportingLevelImpl(int nodeId, ...@@ -1575,19 +1580,67 @@ MgmtSrvr::setEventReportingLevelImpl(int nodeId,
EventSubscribeReq::SignalLength); EventSubscribeReq::SignalLength);
*dst = ll; *dst = ll;
NodeBitmask nodes; if (nodeId_arg == 0)
nodes.clear(); {
Uint32 max = (nodeId == 0) ? (nodeId = 1, MAX_NDB_NODES) : nodeId; // all nodes
for(; (Uint32) nodeId <= max; nodeId++) nodeId = 1;
max = MAX_NDB_NODES;
}
else
{
// only one node
max = nodeId = nodeId_arg;
}
// first make sure nodes are sendable
for(; nodeId <= max; nodeId++)
{ {
if (nodeTypes[nodeId] != NODE_TYPE_DB) if (nodeTypes[nodeId] != NODE_TYPE_DB)
continue; continue;
if (okToSendTo(nodeId, true)) if (okToSendTo(nodeId, true))
{
if (theFacade->theClusterMgr->getNodeInfo(nodeId).connected == false)
{
// node not connected we can safely skip this one
continue; continue;
if (ss.sendSignal(nodeId, &ssig) == SEND_OK) }
// api_reg_conf not recevied yet, need to retry
break;
}
}
if (nodeId <= max)
{ {
if (--retries)
{
ss.unlock();
NdbSleep_MilliSleep(100);
continue;
}
return SEND_OR_RECEIVE_FAILED;
}
if (nodeId_arg == 0)
{
// all nodes
nodeId = 1;
max = MAX_NDB_NODES;
}
else
{
// only one node
max = nodeId = nodeId_arg;
}
// now send to all sendable nodes nodes
// note, lock is held, so states have not changed
for(; (Uint32) nodeId <= max; nodeId++)
{
if (nodeTypes[nodeId] != NODE_TYPE_DB)
continue;
if (theFacade->theClusterMgr->getNodeInfo(nodeId).connected == false)
continue; // node is not connected, skip
if (ss.sendSignal(nodeId, &ssig) == SEND_OK)
nodes.set(nodeId); nodes.set(nodeId);
} }
break;
} }
if (nodes.isclear()) if (nodes.isclear())
...@@ -1598,6 +1651,7 @@ MgmtSrvr::setEventReportingLevelImpl(int nodeId, ...@@ -1598,6 +1651,7 @@ MgmtSrvr::setEventReportingLevelImpl(int nodeId,
int error = 0; int error = 0;
while (!nodes.isclear()) while (!nodes.isclear())
{ {
Uint32 nodeId;
SimpleSignal *signal = ss.waitFor(); SimpleSignal *signal = ss.waitFor();
int gsn = signal->readSignalNumber(); int gsn = signal->readSignalNumber();
nodeId = refToNode(signal->header.theSendersBlockRef); nodeId = refToNode(signal->header.theSendersBlockRef);
......
...@@ -327,7 +327,7 @@ ClusterMgr::showState(NodeId nodeId){ ...@@ -327,7 +327,7 @@ ClusterMgr::showState(NodeId nodeId){
ClusterMgr::Node::Node() ClusterMgr::Node::Node()
: m_state(NodeState::SL_NOTHING) { : m_state(NodeState::SL_NOTHING) {
compatible = nfCompleteRep = true; compatible = nfCompleteRep = true;
connected = defined = m_alive = false; connected = defined = m_alive = m_api_reg_conf = false;
m_state.m_connected_nodes.clear(); m_state.m_connected_nodes.clear();
} }
...@@ -401,6 +401,8 @@ ClusterMgr::execAPI_REGCONF(const Uint32 * theData){ ...@@ -401,6 +401,8 @@ ClusterMgr::execAPI_REGCONF(const Uint32 * theData){
node.m_info.m_version); node.m_info.m_version);
} }
node.m_api_reg_conf = true;
node.m_state = apiRegConf->nodeState; node.m_state = apiRegConf->nodeState;
if (node.compatible && (node.m_state.startLevel == NodeState::SL_STARTED || if (node.compatible && (node.m_state.startLevel == NodeState::SL_STARTED ||
node.m_state.startLevel == NodeState::SL_SINGLEUSER)){ node.m_state.startLevel == NodeState::SL_SINGLEUSER)){
...@@ -519,6 +521,7 @@ ClusterMgr::reportDisconnected(NodeId nodeId){ ...@@ -519,6 +521,7 @@ ClusterMgr::reportDisconnected(NodeId nodeId){
noOfConnectedNodes--; noOfConnectedNodes--;
theNodes[nodeId].connected = false; theNodes[nodeId].connected = false;
theNodes[nodeId].m_api_reg_conf = false;
theNodes[nodeId].m_state.m_connected_nodes.clear(); theNodes[nodeId].m_state.m_connected_nodes.clear();
reportNodeFailed(nodeId, true); reportNodeFailed(nodeId, true);
......
...@@ -65,6 +65,7 @@ public: ...@@ -65,6 +65,7 @@ public:
bool compatible; // Version is compatible bool compatible; // Version is compatible
bool nfCompleteRep; // NF Complete Rep has arrived bool nfCompleteRep; // NF Complete Rep has arrived
bool m_alive; // Node is alive bool m_alive; // Node is alive
bool m_api_reg_conf;// API_REGCONF has arrived
NodeInfo m_info; NodeInfo m_info;
NodeState m_state; NodeState m_state;
......
...@@ -140,6 +140,8 @@ SignalSender::getNoOfConnectedNodes() const { ...@@ -140,6 +140,8 @@ SignalSender::getNoOfConnectedNodes() const {
SendStatus SendStatus
SignalSender::sendSignal(Uint16 nodeId, const SimpleSignal * s){ SignalSender::sendSignal(Uint16 nodeId, const SimpleSignal * s){
assert(getNodeInfo(nodeId).m_api_reg_conf == true ||
s->readSignalNumber() == GSN_API_REGREQ);
return theFacade->theTransporterRegistry->prepareSend(&s->header, return theFacade->theTransporterRegistry->prepareSend(&s->header,
1, // JBB 1, // JBB
&s->theData[0], &s->theData[0],
......
...@@ -32,7 +32,7 @@ public: ...@@ -32,7 +32,7 @@ public:
Uint32 theData[25]; Uint32 theData[25];
LinearSectionPtr ptr[3]; LinearSectionPtr ptr[3];
int readSignalNumber() {return header.theVerId_signalNumber; } int readSignalNumber() const {return header.theVerId_signalNumber; }
Uint32 *getDataPtrSend() { return theData; } Uint32 *getDataPtrSend() { return theData; }
const Uint32 *getDataPtr() const { return theData; } const Uint32 *getDataPtr() const { return theData; }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment