Bug#26293 cluster mgmt node sometimes doesn't receive events from all nodes on restart

- signals where sometimes sent too early when setting up subscriptions
parent de1572d6
...@@ -107,6 +107,10 @@ public: ...@@ -107,6 +107,10 @@ public:
CmvmiDumpLongSignalMemory = 2601, CmvmiDumpLongSignalMemory = 2601,
CmvmiSetRestartOnErrorInsert = 2602, CmvmiSetRestartOnErrorInsert = 2602,
CmvmiTestLongSigWithDelay = 2603, CmvmiTestLongSigWithDelay = 2603,
CmvmiDumpSubscriptions = 2604, /* note: done to respective outfile
to be able to debug if events
for some reason does not end up
in clusterlog */
// 7000 DIH // 7000 DIH
// 7001 DIH // 7001 DIH
// 7002 DIH // 7002 DIH
......
...@@ -897,7 +897,7 @@ void Cmvmi::execSET_VAR_REQ(Signal* signal) ...@@ -897,7 +897,7 @@ void Cmvmi::execSET_VAR_REQ(Signal* signal)
case TimeToWaitAlive: case TimeToWaitAlive:
// QMGR // QMGR
case HeartbeatIntervalDbDb: // TODO ev till Ndbcnt ocks case HeartbeatIntervalDbDb: // TODO possibly Ndbcnt too
case HeartbeatIntervalDbApi: case HeartbeatIntervalDbApi:
case ArbitTimeout: case ArbitTimeout:
sendSignal(QMGR_REF, GSN_SET_VAR_REQ, signal, 3, JBB); sendSignal(QMGR_REF, GSN_SET_VAR_REQ, signal, 3, JBB);
...@@ -1105,6 +1105,24 @@ Cmvmi::execDUMP_STATE_ORD(Signal* signal) ...@@ -1105,6 +1105,24 @@ Cmvmi::execDUMP_STATE_ORD(Signal* signal)
} }
} }
if (arg == DumpStateOrd::CmvmiDumpSubscriptions)
{
SubscriberPtr ptr;
subscribers.first(ptr);
g_eventLogger.info("List subscriptions:");
while(ptr.i != RNIL)
{
g_eventLogger.info("Subscription: %u, nodeId: %u, ref: 0x%x",
ptr.i, refToNode(ptr.p->blockRef), ptr.p->blockRef);
for(Uint32 i = 0; i < LogLevel::LOGLEVEL_CATEGORIES; i++)
{
Uint32 level = ptr.p->logLevel.getLogLevel((LogLevel::EventCategory)i);
g_eventLogger.info("Category %u Level %u", i, level);
}
subscribers.next(ptr);
}
}
if (arg == DumpStateOrd::CmvmiDumpLongSignalMemory){ if (arg == DumpStateOrd::CmvmiDumpLongSignalMemory){
infoEvent("Cmvmi: g_sectionSegmentPool size: %d free: %d", infoEvent("Cmvmi: g_sectionSegmentPool size: %d free: %d",
g_sectionSegmentPool.getSize(), g_sectionSegmentPool.getSize(),
......
...@@ -704,7 +704,7 @@ int MgmtSrvr::okToSendTo(NodeId nodeId, bool unCond) ...@@ -704,7 +704,7 @@ int MgmtSrvr::okToSendTo(NodeId nodeId, bool unCond)
return WRONG_PROCESS_TYPE; return WRONG_PROCESS_TYPE;
// Check if we have contact with it // Check if we have contact with it
if(unCond){ if(unCond){
if(theFacade->theClusterMgr->getNodeInfo(nodeId).connected) if(theFacade->theClusterMgr->getNodeInfo(nodeId).m_api_reg_conf)
return 0; return 0;
} }
else if (theFacade->get_node_alive(nodeId) == true) else if (theFacade->get_node_alive(nodeId) == true)
...@@ -1562,12 +1562,17 @@ MgmtSrvr::status(int nodeId, ...@@ -1562,12 +1562,17 @@ MgmtSrvr::status(int nodeId,
} }
int int
MgmtSrvr::setEventReportingLevelImpl(int nodeId, MgmtSrvr::setEventReportingLevelImpl(int nodeId_arg,
const EventSubscribeReq& ll) const EventSubscribeReq& ll)
{ {
SignalSender ss(theFacade); SignalSender ss(theFacade);
NdbNodeBitmask nodes;
int retries = 30;
nodes.clear();
while (1)
{
Uint32 nodeId, max;
ss.lock(); ss.lock();
SimpleSignal ssig; SimpleSignal ssig;
EventSubscribeReq * dst = EventSubscribeReq * dst =
CAST_PTR(EventSubscribeReq, ssig.getDataPtrSend()); CAST_PTR(EventSubscribeReq, ssig.getDataPtrSend());
...@@ -1575,19 +1580,67 @@ MgmtSrvr::setEventReportingLevelImpl(int nodeId, ...@@ -1575,19 +1580,67 @@ MgmtSrvr::setEventReportingLevelImpl(int nodeId,
EventSubscribeReq::SignalLength); EventSubscribeReq::SignalLength);
*dst = ll; *dst = ll;
NodeBitmask nodes; if (nodeId_arg == 0)
nodes.clear(); {
Uint32 max = (nodeId == 0) ? (nodeId = 1, MAX_NDB_NODES) : nodeId; // all nodes
for(; (Uint32) nodeId <= max; nodeId++) nodeId = 1;
max = MAX_NDB_NODES;
}
else
{
// only one node
max = nodeId = nodeId_arg;
}
// first make sure nodes are sendable
for(; nodeId <= max; nodeId++)
{ {
if (nodeTypes[nodeId] != NODE_TYPE_DB) if (nodeTypes[nodeId] != NODE_TYPE_DB)
continue; continue;
if (okToSendTo(nodeId, true)) if (okToSendTo(nodeId, true))
{
if (theFacade->theClusterMgr->getNodeInfo(nodeId).connected == false)
{
// node not connected we can safely skip this one
continue; continue;
if (ss.sendSignal(nodeId, &ssig) == SEND_OK) }
// api_reg_conf not recevied yet, need to retry
break;
}
}
if (nodeId <= max)
{ {
if (--retries)
{
ss.unlock();
NdbSleep_MilliSleep(100);
continue;
}
return SEND_OR_RECEIVE_FAILED;
}
if (nodeId_arg == 0)
{
// all nodes
nodeId = 1;
max = MAX_NDB_NODES;
}
else
{
// only one node
max = nodeId = nodeId_arg;
}
// now send to all sendable nodes nodes
// note, lock is held, so states have not changed
for(; (Uint32) nodeId <= max; nodeId++)
{
if (nodeTypes[nodeId] != NODE_TYPE_DB)
continue;
if (theFacade->theClusterMgr->getNodeInfo(nodeId).connected == false)
continue; // node is not connected, skip
if (ss.sendSignal(nodeId, &ssig) == SEND_OK)
nodes.set(nodeId); nodes.set(nodeId);
} }
break;
} }
if (nodes.isclear()) if (nodes.isclear())
...@@ -1598,6 +1651,7 @@ MgmtSrvr::setEventReportingLevelImpl(int nodeId, ...@@ -1598,6 +1651,7 @@ MgmtSrvr::setEventReportingLevelImpl(int nodeId,
int error = 0; int error = 0;
while (!nodes.isclear()) while (!nodes.isclear())
{ {
Uint32 nodeId;
SimpleSignal *signal = ss.waitFor(); SimpleSignal *signal = ss.waitFor();
int gsn = signal->readSignalNumber(); int gsn = signal->readSignalNumber();
nodeId = refToNode(signal->header.theSendersBlockRef); nodeId = refToNode(signal->header.theSendersBlockRef);
......
...@@ -327,7 +327,7 @@ ClusterMgr::showState(NodeId nodeId){ ...@@ -327,7 +327,7 @@ ClusterMgr::showState(NodeId nodeId){
ClusterMgr::Node::Node() ClusterMgr::Node::Node()
: m_state(NodeState::SL_NOTHING) { : m_state(NodeState::SL_NOTHING) {
compatible = nfCompleteRep = true; compatible = nfCompleteRep = true;
connected = defined = m_alive = false; connected = defined = m_alive = m_api_reg_conf = false;
m_state.m_connected_nodes.clear(); m_state.m_connected_nodes.clear();
} }
...@@ -401,6 +401,8 @@ ClusterMgr::execAPI_REGCONF(const Uint32 * theData){ ...@@ -401,6 +401,8 @@ ClusterMgr::execAPI_REGCONF(const Uint32 * theData){
node.m_info.m_version); node.m_info.m_version);
} }
node.m_api_reg_conf = true;
node.m_state = apiRegConf->nodeState; node.m_state = apiRegConf->nodeState;
if (node.compatible && (node.m_state.startLevel == NodeState::SL_STARTED || if (node.compatible && (node.m_state.startLevel == NodeState::SL_STARTED ||
node.m_state.startLevel == NodeState::SL_SINGLEUSER)){ node.m_state.startLevel == NodeState::SL_SINGLEUSER)){
...@@ -519,6 +521,7 @@ ClusterMgr::reportDisconnected(NodeId nodeId){ ...@@ -519,6 +521,7 @@ ClusterMgr::reportDisconnected(NodeId nodeId){
noOfConnectedNodes--; noOfConnectedNodes--;
theNodes[nodeId].connected = false; theNodes[nodeId].connected = false;
theNodes[nodeId].m_api_reg_conf = false;
theNodes[nodeId].m_state.m_connected_nodes.clear(); theNodes[nodeId].m_state.m_connected_nodes.clear();
reportNodeFailed(nodeId, true); reportNodeFailed(nodeId, true);
......
...@@ -65,6 +65,7 @@ public: ...@@ -65,6 +65,7 @@ public:
bool compatible; // Version is compatible bool compatible; // Version is compatible
bool nfCompleteRep; // NF Complete Rep has arrived bool nfCompleteRep; // NF Complete Rep has arrived
bool m_alive; // Node is alive bool m_alive; // Node is alive
bool m_api_reg_conf;// API_REGCONF has arrived
NodeInfo m_info; NodeInfo m_info;
NodeState m_state; NodeState m_state;
......
...@@ -140,6 +140,8 @@ SignalSender::getNoOfConnectedNodes() const { ...@@ -140,6 +140,8 @@ SignalSender::getNoOfConnectedNodes() const {
SendStatus SendStatus
SignalSender::sendSignal(Uint16 nodeId, const SimpleSignal * s){ SignalSender::sendSignal(Uint16 nodeId, const SimpleSignal * s){
assert(getNodeInfo(nodeId).m_api_reg_conf == true ||
s->readSignalNumber() == GSN_API_REGREQ);
return theFacade->theTransporterRegistry->prepareSend(&s->header, return theFacade->theTransporterRegistry->prepareSend(&s->header,
1, // JBB 1, // JBB
&s->theData[0], &s->theData[0],
......
...@@ -32,7 +32,7 @@ public: ...@@ -32,7 +32,7 @@ public:
Uint32 theData[25]; Uint32 theData[25];
LinearSectionPtr ptr[3]; LinearSectionPtr ptr[3];
int readSignalNumber() {return header.theVerId_signalNumber; } int readSignalNumber() const {return header.theVerId_signalNumber; }
Uint32 *getDataPtrSend() { return theData; } Uint32 *getDataPtrSend() { return theData; }
const Uint32 *getDataPtr() const { return theData; } const Uint32 *getDataPtr() const { return theData; }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment