MySQL Bugs: #16772: Starting node joins cluster too early, workaround avoiding...

MySQL Bugs: #16772: Starting node joins cluster too early, workaround avoiding the issue for dynamically allocated nodeid's
parent 8188f77c
......@@ -111,9 +111,9 @@ extern const GlobalSignalNumber NO_OF_SIGNAL_NAMES;
/* 57 unused */
/* 58 unused */
/* 59 unused */
/* 60 unused */
/* 61 unused */
/* 62 unused */
#define GSN_ALLOC_NODEID_REQ 60
#define GSN_ALLOC_NODEID_CONF 61
#define GSN_ALLOC_NODEID_REF 62
/* 63 unused */
/* 64 unused */
/* 65 unused */
......
/* Copyright (C) 2003 MySQL AB
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
#ifndef ALLOC_NODE_ID_HPP
#define ALLOC_NODE_ID_HPP
#include "SignalData.hpp"
#include <NodeBitmask.hpp>
/**
* Request to allocate node id
*/
class AllocNodeIdReq {
public:
STATIC_CONST( SignalLength = 3 );
Uint32 senderRef;
Uint32 senderData;
Uint32 nodeId;
};
class AllocNodeIdConf {
public:
STATIC_CONST( SignalLength = 3 );
Uint32 senderRef;
Uint32 senderData;
Uint32 nodeId;
};
class AllocNodeIdRef {
public:
STATIC_CONST( SignalLength = 5 );
enum ErrorCodes {
NoError = 0,
Undefined = 1,
NF_FakeErrorREF = 11,
Busy = 701,
NotMaster = 702,
NodeReserved = 1701,
NodeConnected = 1702,
NodeFailureHandlingNotCompleted = 1703
};
Uint32 senderRef;
Uint32 senderData;
Uint32 nodeId;
Uint32 errorCode;
Uint32 masterRef;
};
#endif
......@@ -1509,6 +1509,9 @@ void Ndbcntr::execNODE_FAILREP(Signal* signal)
sendSignal(SUMA_REF, GSN_NODE_FAILREP, signal,
NodeFailRep::SignalLength, JBB);
sendSignal(QMGR_REF, GSN_NODE_FAILREP, signal,
NodeFailRep::SignalLength, JBB);
Uint32 nodeId = 0;
while(!allFailed.isclear()){
nodeId = allFailed.find(nodeId + 1);
......
......@@ -29,6 +29,10 @@
#include <signaldata/CmRegSignalData.hpp>
#include <signaldata/ApiRegSignalData.hpp>
#include <signaldata/FailRep.hpp>
#include <signaldata/AllocNodeId.hpp>
#include <SafeCounter.hpp>
#include <RequestTracker.hpp>
#include "timer.hpp"
......@@ -222,6 +226,12 @@ private:
void execAPI_VERSION_REQ(Signal* signal);
void execAPI_BROADCAST_REP(Signal* signal);
void execNODE_FAILREP(Signal *);
void execALLOC_NODEID_REQ(Signal *);
void execALLOC_NODEID_CONF(Signal *);
void execALLOC_NODEID_REF(Signal *);
void completeAllocNodeIdReq(Signal *);
// Arbitration signals
void execARBIT_CFG(Signal* signal);
void execARBIT_PREPREQ(Signal* signal);
......@@ -388,6 +398,14 @@ private:
Uint16 cprepFailedNodes[MAX_NDB_NODES];
Uint16 ccommitFailedNodes[MAX_NDB_NODES];
struct OpAllocNodeIdReq {
RequestTracker m_tracker;
AllocNodeIdReq m_req;
Uint32 m_connectCount;
Uint32 m_error;
};
struct OpAllocNodeIdReq opAllocNodeIdReq;
};
#endif
......@@ -85,6 +85,11 @@ Qmgr::Qmgr(const class Configuration & conf)
addRecSignal(GSN_READ_NODESREQ, &Qmgr::execREAD_NODESREQ);
addRecSignal(GSN_SET_VAR_REQ, &Qmgr::execSET_VAR_REQ);
addRecSignal(GSN_API_BROADCAST_REP, &Qmgr::execAPI_BROADCAST_REP);
addRecSignal(GSN_NODE_FAILREP, &Qmgr::execNODE_FAILREP);
addRecSignal(GSN_ALLOC_NODEID_REQ, &Qmgr::execALLOC_NODEID_REQ);
addRecSignal(GSN_ALLOC_NODEID_CONF, &Qmgr::execALLOC_NODEID_CONF);
addRecSignal(GSN_ALLOC_NODEID_REF, &Qmgr::execALLOC_NODEID_REF);
// Arbitration signals
addRecSignal(GSN_ARBIT_PREPREQ, &Qmgr::execARBIT_PREPREQ);
......
......@@ -3984,3 +3984,167 @@ Qmgr::execAPI_BROADCAST_REP(Signal* signal)
NodeReceiverGroup rg(API_CLUSTERMGR, mask);
sendSignal(rg, api.gsn, signal, len, JBB); // forward sections
}
void
Qmgr::execNODE_FAILREP(Signal * signal)
{
jamEntry();
// make sure any distributed signals get acknowledged
// destructive of the signal
c_counterMgr.execNODE_FAILREP(signal);
}
void
Qmgr::execALLOC_NODEID_REQ(Signal * signal)
{
jamEntry();
const AllocNodeIdReq * req = (AllocNodeIdReq*)signal->getDataPtr();
Uint32 senderRef = req->senderRef;
Uint32 nodeId = req->nodeId;
Uint32 error = 0;
if (refToBlock(senderRef) != QMGR) // request from management server
{
/* master */
if (getOwnNodeId() != cpresident)
error = AllocNodeIdRef::NotMaster;
else if (!opAllocNodeIdReq.m_tracker.done())
error = AllocNodeIdRef::Busy;
else if (c_connectedNodes.get(nodeId))
error = AllocNodeIdRef::NodeConnected;
if (error)
{
jam();
AllocNodeIdRef * ref = (AllocNodeIdRef*)signal->getDataPtrSend();
ref->senderRef = reference();
ref->errorCode = error;
ref->masterRef = numberToRef(QMGR, cpresident);
sendSignal(senderRef, GSN_ALLOC_NODEID_REF, signal,
AllocNodeIdRef::SignalLength, JBB);
return;
}
opAllocNodeIdReq.m_req = *req;
opAllocNodeIdReq.m_error = 0;
opAllocNodeIdReq.m_connectCount = getNodeInfo(refToNode(senderRef)).m_connectCount;
jam();
AllocNodeIdReq * req = (AllocNodeIdReq*)signal->getDataPtrSend();
req->senderRef = reference();
NodeReceiverGroup rg(QMGR, c_clusterNodes);
RequestTracker & p = opAllocNodeIdReq.m_tracker;
p.init<AllocNodeIdRef>(c_counterMgr, rg, GSN_ALLOC_NODEID_REF, 0);
sendSignal(rg, GSN_ALLOC_NODEID_REQ, signal,
AllocNodeIdReq::SignalLength, JBB);
return;
}
/* participant */
if (c_connectedNodes.get(nodeId))
error = AllocNodeIdRef::NodeConnected;
else
{
NodeRecPtr nodePtr;
nodePtr.i = nodeId;
ptrAss(nodePtr, nodeRec);
if (nodePtr.p->failState != NORMAL)
error = AllocNodeIdRef::NodeFailureHandlingNotCompleted;
}
if (error)
{
AllocNodeIdRef * ref = (AllocNodeIdRef*)signal->getDataPtrSend();
ref->senderRef = reference();
ref->errorCode = error;
sendSignal(senderRef, GSN_ALLOC_NODEID_REF, signal,
AllocNodeIdRef::SignalLength, JBB);
return;
}
AllocNodeIdConf * conf = (AllocNodeIdConf*)signal->getDataPtrSend();
conf->senderRef = reference();
sendSignal(senderRef, GSN_ALLOC_NODEID_CONF, signal,
AllocNodeIdConf::SignalLength, JBB);
}
void
Qmgr::execALLOC_NODEID_CONF(Signal * signal)
{
/* master */
jamEntry();
const AllocNodeIdConf * conf = (AllocNodeIdConf*)signal->getDataPtr();
opAllocNodeIdReq.m_tracker.reportConf(c_counterMgr,
refToNode(conf->senderRef));
completeAllocNodeIdReq(signal);
}
void
Qmgr::execALLOC_NODEID_REF(Signal * signal)
{
/* master */
jamEntry();
const AllocNodeIdRef * ref = (AllocNodeIdRef*)signal->getDataPtr();
if (ref->errorCode == AllocNodeIdRef::NF_FakeErrorREF)
{
opAllocNodeIdReq.m_tracker.ignoreRef(c_counterMgr,
refToNode(ref->senderRef));
}
else
{
opAllocNodeIdReq.m_tracker.reportRef(c_counterMgr,
refToNode(ref->senderRef));
if (opAllocNodeIdReq.m_error == 0)
opAllocNodeIdReq.m_error = ref->errorCode;
}
completeAllocNodeIdReq(signal);
}
void
Qmgr::completeAllocNodeIdReq(Signal *signal)
{
/* master */
if (!opAllocNodeIdReq.m_tracker.done())
{
jam();
return;
}
if (opAllocNodeIdReq.m_connectCount !=
getNodeInfo(refToNode(opAllocNodeIdReq.m_req.senderRef)).m_connectCount)
{
// management server not same version as the original requester
jam();
return;
}
if (opAllocNodeIdReq.m_tracker.hasRef())
{
jam();
AllocNodeIdRef * ref = (AllocNodeIdRef*)signal->getDataPtrSend();
ref->senderRef = reference();
ref->senderData = opAllocNodeIdReq.m_req.senderData;
ref->nodeId = opAllocNodeIdReq.m_req.nodeId;
ref->errorCode = opAllocNodeIdReq.m_error;
ref->masterRef = numberToRef(QMGR, cpresident);
ndbassert(AllocNodeIdRef::SignalLength == 5);
sendSignal(opAllocNodeIdReq.m_req.senderRef, GSN_ALLOC_NODEID_REF, signal,
AllocNodeIdRef::SignalLength, JBB);
return;
}
jam();
AllocNodeIdConf * conf = (AllocNodeIdConf*)signal->getDataPtrSend();
conf->senderRef = reference();
conf->senderData = opAllocNodeIdReq.m_req.senderData;
conf->nodeId = opAllocNodeIdReq.m_req.nodeId;
ndbassert(AllocNodeIdConf::SignalLength == 3);
sendSignal(opAllocNodeIdReq.m_req.senderRef, GSN_ALLOC_NODEID_CONF, signal,
AllocNodeIdConf::SignalLength, JBB);
}
......@@ -40,6 +40,7 @@
#include <signaldata/ManagementServer.hpp>
#include <signaldata/NFCompleteRep.hpp>
#include <signaldata/NodeFailRep.hpp>
#include <signaldata/AllocNodeId.hpp>
#include <NdbSleep.h>
#include <EventLogger.hpp>
#include <DebuggerNames.hpp>
......@@ -1712,6 +1713,88 @@ MgmtSrvr::get_connected_nodes(NodeBitmask &connected_nodes) const
}
}
int
MgmtSrvr::alloc_node_id_req(Uint32 free_node_id)
{
SignalSender ss(theFacade);
ss.lock(); // lock will be released on exit
SimpleSignal ssig;
AllocNodeIdReq* req = CAST_PTR(AllocNodeIdReq, ssig.getDataPtrSend());
ssig.set(ss, TestOrd::TraceAPI, QMGR, GSN_ALLOC_NODEID_REQ,
AllocNodeIdReq::SignalLength);
req->senderRef = ss.getOwnRef();
req->senderData = 19;
req->nodeId = free_node_id;
int do_send = 1;
NodeId nodeId = 0;
while (1)
{
if (nodeId == 0)
{
bool next;
while((next = getNextNodeId(&nodeId, NDB_MGM_NODE_TYPE_NDB)) == true &&
theFacade->get_node_alive(nodeId) == false);
if (!next)
return NO_CONTACT_WITH_DB_NODES;
do_send = 1;
}
if (do_send)
{
if (ss.sendSignal(nodeId, &ssig) != SEND_OK) {
return SEND_OR_RECEIVE_FAILED;
}
do_send = 0;
}
SimpleSignal *signal = ss.waitFor();
int gsn = signal->readSignalNumber();
switch (gsn) {
case GSN_ALLOC_NODEID_CONF:
{
const AllocNodeIdConf * const conf =
CAST_CONSTPTR(AllocNodeIdConf, signal->getDataPtr());
return 0;
}
case GSN_ALLOC_NODEID_REF:
{
const AllocNodeIdRef * const ref =
CAST_CONSTPTR(AllocNodeIdRef, signal->getDataPtr());
if (ref->errorCode == AllocNodeIdRef::NotMaster ||
ref->errorCode == AllocNodeIdRef::Busy)
{
do_send = 1;
nodeId = refToNode(ref->masterRef);
continue;
}
return ref->errorCode;
}
case GSN_NF_COMPLETEREP:
{
const NFCompleteRep * const rep =
CAST_CONSTPTR(NFCompleteRep, signal->getDataPtr());
#ifdef VM_TRACE
ndbout_c("Node %d fail completed", rep->failedNodeId);
#endif
if (rep->failedNodeId == nodeId)
nodeId = 0;
continue;
}
case GSN_NODE_FAILREP:{
// ignore NF_COMPLETEREP will come
continue;
}
default:
report_unknown_signal(signal);
return SEND_OR_RECEIVE_FAILED;
}
}
return 0;
}
bool
MgmtSrvr::alloc_node_id(NodeId * nodeId,
enum ndb_mgm_node_type type,
......@@ -1836,6 +1919,39 @@ MgmtSrvr::alloc_node_id(NodeId * nodeId,
}
NdbMutex_Unlock(m_configMutex);
if (id_found && client_addr != 0)
{
int res = alloc_node_id_req(id_found);
unsigned save_id_found = id_found;
switch (res)
{
case 0:
// ok continue
break;
case NO_CONTACT_WITH_DB_NODES:
// ok continue
break;
default:
// something wrong
id_found = 0;
break;
}
if (id_found == 0)
{
char buf[128];
ndb_error_string(res, buf, sizeof(buf));
error_string.appfmt("Cluster refused allocation of id %d. Error: %d (%s).",
save_id_found, res, buf);
g_eventLogger.warning("Cluster refused allocation of id %d. "
"Connection from ip %s. "
"Returned error string \"%s\"", save_id_found,
inet_ntoa(((struct sockaddr_in *)(client_addr))->sin_addr),
error_string.c_str());
DBUG_RETURN(false);
}
}
if (id_found)
{
*nodeId= id_found;
......
......@@ -506,7 +506,8 @@ private:
* @return -1 if block not found, otherwise block number
*/
int getBlockNumber(const BaseString &blockName);
int alloc_node_id_req(Uint32 free_node_id);
//**************************************************************************
int _blockNumber;
......
......@@ -81,6 +81,7 @@ static const char* empty_string = "";
* 1400 - SUMA
* 1500 - LGMAN
* 1600 - TSMAN
* 1700 - QMGR
* 4000 - API
* 4100 - ""
* 4200 - ""
......@@ -450,6 +451,15 @@ ErrorBundle ErrorCodes[] = {
{ 1348, DMEC, AE, "Backup failed to allocate file record (check configuration)" },
{ 1349, DMEC, AE, "Backup failed to allocate attribute record (check configuration)" },
{ 1329, DMEC, AE, "Backup during software upgrade not supported" },
/**
* Node id allocation error codes
*/
{ 1700, DMEC, IE, "Undefined error" },
{ 1701, DMEC, AE, "Node already reserved" },
{ 1702, DMEC, AE, "Node already connected" },
{ 1703, DMEC, AE, "Node failure handling not completed" },
/**
* Still uncategorized
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment