Commit 5c733394 authored by unknown's avatar unknown

Bug#32025 ndb_waiter does too many roundtrips to ndb_mgmd


ndb/tools/waiter.cpp:
  - Only contact ndb_mgmd once per loop
  - Program only cares about ndbd nodes -> remove the api and mgm vectors
  - Program can not wait for "starting" -> remove that code
  - Remove unused includes
  - Protect against SIGPIPE(writing to a socket where the mgmsrv was
    gone, silently killed the program)
  - Don't sleep one second if if all nodes are in the wanted state  
  - Use 100 milliseconds sleep between each poll
parent 5aa9c1bd
......@@ -21,13 +21,11 @@
#include <NdbMain.h>
#include <NdbOut.hpp>
#include <NdbSleep.h>
#include <kernel/ndb_limits.h>
#include <NDBT.hpp>
int
waitClusterStatus(const char* _addr, ndb_mgm_node_status _status,
unsigned int _timeout);
static int
waitClusterStatus(const char* _addr, ndb_mgm_node_status _status);
enum ndb_waiter_options {
OPT_WAIT_STATUS_NOT_STARTED = NDB_STD_OPTIONS_LAST,
......@@ -55,12 +53,13 @@ static struct my_option my_long_options[] =
"Wait for cluster to enter single user mode",
(gptr*) &_single_user, (gptr*) &_single_user, 0,
GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0 },
{ "timeout", 't', "Timeout to wait",
{ "timeout", 't', "Timeout to wait in seconds",
(gptr*) &_timeout, (gptr*) &_timeout, 0,
GET_INT, REQUIRED_ARG, 120, 0, 0, 0, 0, 0 },
{ 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}
};
static void usage()
{
ndb_std_print_version();
......@@ -70,16 +69,18 @@ static void usage()
my_print_variables(my_long_options);
}
int main(int argc, char** argv){
NDB_INIT(argv[0]);
load_defaults("my",load_default_groups,&argc,&argv);
const char* _hostName = NULL;
int ho_error;
#ifndef DBUG_OFF
opt_debug= "d:t:O,/tmp/ndb_waiter.trace";
#endif
if ((ho_error=handle_options(&argc, &argv, my_long_options,
ndb_std_get_one_option)))
if (handle_options(&argc, &argv, my_long_options,
ndb_std_get_one_option))
return NDBT_ProgramExit(NDBT_WRONGARGS);
_hostName = argv[0];
......@@ -105,7 +106,7 @@ int main(int argc, char** argv){
wait_status= NDB_MGM_NODE_STATUS_STARTED;
}
if (waitClusterStatus(_hostName, wait_status, _timeout) != 0)
if (waitClusterStatus(_hostName, wait_status) != 0)
return NDBT_ProgramExit(NDBT_FAILED);
return NDBT_ProgramExit(NDBT_OK);
}
......@@ -118,8 +119,6 @@ int main(int argc, char** argv){
NdbMgmHandle handle= NULL;
Vector<ndb_mgm_node_state> ndbNodes;
Vector<ndb_mgm_node_state> mgmNodes;
Vector<ndb_mgm_node_state> apiNodes;
int
getStatus(){
......@@ -128,8 +127,6 @@ getStatus(){
struct ndb_mgm_node_state * node;
ndbNodes.clear();
mgmNodes.clear();
apiNodes.clear();
while(retries < 10){
status = ndb_mgm_get_status(handle);
......@@ -153,18 +150,16 @@ getStatus(){
ndbNodes.push_back(*node);
break;
case NDB_MGM_NODE_TYPE_MGM:
mgmNodes.push_back(*node);
/* Don't care about MGM nodes */
break;
case NDB_MGM_NODE_TYPE_API:
apiNodes.push_back(*node);
/* Don't care about API nodes */
break;
default:
if(node->node_status == NDB_MGM_NODE_STATUS_UNKNOWN ||
node->node_status == NDB_MGM_NODE_STATUS_NO_CONTACT){
retries++;
ndbNodes.clear();
mgmNodes.clear();
apiNodes.clear();
free(status);
status = NULL;
count = 0;
......@@ -184,23 +179,21 @@ getStatus(){
return 0;
}
g_err << "getStatus failed" << endl;
return -1;
}
int
static int
waitClusterStatus(const char* _addr,
ndb_mgm_node_status _status,
unsigned int _timeout)
ndb_mgm_node_status _status)
{
int _startphase = -1;
int _nodes[MAX_NDB_NODES];
int _num_nodes = 0;
/* Ignore SIGPIPE */
signal(SIGPIPE, SIG_IGN);
handle = ndb_mgm_create_handle();
if (handle == NULL){
g_err << "handle == NULL" << endl;
g_err << "Could not create ndb_mgm handle" << endl;
return -1;
}
g_info << "Connecting to mgmsrv at " << _addr << endl;
......@@ -216,19 +209,11 @@ waitClusterStatus(const char* _addr,
return -1;
}
if (getStatus() != 0)
return -1;
// Collect all nodes into nodes
for (size_t i = 0; i < ndbNodes.size(); i++){
_nodes[i] = ndbNodes[i].node_id;
_num_nodes++;
}
unsigned int attempts = 0;
unsigned int resetAttempts = 0;
const unsigned int MAX_RESET_ATTEMPTS = 10;
int attempts = 0;
int resetAttempts = 0;
const int MAX_RESET_ATTEMPTS = 10;
bool allInState = false;
int timeout_ms= _timeout * 10; /* In number of 100 milliseconds */
while (allInState == false){
if (_timeout > 0 && attempts > _timeout){
/**
......@@ -269,62 +254,34 @@ waitClusterStatus(const char* _addr,
<< resetAttempts << endl;
attempts = 0;
resetAttempts++;
}
allInState = true;
if (getStatus() != 0){
g_err << "getStatus != 0" << endl;
return -1;
}
// ndbout << "waitNodeState; _num_nodes = " << _num_nodes << endl;
// for (int i = 0; i < _num_nodes; i++)
// ndbout << " node["<<i<<"] =" <<_nodes[i] << endl;
for (int i = 0; i < _num_nodes; i++){
ndb_mgm_node_state* ndbNode = NULL;
for (size_t n = 0; n < ndbNodes.size(); n++){
if (ndbNodes[n].node_id == _nodes[i])
ndbNode = &ndbNodes[n];
}
if(ndbNode == NULL){
allInState = false;
continue;
}
/* Assume all nodes are in state(if there is any) */
allInState = (ndbNodes.size() > 0);
g_info << "State node " << ndbNode->node_id << " "
<< ndb_mgm_get_node_status_string(ndbNode->node_status)<< endl;
/* Loop through all nodes and check their state */
for (size_t n = 0; n < ndbNodes.size(); n++) {
ndb_mgm_node_state* ndbNode = &ndbNodes[n];
assert(ndbNode != NULL);
if(_status == NDB_MGM_NODE_STATUS_STARTING &&
((ndbNode->node_status == NDB_MGM_NODE_STATUS_STARTING &&
ndbNode->start_phase >= _startphase) ||
(ndbNode->node_status == NDB_MGM_NODE_STATUS_STARTED)))
continue;
g_info << "Node " << ndbNode->node_id << ": "
<< ndb_mgm_get_node_status_string(ndbNode->node_status)<< endl;
if (_status == NDB_MGM_NODE_STATUS_STARTING){
g_info << "status = "
<< ndb_mgm_get_node_status_string(ndbNode->node_status)
<<", start_phase="<<ndbNode->start_phase<<endl;
if (ndbNode->node_status != _status) {
if (ndbNode->node_status < _status)
allInState = false;
else
g_info << "node_status(" << (unsigned)ndbNode->node_status
<< ") != _status("<< (unsigned)_status << ")" <<endl;
} else if (ndbNode->start_phase < _startphase)
allInState = false;
} else {
if (ndbNode->node_status != _status)
allInState = false;
}
}
if (!allInState) {
g_info << "Waiting for cluster enter state "
<< ndb_mgm_get_node_status_string(_status)<< endl;
NdbSleep_SecSleep(1);
NdbSleep_MilliSleep(100);
}
attempts++;
}
return 0;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment