ndb - bug#18781 (5.0) handle rolling upgrade, minor fixes, logging, docs

ada3df70 · pekka@clam.ndb.mysql.com · cdc421dc · ada3df70 · ada3df70 · ada3df70
Commit ada3df70 authored Jun 11, 2006 by pekka@clam.ndb.mysql.com
8 changed files
--- a/ndb/include/kernel/signaldata/DictLock.hpp
+++ b/ndb/include/kernel/signaldata/DictLock.hpp
@@ -55,7 +55,9 @@ public:
  enum ErrorCode {
    NotMaster = 1,
    InvalidLockType = 2,
-    TooManyRequests = 3
+    BadUserRef = 3,
+    TooLate = 4,
+    TooManyRequests = 5
  };
 private:
  Uint32 userPtr;

--- a/ndb/include/ndb_version.h.in
+++ b/ndb/include/ndb_version.h.in
@@ -60,5 +60,7 @@ char ndb_version_string_buf[NDB_VERSION_STRING_BUF_SZ];
 #define NDBD_INCL_NODECONF_VERSION_4 MAKE_VERSION(4,1,17)
 #define NDBD_INCL_NODECONF_VERSION_5 MAKE_VERSION(5,0,18)

+#define NDBD_DICT_LOCK_VERSION_5 MAKE_VERSION(5,0,23)
+
 #endif
 
--- a/ndb/src/kernel/blocks/ERROR_codes.txt
+++ b/ndb/src/kernel/blocks/ERROR_codes.txt
@@ -5,7 +5,7 @@ Next DBACC 3002
 Next DBTUP 4013
 Next DBLQH 5043
 Next DBDICT 6007
-Next DBDIH 7175
+Next DBDIH 7177
 Next DBTC 8037
 Next CMVMI 9000
 Next BACKUP 10022
@@ -312,7 +312,9 @@ Test Crashes in handling node restarts

 7170: Crash when receiving START_PERMREF (InitialStartRequired)

-7174: Send one fake START_PERMREF (ZNODE_ALREADY_STARTING_ERROR)
+7174: Crash starting node before sending DICT_LOCK_REQ
+7175: Master sends one fake START_PERMREF (ZNODE_ALREADY_STARTING_ERROR)
+7176: Slave NR pretends master does not support DICT lock (rolling upgrade)

 DICT:
 6000  Crash during NR when receiving DICTSTARTREQ

--- a/ndb/src/kernel/blocks/dbdict/Dbdict.cpp
+++ b/ndb/src/kernel/blocks/dbdict/Dbdict.cpp
@@ -205,7 +205,7 @@ void Dbdict::execCONTINUEB(Signal* signal)

  case ZDICT_LOCK_POLL:
    jam();
-    checkDictLockQueue(signal);
+    checkDictLockQueue(signal, true);
    break;

  default :
@@ -2836,7 +2836,6 @@ void Dbdict::execNODE_FAILREP(Signal* signal)
  case BS_NODE_RESTART:
    jam();
    ok = true;
-    removeStaleDictLocks(signal, theFailedNodes);
    break;
  }
  ndbrequire(ok);
@@ -2860,6 +2859,15 @@ void Dbdict::execNODE_FAILREP(Signal* signal)
    }//if
  }//for

+  /*
+   * NODE_FAILREP guarantees that no "in flight" signal from
+   * a dead node is accepted, and also that the job buffer contains
+   * no such (un-executed) signals.  Therefore no DICT_UNLOCK_ORD
+   * from a dead node (leading to master crash) is possible after
+   * this clean-up removes the lock record.
+   */
+  removeStaleDictLocks(signal, theFailedNodes);
+
 }//execNODE_FAILREP()


@@ -12210,7 +12218,7 @@ Dbdict::getIndexAttrMask(TableRecordPtr indexPtr, AttributeMask& mask)
 const Dbdict::DictLockType*
 Dbdict::getDictLockType(Uint32 lockType)
 {
-  static DictLockType lt[] = {
+  static const DictLockType lt[] = {
    { DictLockReq::NodeRestartLock, BS_NODE_RESTART, "NodeRestart" }
  };
  for (int i = 0; i < sizeof(lt)/sizeof(lt[0]); i++) {
@@ -12220,12 +12228,40 @@ Dbdict::getDictLockType(Uint32 lockType)
  return NULL;
 }

+void
+Dbdict::sendDictLockInfoEvent(Uint32 pollCount)
+{
+  DictLockPtr loopPtr;
+  c_dictLockQueue.first(loopPtr);
+  unsigned count = 0;
+
+  char queue_buf[100];
+  char *p = &queue_buf[0];
+  const char *const q = &queue_buf[sizeof(queue_buf)];
+  *p = 0;
+
+  while (loopPtr.i != RNIL) {
+    jam();
+    my_snprintf(p, q-p, "%s%u%s",
+                ++count == 1 ? "" : " ",
+                (unsigned)refToNode(loopPtr.p->req.userRef),
+                loopPtr.p->locked ? "L" : "");
+    p += strlen(p);
+    c_dictLockQueue.next(loopPtr);
+  }
+
+  infoEvent("DICT: lock bs: %d ops: %d poll: %d cnt: %d queue: %s",
+      (int)c_blockState,
+      c_opRecordPool.getSize() - c_opRecordPool.getNoOfFree(),
+      c_dictLockPoll, (int)pollCount, queue_buf);
+}
+
 void
 Dbdict::sendDictLockInfoEvent(DictLockPtr lockPtr, const char* text)
 {
  infoEvent("DICT: %s %u for %s",
      text,
-      (unsigned int)refToNode(lockPtr.p->req.userRef), lockPtr.p->lt->text);
+      (unsigned)refToNode(lockPtr.p->req.userRef), lockPtr.p->lt->text);
 }

 void
@@ -12234,6 +12270,8 @@ Dbdict::execDICT_LOCK_REQ(Signal* signal)
  jamEntry();
  const DictLockReq* req = (const DictLockReq*)&signal->theData[0];

+  // make sure bad request crashes slave, not master (us)
+
  if (getOwnNodeId() != c_masterNodeId) {
    jam();
    sendDictLockRef(signal, *req, DictLockRef::NotMaster);
@@ -12247,6 +12285,19 @@ Dbdict::execDICT_LOCK_REQ(Signal* signal)
    return;
  }

+  if (req->userRef != signal->getSendersBlockRef() ||
+      getNodeInfo(refToNode(req->userRef)).m_type != NodeInfo::DB) {
+    jam();
+    sendDictLockRef(signal, *req, DictLockRef::BadUserRef);
+    return;
+  }
+
+  if (c_aliveNodes.get(refToNode(req->userRef))) {
+    jam();
+    sendDictLockRef(signal, *req, DictLockRef::TooLate);
+    return;
+  }
+
  DictLockPtr lockPtr;
  if (! c_dictLockQueue.seize(lockPtr)) {
    jam();
@@ -12258,21 +12309,23 @@ Dbdict::execDICT_LOCK_REQ(Signal* signal)
  lockPtr.p->locked = false;
  lockPtr.p->lt = lt;

-  checkDictLockQueue(signal);
+  checkDictLockQueue(signal, false);

  if (! lockPtr.p->locked)
    sendDictLockInfoEvent(lockPtr, "lock request by node");
 }

 void
-Dbdict::checkDictLockQueue(Signal* signal)
+Dbdict::checkDictLockQueue(Signal* signal, bool poll)
 {
+  Uint32 pollCount = ! poll ? 0 : signal->theData[1];
+
  DictLockPtr lockPtr;

  do {
    if (! c_dictLockQueue.first(lockPtr)) {
      jam();
-      setDictLockPoll(signal, false);
+      setDictLockPoll(signal, false, pollCount);
      return;
    }

@@ -12299,7 +12352,7 @@ Dbdict::checkDictLockQueue(Signal* signal)
  // this routine is called again when it is removed for any reason

  bool on = ! lockPtr.p->locked;
-  setDictLockPoll(signal, on);
+  setDictLockPoll(signal, on, pollCount);
 }

 void
@@ -12326,7 +12379,7 @@ Dbdict::execDICT_UNLOCK_ORD(Signal* signal)

  c_dictLockQueue.release(lockPtr);

-  checkDictLockQueue(signal);
+  checkDictLockQueue(signal, false);
 }

 void
@@ -12359,21 +12412,32 @@ Dbdict::sendDictLockRef(Signal* signal, DictLockReq req, Uint32 errorCode)
 // control polling

 void
-Dbdict::setDictLockPoll(Signal* signal, bool on)
+Dbdict::setDictLockPoll(Signal* signal, bool on, Uint32 pollCount)
 {
  if (on) {
    jam();
    signal->theData[0] = ZDICT_LOCK_POLL;
-    sendSignalWithDelay(reference(), GSN_CONTINUEB, signal, 100, 1);
+    signal->theData[1] = pollCount + 1;
+    sendSignalWithDelay(reference(), GSN_CONTINUEB, signal, 100, 2);
  }

-  if (c_dictLockPoll != on) {
+  bool change = (c_dictLockPoll != on);
+
+  if (change) {
    jam();
-#ifdef VM_TRACE
-    infoEvent("DICT: lock polling %s", on ? "On" : "Off");
-#endif
    c_dictLockPoll = on;
  }
+
+  // avoid too many messages if master is stuck busy (BS_NODE_FAILURE)
+  bool periodic =
+    pollCount < 8 ||
+    pollCount < 64 && pollCount % 8 == 0 ||
+    pollCount < 512 && pollCount % 64 == 0 ||
+    pollCount < 4096 && pollCount % 512 == 0 ||
+    pollCount % 4096 == 0; // about every 6 minutes
+
+  if (change || periodic)
+    sendDictLockInfoEvent(pollCount);
 }

 // NF handling
@@ -12384,6 +12448,11 @@ Dbdict::removeStaleDictLocks(Signal* signal, const Uint32* theFailedNodes)
  DictLockPtr loopPtr;
  c_dictLockQueue.first(loopPtr);

+  if (getOwnNodeId() != c_masterNodeId) {
+    ndbrequire(loopPtr.i == RNIL);
+    return;
+  }
+
  while (loopPtr.i != RNIL) {
    jam();
    DictLockPtr lockPtr = loopPtr;
@@ -12409,7 +12478,7 @@ Dbdict::removeStaleDictLocks(Signal* signal, const Uint32* theFailedNodes)
    }
  }

-  checkDictLockQueue(signal);
+  checkDictLockQueue(signal, false);
 }



--- a/ndb/src/kernel/blocks/dbdict/Dbdict.hpp
+++ b/ndb/src/kernel/blocks/dbdict/Dbdict.hpp
@@ -1804,14 +1804,15 @@ private:
  bool c_dictLockPoll;

  static const DictLockType* getDictLockType(Uint32 lockType);
+  void sendDictLockInfoEvent(Uint32 pollCount);
  void sendDictLockInfoEvent(DictLockPtr lockPtr, const char* text);

-  void checkDictLockQueue(Signal* signal);
+  void checkDictLockQueue(Signal* signal, bool poll);
  void sendDictLockConf(Signal* signal, DictLockPtr lockPtr);
  void sendDictLockRef(Signal* signal, DictLockReq req, Uint32 errorCode);

  // control polling i.e. continueB loop
-  void setDictLockPoll(Signal* signal, bool on);
+  void setDictLockPoll(Signal* signal, bool on, Uint32 pollCount);

  // NF handling
  void removeStaleDictLocks(Signal* signal, const Uint32* theFailedNodes);

--- a/ndb/src/kernel/blocks/dbdict/DictLock.txt
+++ b/ndb/src/kernel/blocks/dbdict/DictLock.txt
+Lock master DICT against schema operations
+
+Implementation
+--------------
+
+[ see comments in Dbdict.hpp ]
+
+Use case: Node startup INR / NR
+-------------------------------
+
+Master DICT (like any block) keeps list of alive nodes (c_aliveNodes).
+These are participants in schema ops.
+
+(1) c_aliveNodes is initialized when DICT starts
+    in sp3 in READ_NODESCONF from CNTR
+
+(2) when slave node fails (in any sp of the slave node)
+    it is removed from c_aliveNodes in NODE_FAILREP
+
+(3) when slave starts, it is added to c_aliveNodes
+    in sp4 of the starting node in INCL_NODEREQ
+
+Slave DIH locks master DICT in sp2 and releases the lock when started.
+Based on the constraints:
+
+- the lock is taken when master DICT is known
+  DIH reads this in sp2 in READ_NODESCONF
+
+- the lock is taken before (3)
+
+- the lock is taken before copying starts and held until it is done
+  in sp4 DIH meta, DICT meta, tuple data
+
+- on INR in sp2 in START_PERMREQ the LCP info of the slave is erased
+  in all DIH in invalidateNodeLCP() - not safe under schema ops
+
+Signals:
+
+All but DICT_LOCK are standard v5.0 signals.
+s=starting node, m=master, a=all participants, l=local block.
+
+* sp2 - DICT_LOCK and START_PERM
+
+DIH/s
+    DICT_LOCK_REQ
+        DICT/m
+    DICT_LOCK_CONF
+DIH/s
+    START_PERMREQ
+        DIH/m
+            START_INFOREQ
+                DIH/a
+                    invalidateNodeLCP() if INR
+                DIH/a
+            START_INFOCONF
+        DIH/m
+    START_PERMCONF
+DIH/s
+
+* sp4 - START_ME (copy metadata, no changes)
+
+DIH/s
+    START_MEREQ
+        DIH/m
+            COPY_TABREQ
+                DIH/s
+            COPY_TABCONF
+        DIH/m
+            DICTSTARTREQ
+                DICT/s
+                    GET_SCHEMA_INFOREQ
+                        DICT/m
+                    SCHEMA_INFO
+                DICT/s
+            DICTSTARTCONF
+        DIH/m
+            INCL_NODEREQ
+                DIH/a
+                    INCL_NODEREQ
+                        ANY/l
+                    INCL_NODECONF
+                DIH/a
+            INCL_NODECONF
+        DIH/m
+    START_MECONF
+DIH/s
+
+* sp7 - release DICT lock
+
+DIH/s
+    DICT_UNLOCK_ORD
+        DICT/m
+
+# vim: set et sw=4:
--- a/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp
+++ b/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp
@@ -1594,6 +1594,9 @@ void Dbdih::nodeRestartPh2Lab(Signal* signal)
   */
  ndbrequire(c_dictLockSlavePtrI_nodeRestart == RNIL);

+  // check that we are not yet taking part in schema ops
+  CRASH_INSERTION(7174);
+
  Uint32 lockType = DictLockReq::NodeRestartLock;
  Callback c = { safe_cast(&Dbdih::recvDictLockConf_nodeRestart), 0 };
  sendDictLockReq(signal, lockType, c);
@@ -1746,7 +1749,7 @@ void Dbdih::execSTART_PERMREQ(Signal* signal)
  ndbrequire(refToNode(retRef) == nodeId);
  if ((c_nodeStartMaster.activeState) ||
      (c_nodeStartMaster.wait != ZFALSE) ||
-      ERROR_INSERTED_CLEAR(7174)) {
+      ERROR_INSERTED_CLEAR(7175)) {
    jam();
    signal->theData[0] = nodeId;
    signal->theData[1] = StartPermRef::ZNODE_ALREADY_STARTING_ERROR;
@@ -14709,6 +14712,34 @@ Dbdih::sendDictLockReq(Signal* signal, Uint32 lockType, Callback c)
  lockPtr.p->locked = false;
  lockPtr.p->callback = c;

+  // handle rolling upgrade
+  {
+    Uint32 masterVersion = getNodeInfo(cmasterNodeId).m_version;
+
+    unsigned int get_major = getMajor(masterVersion);
+    unsigned int get_minor = getMinor(masterVersion);
+    unsigned int get_build = getBuild(masterVersion);
+
+    ndbrequire(get_major == 4 || get_major == 5);
+
+    if (masterVersion < NDBD_DICT_LOCK_VERSION_5 ||
+        ERROR_INSERTED(7176)) {
+      jam();
+
+      infoEvent("DIH: detect upgrade: master node %u old version %u.%u.%u",
+        (unsigned int)cmasterNodeId, get_major, get_minor, get_build);
+
+      DictLockConf* conf = (DictLockConf*)&signal->theData[0];
+      conf->userPtr = lockPtr.i;
+      conf->lockType = lockType;
+      conf->lockPtr = ZNIL;
+
+      sendSignal(reference(), GSN_DICT_LOCK_CONF, signal,
+          DictLockConf::SignalLength, JBB);
+      return;
+    }
+  }
+
  BlockReference dictMasterRef = calcDictBlockRef(cmasterNodeId);
  sendSignal(dictMasterRef, GSN_DICT_LOCK_REQ, signal,
      DictLockReq::SignalLength, JBB);
@@ -14758,6 +14789,19 @@ Dbdih::sendDictUnlockOrd(Signal* signal, Uint32 lockSlavePtrI)

  c_dictLockSlavePool.release(lockPtr);

+  // handle rolling upgrade
+  {
+    Uint32 masterVersion = getNodeInfo(cmasterNodeId).m_version;
+
+    unsigned int get_major = getMajor(masterVersion);
+    ndbrequire(get_major == 4 || get_major == 5);
+
+    if (masterVersion < NDBD_DICT_LOCK_VERSION_5 ||
+        ERROR_INSERTED(7176)) {
+      return;
+    }
+  }
+
  BlockReference dictMasterRef = calcDictBlockRef(cmasterNodeId);
  sendSignal(dictMasterRef, GSN_DICT_UNLOCK_ORD, signal,
      DictUnlockOrd::SignalLength, JBB);

--- a/ndb/test/ndbapi/testDict.cpp
+++ b/ndb/test/ndbapi/testDict.cpp
@@ -1590,17 +1590,18 @@ recv_dict_ops_run(NDBT_Context* ctx)
 int
 runRestarts(NDBT_Context* ctx, NDBT_Step* step)
 {
-  static int err_master[] = {   // non-crashing
-    0,
-    7174        // send one fake START_PERMREF
+  static int errlst_master[] = {   // non-crashing
+    7175,       // send one fake START_PERMREF
+    0 
  };
-  static int err_node[] = {
-    0,
-    7121,       // crash on START_PERMCONF
-    7130        // crash on START_MECONF
+  static int errlst_node[] = {
+    7174,       // crash before sending DICT_LOCK_REQ
+    7176,       // pretend master does not support DICT lock
+    7121,       // crash at receive START_PERMCONF
+    0
  };
-  const uint err_master_cnt = sizeof(err_master)/sizeof(err_master[0]);
-  const uint err_node_cnt = sizeof(err_node)/sizeof(err_node[0]);
+  const uint errcnt_master = sizeof(errlst_master)/sizeof(errlst_master[0]);
+  const uint errcnt_node = sizeof(errlst_node)/sizeof(errlst_node[0]);

  myRandom48Init(NdbTick_CurrentMillisecond());
  NdbRestarter restarter;
@@ -1632,7 +1633,7 @@ runRestarts(NDBT_Context* ctx, NDBT_Step* step)
      nodeIdList[nodeIdCnt++] = nodeId;
    }

-    if (numnodes >= 4) {
+    if (numnodes >= 4 && myRandom48(2) == 0) {
      int rand = myRandom48(numnodes);
      int nodeId = restarter.getRandomNodeOtherNodeGroup(nodeIdList[0], rand);
      CHECK(nodeId != -1);
@@ -1642,6 +1643,7 @@ runRestarts(NDBT_Context* ctx, NDBT_Step* step)

    g_info << "1: master=" << masterNodeId << " nodes=" << nodeIdList[0] << "," << nodeIdList[1] << endl;

+    const uint timeout = 60; //secs for node wait
    const unsigned maxsleep = 2000; //ms

    bool NF_ops = ctx->getProperty("Restart_NF_ops");
@@ -1655,9 +1657,8 @@ runRestarts(NDBT_Context* ctx, NDBT_Step* step)
    NdbSleep_MilliSleep(myRandom48(maxsleep));

    {
-      int i = 0;
-      while (i < nodeIdCnt) {
-        int nodeId = nodeIdList[i++];
+      for (int i = 0; i < nodeIdCnt; i++) {
+        int nodeId = nodeIdList[i];

        bool nostart = true;
        bool abort = NF_type == 0 ? myRandom48(2) : (NF_type == 2);
@@ -1676,9 +1677,31 @@ runRestarts(NDBT_Context* ctx, NDBT_Step* step)
    }

    g_info << "1: wait for nostart" << endl;
-    CHECK(restarter.waitNodesNoStart(nodeIdList, nodeIdCnt) == 0);
+    CHECK(restarter.waitNodesNoStart(nodeIdList, nodeIdCnt, timeout) == 0);
    NdbSleep_MilliSleep(myRandom48(maxsleep));

+    int err_master = 0;
+    int err_node[2] = { 0, 0 };
+
+    if (NR_error) {
+      err_master = errlst_master[l % errcnt_master];
+
+      // limitation: cannot have 2 node restarts and crash_insert
+      // one node may die for real (NF during startup)
+
+      for (int i = 0; i < nodeIdCnt && nodeIdCnt == 1; i++) {
+        err_node[i] = errlst_node[l % errcnt_node];
+
+        // 7176 - no DICT lock protection
+
+        if (err_node[i] == 7176) {
+          g_info << "1: no dict ops due to error insert "
+                 << err_node[i] << endl;
+          NR_ops = false;
+        }
+      }
+    }
+
    g_info << "1: " << (NR_ops ? "run" : "pause") << " dict ops" << endl;
    if (! send_dict_ops_cmd(ctx, NR_ops ? 1 : 2))
      break;
@@ -1689,23 +1712,17 @@ runRestarts(NDBT_Context* ctx, NDBT_Step* step)

    if (NR_error) {
      {
-        int rand = myRandom48(err_master_cnt);
-        int err = err_master[rand];
+        int err = err_master;
        if (err != 0) {
          g_info << "1: insert master error " << err << endl;
          CHECK(restarter.insertErrorInNode(masterNodeId, err) == 0);
        }
      }

-      // limitation: cannot have 2 node restarts and crash_insert
-      // one node may die for real (NF during startup)
+      for (int i = 0; i < nodeIdCnt; i++) {
+        int nodeId = nodeIdList[i];

-      int i = 0;
-      while (i < nodeIdCnt && nodeIdCnt == 1) {
-        int nodeId = nodeIdList[i++];
-
-        int rand = myRandom48(err_node_cnt);
-        int err = err_node[rand];
+        int err = err_node[i];
        if (err != 0) {
          g_info << "1: insert node " << nodeId << " error " << err << endl;
          CHECK(restarter.insertErrorInNode(nodeId, err) == 0);
@@ -1715,7 +1732,7 @@ runRestarts(NDBT_Context* ctx, NDBT_Step* step)
    NdbSleep_MilliSleep(myRandom48(maxsleep));

    g_info << "1: wait cluster started" << endl;
-    CHECK(restarter.waitClusterStarted() == 0);
+    CHECK(restarter.waitClusterStarted(timeout) == 0);
    NdbSleep_MilliSleep(myRandom48(maxsleep));

    g_info << "1: restart done" << endl;