Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
M
mariadb
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
Kirill Smelkov
mariadb
Commits
0e753bbb
Commit
0e753bbb
authored
Jun 26, 2007
by
jonas@perch.ndb.mysql.com
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
ndb - bug#29331 (51)
Add better handling of GCP Stop Only kill "offending" node
parent
dfd8deeb
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
167 additions
and
38 deletions
+167
-38
storage/ndb/src/kernel/blocks/ERROR_codes.txt
storage/ndb/src/kernel/blocks/ERROR_codes.txt
+5
-1
storage/ndb/src/kernel/blocks/dbdih/Dbdih.hpp
storage/ndb/src/kernel/blocks/dbdih/Dbdih.hpp
+1
-1
storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp
storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp
+161
-36
No files found.
storage/ndb/src/kernel/blocks/ERROR_codes.txt
View file @
0e753bbb
...
@@ -5,7 +5,7 @@ Next DBACC 3002
...
@@ -5,7 +5,7 @@ Next DBACC 3002
Next DBTUP 4029
Next DBTUP 4029
Next DBLQH 5045
Next DBLQH 5045
Next DBDICT 6007
Next DBDICT 6007
Next DBDIH 718
3
Next DBDIH 718
6
Next DBTC 8040
Next DBTC 8040
Next CMVMI 9000
Next CMVMI 9000
Next BACKUP 10038
Next BACKUP 10038
...
@@ -75,6 +75,10 @@ Delay GCP_SAVEREQ by 10 secs
...
@@ -75,6 +75,10 @@ Delay GCP_SAVEREQ by 10 secs
7180: Crash master during master-take-over in execMASTER_LCPCONF
7180: Crash master during master-take-over in execMASTER_LCPCONF
7184: Crash before starting next GCP after a node failure
7185: Dont reply to COPY_GCI_REQ where reason == GCP
ERROR CODES FOR TESTING NODE FAILURE, LOCAL CHECKPOINT HANDLING:
ERROR CODES FOR TESTING NODE FAILURE, LOCAL CHECKPOINT HANDLING:
-----------------------------------------------------------------
-----------------------------------------------------------------
...
...
storage/ndb/src/kernel/blocks/dbdih/Dbdih.hpp
View file @
0e753bbb
...
@@ -899,7 +899,7 @@ private:
...
@@ -899,7 +899,7 @@ private:
void
ndbsttorry10Lab
(
Signal
*
,
Uint32
_line
);
void
ndbsttorry10Lab
(
Signal
*
,
Uint32
_line
);
void
createMutexes
(
Signal
*
signal
,
Uint32
no
);
void
createMutexes
(
Signal
*
signal
,
Uint32
no
);
void
createMutex_done
(
Signal
*
signal
,
Uint32
no
,
Uint32
retVal
);
void
createMutex_done
(
Signal
*
signal
,
Uint32
no
,
Uint32
retVal
);
void
crashSystemAtGcpStop
(
Signal
*
);
void
crashSystemAtGcpStop
(
Signal
*
,
bool
);
void
sendFirstDictfragsreq
(
Signal
*
,
TabRecordPtr
regTabPtr
);
void
sendFirstDictfragsreq
(
Signal
*
,
TabRecordPtr
regTabPtr
);
void
addtabrefuseLab
(
Signal
*
,
ConnectRecordPtr
regConnectPtr
,
Uint32
errorCode
);
void
addtabrefuseLab
(
Signal
*
,
ConnectRecordPtr
regConnectPtr
,
Uint32
errorCode
);
void
GCP_SAVEhandling
(
Signal
*
,
Uint32
nodeId
);
void
GCP_SAVEhandling
(
Signal
*
,
Uint32
nodeId
);
...
...
storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp
View file @
0e753bbb
...
@@ -747,6 +747,13 @@ done:
...
@@ -747,6 +747,13 @@ done:
}
}
ndbrequire
(
ok
);
ndbrequire
(
ok
);
if
(
ERROR_INSERTED
(
7185
)
&&
reason
==
CopyGCIReq
::
GLOBAL_CHECKPOINT
)
{
jam
();
return
;
}
/* ----------------------------------------------------------------------- */
/* ----------------------------------------------------------------------- */
/* WE START BY TRYING TO OPEN THE FIRST RESTORABLE GCI FILE. */
/* WE START BY TRYING TO OPEN THE FIRST RESTORABLE GCI FILE. */
/* ----------------------------------------------------------------------- */
/* ----------------------------------------------------------------------- */
...
@@ -4071,6 +4078,11 @@ void Dbdih::execNODE_FAILREP(Signal* signal)
...
@@ -4071,6 +4078,11 @@ void Dbdih::execNODE_FAILREP(Signal* signal)
CLEAR_ERROR_INSERT_VALUE
;
CLEAR_ERROR_INSERT_VALUE
;
}
}
if
(
ERROR_INSERTED
(
7184
))
{
SET_ERROR_INSERT_VALUE
(
7000
);
}
/*-------------------------------------------------------------------------*/
/*-------------------------------------------------------------------------*/
// The first step is to convert from a bit mask to an array of failed nodes.
// The first step is to convert from a bit mask to an array of failed nodes.
/*-------------------------------------------------------------------------*/
/*-------------------------------------------------------------------------*/
...
@@ -7745,7 +7757,7 @@ void Dbdih::checkGcpStopLab(Signal* signal)
...
@@ -7745,7 +7757,7 @@ void Dbdih::checkGcpStopLab(Signal* signal)
g_eventLogger
.
error
(
"System crash due to GCP Stop in state = %u"
,
g_eventLogger
.
error
(
"System crash due to GCP Stop in state = %u"
,
(
Uint32
)
cgcpStatus
);
(
Uint32
)
cgcpStatus
);
#endif
#endif
crashSystemAtGcpStop
(
signal
);
crashSystemAtGcpStop
(
signal
,
false
);
return
;
return
;
}
//if
}
//if
}
else
{
}
else
{
...
@@ -7759,7 +7771,7 @@ void Dbdih::checkGcpStopLab(Signal* signal)
...
@@ -7759,7 +7771,7 @@ void Dbdih::checkGcpStopLab(Signal* signal)
g_eventLogger
.
error
(
"System crash due to GCP Stop in state = %u"
,
g_eventLogger
.
error
(
"System crash due to GCP Stop in state = %u"
,
(
Uint32
)
cgcpStatus
);
(
Uint32
)
cgcpStatus
);
#endif
#endif
crashSystemAtGcpStop
(
signal
);
crashSystemAtGcpStop
(
signal
,
false
);
return
;
return
;
}
//if
}
//if
}
else
{
}
else
{
...
@@ -11117,41 +11129,132 @@ void Dbdih::tableCloseLab(Signal* signal, FileRecordPtr filePtr)
...
@@ -11117,41 +11129,132 @@ void Dbdih::tableCloseLab(Signal* signal, FileRecordPtr filePtr)
* GCP stop detected,
* GCP stop detected,
* send SYSTEM_ERROR to all other alive nodes
* send SYSTEM_ERROR to all other alive nodes
*/
*/
void
Dbdih
::
crashSystemAtGcpStop
(
Signal
*
signal
)
void
Dbdih
::
crashSystemAtGcpStop
(
Signal
*
signal
,
bool
local
)
{
{
if
(
local
)
goto
dolocal
;
switch
(
cgcpStatus
){
switch
(
cgcpStatus
){
case
GCP_PREPARE_SENT
:
{
jam
();
/**
* We're waiting for a GCP PREPARE CONF
*/
infoEvent
(
"Detected GCP stop(%d)...sending kill to %s"
,
cgcpStatus
,
c_GCP_PREPARE_Counter
.
getText
());
ndbout_c
(
"Detected GCP stop(%d)...sending kill to %s"
,
cgcpStatus
,
c_GCP_PREPARE_Counter
.
getText
());
{
NodeReceiverGroup
rg
(
DBDIH
,
c_GCP_PREPARE_Counter
);
signal
->
theData
[
0
]
=
7022
;
sendSignal
(
rg
,
GSN_DUMP_STATE_ORD
,
signal
,
1
,
JBA
);
}
{
NodeReceiverGroup
rg
(
NDBCNTR
,
c_GCP_PREPARE_Counter
);
SystemError
*
const
sysErr
=
(
SystemError
*
)
&
signal
->
theData
[
0
];
sysErr
->
errorCode
=
SystemError
::
GCPStopDetected
;
sysErr
->
errorRef
=
reference
();
sysErr
->
data1
=
cgcpStatus
;
sysErr
->
data2
=
cgcpOrderBlocked
;
sendSignal
(
rg
,
GSN_SYSTEM_ERROR
,
signal
,
SystemError
::
SignalLength
,
JBA
);
}
ndbrequire
(
!
c_GCP_PREPARE_Counter
.
done
());
return
;
}
case
GCP_COMMIT_SENT
:
{
jam
();
/**
* We're waiting for a GCP_NODEFINISH
*/
infoEvent
(
"Detected GCP stop(%d)...sending kill to %s"
,
cgcpStatus
,
c_GCP_COMMIT_Counter
.
getText
());
ndbout_c
(
"Detected GCP stop(%d)...sending kill to %s"
,
cgcpStatus
,
c_GCP_COMMIT_Counter
.
getText
());
{
NodeReceiverGroup
rg
(
DBDIH
,
c_GCP_COMMIT_Counter
);
signal
->
theData
[
0
]
=
7022
;
sendSignal
(
rg
,
GSN_DUMP_STATE_ORD
,
signal
,
1
,
JBA
);
}
{
NodeReceiverGroup
rg
(
NDBCNTR
,
c_GCP_COMMIT_Counter
);
SystemError
*
const
sysErr
=
(
SystemError
*
)
&
signal
->
theData
[
0
];
sysErr
->
errorCode
=
SystemError
::
GCPStopDetected
;
sysErr
->
errorRef
=
reference
();
sysErr
->
data1
=
cgcpStatus
;
sysErr
->
data2
=
cgcpOrderBlocked
;
sendSignal
(
rg
,
GSN_SYSTEM_ERROR
,
signal
,
SystemError
::
SignalLength
,
JBA
);
}
ndbrequire
(
!
c_GCP_COMMIT_Counter
.
done
());
return
;
}
case
GCP_NODE_FINISHED
:
case
GCP_NODE_FINISHED
:
{
{
jam
();
/**
/**
* We're waiting for a GCP save conf
* We're waiting for a GCP save conf
*/
*/
ndbrequire
(
!
c_GCP_SAVEREQ_Counter
.
done
());
NodeReceiverGroup
rg
(
DBLQH
,
c_GCP_SAVEREQ_Counter
);
NodeReceiverGroup
rg
(
DBLQH
,
c_GCP_SAVEREQ_Counter
);
signal
->
theData
[
0
]
=
2305
;
signal
->
theData
[
0
]
=
2305
;
sendSignal
(
rg
,
GSN_DUMP_STATE_ORD
,
signal
,
1
,
JBB
);
sendSignal
(
rg
,
GSN_DUMP_STATE_ORD
,
signal
,
1
,
JBB
);
infoEvent
(
"Detected GCP stop...sending kill to %s"
,
infoEvent
(
"Detected GCP stop(%d)...sending kill to %s"
,
c_GCP_SAVEREQ_Counter
.
getText
());
cgcpStatus
,
c_GCP_SAVEREQ_Counter
.
getText
());
g_eventLogger
.
error
(
"Detected GCP stop...sending kill to %s"
,
ndbout_c
(
"Detected GCP stop(%d)...sending kill to %s"
,
c_GCP_SAVEREQ_Counter
.
getText
());
cgcpStatus
,
c_GCP_SAVEREQ_Counter
.
getText
());
ndbrequire
(
!
c_GCP_SAVEREQ_Counter
.
done
());
return
;
return
;
}
}
case
GCP_SAVE_LQH_FINISHED
:
case
GCP_SAVE_LQH_FINISHED
:
g_eventLogger
.
error
(
"m_copyReason: %d m_waiting: %d"
,
{
c_copyGCIMaster
.
m_copyReason
,
jam
();
c_copyGCIMaster
.
m_waiting
);
/**
break
;
* We're waiting for a COPY_GCICONF
case
GCP_READY
:
// shut up lint
*/
case
GCP_PREPARE_SENT
:
infoEvent
(
"Detected GCP stop(%d)...sending kill to %s"
,
case
GCP_COMMIT_SENT
:
cgcpStatus
,
c_COPY_GCIREQ_Counter
.
getText
());
break
;
ndbout_c
(
"Detected GCP stop(%d)...sending kill to %s"
,
cgcpStatus
,
c_COPY_GCIREQ_Counter
.
getText
());
{
NodeReceiverGroup
rg
(
DBDIH
,
c_COPY_GCIREQ_Counter
);
signal
->
theData
[
0
]
=
7022
;
sendSignal
(
rg
,
GSN_DUMP_STATE_ORD
,
signal
,
1
,
JBA
);
}
{
NodeReceiverGroup
rg
(
NDBCNTR
,
c_COPY_GCIREQ_Counter
);
SystemError
*
const
sysErr
=
(
SystemError
*
)
&
signal
->
theData
[
0
];
sysErr
->
errorCode
=
SystemError
::
GCPStopDetected
;
sysErr
->
errorRef
=
reference
();
sysErr
->
data1
=
cgcpStatus
;
sysErr
->
data2
=
cgcpOrderBlocked
;
sendSignal
(
rg
,
GSN_SYSTEM_ERROR
,
signal
,
SystemError
::
SignalLength
,
JBA
);
}
ndbrequire
(
!
c_COPY_GCIREQ_Counter
.
done
());
return
;
}
case
GCP_READY
:
(
void
)
1
;
}
}
dolocal:
ndbout_c
(
"m_copyReason: %d m_waiting: %d"
,
c_copyGCIMaster
.
m_copyReason
,
c_copyGCIMaster
.
m_waiting
);
g_eventLogger
.
error
(
"c_copyGCISlave: sender{Data, Ref} %d %x reason: %d nextWord: %d"
,
ndbout_c
(
"c_copyGCISlave: sender{Data, Ref} %d %x reason: %d nextWord: %d"
,
c_copyGCISlave
.
m_senderData
,
c_copyGCISlave
.
m_senderData
,
c_copyGCISlave
.
m_senderRef
,
c_copyGCISlave
.
m_senderRef
,
c_copyGCISlave
.
m_copyReason
,
c_copyGCISlave
.
m_copyReason
,
c_copyGCISlave
.
m_expectedNextWord
);
c_copyGCISlave
.
m_expectedNextWord
);
FileRecordPtr
file0Ptr
;
FileRecordPtr
file0Ptr
;
file0Ptr
.
i
=
crestartInfoFile
[
0
];
file0Ptr
.
i
=
crestartInfoFile
[
0
];
...
@@ -11202,23 +11305,39 @@ void Dbdih::crashSystemAtGcpStop(Signal* signal)
...
@@ -11202,23 +11305,39 @@ void Dbdih::crashSystemAtGcpStop(Signal* signal)
c_TCGETOPSIZEREQ_Counter
.
getText
());
c_TCGETOPSIZEREQ_Counter
.
getText
());
ndbout_c
(
"c_UPDATE_TOREQ_Counter = %s"
,
c_UPDATE_TOREQ_Counter
.
getText
());
ndbout_c
(
"c_UPDATE_TOREQ_Counter = %s"
,
c_UPDATE_TOREQ_Counter
.
getText
());
NodeRecordPtr
nodePtr
;
if
(
local
==
false
)
for
(
nodePtr
.
i
=
1
;
nodePtr
.
i
<
MAX_NDB_NODES
;
nodePtr
.
i
++
)
{
{
jam
();
jam
();
ptrAss
(
nodePtr
,
nodeRecord
)
;
NodeRecordPtr
nodePtr
;
if
(
nodePtr
.
p
->
nodeStatus
==
NodeRecord
::
ALIVE
)
{
for
(
nodePtr
.
i
=
1
;
nodePtr
.
i
<
MAX_NDB_NODES
;
nodePtr
.
i
++
)
{
jam
();
jam
();
const
BlockReference
ref
=
ptrAss
(
nodePtr
,
nodeRecord
);
numberToRef
(
refToBlock
(
cntrlblockref
),
nodePtr
.
i
);
if
(
nodePtr
.
p
->
nodeStatus
==
NodeRecord
::
ALIVE
)
{
SystemError
*
const
sysErr
=
(
SystemError
*
)
&
signal
->
theData
[
0
];
jam
();
sysErr
->
errorCode
=
SystemError
::
GCPStopDetected
;
const
BlockReference
ref
=
sysErr
->
errorRef
=
reference
();
numberToRef
(
refToBlock
(
cntrlblockref
),
nodePtr
.
i
);
sysErr
->
data1
=
cgcpStatus
;
SystemError
*
const
sysErr
=
(
SystemError
*
)
&
signal
->
theData
[
0
];
sysErr
->
data2
=
cgcpOrderBlocked
;
sysErr
->
errorCode
=
SystemError
::
GCPStopDetected
;
sendSignal
(
ref
,
GSN_SYSTEM_ERROR
,
signal
,
sysErr
->
errorRef
=
reference
();
SystemError
::
SignalLength
,
JBA
);
sysErr
->
data1
=
cgcpStatus
;
}
//if
sysErr
->
data2
=
cgcpOrderBlocked
;
}
//for
sendSignal
(
ref
,
GSN_SYSTEM_ERROR
,
signal
,
SystemError
::
SignalLength
,
JBA
);
}
//if
}
//for
}
else
{
jam
();
SystemError
*
const
sysErr
=
(
SystemError
*
)
&
signal
->
theData
[
0
];
sysErr
->
errorCode
=
SystemError
::
GCPStopDetected
;
sysErr
->
errorRef
=
reference
();
sysErr
->
data1
=
cgcpStatus
;
sysErr
->
data2
=
cgcpOrderBlocked
;
EXECUTE_DIRECT
(
NDBCNTR
,
GSN_SYSTEM_ERROR
,
signal
,
SystemError
::
SignalLength
);
ndbrequire
(
false
);
}
return
;
return
;
}
//Dbdih::crashSystemAtGcpStop()
}
//Dbdih::crashSystemAtGcpStop()
...
@@ -14304,6 +14423,12 @@ Dbdih::execDUMP_STATE_ORD(Signal* signal)
...
@@ -14304,6 +14423,12 @@ Dbdih::execDUMP_STATE_ORD(Signal* signal)
infoEvent
(
buf
);
infoEvent
(
buf
);
}
}
}
}
if
(
arg
==
7022
)
{
jam
();
crashSystemAtGcpStop
(
signal
,
true
);
}
}
//Dbdih::execDUMP_STATE_ORD()
}
//Dbdih::execDUMP_STATE_ORD()
void
void
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment