Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
N
neo
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Labels
Merge Requests
2
Merge Requests
2
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Analytics
Analytics
CI / CD
Repository
Value Stream
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Jobs
Commits
Open sidebar
Kirill Smelkov
neo
Commits
0beb359a
Commit
0beb359a
authored
Aug 16, 2017
by
Kirill Smelkov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
.
parent
ff17df41
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
96 additions
and
41 deletions
+96
-41
go/neo/server/cluster_test.go
go/neo/server/cluster_test.go
+10
-9
go/neo/server/master.go
go/neo/server/master.go
+86
-32
No files found.
go/neo/server/cluster_test.go
View file @
0beb359a
...
@@ -322,21 +322,22 @@ func TestMasterStorage(t *testing.T) {
...
@@ -322,21 +322,22 @@ func TestMasterStorage(t *testing.T) {
LastTid
:
lastTid
,
LastTid
:
lastTid
,
}))
}))
// XXX M -> S ClusterInformation(VERIFICATION) ?
// expect:
// TODO there is actually txn to finish
// ? M -> S ClusterInformation(VERIFICATION)
// TODO S leave at verify
// TODO S join at verify
// TODO M.Stop() while verify
// + TODO there is actually txn to finish
// verification ok; M start service
// + TODO S leave at verify
// TODO
// + TODO S join at verify
// + TODO M.Stop() while verify
// expect:
// expect:
// M.clusterState <- RUNNING + TODO it should be sent to S
// M.clusterState <- RUNNING + TODO it should be sent to S
//
+
TODO S leave while service
// TODO S leave while service
//
+
TODO S join while service
// TODO S join while service
//
+
TODO M.Stop while service
// TODO M.Stop while service
// + TODO Client connects here ?
// + TODO Client connects here ?
...
...
go/neo/server/master.go
View file @
0beb359a
...
@@ -296,7 +296,7 @@ func (m *Master) recovery(ctx context.Context) (err error) {
...
@@ -296,7 +296,7 @@ func (m *Master) recovery(ctx context.Context) (err error) {
recovery
:=
make
(
chan
storRecovery
)
recovery
:=
make
(
chan
storRecovery
)
inprogress
:=
0
// in-progress stor recoveries
inprogress
:=
0
// in-progress stor recoveries
wg
:=
sync
.
WaitGroup
{}
wg
:=
&
sync
.
WaitGroup
{}
// start recovery on all storages we are currently in touch with
// start recovery on all storages we are currently in touch with
// XXX close links to clients
// XXX close links to clients
...
@@ -320,11 +320,7 @@ loop:
...
@@ -320,11 +320,7 @@ loop:
// XXX set node.State = PENDING
// XXX set node.State = PENDING
if
node
==
nil
{
if
node
==
nil
{
wg
.
Add
(
1
)
goreject
(
ctx
,
wg
,
n
.
conn
,
resp
)
go
func
()
{
defer
wg
.
Done
()
m
.
reject
(
ctx
,
n
.
conn
,
resp
)
}()
return
return
}
}
...
@@ -336,7 +332,6 @@ loop:
...
@@ -336,7 +332,6 @@ loop:
err
:=
m
.
accept
(
ctx
,
n
.
conn
,
resp
)
err
:=
m
.
accept
(
ctx
,
n
.
conn
,
resp
)
if
err
!=
nil
{
if
err
!=
nil
{
// XXX move this m.nodeLeave <- to accept() ?
recovery
<-
storRecovery
{
stor
:
node
,
err
:
err
}
recovery
<-
storRecovery
{
stor
:
node
,
err
:
err
}
return
return
}
}
...
@@ -347,6 +342,9 @@ loop:
...
@@ -347,6 +342,9 @@ loop:
// a storage node came through recovery - let's see whether
// a storage node came through recovery - let's see whether
// ptid ↑ and if so we should take partition table from there
// ptid ↑ and if so we should take partition table from there
//
// FIXME after a storage recovers, it could go away while
// recovery for others is in progress -> monitor this.
case
r
:=
<-
recovery
:
case
r
:=
<-
recovery
:
inprogress
--
inprogress
--
...
@@ -354,6 +352,7 @@ loop:
...
@@ -354,6 +352,7 @@ loop:
log
.
Error
(
ctx
,
r
.
err
)
log
.
Error
(
ctx
,
r
.
err
)
if
!
xcontext
.
Canceled
(
errors
.
Cause
(
r
.
err
))
{
if
!
xcontext
.
Canceled
(
errors
.
Cause
(
r
.
err
))
{
// XXX dup wrt vvv (loop2)
log
.
Infof
(
ctx
,
"%v: closing link"
,
r
.
stor
.
Link
)
log
.
Infof
(
ctx
,
"%v: closing link"
,
r
.
stor
.
Link
)
// close stor link / update .nodeTab
// close stor link / update .nodeTab
...
@@ -391,7 +390,6 @@ loop:
...
@@ -391,7 +390,6 @@ loop:
readyToStart
=
ready
readyToStart
=
ready
traceMasterStartReady
(
m
,
ready
)
traceMasterStartReady
(
m
,
ready
)
}
}
// XXX -> create new parttab for new-cluster case
// request to start the cluster - if ok we exit replying ok
// request to start the cluster - if ok we exit replying ok
...
@@ -442,8 +440,12 @@ loop2:
...
@@ -442,8 +440,12 @@ loop2:
log
.
Error
(
ctx
,
r
.
err
)
log
.
Error
(
ctx
,
r
.
err
)
if
!
xcontext
.
Canceled
(
errors
.
Cause
(
r
.
err
))
{
if
!
xcontext
.
Canceled
(
errors
.
Cause
(
r
.
err
))
{
// XXX not so ok
// XXX -> r.stor.CloseLink(ctx) ?
// FIXME log / close node link; update NT
log
.
Infof
(
ctx
,
"%v: closing link"
,
r
.
stor
.
Link
)
// close stor link / update .nodeTab
lclose
(
ctx
,
r
.
stor
.
Link
)
m
.
nodeTab
.
SetNodeState
(
r
.
stor
,
neo
.
DOWN
)
}
}
case
<-
done
:
case
<-
done
:
...
@@ -542,7 +544,7 @@ var errClusterDegraded = stderrors.New("cluster became non-operatonal")
...
@@ -542,7 +544,7 @@ var errClusterDegraded = stderrors.New("cluster became non-operatonal")
// - once we are done without loosing too much storages in the process (so that
// - once we are done without loosing too much storages in the process (so that
// partition table is still operational) we are ready to enter servicing state.
// partition table is still operational) we are ready to enter servicing state.
// verify drives cluster
via
verification phase
// verify drives cluster
during
verification phase
//
//
// when verify finishes error indicates:
// when verify finishes error indicates:
// - nil: verification completed ok; cluster is ready to enter running state
// - nil: verification completed ok; cluster is ready to enter running state
...
@@ -553,14 +555,12 @@ func (m *Master) verify(ctx context.Context) (err error) {
...
@@ -553,14 +555,12 @@ func (m *Master) verify(ctx context.Context) (err error) {
defer
running
(
&
ctx
,
"verify"
)(
&
err
)
defer
running
(
&
ctx
,
"verify"
)(
&
err
)
m
.
setClusterState
(
neo
.
ClusterVerifying
)
m
.
setClusterState
(
neo
.
ClusterVerifying
)
v
ctx
,
vcancel
:=
context
.
WithCancel
(
ctx
)
ctx
,
vcancel
:=
context
.
WithCancel
(
ctx
)
defer
vcancel
()
defer
vcancel
()
verify
:=
make
(
chan
storVerify
)
verify
:=
make
(
chan
storVerify
)
inprogress
:=
0
inprogress
:=
0
wg
:=
&
sync
.
WaitGroup
{}
// verification = ask every storage to verify and wait for all them to complete
// XXX "all them" -> "enough of them"?
// NOTE we don't reset m.lastOid / m.lastTid to 0 in the beginning of verification
// NOTE we don't reset m.lastOid / m.lastTid to 0 in the beginning of verification
// XXX (= py), rationale=?
// XXX (= py), rationale=?
...
@@ -569,7 +569,11 @@ func (m *Master) verify(ctx context.Context) (err error) {
...
@@ -569,7 +569,11 @@ func (m *Master) verify(ctx context.Context) (err error) {
for
_
,
stor
:=
range
m
.
nodeTab
.
StorageList
()
{
for
_
,
stor
:=
range
m
.
nodeTab
.
StorageList
()
{
if
stor
.
State
>
neo
.
DOWN
{
// XXX state cmp ok ? XXX or stor.Link != nil ?
if
stor
.
State
>
neo
.
DOWN
{
// XXX state cmp ok ? XXX or stor.Link != nil ?
inprogress
++
inprogress
++
go
storCtlVerify
(
vctx
,
stor
,
m
.
partTab
,
verify
)
wg
.
Add
(
1
)
go
func
()
{
defer
wg
.
Done
()
storCtlVerify
(
ctx
,
stor
,
m
.
partTab
,
verify
)
}()
}
}
}
}
...
@@ -578,16 +582,26 @@ loop:
...
@@ -578,16 +582,26 @@ loop:
select
{
select
{
case
n
:=
<-
m
.
nodeCome
:
case
n
:=
<-
m
.
nodeCome
:
node
,
resp
:=
m
.
identify
(
ctx
,
n
,
/* XXX only accept storages -> known ? RUNNING : PENDING */
)
node
,
resp
:=
m
.
identify
(
ctx
,
n
,
/* XXX only accept storages -> known ? RUNNING : PENDING */
)
// XXX handle resp ^^^ like in recover
_
,
ok
:=
resp
.
(
*
neo
.
AcceptIdentification
)
if
node
==
nil
{
if
!
ok
{
goreject
(
ctx
,
wg
,
n
.
conn
,
resp
)
break
return
}
}
// new storage arrived - start verification on it too
// new storage arrived - start verification on it too
// XXX ok? or it must first go through recovery check?
inprogress
++
inprogress
++
go
storCtlVerify
(
vctx
,
node
,
m
.
partTab
,
verify
)
wg
.
Add
(
1
)
go
func
()
{
defer
wg
.
Done
()
err
:=
m
.
accept
(
ctx
,
n
.
conn
,
resp
)
if
err
!=
nil
{
verify
<-
storVerify
{
stor
:
node
,
err
:
err
}
return
}
storCtlVerify
(
ctx
,
node
,
m
.
partTab
,
verify
)
}()
case
n
:=
<-
m
.
nodeLeave
:
case
n
:=
<-
m
.
nodeLeave
:
m
.
nodeTab
.
SetNodeState
(
n
.
node
,
neo
.
DOWN
)
m
.
nodeTab
.
SetNodeState
(
n
.
node
,
neo
.
DOWN
)
...
@@ -603,14 +617,22 @@ loop:
...
@@ -603,14 +617,22 @@ loop:
// a storage node came through verification - adjust our last{Oid,Tid} if ok
// a storage node came through verification - adjust our last{Oid,Tid} if ok
// on error check - whether cluster became non-operational and stop verification if so
// on error check - whether cluster became non-operational and stop verification if so
//
// FIXME actually implement logic to decide to finish/rollback transactions
case
v
:=
<-
verify
:
case
v
:=
<-
verify
:
inprogress
--
inprogress
--
if
v
.
err
!=
nil
{
if
v
.
err
!=
nil
{
log
.
Error
(
ctx
,
v
.
err
)
log
.
Error
(
ctx
,
v
.
err
)
// mark storage as non-working in nodeTab
if
!
xcontext
.
Canceled
(
errors
.
Cause
(
v
.
err
))
{
m
.
nodeTab
.
SetNodeState
(
v
.
node
,
neo
.
DOWN
)
// XXX dup wrt recovery ^^^
log
.
Infof
(
ctx
,
"%s: closing link"
,
v
.
stor
.
Link
)
// mark storage as non-working in nodeTab
lclose
(
ctx
,
v
.
stor
.
Link
)
m
.
nodeTab
.
SetNodeState
(
v
.
stor
,
neo
.
DOWN
)
}
// check partTab is still operational
// check partTab is still operational
// if not -> cancel to go back to recovery
// if not -> cancel to go back to recovery
...
@@ -628,9 +650,6 @@ loop:
...
@@ -628,9 +650,6 @@ loop:
}
}
}
}
// XXX if m.partTab.OperationalWith(&.nodeTab, RUNNING) -> break (ok)
case
ech
:=
<-
m
.
ctlStart
:
case
ech
:=
<-
m
.
ctlStart
:
ech
<-
nil
// we are already starting
ech
<-
nil
// we are already starting
...
@@ -645,20 +664,48 @@ loop:
...
@@ -645,20 +664,48 @@ loop:
}
}
}
}
/*
// consume left verify responses (which should come without delay since it was cancelled)
// consume left verify responses (which should come without delay since it was cancelled)
for ; inprogress > 0; inprogress-- {
for ; inprogress > 0; inprogress-- {
<-verify
<-verify
}
}
*/
// wait all workers to finish (which should come without delay since it was cancelled)
done
:=
make
(
chan
struct
{})
go
func
()
{
wg
.
Wait
()
close
(
done
)
}()
loop2
:
for
{
select
{
case
v
:=
<-
verify
:
// XXX dup wrt <-verify handler above
log
.
Error
(
ctx
,
v
.
err
)
if
!
xcontext
.
Canceled
(
errors
.
Cause
(
v
.
err
))
{
log
.
Infof
(
ctx
,
"%v: closing link"
,
v
.
stor
.
Link
)
// close stor link / update .nodeTab
lclose
(
ctx
,
v
.
stor
.
Link
)
m
.
nodeTab
.
SetNodeState
(
v
.
stor
,
neo
.
DOWN
)
}
case
<-
done
:
break
loop2
}
}
return
err
return
err
}
}
// storVerify is result of a storage node passing verification phase
// storVerify is result of a storage node passing verification phase
type
storVerify
struct
{
type
storVerify
struct
{
node
*
neo
.
Node
stor
*
neo
.
Node
lastOid
zodb
.
Oid
lastOid
zodb
.
Oid
lastTid
zodb
.
Tid
lastTid
zodb
.
Tid
// link *neo.NodeLink // XXX -> Node
err
error
err
error
}
}
...
@@ -670,7 +717,7 @@ func storCtlVerify(ctx context.Context, stor *neo.Node, pt *neo.PartitionTable,
...
@@ -670,7 +717,7 @@ func storCtlVerify(ctx context.Context, stor *neo.Node, pt *neo.PartitionTable,
var
err
error
var
err
error
defer
func
()
{
defer
func
()
{
if
err
!=
nil
{
if
err
!=
nil
{
res
<-
storVerify
{
node
:
stor
,
err
:
err
}
res
<-
storVerify
{
stor
:
stor
,
err
:
err
}
}
}
}()
}()
defer
runningf
(
&
ctx
,
"%s: stor verify"
,
stor
.
Link
)(
&
err
)
defer
runningf
(
&
ctx
,
"%s: stor verify"
,
stor
.
Link
)(
&
err
)
...
@@ -705,7 +752,7 @@ func storCtlVerify(ctx context.Context, stor *neo.Node, pt *neo.PartitionTable,
...
@@ -705,7 +752,7 @@ func storCtlVerify(ctx context.Context, stor *neo.Node, pt *neo.PartitionTable,
}
}
// send results to driver
// send results to driver
res
<-
storVerify
{
node
:
stor
,
lastOid
:
last
.
LastOid
,
lastTid
:
last
.
LastTid
}
res
<-
storVerify
{
stor
:
stor
,
lastOid
:
last
.
LastOid
,
lastTid
:
last
.
LastTid
}
}
}
...
@@ -984,7 +1031,7 @@ func (m *Master) identify(ctx context.Context, n nodeCome) (node *neo.Node, resp
...
@@ -984,7 +1031,7 @@ func (m *Master) identify(ctx context.Context, n nodeCome) (node *neo.Node, resp
}
}
// reject sends rejective identification response and closes associated link
// reject sends rejective identification response and closes associated link
func
(
m
*
Master
)
reject
(
ctx
context
.
Context
,
conn
*
neo
.
Conn
,
resp
neo
.
Msg
)
{
func
reject
(
ctx
context
.
Context
,
conn
*
neo
.
Conn
,
resp
neo
.
Msg
)
{
// XXX cancel on ctx?
// XXX cancel on ctx?
// XXX log?
// XXX log?
err1
:=
conn
.
Send
(
resp
)
err1
:=
conn
.
Send
(
resp
)
...
@@ -996,6 +1043,13 @@ func (m *Master) reject(ctx context.Context, conn *neo.Conn, resp neo.Msg) {
...
@@ -996,6 +1043,13 @@ func (m *Master) reject(ctx context.Context, conn *neo.Conn, resp neo.Msg) {
}
}
}
}
// goreject spawns reject in separate goroutine properly added/done on wg
func
goreject
(
ctx
context
.
Context
,
wg
*
sync
.
WaitGroup
,
conn
*
neo
.
Conn
,
resp
neo
.
Msg
)
{
wg
.
Add
(
1
)
defer
wg
.
Done
()
reject
(
ctx
,
conn
,
resp
)
}
// accept sends acceptive identification response and closes conn
// accept sends acceptive identification response and closes conn
// XXX if problem -> .nodeLeave
// XXX if problem -> .nodeLeave
// XXX spawn ping goroutine from here?
// XXX spawn ping goroutine from here?
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment