Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
N
neo
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Stefane Fermigier
neo
Commits
4d71333f
Commit
4d71333f
authored
7 years ago
by
Kirill Smelkov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
.
parent
06158f50
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
55 additions
and
31 deletions
+55
-31
go/neo/master.go
go/neo/master.go
+48
-28
go/neo/nodetab.go
go/neo/nodetab.go
+1
-2
go/neo/parttab.go
go/neo/parttab.go
+5
-0
go/neo/server.go
go/neo/server.go
+1
-1
No files found.
go/neo/master.go
View file @
4d71333f
...
...
@@ -162,9 +162,11 @@ func (m *Master) run(ctx context.Context) {
// recovery drives cluster during recovery phase
//
// when recovery finishes error indicates:
// - nil: recovery was ok and a command came for cluster to start
XXX or autostart
// - nil: recovery was ok and a command came for cluster to start
// - !nil: recovery was cancelled
func
(
m
*
Master
)
recovery
(
ctx
context
.
Context
)
(
err
error
)
{
defer
errcontextf
(
&
err
,
"master: recovery"
)
m
.
setClusterState
(
ClusterRecovering
)
rctx
,
rcancel
:=
context
.
WithCancel
(
ctx
)
defer
rcancel
()
...
...
@@ -184,7 +186,7 @@ loop:
for
{
select
{
case
n
:=
<-
m
.
nodeCome
:
node
,
ok
:=
m
.
accept
(
n
,
/* XXX
do not accept clients
*/
)
node
,
ok
:=
m
.
accept
(
n
,
/* XXX
only accept storages -> PENDING
*/
)
if
!
ok
{
break
}
...
...
@@ -227,8 +229,7 @@ loop:
// XXX ok? we want to retrieve all recovery information first?
// XXX or initially S is in PENDING state and
// transitions to RUNNING only after successful
// recovery?
// transitions to RUNNING only after successful recovery?
rcancel
()
defer
func
()
{
...
...
@@ -326,18 +327,24 @@ func storCtlRecovery(ctx context.Context, link *NodeLink, res chan storRecovery)
}
var
errStopRequested
=
errors
.
New
(
"stop requested"
)
var
errClusterDegraded
=
errors
.
New
(
"cluster became non-operatonal"
)
// Cluster Verification
// --------------------
//
// - starts with operational part
tab
// - starts with operational part
ition table
// - tell all storages to perform data verification (TODO) and retrieve last ids
// - once we are done without loosing too much storages in the process (so that
// part
tab
is still operational) we are ready to enter servicing state.
// part
ition table
is still operational) we are ready to enter servicing state.
// verify drives cluster via verification phase
//
// prerequisite for start: .partTab is operational wrt .nodeTab
func
(
m
*
Master
)
verify
(
ctx
context
.
Context
)
(
err
error
)
{
defer
errcontextf
(
&
err
,
"master: verify"
)
m
.
setClusterState
(
ClusterVerifying
)
vctx
,
vcancel
:=
context
.
WithCancel
(
ctx
)
defer
vcancel
()
...
...
@@ -345,39 +352,57 @@ func (m *Master) verify(ctx context.Context) (err error) {
verify
:=
make
(
chan
storVerify
)
inprogress
:=
0
//
XXX ask every storage for verify and wait for _all_ them to complete?
//
XXX do we need to reset m.lastOid / m.lastTid to 0 in the beginning?
//
NOTE we don't reset m.lastOid / m.lastTid to 0 in the beginning of verification
//
with the idea that XXX
// XXX ask every storage to verify and wait for _all_ them to complete?
// start verification on all storages we are currently in touch with
for
_
,
stor
:=
range
m
.
nodeTab
.
StorageList
()
{
// XXX check state > DOWN
inprogress
++
go
storCtlVerify
(
vctx
,
stor
.
Link
,
verify
)
if
stor
.
NodeState
>
DOWN
{
// XXX state cmp ok ? XXX or stor.Link != nil ?
inprogress
++
go
storCtlVerify
(
vctx
,
stor
.
Link
,
verify
)
}
}
loop
:
for
inprogress
>
0
{
select
{
case
n
:=
<-
m
.
nodeCome
:
// TODO
_
=
n
node
,
ok
:=
m
.
accept
(
n
,
/* XXX only accept storages -> known ? RUNNING : PENDING */
)
if
!
ok
{
break
}
// new storage arrived - start verification on it too
// XXX ok? or it must first go through recovery check?
inprogress
++
go
storCtlVerify
(
vctx
,
node
.
Link
,
verify
)
case
n
:=
<-
m
.
nodeLeave
:
// TODO
_
=
n
m
.
nodeTab
.
UpdateLinkDown
(
n
.
link
)
// if cluster became non-operational - we cancel verification
if
!
m
.
partTab
.
OperationalWith
(
&
m
.
nodeTab
)
{
// XXX ok to instantly cancel? or better
// graceful shutdown in-flight verifications?
vcancel
()
err
=
errClusterDegraded
break
loop
}
// a storage node came through verification - TODO
case
v
:=
<-
verify
:
inprogress
--
if
v
.
err
!=
nil
{
fmt
.
Printf
(
"master:
%v
\n
"
,
v
.
err
)
// XXX err ctx
fmt
.
Printf
(
"master:
verify: %v
\n
"
,
v
.
err
)
// XXX mark S as non-working in nodeTab
// check partTab is still operational
// if not -> cancel to go back to recovery
if
m
.
partTab
.
OperationalWith
(
&
m
.
nodeTab
)
{
vcancel
()
err
=
fmt
.
Errorf
(
"cluster became non-operational in the process"
)
err
=
errClusterDegraded
break
loop
}
}
else
{
...
...
@@ -395,7 +420,7 @@ loop:
case
ech
:=
<-
m
.
ctlStop
:
ech
<-
nil
// ok
err
=
fmt
.
Errorf
(
"stop requested"
)
err
=
errStopRequested
break
loop
case
<-
ctx
.
Done
()
:
...
...
@@ -404,17 +429,11 @@ loop:
}
}
if
err
!=
nil
{
// XXX -> err = fmt.Errorf("... %v", err)
fmt
.
Printf
(
"master: verify: %v
\n
"
,
err
)
// consume left verify responses (which should come without delay since it was cancelled)
for
;
inprogress
>
0
;
inprogress
--
{
<-
verify
}
// consume left verify responses (which should come without delay since it was cancelled)
for
;
inprogress
>
0
;
inprogress
--
{
<-
verify
}
// XXX -> return via channel ?
return
err
}
...
...
@@ -475,6 +494,7 @@ func storCtlVerify(ctx context.Context, link *NodeLink, res chan storVerify) {
// service drives cluster during running state
//
// prerequisite for start: .partTab is operational wrt .nodeTab and verification passed (XXX)
func
(
m
*
Master
)
service
(
ctx
context
.
Context
)
(
err
error
)
{
m
.
setClusterState
(
ClusterRunning
)
...
...
@@ -864,7 +884,7 @@ func (m *Master) DriveStorage(ctx context.Context, link *NodeLink) {
// # (via changing m.clusterState and relying on broadcast ?)
// >NotifyClusterInformation (cluster_state=VERIFYING)
//
// # (via changing partTab and relying on broadcast ?)
// # (via changing partTab and relying on broadcast ?)
-> no sends whole PT initially
// >NotifyPartitionTable (ptid=1, `node 0: S1, R`)
// # S saves PT info locally XXX -> after StartOperation ?
//
...
...
This diff is collapsed.
Click to expand it.
go/neo/nodetab.go
View file @
4d71333f
...
...
@@ -81,8 +81,7 @@ type NodeTable struct {
// Node represents a node entry in NodeTable
type
Node
struct
{
//Info NodeInfo // XXX extract ? XXX -> embedd
NodeInfo
NodeInfo
// XXX good idea to embed ?
Link
*
NodeLink
// link to this node; =nil if not connected XXX do we need it here ?
// XXX identified or not ?
...
...
This diff is collapsed.
Click to expand it.
go/neo/parttab.go
View file @
4d71333f
...
...
@@ -151,6 +151,11 @@ func (pt *PartitionTable) OperationalWith(nt *NodeTable) bool {
switch
cell
.
CellState
{
case
UP_TO_DATE
,
FEEDING
:
// XXX cell.isReadble in py
// cell says it is readable. let's check whether corresponding node is up
// FIXME checking whether it is up is not really enough -
// - what is needed to check is that data on that node is up
// to last_tid.
//
// We leave it as is for now.
node
:=
nt
.
Get
(
cell
.
NodeUUID
)
if
node
==
nil
||
node
.
NodeState
!=
RUNNING
{
// XXX PENDING is also ok ?
continue
...
...
This diff is collapsed.
Click to expand it.
go/neo/server.go
View file @
4d71333f
...
...
@@ -263,7 +263,7 @@ func Expect(conn *Conn, msg NEODecoder) error {
return
errDecode
(
&
errResp
)
// XXX err ctx
}
return
fmt
.
Errorf
(
"unexpected packet: %T"
,
msgType
)
// XXX err ctx
return
fmt
.
Errorf
(
"unexpected packet: %T"
,
msgType
)
// XXX err ctx
-> + conn ?
}
_
,
err
=
msg
.
NEODecode
(
pkt
.
Payload
())
...
...
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment