Commit 7aa13998 authored by Kirill Smelkov's avatar Kirill Smelkov

.

parent 06685e0b
...@@ -181,7 +181,7 @@ func NewMaster(clusterName string, net xnet.Networker) *Master { ...@@ -181,7 +181,7 @@ func NewMaster(clusterName string, net xnet.Networker) *Master {
// NOTE upon successful return cluster is not yet in running state - the transition will // NOTE upon successful return cluster is not yet in running state - the transition will
// take time and could be also automatically aborted due to cluster environment change (e.g. // take time and could be also automatically aborted due to cluster environment change (e.g.
// a storage node goes down). // a storage node goes down).
func (m *Master) _Start() error { func (m *Master) Start() error {
ech := make(chan error) ech := make(chan error)
m.ctlStart <- ech m.ctlStart <- ech
return <-ech return <-ech
...@@ -392,7 +392,19 @@ func (m *Master) recovery(ctx context.Context) (err error) { ...@@ -392,7 +392,19 @@ func (m *Master) recovery(ctx context.Context) (err error) {
inprogress := 0 // in-progress stor recoveries inprogress := 0 // in-progress stor recoveries
// wg := &sync.WaitGroup{} // wg := &sync.WaitGroup{}
start := false // whether we were instructed to start // requests to .ctlStart received when readyToStart
// on success answered when full recovery completes
startReqv := []chan error{}
errStartNonOperational := fmt.Errorf("start: cluster is non-operational")
defer func() {
errStart := errStartNonOperational
if err == nil {
errStart = nil
}
for _, ech := range startReqv {
ech <- errStart
}
}()
//trace:event traceMasterStartReady(m *Master, ready bool) //trace:event traceMasterStartReady(m *Master, ready bool)
readyToStart := false // whether cluster currently can be operational or not readyToStart := false // whether cluster currently can be operational or not
updateReadyToStart := func() { updateReadyToStart := func() {
...@@ -409,12 +421,26 @@ func (m *Master) recovery(ctx context.Context) (err error) { ...@@ -409,12 +421,26 @@ func (m *Master) recovery(ctx context.Context) (err error) {
ready = (nup > 0 && inprogress == 0) ready = (nup > 0 && inprogress == 0)
} else { } else {
ready = m.node.State.PartTab.OperationalWith(m.node.State.NodeTab) // XXX + node state ready = m.node.State.PartTab.OperationalWith(m.node.State.NodeTab)
} }
if readyToStart != ready { if readyToStart != ready {
state := "ready"
if !ready {
state = "not ready"
}
log.Info(ctx, "cluster is %s to start", state)
readyToStart = ready readyToStart = ready
traceMasterStartReady(m, ready) traceMasterStartReady(m, ready)
// cluster became non-operational - cancel previously queued start requests
if !ready {
for _, ech := range startReqv {
ech <- errStartNonOperational
}
startReqv = startReqv[:0]
}
} }
} }
...@@ -473,9 +499,9 @@ func (m *Master) recovery(ctx context.Context) (err error) { ...@@ -473,9 +499,9 @@ func (m *Master) recovery(ctx context.Context) (err error) {
ctlStop = nil ctlStop = nil
nodeComeq = nil nodeComeq = nil
} }
loop:
for inprogress > 0 || !( for inprogress > 0 || !(
/*start*/(readyToStart && start) || /*stop*/(err != nil)) { /*start*/(readyToStart && len(startReqv) > 0) || /*stop*/(err != nil)) {
select { select {
case <-ctxDone: case <-ctxDone:
...@@ -486,25 +512,16 @@ loop: ...@@ -486,25 +512,16 @@ loop:
case ech := <-ctlStart: case ech := <-ctlStart:
if readyToStart { if readyToStart {
log.Infof(ctx, "start command - we are ready") log.Infof(ctx, "start command - we are ready")
// reply "ok to start" after whole recovery finishes // queue start request. Right now we believe we can
// satisfy it, but during completion of spawned recovery
// XXX ok? we want to retrieve all recovery information first? // tasks, the cluster might become non-operational again.
// XXX or initially S is in PENDING state and // If it will - queued start requests will be canceled.
// transitions to RUNNING only after successful recovery? startReqv = append(startReqv, ech)
} else {
rcancel() log.Infof(ctx, "start command - err - we are not ready")
defer func() { ech <- errStartNonOperational
// XXX can situation change while we are shutting down?
// XXX -> recheck logic with checking PT operational ^^^
// XXX (depending on storages state)
ech <- nil
}()
break loop // FIXME
} }
log.Infof(ctx, "start command - err - we are not ready")
ech <- fmt.Errorf("start: cluster is non-operational")
case ech := <-ctlStop: case ech := <-ctlStop:
close(ech) // ok; we are already recovering close(ech) // ok; we are already recovering
......
...@@ -83,7 +83,7 @@ type tNode struct { ...@@ -83,7 +83,7 @@ type tNode struct {
// ITestMaster represents tested master node. // ITestMaster represents tested master node.
type ITestMaster interface { type ITestMaster interface {
_Start() error Start() error
} }
// ITestStorage represents tested storage node. // ITestStorage represents tested storage node.
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment