14325: Don't shutdown busy VMs even if boot probe fails.
authorTom Clegg <tclegg@veritasgenetics.com>
Thu, 24 Jan 2019 21:10:48 +0000 (16:10 -0500)
committerTom Clegg <tclegg@veritasgenetics.com>
Thu, 24 Jan 2019 21:10:48 +0000 (16:10 -0500)
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg@veritasgenetics.com>

lib/dispatchcloud/test/stub_driver.go
lib/dispatchcloud/worker/pool_test.go
lib/dispatchcloud/worker/worker.go
lib/dispatchcloud/worker/worker_test.go [new file with mode: 0644]

index 64afa3acf3ed5040c567e2bfd7b67ecab5f32dd3..2277a3f235400c0f9b4944edfd1d8b58febe6e1d 100644 (file)
@@ -50,9 +50,7 @@ type StubDriver struct {
 }
 
 // InstanceSet returns a new *StubInstanceSet.
-func (sd *StubDriver) InstanceSet(params map[string]interface{}, id cloud.InstanceSetID,
-       logger logrus.FieldLogger) (cloud.InstanceSet, error) {
-
+func (sd *StubDriver) InstanceSet(params map[string]interface{}, id cloud.InstanceSetID, logger logrus.FieldLogger) (cloud.InstanceSet, error) {
        sis := StubInstanceSet{
                driver:  sd,
                servers: map[cloud.InstanceID]*StubVM{},
index 26087193f3ca32efa57279642385db1dc8bb2d25..aa81958434dd73792f399a61f5f1b0be6e2cfc1b 100644 (file)
@@ -5,7 +5,6 @@
 package worker
 
 import (
-       "io"
        "time"
 
        "git.curoverse.com/arvados.git/lib/cloud"
@@ -50,7 +49,7 @@ func (suite *PoolSuite) TestCreateUnallocShutdown(c *check.C) {
        type3 := arvados.InstanceType{Name: "a2l", ProviderType: "a2.large", VCPUs: 4, RAM: 4 * GiB, Price: .04}
        pool := &Pool{
                logger:      logrus.StandardLogger(),
-               newExecutor: func(cloud.Instance) Executor { return &stubExecutor{} },
+               newExecutor: func(cloud.Instance) Executor { return stubExecutor{} },
                instanceSet: &throttledInstanceSet{InstanceSet: lameInstanceSet},
                instanceTypes: arvados.InstanceTypeMap{
                        type1.Name: type1,
@@ -186,13 +185,3 @@ func (suite *PoolSuite) wait(c *check.C, pool *Pool, notify <-chan struct{}, rea
        }
        c.Check(ready(), check.Equals, true)
 }
-
-type stubExecutor struct{}
-
-func (*stubExecutor) SetTarget(cloud.ExecutorTarget) {}
-
-func (*stubExecutor) Execute(env map[string]string, cmd string, stdin io.Reader) ([]byte, []byte, error) {
-       return nil, nil, nil
-}
-
-func (*stubExecutor) Close() {}
index 0872e594f9a8175973cc11032ca7942a8aa455d5..a75d2bbb88611225c709a98f151dded697b5974c 100644 (file)
@@ -6,6 +6,7 @@ package worker
 
 import (
        "bytes"
+       "fmt"
        "strings"
        "sync"
        "time"
@@ -144,38 +145,65 @@ func (wkr *worker) ProbeAndUpdate() {
        }
 }
 
-// should be called in a new goroutine
+// probeAndUpdate calls probeBooted and/or probeRunning if needed, and
+// updates state accordingly.
+//
+// In StateUnknown: Call both probeBooted and probeRunning.
+// In StateBooting: Call probeBooted; if successful, call probeRunning.
+// In StateRunning: Call probeRunning.
+// In StateIdle: Call probeRunning.
+// In StateShutdown: Do nothing.
+//
+// If both probes succeed, wkr.state changes to
+// StateIdle/StateRunning.
+//
+// If probeRunning succeeds, wkr.running is updated. (This means
+// wkr.running might be non-empty even in StateUnknown, if the boot
+// probe failed.)
+//
+// probeAndUpdate should be called in a new goroutine.
 func (wkr *worker) probeAndUpdate() {
        wkr.mtx.Lock()
        updated := wkr.updated
-       needProbeRunning := wkr.state == StateRunning || wkr.state == StateIdle
-       needProbeBooted := wkr.state == StateUnknown || wkr.state == StateBooting
+       initialState := wkr.state
        wkr.mtx.Unlock()
-       if !needProbeBooted && !needProbeRunning {
-               return
-       }
 
        var (
+               booted   bool
                ctrUUIDs []string
                ok       bool
                stderr   []byte
        )
-       if needProbeBooted {
-               ok, stderr = wkr.probeBooted()
-               wkr.mtx.Lock()
-               if ok || wkr.state == StateRunning || wkr.state == StateIdle {
+
+       switch initialState {
+       case StateShutdown:
+               return
+       case StateIdle, StateRunning:
+               booted = true
+       case StateUnknown, StateBooting:
+       default:
+               panic(fmt.Sprintf("unknown state %s", initialState))
+       }
+
+       if !booted {
+               booted, stderr = wkr.probeBooted()
+               if !booted {
+                       // Pretend this probe succeeded if another
+                       // concurrent attempt succeeded.
+                       wkr.mtx.Lock()
+                       booted = wkr.state == StateRunning || wkr.state == StateIdle
+                       wkr.mtx.Unlock()
+               } else {
                        wkr.logger.Info("instance booted; will try probeRunning")
-                       needProbeRunning = true
                }
-               wkr.mtx.Unlock()
        }
-       if needProbeRunning {
+       if booted || wkr.state == StateUnknown {
                ctrUUIDs, ok, stderr = wkr.probeRunning()
        }
        logger := wkr.logger.WithField("stderr", string(stderr))
        wkr.mtx.Lock()
        defer wkr.mtx.Unlock()
-       if !ok {
+       if !ok || (!booted && len(ctrUUIDs) == 0 && len(wkr.running) == 0) {
                if wkr.state == StateShutdown && wkr.updated.After(updated) {
                        // Skip the logging noise if shutdown was
                        // initiated during probe.
@@ -186,10 +214,10 @@ func (wkr *worker) probeAndUpdate() {
                        "Duration": dur,
                        "State":    wkr.state,
                })
-               if wkr.state == StateBooting && !needProbeRunning {
-                       // If we know the instance has never passed a
-                       // boot probe, it's not noteworthy that it
-                       // hasn't passed this probe.
+               if !booted {
+                       // While we're polling the VM to see if it's
+                       // finished booting, failures are not
+                       // noteworthy, so we log at Debug level.
                        logger.Debug("new instance not responding")
                } else {
                        logger.Info("instance not responding")
@@ -234,11 +262,16 @@ func (wkr *worker) probeAndUpdate() {
                        changed = true
                }
        }
-       if wkr.state == StateUnknown || wkr.state == StateBooting {
+       if booted && (wkr.state == StateUnknown || wkr.state == StateBooting) {
                // Note: this will change again below if
                // len(wkr.starting)+len(wkr.running) > 0.
                wkr.state = StateIdle
                changed = true
+       } else if wkr.state == StateUnknown && len(running) != len(wkr.running) {
+               logger.WithFields(logrus.Fields{
+                       "RunningContainers": len(running),
+                       "State":             wkr.state,
+               }).Info("crunch-run probe succeeded, but boot probe is still failing")
        }
        if !changed {
                return
@@ -251,7 +284,7 @@ func (wkr *worker) probeAndUpdate() {
                wkr.state = StateIdle
        }
        wkr.updated = updateTime
-       if needProbeBooted {
+       if booted && (initialState == StateUnknown || initialState == StateBooting) {
                logger.WithFields(logrus.Fields{
                        "RunningContainers": len(running),
                        "State":             wkr.state,
diff --git a/lib/dispatchcloud/worker/worker_test.go b/lib/dispatchcloud/worker/worker_test.go
new file mode 100644 (file)
index 0000000..570d3b5
--- /dev/null
@@ -0,0 +1,240 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package worker
+
+import (
+       "errors"
+       "io"
+       "time"
+
+       "git.curoverse.com/arvados.git/lib/cloud"
+       "git.curoverse.com/arvados.git/lib/dispatchcloud/test"
+       "git.curoverse.com/arvados.git/sdk/go/arvados"
+       "github.com/sirupsen/logrus"
+       check "gopkg.in/check.v1"
+)
+
+var _ = check.Suite(&WorkerSuite{})
+
+type WorkerSuite struct{}
+
+func (suite *WorkerSuite) TestProbeAndUpdate(c *check.C) {
+       logger := logrus.StandardLogger()
+       bootTimeout := time.Minute
+       probeTimeout := time.Second
+
+       is, err := (&test.StubDriver{}).InstanceSet(nil, "", logger)
+       c.Assert(err, check.IsNil)
+       inst, err := is.Create(arvados.InstanceType{}, "", nil, nil)
+       c.Assert(err, check.IsNil)
+
+       type trialT struct {
+               testCaseComment string // displayed in test output to help identify failure case
+               age             time.Duration
+               state           State
+               running         int
+               starting        int
+               respBoot        stubResp // zero value is success
+               respRun         stubResp // zero value is success + nothing running
+               expectState     State
+               expectRunning   int
+       }
+
+       errFail := errors.New("failed")
+       respFail := stubResp{"", "command failed\n", errFail}
+       respContainerRunning := stubResp{"zzzzz-dz642-abcdefghijklmno\n", "", nil}
+       for _, trial := range []trialT{
+               {
+                       testCaseComment: "Unknown, probes fail",
+                       state:           StateUnknown,
+                       respBoot:        respFail,
+                       respRun:         respFail,
+                       expectState:     StateUnknown,
+               },
+               {
+                       testCaseComment: "Unknown, boot probe fails, but one container is running",
+                       state:           StateUnknown,
+                       respBoot:        respFail,
+                       respRun:         respContainerRunning,
+                       expectState:     StateUnknown,
+                       expectRunning:   1,
+               },
+               {
+                       testCaseComment: "Unknown, boot probe fails, previously running container has exited",
+                       state:           StateUnknown,
+                       running:         1,
+                       respBoot:        respFail,
+                       expectState:     StateUnknown,
+                       expectRunning:   0,
+               },
+               {
+                       testCaseComment: "Unknown, boot timeout exceeded, boot probe fails",
+                       state:           StateUnknown,
+                       age:             bootTimeout + time.Second,
+                       respBoot:        respFail,
+                       respRun:         respFail,
+                       expectState:     StateShutdown,
+               },
+               {
+                       testCaseComment: "Unknown, boot timeout exceeded, boot probe succeeds but crunch-run fails",
+                       state:           StateUnknown,
+                       age:             bootTimeout * 2,
+                       respRun:         respFail,
+                       expectState:     StateShutdown,
+               },
+               {
+                       testCaseComment: "Unknown, boot timeout exceeded, boot probe fails but crunch-run succeeds",
+                       state:           StateUnknown,
+                       age:             bootTimeout * 2,
+                       respBoot:        respFail,
+                       expectState:     StateShutdown,
+               },
+               {
+                       testCaseComment: "Unknown, boot timeout exceeded, boot probe fails but container is running",
+                       state:           StateUnknown,
+                       age:             bootTimeout * 2,
+                       respBoot:        respFail,
+                       respRun:         respContainerRunning,
+                       expectState:     StateUnknown,
+                       expectRunning:   1,
+               },
+               {
+                       testCaseComment: "Booting, boot probe fails, run probe fails",
+                       state:           StateBooting,
+                       respBoot:        respFail,
+                       respRun:         respFail,
+                       expectState:     StateBooting,
+               },
+               {
+                       testCaseComment: "Booting, boot probe fails, run probe succeeds (but isn't expected to be called)",
+                       state:           StateBooting,
+                       respBoot:        respFail,
+                       expectState:     StateBooting,
+               },
+               {
+                       testCaseComment: "Booting, boot probe succeeds, run probe fails",
+                       state:           StateBooting,
+                       respRun:         respFail,
+                       expectState:     StateBooting,
+               },
+               {
+                       testCaseComment: "Booting, boot probe succeeds, run probe succeeds",
+                       state:           StateBooting,
+                       expectState:     StateIdle,
+               },
+               {
+                       testCaseComment: "Booting, boot probe succeeds, run probe succeeds, container is running",
+                       state:           StateBooting,
+                       respRun:         respContainerRunning,
+                       expectState:     StateRunning,
+                       expectRunning:   1,
+               },
+               {
+                       testCaseComment: "Booting, boot timeout exceeded",
+                       state:           StateBooting,
+                       age:             bootTimeout * 2,
+                       respRun:         respFail,
+                       expectState:     StateShutdown,
+               },
+               {
+                       testCaseComment: "Idle, probe timeout exceeded, one container running",
+                       state:           StateIdle,
+                       age:             probeTimeout * 2,
+                       respRun:         respContainerRunning,
+                       expectState:     StateRunning,
+                       expectRunning:   1,
+               },
+               {
+                       testCaseComment: "Idle, probe timeout exceeded, one container running, probe fails",
+                       state:           StateIdle,
+                       age:             probeTimeout * 2,
+                       running:         1,
+                       respRun:         respFail,
+                       expectState:     StateShutdown,
+                       expectRunning:   1,
+               },
+               {
+                       testCaseComment: "Idle, probe timeout exceeded, nothing running, probe fails",
+                       state:           StateIdle,
+                       age:             probeTimeout * 2,
+                       respRun:         respFail,
+                       expectState:     StateShutdown,
+               },
+               {
+                       testCaseComment: "Running, one container still running",
+                       state:           StateRunning,
+                       running:         1,
+                       respRun:         respContainerRunning,
+                       expectState:     StateRunning,
+                       expectRunning:   1,
+               },
+               {
+                       testCaseComment: "Running, container has exited",
+                       state:           StateRunning,
+                       running:         1,
+                       expectState:     StateIdle,
+                       expectRunning:   0,
+               },
+               {
+                       testCaseComment: "Running, probe timeout exceeded, nothing running, new container being started",
+                       state:           StateRunning,
+                       age:             probeTimeout * 2,
+                       starting:        1,
+                       expectState:     StateRunning,
+               },
+       } {
+               c.Logf("------- %#v", trial)
+               ctime := time.Now().Add(-trial.age)
+               exr := stubExecutor{
+                       "bootprobe":         trial.respBoot,
+                       "crunch-run --list": trial.respRun,
+               }
+               wp := &Pool{
+                       newExecutor:      func(cloud.Instance) Executor { return exr },
+                       bootProbeCommand: "bootprobe",
+                       timeoutBooting:   bootTimeout,
+                       timeoutProbe:     probeTimeout,
+                       exited:           map[string]time.Time{},
+               }
+               wkr := &worker{
+                       logger:   logger,
+                       executor: exr,
+                       wp:       wp,
+                       mtx:      &wp.mtx,
+                       state:    trial.state,
+                       instance: inst,
+                       appeared: ctime,
+                       busy:     ctime,
+                       probed:   ctime,
+                       updated:  ctime,
+               }
+               if trial.running > 0 {
+                       wkr.running = map[string]struct{}{"zzzzz-dz642-abcdefghijklmno": struct{}{}}
+               }
+               if trial.starting > 0 {
+                       wkr.starting = map[string]struct{}{"zzzzz-dz642-abcdefghijklmno": struct{}{}}
+               }
+               wkr.probeAndUpdate()
+               c.Check(wkr.state, check.Equals, trial.expectState)
+               c.Check(len(wkr.running), check.Equals, trial.expectRunning)
+       }
+}
+
+type stubResp struct {
+       stdout string
+       stderr string
+       err    error
+}
+type stubExecutor map[string]stubResp
+
+func (se stubExecutor) SetTarget(cloud.ExecutorTarget) {}
+func (se stubExecutor) Close()                         {}
+func (se stubExecutor) Execute(env map[string]string, cmd string, stdin io.Reader) (stdout, stderr []byte, err error) {
+       resp, ok := se[cmd]
+       if !ok {
+               return nil, []byte("command not found\n"), errors.New("command not found")
+       }
+       return []byte(resp.stdout), []byte(resp.stderr), resp.err
+}