14325: Don't shutdown busy VMs even if boot probe fails.

author Tom Clegg <tclegg@veritasgenetics.com>

Thu, 24 Jan 2019 21:10:48 +0000 (16:10 -0500)

committer Tom Clegg <tclegg@veritasgenetics.com>

Thu, 24 Jan 2019 21:10:48 +0000 (16:10 -0500)
author Tom Clegg <tclegg@veritasgenetics.com>
Thu, 24 Jan 2019 21:10:48 +0000 (16:10 -0500)
committer Tom Clegg <tclegg@veritasgenetics.com>
Thu, 24 Jan 2019 21:10:48 +0000 (16:10 -0500)
diff --git a/lib/dispatchcloud/test/stub_driver.go b/lib/dispatchcloud/test/stub_driver.go

index 64afa3acf3ed5040c567e2bfd7b67ecab5f32dd3..2277a3f235400c0f9b4944edfd1d8b58febe6e1d 100644 (file)
--- a/lib/dispatchcloud/test/stub_driver.go
+++ b/lib/dispatchcloud/test/stub_driver.go
@@ -50,9 +50,7 @@ type StubDriver struct {
  }
  
  // InstanceSet returns a new *StubInstanceSet.
-func (sd *StubDriver) InstanceSet(params map[string]interface{}, id cloud.InstanceSetID,
-       logger logrus.FieldLogger) (cloud.InstanceSet, error) {
-
+func (sd *StubDriver) InstanceSet(params map[string]interface{}, id cloud.InstanceSetID, logger logrus.FieldLogger) (cloud.InstanceSet, error) {
         sis := StubInstanceSet{
                 driver:  sd,
                 servers: map[cloud.InstanceID]*StubVM{},
diff --git a/lib/dispatchcloud/worker/pool_test.go b/lib/dispatchcloud/worker/pool_test.go

index 26087193f3ca32efa57279642385db1dc8bb2d25..aa81958434dd73792f399a61f5f1b0be6e2cfc1b 100644 (file)
--- a/lib/dispatchcloud/worker/pool_test.go
+++ b/lib/dispatchcloud/worker/pool_test.go
@@ -5,7 +5,6 @@
  package worker
  
  import (
-       "io"
         "time"
  
         "git.curoverse.com/arvados.git/lib/cloud"
@@ -50,7 +49,7 @@ func (suite *PoolSuite) TestCreateUnallocShutdown(c *check.C) {
         type3 := arvados.InstanceType{Name: "a2l", ProviderType: "a2.large", VCPUs: 4, RAM: 4 * GiB, Price: .04}
         pool := &Pool{
                 logger:      logrus.StandardLogger(),
-               newExecutor: func(cloud.Instance) Executor { return &stubExecutor{} },
+               newExecutor: func(cloud.Instance) Executor { return stubExecutor{} },
                 instanceSet: &throttledInstanceSet{InstanceSet: lameInstanceSet},
                 instanceTypes: arvados.InstanceTypeMap{
                         type1.Name: type1,
@@ -186,13 +185,3 @@ func (suite *PoolSuite) wait(c *check.C, pool *Pool, notify <-chan struct{}, rea
         }
         c.Check(ready(), check.Equals, true)
  }
-
-type stubExecutor struct{}
-
-func (*stubExecutor) SetTarget(cloud.ExecutorTarget) {}
-
-func (*stubExecutor) Execute(env map[string]string, cmd string, stdin io.Reader) ([]byte, []byte, error) {
-       return nil, nil, nil
-}
-
-func (*stubExecutor) Close() {}
diff --git a/lib/dispatchcloud/worker/worker.go b/lib/dispatchcloud/worker/worker.go

index 0872e594f9a8175973cc11032ca7942a8aa455d5..a75d2bbb88611225c709a98f151dded697b5974c 100644 (file)
--- a/lib/dispatchcloud/worker/worker.go
+++ b/lib/dispatchcloud/worker/worker.go
@@ -6,6 +6,7 @@ package worker
  
  import (
         "bytes"
+       "fmt"
         "strings"
         "sync"
         "time"
@@ -144,38 +145,65 @@ func (wkr *worker) ProbeAndUpdate() {
         }
  }
  
-// should be called in a new goroutine
+// probeAndUpdate calls probeBooted and/or probeRunning if needed, and
+// updates state accordingly.
+//
+// In StateUnknown: Call both probeBooted and probeRunning.
+// In StateBooting: Call probeBooted; if successful, call probeRunning.
+// In StateRunning: Call probeRunning.
+// In StateIdle: Call probeRunning.
+// In StateShutdown: Do nothing.
+//
+// If both probes succeed, wkr.state changes to
+// StateIdle/StateRunning.
+//
+// If probeRunning succeeds, wkr.running is updated. (This means
+// wkr.running might be non-empty even in StateUnknown, if the boot
+// probe failed.)
+//
+// probeAndUpdate should be called in a new goroutine.
  func (wkr *worker) probeAndUpdate() {
         wkr.mtx.Lock()
         updated := wkr.updated
-       needProbeRunning := wkr.state == StateRunning || wkr.state == StateIdle
-       needProbeBooted := wkr.state == StateUnknown || wkr.state == StateBooting
+       initialState := wkr.state
         wkr.mtx.Unlock()
-       if !needProbeBooted && !needProbeRunning {
-               return
-       }
  
         var (
+               booted   bool
                 ctrUUIDs []string
                 ok       bool
                 stderr   []byte
         )
-       if needProbeBooted {
-               ok, stderr = wkr.probeBooted()
-               wkr.mtx.Lock()
-               if ok || wkr.state == StateRunning || wkr.state == StateIdle {
+
+       switch initialState {
+       case StateShutdown:
+               return
+       case StateIdle, StateRunning:
+               booted = true
+       case StateUnknown, StateBooting:
+       default:
+               panic(fmt.Sprintf("unknown state %s", initialState))
+       }
+
+       if !booted {
+               booted, stderr = wkr.probeBooted()
+               if !booted {
+                       // Pretend this probe succeeded if another
+                       // concurrent attempt succeeded.
+                       wkr.mtx.Lock()
+                       booted = wkr.state == StateRunning || wkr.state == StateIdle
+                       wkr.mtx.Unlock()
+               } else {
                         wkr.logger.Info("instance booted; will try probeRunning")
-                       needProbeRunning = true
                 }
-               wkr.mtx.Unlock()
         }
-       if needProbeRunning {
+       if booted || wkr.state == StateUnknown {
                 ctrUUIDs, ok, stderr = wkr.probeRunning()
         }
         logger := wkr.logger.WithField("stderr", string(stderr))
         wkr.mtx.Lock()
         defer wkr.mtx.Unlock()
-       if !ok {
+       if !ok || (!booted && len(ctrUUIDs) == 0 && len(wkr.running) == 0) {
                 if wkr.state == StateShutdown && wkr.updated.After(updated) {
                         // Skip the logging noise if shutdown was
                         // initiated during probe.
@@ -186,10 +214,10 @@ func (wkr *worker) probeAndUpdate() {
                         "Duration": dur,
                         "State":    wkr.state,
                 })
-               if wkr.state == StateBooting && !needProbeRunning {
-                       // If we know the instance has never passed a
-                       // boot probe, it's not noteworthy that it
-                       // hasn't passed this probe.
+               if !booted {
+                       // While we're polling the VM to see if it's
+                       // finished booting, failures are not
+                       // noteworthy, so we log at Debug level.
                         logger.Debug("new instance not responding")
                 } else {
                         logger.Info("instance not responding")
@@ -234,11 +262,16 @@ func (wkr *worker) probeAndUpdate() {
                         changed = true
                 }
         }
-       if wkr.state == StateUnknown || wkr.state == StateBooting {
+       if booted && (wkr.state == StateUnknown || wkr.state == StateBooting) {
                 // Note: this will change again below if
                 // len(wkr.starting)+len(wkr.running) > 0.
                 wkr.state = StateIdle
                 changed = true
+       } else if wkr.state == StateUnknown && len(running) != len(wkr.running) {
+               logger.WithFields(logrus.Fields{
+                       "RunningContainers": len(running),
+                       "State":             wkr.state,
+               }).Info("crunch-run probe succeeded, but boot probe is still failing")
         }
         if !changed {
                 return
@@ -251,7 +284,7 @@ func (wkr *worker) probeAndUpdate() {
                 wkr.state = StateIdle
         }
         wkr.updated = updateTime
-       if needProbeBooted {
+       if booted && (initialState == StateUnknown || initialState == StateBooting) {
                 logger.WithFields(logrus.Fields{
                         "RunningContainers": len(running),
                         "State":             wkr.state,
diff --git a/lib/dispatchcloud/worker/worker_test.go b/lib/dispatchcloud/worker/worker_test.go

new file mode 100644 (file)

index 0000000..570d3b5
--- /dev/null
+++ b/lib/dispatchcloud/worker/worker_test.go
@@ -0,0 +1,240 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package worker
+
+import (
+       "errors"
+       "io"
+       "time"
+
+       "git.curoverse.com/arvados.git/lib/cloud"
+       "git.curoverse.com/arvados.git/lib/dispatchcloud/test"
+       "git.curoverse.com/arvados.git/sdk/go/arvados"
+       "github.com/sirupsen/logrus"
+       check "gopkg.in/check.v1"
+)
+
+var _ = check.Suite(&WorkerSuite{})
+
+type WorkerSuite struct{}
+
+func (suite *WorkerSuite) TestProbeAndUpdate(c *check.C) {
+       logger := logrus.StandardLogger()
+       bootTimeout := time.Minute
+       probeTimeout := time.Second
+
+       is, err := (&test.StubDriver{}).InstanceSet(nil, "", logger)
+       c.Assert(err, check.IsNil)
+       inst, err := is.Create(arvados.InstanceType{}, "", nil, nil)
+       c.Assert(err, check.IsNil)
+
+       type trialT struct {
+               testCaseComment string // displayed in test output to help identify failure case
+               age             time.Duration
+               state           State
+               running         int
+               starting        int
+               respBoot        stubResp // zero value is success
+               respRun         stubResp // zero value is success + nothing running
+               expectState     State
+               expectRunning   int
+       }
+
+       errFail := errors.New("failed")
+       respFail := stubResp{"", "command failed\n", errFail}
+       respContainerRunning := stubResp{"zzzzz-dz642-abcdefghijklmno\n", "", nil}
+       for _, trial := range []trialT{
+               {
+                       testCaseComment: "Unknown, probes fail",
+                       state:           StateUnknown,
+                       respBoot:        respFail,
+                       respRun:         respFail,
+                       expectState:     StateUnknown,
+               },
+               {
+                       testCaseComment: "Unknown, boot probe fails, but one container is running",
+                       state:           StateUnknown,
+                       respBoot:        respFail,
+                       respRun:         respContainerRunning,
+                       expectState:     StateUnknown,
+                       expectRunning:   1,
+               },
+               {
+                       testCaseComment: "Unknown, boot probe fails, previously running container has exited",
+                       state:           StateUnknown,
+                       running:         1,
+                       respBoot:        respFail,
+                       expectState:     StateUnknown,
+                       expectRunning:   0,
+               },
+               {
+                       testCaseComment: "Unknown, boot timeout exceeded, boot probe fails",
+                       state:           StateUnknown,
+                       age:             bootTimeout + time.Second,
+                       respBoot:        respFail,
+                       respRun:         respFail,
+                       expectState:     StateShutdown,
+               },
+               {
+                       testCaseComment: "Unknown, boot timeout exceeded, boot probe succeeds but crunch-run fails",
+                       state:           StateUnknown,
+                       age:             bootTimeout * 2,
+                       respRun:         respFail,
+                       expectState:     StateShutdown,
+               },
+               {
+                       testCaseComment: "Unknown, boot timeout exceeded, boot probe fails but crunch-run succeeds",
+                       state:           StateUnknown,
+                       age:             bootTimeout * 2,
+                       respBoot:        respFail,
+                       expectState:     StateShutdown,
+               },
+               {
+                       testCaseComment: "Unknown, boot timeout exceeded, boot probe fails but container is running",
+                       state:           StateUnknown,
+                       age:             bootTimeout * 2,
+                       respBoot:        respFail,
+                       respRun:         respContainerRunning,
+                       expectState:     StateUnknown,
+                       expectRunning:   1,
+               },
+               {
+                       testCaseComment: "Booting, boot probe fails, run probe fails",
+                       state:           StateBooting,
+                       respBoot:        respFail,
+                       respRun:         respFail,
+                       expectState:     StateBooting,
+               },
+               {
+                       testCaseComment: "Booting, boot probe fails, run probe succeeds (but isn't expected to be called)",
+                       state:           StateBooting,
+                       respBoot:        respFail,
+                       expectState:     StateBooting,
+               },
+               {
+                       testCaseComment: "Booting, boot probe succeeds, run probe fails",
+                       state:           StateBooting,
+                       respRun:         respFail,
+                       expectState:     StateBooting,
+               },
+               {
+                       testCaseComment: "Booting, boot probe succeeds, run probe succeeds",
+                       state:           StateBooting,
+                       expectState:     StateIdle,
+               },
+               {
+                       testCaseComment: "Booting, boot probe succeeds, run probe succeeds, container is running",
+                       state:           StateBooting,
+                       respRun:         respContainerRunning,
+                       expectState:     StateRunning,
+                       expectRunning:   1,
+               },
+               {
+                       testCaseComment: "Booting, boot timeout exceeded",
+                       state:           StateBooting,
+                       age:             bootTimeout * 2,
+                       respRun:         respFail,
+                       expectState:     StateShutdown,
+               },
+               {
+                       testCaseComment: "Idle, probe timeout exceeded, one container running",
+                       state:           StateIdle,
+                       age:             probeTimeout * 2,
+                       respRun:         respContainerRunning,
+                       expectState:     StateRunning,
+                       expectRunning:   1,
+               },
+               {
+                       testCaseComment: "Idle, probe timeout exceeded, one container running, probe fails",
+                       state:           StateIdle,
+                       age:             probeTimeout * 2,
+                       running:         1,
+                       respRun:         respFail,
+                       expectState:     StateShutdown,
+                       expectRunning:   1,
+               },
+               {
+                       testCaseComment: "Idle, probe timeout exceeded, nothing running, probe fails",
+                       state:           StateIdle,
+                       age:             probeTimeout * 2,
+                       respRun:         respFail,
+                       expectState:     StateShutdown,
+               },
+               {
+                       testCaseComment: "Running, one container still running",
+                       state:           StateRunning,
+                       running:         1,
+                       respRun:         respContainerRunning,
+                       expectState:     StateRunning,
+                       expectRunning:   1,
+               },
+               {
+                       testCaseComment: "Running, container has exited",
+                       state:           StateRunning,
+                       running:         1,
+                       expectState:     StateIdle,
+                       expectRunning:   0,
+               },
+               {
+                       testCaseComment: "Running, probe timeout exceeded, nothing running, new container being started",
+                       state:           StateRunning,
+                       age:             probeTimeout * 2,
+                       starting:        1,
+                       expectState:     StateRunning,
+               },
+       } {
+               c.Logf("------- %#v", trial)
+               ctime := time.Now().Add(-trial.age)
+               exr := stubExecutor{
+                       "bootprobe":         trial.respBoot,
+                       "crunch-run --list": trial.respRun,
+               }
+               wp := &Pool{
+                       newExecutor:      func(cloud.Instance) Executor { return exr },
+                       bootProbeCommand: "bootprobe",
+                       timeoutBooting:   bootTimeout,
+                       timeoutProbe:     probeTimeout,
+                       exited:           map[string]time.Time{},
+               }
+               wkr := &worker{
+                       logger:   logger,
+                       executor: exr,
+                       wp:       wp,
+                       mtx:      &wp.mtx,
+                       state:    trial.state,
+                       instance: inst,
+                       appeared: ctime,
+                       busy:     ctime,
+                       probed:   ctime,
+                       updated:  ctime,
+               }
+               if trial.running > 0 {
+                       wkr.running = map[string]struct{}{"zzzzz-dz642-abcdefghijklmno": struct{}{}}
+               }
+               if trial.starting > 0 {
+                       wkr.starting = map[string]struct{}{"zzzzz-dz642-abcdefghijklmno": struct{}{}}
+               }
+               wkr.probeAndUpdate()
+               c.Check(wkr.state, check.Equals, trial.expectState)
+               c.Check(len(wkr.running), check.Equals, trial.expectRunning)
+       }
+}
+
+type stubResp struct {
+       stdout string
+       stderr string
+       err    error
+}
+type stubExecutor map[string]stubResp
+
+func (se stubExecutor) SetTarget(cloud.ExecutorTarget) {}
+func (se stubExecutor) Close()                         {}
+func (se stubExecutor) Execute(env map[string]string, cmd string, stdin io.Reader) (stdout, stderr []byte, err error) {
+       resp, ok := se[cmd]
+       if !ok {
+               return nil, []byte("command not found\n"), errors.New("command not found")
+       }
+       return []byte(resp.stdout), []byte(resp.stderr), resp.err
+}
author	Tom Clegg <tclegg@veritasgenetics.com>
	Thu, 24 Jan 2019 21:10:48 +0000 (16:10 -0500)
committer	Tom Clegg <tclegg@veritasgenetics.com>
	Thu, 24 Jan 2019 21:10:48 +0000 (16:10 -0500)
lib/dispatchcloud/test/stub_driver.go		patch \| blob \| history
lib/dispatchcloud/worker/pool_test.go		patch \| blob \| history
lib/dispatchcloud/worker/worker.go		patch \| blob \| history
lib/dispatchcloud/worker/worker_test.go	[new file with mode: 0644]	patch \| blob