lib/dispatchcloud/worker/worker_test.go

   1 // Copyright (C) The Arvados Authors. All rights reserved.
   2 //
   3 // SPDX-License-Identifier: AGPL-3.0
   4
   5 package worker
   6
   7 import (
   8         "bytes"
   9         "crypto/md5"
  10         "errors"
  11         "fmt"
  12         "io"
  13         "strings"
  14         "time"
  15
  16         "git.arvados.org/arvados.git/lib/cloud"
  17         "git.arvados.org/arvados.git/lib/dispatchcloud/test"
  18         "git.arvados.org/arvados.git/sdk/go/arvados"
  19         "git.arvados.org/arvados.git/sdk/go/ctxlog"
  20         check "gopkg.in/check.v1"
  21 )
  22
  23 var _ = check.Suite(&WorkerSuite{})
  24
  25 type WorkerSuite struct{}
  26
  27 func (suite *WorkerSuite) TestProbeAndUpdate(c *check.C) {
  28         logger := ctxlog.TestLogger(c)
  29         bootTimeout := time.Minute
  30         probeTimeout := time.Second
  31
  32         ac := arvados.NewClientFromEnv()
  33         is, err := (&test.StubDriver{}).InstanceSet(nil, "test-instance-set-id", nil, logger)
  34         c.Assert(err, check.IsNil)
  35         inst, err := is.Create(arvados.InstanceType{}, "", nil, "echo InitCommand", nil)
  36         c.Assert(err, check.IsNil)
  37
  38         type trialT struct {
  39                 testCaseComment string // displayed in test output to help identify failure case
  40                 age             time.Duration
  41                 state           State
  42                 running         int
  43                 starting        int
  44                 respBoot        stubResp // zero value is success
  45                 respDeploy      stubResp // zero value is success
  46                 respRun         stubResp // zero value is success + nothing running
  47                 respRunDeployed stubResp
  48                 deployRunner    []byte
  49                 expectStdin     []byte
  50                 expectState     State
  51                 expectRunning   int
  52         }
  53
  54         errFail := errors.New("failed")
  55         respFail := stubResp{"", "command failed\n", errFail}
  56         respContainerRunning := stubResp{"zzzzz-dz642-abcdefghijklmno\n", "", nil}
  57         for idx, trial := range []trialT{
  58                 {
  59                         testCaseComment: "Unknown, probes fail",
  60                         state:           StateUnknown,
  61                         respBoot:        respFail,
  62                         respRun:         respFail,
  63                         expectState:     StateUnknown,
  64                 },
  65                 {
  66                         testCaseComment: "Unknown, boot probe fails, but one container is running",
  67                         state:           StateUnknown,
  68                         respBoot:        respFail,
  69                         respRun:         respContainerRunning,
  70                         expectState:     StateUnknown,
  71                         expectRunning:   1,
  72                 },
  73                 {
  74                         testCaseComment: "Unknown, boot probe fails, previously running container has exited",
  75                         state:           StateUnknown,
  76                         running:         1,
  77                         respBoot:        respFail,
  78                         expectState:     StateUnknown,
  79                         expectRunning:   0,
  80                 },
  81                 {
  82                         testCaseComment: "Unknown, boot timeout exceeded, boot probe fails",
  83                         state:           StateUnknown,
  84                         age:             bootTimeout + time.Second,
  85                         respBoot:        respFail,
  86                         respRun:         respFail,
  87                         expectState:     StateShutdown,
  88                 },
  89                 {
  90                         testCaseComment: "Unknown, boot timeout exceeded, boot probe succeeds but crunch-run fails",
  91                         state:           StateUnknown,
  92                         age:             bootTimeout * 2,
  93                         respRun:         respFail,
  94                         expectState:     StateShutdown,
  95                 },
  96                 {
  97                         testCaseComment: "Unknown, boot timeout exceeded, boot probe fails but crunch-run succeeds",
  98                         state:           StateUnknown,
  99                         age:             bootTimeout * 2,
 100                         respBoot:        respFail,
 101                         expectState:     StateShutdown,
 102                 },
 103                 {
 104                         testCaseComment: "Unknown, boot timeout exceeded, boot probe fails but container is running",
 105                         state:           StateUnknown,
 106                         age:             bootTimeout * 2,
 107                         respBoot:        respFail,
 108                         respRun:         respContainerRunning,
 109                         expectState:     StateUnknown,
 110                         expectRunning:   1,
 111                 },
 112                 {
 113                         testCaseComment: "Booting, boot probe fails, run probe fails",
 114                         state:           StateBooting,
 115                         respBoot:        respFail,
 116                         respRun:         respFail,
 117                         expectState:     StateBooting,
 118                 },
 119                 {
 120                         testCaseComment: "Booting, boot probe fails, run probe succeeds (but isn't expected to be called)",
 121                         state:           StateBooting,
 122                         respBoot:        respFail,
 123                         expectState:     StateBooting,
 124                 },
 125                 {
 126                         testCaseComment: "Booting, boot probe succeeds, run probe fails",
 127                         state:           StateBooting,
 128                         respRun:         respFail,
 129                         expectState:     StateBooting,
 130                 },
 131                 {
 132                         testCaseComment: "Booting, boot probe succeeds, run probe succeeds",
 133                         state:           StateBooting,
 134                         expectState:     StateIdle,
 135                 },
 136                 {
 137                         testCaseComment: "Booting, boot probe succeeds, run probe succeeds, container is running",
 138                         state:           StateBooting,
 139                         respRun:         respContainerRunning,
 140                         expectState:     StateRunning,
 141                         expectRunning:   1,
 142                 },
 143                 {
 144                         testCaseComment: "Booting, boot timeout exceeded",
 145                         state:           StateBooting,
 146                         age:             bootTimeout * 2,
 147                         respRun:         respFail,
 148                         expectState:     StateShutdown,
 149                 },
 150                 {
 151                         testCaseComment: "Idle, probe timeout exceeded, one container running",
 152                         state:           StateIdle,
 153                         age:             probeTimeout * 2,
 154                         respRun:         respContainerRunning,
 155                         expectState:     StateRunning,
 156                         expectRunning:   1,
 157                 },
 158                 {
 159                         testCaseComment: "Idle, probe timeout exceeded, one container running, probe fails",
 160                         state:           StateIdle,
 161                         age:             probeTimeout * 2,
 162                         running:         1,
 163                         respRun:         respFail,
 164                         expectState:     StateShutdown,
 165                         expectRunning:   1,
 166                 },
 167                 {
 168                         testCaseComment: "Idle, probe timeout exceeded, nothing running, probe fails",
 169                         state:           StateIdle,
 170                         age:             probeTimeout * 2,
 171                         respRun:         respFail,
 172                         expectState:     StateShutdown,
 173                 },
 174                 {
 175                         testCaseComment: "Running, one container still running",
 176                         state:           StateRunning,
 177                         running:         1,
 178                         respRun:         respContainerRunning,
 179                         expectState:     StateRunning,
 180                         expectRunning:   1,
 181                 },
 182                 {
 183                         testCaseComment: "Running, container has exited",
 184                         state:           StateRunning,
 185                         running:         1,
 186                         expectState:     StateIdle,
 187                         expectRunning:   0,
 188                 },
 189                 {
 190                         testCaseComment: "Running, probe timeout exceeded, nothing running, new container being started",
 191                         state:           StateRunning,
 192                         age:             probeTimeout * 2,
 193                         starting:        1,
 194                         expectState:     StateRunning,
 195                 },
 196                 {
 197                         testCaseComment: "Booting, boot probe succeeds, deployRunner succeeds, run probe succeeds",
 198                         state:           StateBooting,
 199                         deployRunner:    []byte("ELF"),
 200                         expectStdin:     []byte("ELF"),
 201                         respRun:         respFail,
 202                         respRunDeployed: respContainerRunning,
 203                         expectRunning:   1,
 204                         expectState:     StateRunning,
 205                 },
 206                 {
 207                         testCaseComment: "Booting, boot probe succeeds, deployRunner fails",
 208                         state:           StateBooting,
 209                         deployRunner:    []byte("ELF"),
 210                         respDeploy:      respFail,
 211                         expectStdin:     []byte("ELF"),
 212                         expectState:     StateBooting,
 213                 },
 214                 {
 215                         testCaseComment: "Booting, boot probe succeeds, deployRunner skipped, run probe succeeds",
 216                         state:           StateBooting,
 217                         deployRunner:    nil,
 218                         respDeploy:      respFail,
 219                         expectState:     StateIdle,
 220                 },
 221         } {
 222                 c.Logf("------- trial %d: %#v", idx, trial)
 223                 ctime := time.Now().Add(-trial.age)
 224                 exr := &stubExecutor{
 225                         response: map[string]stubResp{
 226                                 "bootprobe":         trial.respBoot,
 227                                 "crunch-run --list": trial.respRun,
 228                                 "{deploy}":          trial.respDeploy,
 229                         },
 230                 }
 231                 wp := &Pool{
 232                         arvClient:        ac,
 233                         newExecutor:      func(cloud.Instance) Executor { return exr },
 234                         bootProbeCommand: "bootprobe",
 235                         timeoutBooting:   bootTimeout,
 236                         timeoutProbe:     probeTimeout,
 237                         exited:           map[string]time.Time{},
 238                         runnerCmd:        "crunch-run",
 239                         runnerData:       trial.deployRunner,
 240                         runnerMD5:        md5.Sum(trial.deployRunner),
 241                 }
 242                 if trial.deployRunner != nil {
 243                         svHash := md5.Sum(trial.deployRunner)
 244                         wp.runnerCmd = fmt.Sprintf("/var/run/arvados/crunch-run~%x", svHash)
 245                         exr.response[wp.runnerCmd+" --list"] = trial.respRunDeployed
 246                 }
 247                 wkr := &worker{
 248                         logger:   logger,
 249                         executor: exr,
 250                         wp:       wp,
 251                         mtx:      &wp.mtx,
 252                         state:    trial.state,
 253                         instance: inst,
 254                         appeared: ctime,
 255                         busy:     ctime,
 256                         probed:   ctime,
 257                         updated:  ctime,
 258                         running:  map[string]*remoteRunner{},
 259                         starting: map[string]*remoteRunner{},
 260                         probing:  make(chan struct{}, 1),
 261                 }
 262                 if trial.running > 0 {
 263                         uuid := "zzzzz-dz642-abcdefghijklmno"
 264                         wkr.running = map[string]*remoteRunner{uuid: newRemoteRunner(uuid, wkr)}
 265                 }
 266                 if trial.starting > 0 {
 267                         uuid := "zzzzz-dz642-bcdefghijklmnop"
 268                         wkr.starting = map[string]*remoteRunner{uuid: newRemoteRunner(uuid, wkr)}
 269                 }
 270                 wkr.probeAndUpdate()
 271                 c.Check(wkr.state, check.Equals, trial.expectState)
 272                 c.Check(len(wkr.running), check.Equals, trial.expectRunning)
 273                 c.Check(exr.stdin.String(), check.Equals, string(trial.expectStdin))
 274         }
 275 }
 276
 277 type stubResp struct {
 278         stdout string
 279         stderr string
 280         err    error
 281 }
 282
 283 type stubExecutor struct {
 284         response map[string]stubResp
 285         stdin    bytes.Buffer
 286 }
 287
 288 func (se *stubExecutor) SetTarget(cloud.ExecutorTarget) {}
 289 func (se *stubExecutor) Close()                         {}
 290 func (se *stubExecutor) Execute(env map[string]string, cmd string, stdin io.Reader) (stdout, stderr []byte, err error) {
 291         if stdin != nil {
 292                 _, err = io.Copy(&se.stdin, stdin)
 293                 if err != nil {
 294                         return nil, []byte(err.Error()), err
 295                 }
 296         }
 297         resp, ok := se.response[cmd]
 298         if !ok && strings.Contains(cmd, `; cat >"$dstfile"`) {
 299                 resp, ok = se.response["{deploy}"]
 300         }
 301         if !ok {
 302                 return nil, []byte(fmt.Sprintf("%s: command not found\n", cmd)), errors.New("command not found")
 303         }
 304         return []byte(resp.stdout), []byte(resp.stderr), resp.err
 305 }