lib/dispatchcloud/worker/worker_test.go

   1 // Copyright (C) The Arvados Authors. All rights reserved.
   2 //
   3 // SPDX-License-Identifier: AGPL-3.0
   4
   5 package worker
   6
   7 import (
   8         "bytes"
   9         "crypto/md5"
  10         "errors"
  11         "fmt"
  12         "io"
  13         "strings"
  14         "time"
  15
  16         "git.arvados.org/arvados.git/lib/cloud"
  17         "git.arvados.org/arvados.git/lib/config"
  18         "git.arvados.org/arvados.git/lib/dispatchcloud/test"
  19         "git.arvados.org/arvados.git/sdk/go/arvados"
  20         "git.arvados.org/arvados.git/sdk/go/ctxlog"
  21         "github.com/prometheus/client_golang/prometheus"
  22         "github.com/sirupsen/logrus"
  23         check "gopkg.in/check.v1"
  24 )
  25
  26 var _ = check.Suite(&WorkerSuite{})
  27
  28 type WorkerSuite struct {
  29         logger      logrus.FieldLogger
  30         testCluster *arvados.Cluster
  31 }
  32
  33 func (suite *WorkerSuite) SetUpTest(c *check.C) {
  34         suite.logger = ctxlog.TestLogger(c)
  35         cfg, err := config.NewLoader(nil, suite.logger).Load()
  36         c.Assert(err, check.IsNil)
  37         suite.testCluster, err = cfg.GetCluster("")
  38         c.Assert(err, check.IsNil)
  39 }
  40
  41 func (suite *WorkerSuite) TestProbeAndUpdate(c *check.C) {
  42         bootTimeout := time.Minute
  43         probeTimeout := time.Second
  44
  45         ac := arvados.NewClientFromEnv()
  46         is, err := (&test.StubDriver{}).InstanceSet(nil, "test-instance-set-id", nil, suite.logger, nil)
  47         c.Assert(err, check.IsNil)
  48         inst, err := is.Create(arvados.InstanceType{}, "", nil, "echo InitCommand", nil)
  49         c.Assert(err, check.IsNil)
  50
  51         type trialT struct {
  52                 testCaseComment string // displayed in test output to help identify failure case
  53                 age             time.Duration
  54                 state           State
  55                 running         int
  56                 starting        int
  57                 respBoot        stubResp // zero value is success
  58                 respDeploy      stubResp // zero value is success
  59                 respRun         stubResp // zero value is success + nothing running
  60                 respRunDeployed stubResp
  61                 deployRunner    []byte
  62                 expectStdin     []byte
  63                 expectState     State
  64                 expectRunning   int
  65         }
  66
  67         errFail := errors.New("failed")
  68         respFail := stubResp{"", "command failed\n", errFail}
  69         respContainerRunning := stubResp{"zzzzz-dz642-abcdefghijklmno\n", "", nil}
  70         for idx, trial := range []trialT{
  71                 {
  72                         testCaseComment: "Unknown, probes fail",
  73                         state:           StateUnknown,
  74                         respBoot:        respFail,
  75                         respRun:         respFail,
  76                         expectState:     StateUnknown,
  77                 },
  78                 {
  79                         testCaseComment: "Unknown, boot probe fails, but one container is running",
  80                         state:           StateUnknown,
  81                         respBoot:        respFail,
  82                         respRun:         respContainerRunning,
  83                         expectState:     StateUnknown,
  84                         expectRunning:   1,
  85                 },
  86                 {
  87                         testCaseComment: "Unknown, boot probe fails, previously running container has exited",
  88                         state:           StateUnknown,
  89                         running:         1,
  90                         respBoot:        respFail,
  91                         expectState:     StateUnknown,
  92                         expectRunning:   0,
  93                 },
  94                 {
  95                         testCaseComment: "Unknown, boot timeout exceeded, boot probe fails",
  96                         state:           StateUnknown,
  97                         age:             bootTimeout + time.Second,
  98                         respBoot:        respFail,
  99                         respRun:         respFail,
 100                         expectState:     StateShutdown,
 101                 },
 102                 {
 103                         testCaseComment: "Unknown, boot timeout exceeded, boot probe succeeds but crunch-run fails",
 104                         state:           StateUnknown,
 105                         age:             bootTimeout * 2,
 106                         respRun:         respFail,
 107                         expectState:     StateShutdown,
 108                 },
 109                 {
 110                         testCaseComment: "Unknown, boot timeout exceeded, boot probe fails but crunch-run succeeds",
 111                         state:           StateUnknown,
 112                         age:             bootTimeout * 2,
 113                         respBoot:        respFail,
 114                         expectState:     StateShutdown,
 115                 },
 116                 {
 117                         testCaseComment: "Unknown, boot timeout exceeded, boot probe fails but container is running",
 118                         state:           StateUnknown,
 119                         age:             bootTimeout * 2,
 120                         respBoot:        respFail,
 121                         respRun:         respContainerRunning,
 122                         expectState:     StateUnknown,
 123                         expectRunning:   1,
 124                 },
 125                 {
 126                         testCaseComment: "Unknown, boot probe fails, deployRunner succeeds, container is running",
 127                         state:           StateUnknown,
 128                         respBoot:        respFail,
 129                         respRun:         respFail,
 130                         respRunDeployed: respContainerRunning,
 131                         deployRunner:    []byte("ELF"),
 132                         expectStdin:     []byte("ELF"),
 133                         expectState:     StateUnknown,
 134                         expectRunning:   1,
 135                 },
 136                 {
 137                         testCaseComment: "Unknown, boot timeout exceeded, boot probe fails but deployRunner succeeds and container is running",
 138                         state:           StateUnknown,
 139                         age:             bootTimeout * 2,
 140                         respBoot:        respFail,
 141                         respRun:         respFail,
 142                         respRunDeployed: respContainerRunning,
 143                         deployRunner:    []byte("ELF"),
 144                         expectStdin:     []byte("ELF"),
 145                         expectState:     StateUnknown,
 146                         expectRunning:   1,
 147                 },
 148                 {
 149                         testCaseComment: "Unknown, boot timeout exceeded, boot probe fails but deployRunner succeeds and no container is running",
 150                         state:           StateUnknown,
 151                         age:             bootTimeout * 2,
 152                         respBoot:        respFail,
 153                         respRun:         respFail,
 154                         deployRunner:    []byte("ELF"),
 155                         expectStdin:     []byte("ELF"),
 156                         expectState:     StateShutdown,
 157                 },
 158                 {
 159                         testCaseComment: "Booting, boot probe fails, run probe fails",
 160                         state:           StateBooting,
 161                         respBoot:        respFail,
 162                         respRun:         respFail,
 163                         expectState:     StateBooting,
 164                 },
 165                 {
 166                         testCaseComment: "Booting, boot probe fails, run probe succeeds (but isn't expected to be called)",
 167                         state:           StateBooting,
 168                         respBoot:        respFail,
 169                         expectState:     StateBooting,
 170                 },
 171                 {
 172                         testCaseComment: "Booting, boot probe succeeds, run probe fails",
 173                         state:           StateBooting,
 174                         respRun:         respFail,
 175                         expectState:     StateBooting,
 176                 },
 177                 {
 178                         testCaseComment: "Booting, boot probe succeeds, run probe succeeds",
 179                         state:           StateBooting,
 180                         expectState:     StateIdle,
 181                 },
 182                 {
 183                         testCaseComment: "Booting, boot probe succeeds, run probe succeeds, container is running",
 184                         state:           StateBooting,
 185                         respRun:         respContainerRunning,
 186                         expectState:     StateRunning,
 187                         expectRunning:   1,
 188                 },
 189                 {
 190                         testCaseComment: "Booting, boot timeout exceeded",
 191                         state:           StateBooting,
 192                         age:             bootTimeout * 2,
 193                         respRun:         respFail,
 194                         expectState:     StateShutdown,
 195                 },
 196                 {
 197                         testCaseComment: "Idle, probe timeout exceeded, one container running",
 198                         state:           StateIdle,
 199                         age:             probeTimeout * 2,
 200                         respRun:         respContainerRunning,
 201                         expectState:     StateRunning,
 202                         expectRunning:   1,
 203                 },
 204                 {
 205                         testCaseComment: "Idle, probe timeout exceeded, one container running, probe fails",
 206                         state:           StateIdle,
 207                         age:             probeTimeout * 2,
 208                         running:         1,
 209                         respRun:         respFail,
 210                         expectState:     StateShutdown,
 211                         expectRunning:   1,
 212                 },
 213                 {
 214                         testCaseComment: "Idle, probe timeout exceeded, nothing running, probe fails",
 215                         state:           StateIdle,
 216                         age:             probeTimeout * 2,
 217                         respRun:         respFail,
 218                         expectState:     StateShutdown,
 219                 },
 220                 {
 221                         testCaseComment: "Running, one container still running",
 222                         state:           StateRunning,
 223                         running:         1,
 224                         respRun:         respContainerRunning,
 225                         expectState:     StateRunning,
 226                         expectRunning:   1,
 227                 },
 228                 {
 229                         testCaseComment: "Running, container has exited",
 230                         state:           StateRunning,
 231                         running:         1,
 232                         expectState:     StateIdle,
 233                         expectRunning:   0,
 234                 },
 235                 {
 236                         testCaseComment: "Running, probe timeout exceeded, nothing running, new container being started",
 237                         state:           StateRunning,
 238                         age:             probeTimeout * 2,
 239                         starting:        1,
 240                         expectState:     StateRunning,
 241                 },
 242                 {
 243                         testCaseComment: "Booting, boot probe succeeds, deployRunner succeeds, run probe succeeds",
 244                         state:           StateBooting,
 245                         deployRunner:    []byte("ELF"),
 246                         expectStdin:     []byte("ELF"),
 247                         respRun:         respFail,
 248                         respRunDeployed: respContainerRunning,
 249                         expectRunning:   1,
 250                         expectState:     StateRunning,
 251                 },
 252                 {
 253                         testCaseComment: "Booting, boot probe succeeds, deployRunner fails",
 254                         state:           StateBooting,
 255                         deployRunner:    []byte("ELF"),
 256                         respDeploy:      respFail,
 257                         expectStdin:     []byte("ELF"),
 258                         expectState:     StateBooting,
 259                 },
 260                 {
 261                         testCaseComment: "Booting, boot probe succeeds, deployRunner skipped, run probe succeeds",
 262                         state:           StateBooting,
 263                         deployRunner:    nil,
 264                         respDeploy:      respFail,
 265                         expectState:     StateIdle,
 266                 },
 267         } {
 268                 c.Logf("------- trial %d: %#v", idx, trial)
 269                 ctime := time.Now().Add(-trial.age)
 270                 exr := &stubExecutor{
 271                         response: map[string]stubResp{
 272                                 "bootprobe":         trial.respBoot,
 273                                 "crunch-run --list": trial.respRun,
 274                                 "{deploy}":          trial.respDeploy,
 275                         },
 276                 }
 277                 wp := &Pool{
 278                         arvClient:        ac,
 279                         newExecutor:      func(cloud.Instance) Executor { return exr },
 280                         cluster:          suite.testCluster,
 281                         bootProbeCommand: "bootprobe",
 282                         timeoutBooting:   bootTimeout,
 283                         timeoutProbe:     probeTimeout,
 284                         exited:           map[string]time.Time{},
 285                         runnerCmdDefault: "crunch-run",
 286                         runnerArgs:       []string{"--args=not used with --list"},
 287                         runnerCmd:        "crunch-run",
 288                         runnerData:       trial.deployRunner,
 289                         runnerMD5:        md5.Sum(trial.deployRunner),
 290                 }
 291                 wp.registerMetrics(prometheus.NewRegistry())
 292                 if trial.deployRunner != nil {
 293                         svHash := md5.Sum(trial.deployRunner)
 294                         wp.runnerCmd = fmt.Sprintf("/var/run/arvados/crunch-run~%x", svHash)
 295                         exr.response[wp.runnerCmd+" --list"] = trial.respRunDeployed
 296                 }
 297                 wkr := &worker{
 298                         logger:   suite.logger,
 299                         executor: exr,
 300                         wp:       wp,
 301                         mtx:      &wp.mtx,
 302                         state:    trial.state,
 303                         instance: inst,
 304                         appeared: ctime,
 305                         busy:     ctime,
 306                         probed:   ctime,
 307                         updated:  ctime,
 308                         running:  map[string]*remoteRunner{},
 309                         starting: map[string]*remoteRunner{},
 310                         probing:  make(chan struct{}, 1),
 311                 }
 312                 if trial.running > 0 {
 313                         uuid := "zzzzz-dz642-abcdefghijklmno"
 314                         wkr.running = map[string]*remoteRunner{uuid: newRemoteRunner(uuid, wkr)}
 315                 }
 316                 if trial.starting > 0 {
 317                         uuid := "zzzzz-dz642-bcdefghijklmnop"
 318                         wkr.starting = map[string]*remoteRunner{uuid: newRemoteRunner(uuid, wkr)}
 319                 }
 320                 wkr.probeAndUpdate()
 321                 c.Check(wkr.state, check.Equals, trial.expectState)
 322                 c.Check(len(wkr.running), check.Equals, trial.expectRunning)
 323                 c.Check(exr.stdin.String(), check.Equals, string(trial.expectStdin))
 324         }
 325 }
 326
 327 type stubResp struct {
 328         stdout string
 329         stderr string
 330         err    error
 331 }
 332
 333 type stubExecutor struct {
 334         response map[string]stubResp
 335         stdin    bytes.Buffer
 336 }
 337
 338 func (se *stubExecutor) SetTarget(cloud.ExecutorTarget) {}
 339 func (se *stubExecutor) Close()                         {}
 340 func (se *stubExecutor) Execute(env map[string]string, cmd string, stdin io.Reader) (stdout, stderr []byte, err error) {
 341         if stdin != nil {
 342                 _, err = io.Copy(&se.stdin, stdin)
 343                 if err != nil {
 344                         return nil, []byte(err.Error()), err
 345                 }
 346         }
 347         resp, ok := se.response[cmd]
 348         if !ok && strings.Contains(cmd, `; cat >"$dstfile"`) {
 349                 resp, ok = se.response["{deploy}"]
 350         }
 351         if !ok {
 352                 return nil, []byte(fmt.Sprintf("%s: command not found\n", cmd)), errors.New("command not found")
 353         }
 354         return []byte(resp.stdout), []byte(resp.stderr), resp.err
 355 }