1 // Copyright (C) The Arvados Authors. All rights reserved.
3 // SPDX-License-Identifier: AGPL-3.0
16 "git.arvados.org/arvados.git/lib/cloud"
17 "git.arvados.org/arvados.git/lib/config"
18 "git.arvados.org/arvados.git/lib/dispatchcloud/test"
19 "git.arvados.org/arvados.git/sdk/go/arvados"
20 "git.arvados.org/arvados.git/sdk/go/ctxlog"
21 "github.com/prometheus/client_golang/prometheus"
22 "github.com/sirupsen/logrus"
23 check "gopkg.in/check.v1"
26 var _ = check.Suite(&WorkerSuite{})
28 type WorkerSuite struct {
29 logger logrus.FieldLogger
30 testCluster *arvados.Cluster
33 func (suite *WorkerSuite) SetUpTest(c *check.C) {
34 suite.logger = ctxlog.TestLogger(c)
35 cfg, err := config.NewLoader(nil, suite.logger).Load()
36 c.Assert(err, check.IsNil)
37 suite.testCluster, err = cfg.GetCluster("")
38 c.Assert(err, check.IsNil)
41 func (suite *WorkerSuite) TestProbeAndUpdate(c *check.C) {
42 bootTimeout := time.Minute
43 probeTimeout := time.Second
45 ac := arvados.NewClientFromEnv()
46 is, err := (&test.StubDriver{}).InstanceSet(nil, "test-instance-set-id", nil, suite.logger)
47 c.Assert(err, check.IsNil)
48 inst, err := is.Create(arvados.InstanceType{}, "", nil, "echo InitCommand", nil)
49 c.Assert(err, check.IsNil)
52 testCaseComment string // displayed in test output to help identify failure case
57 respBoot stubResp // zero value is success
58 respDeploy stubResp // zero value is success
59 respRun stubResp // zero value is success + nothing running
60 respRunDeployed stubResp
67 errFail := errors.New("failed")
68 respFail := stubResp{"", "command failed\n", errFail}
69 respContainerRunning := stubResp{"zzzzz-dz642-abcdefghijklmno\n", "", nil}
70 for idx, trial := range []trialT{
72 testCaseComment: "Unknown, probes fail",
76 expectState: StateUnknown,
79 testCaseComment: "Unknown, boot probe fails, but one container is running",
82 respRun: respContainerRunning,
83 expectState: StateUnknown,
87 testCaseComment: "Unknown, boot probe fails, previously running container has exited",
91 expectState: StateUnknown,
95 testCaseComment: "Unknown, boot timeout exceeded, boot probe fails",
97 age: bootTimeout + time.Second,
100 expectState: StateShutdown,
103 testCaseComment: "Unknown, boot timeout exceeded, boot probe succeeds but crunch-run fails",
105 age: bootTimeout * 2,
107 expectState: StateShutdown,
110 testCaseComment: "Unknown, boot timeout exceeded, boot probe fails but crunch-run succeeds",
112 age: bootTimeout * 2,
114 expectState: StateShutdown,
117 testCaseComment: "Unknown, boot timeout exceeded, boot probe fails but container is running",
119 age: bootTimeout * 2,
121 respRun: respContainerRunning,
122 expectState: StateUnknown,
126 testCaseComment: "Unknown, boot probe fails, deployRunner succeeds, container is running",
130 respRunDeployed: respContainerRunning,
131 deployRunner: []byte("ELF"),
132 expectStdin: []byte("ELF"),
133 expectState: StateUnknown,
137 testCaseComment: "Unknown, boot timeout exceeded, boot probe fails but deployRunner succeeds and container is running",
139 age: bootTimeout * 2,
142 respRunDeployed: respContainerRunning,
143 deployRunner: []byte("ELF"),
144 expectStdin: []byte("ELF"),
145 expectState: StateUnknown,
149 testCaseComment: "Unknown, boot timeout exceeded, boot probe fails but deployRunner succeeds and no container is running",
151 age: bootTimeout * 2,
154 deployRunner: []byte("ELF"),
155 expectStdin: []byte("ELF"),
156 expectState: StateShutdown,
159 testCaseComment: "Booting, boot probe fails, run probe fails",
163 expectState: StateBooting,
166 testCaseComment: "Booting, boot probe fails, run probe succeeds (but isn't expected to be called)",
169 expectState: StateBooting,
172 testCaseComment: "Booting, boot probe succeeds, run probe fails",
175 expectState: StateBooting,
178 testCaseComment: "Booting, boot probe succeeds, run probe succeeds",
180 expectState: StateIdle,
183 testCaseComment: "Booting, boot probe succeeds, run probe succeeds, container is running",
185 respRun: respContainerRunning,
186 expectState: StateRunning,
190 testCaseComment: "Booting, boot timeout exceeded",
192 age: bootTimeout * 2,
194 expectState: StateShutdown,
197 testCaseComment: "Idle, probe timeout exceeded, one container running",
199 age: probeTimeout * 2,
200 respRun: respContainerRunning,
201 expectState: StateRunning,
205 testCaseComment: "Idle, probe timeout exceeded, one container running, probe fails",
207 age: probeTimeout * 2,
210 expectState: StateShutdown,
214 testCaseComment: "Idle, probe timeout exceeded, nothing running, probe fails",
216 age: probeTimeout * 2,
218 expectState: StateShutdown,
221 testCaseComment: "Running, one container still running",
224 respRun: respContainerRunning,
225 expectState: StateRunning,
229 testCaseComment: "Running, container has exited",
232 expectState: StateIdle,
236 testCaseComment: "Running, probe timeout exceeded, nothing running, new container being started",
238 age: probeTimeout * 2,
240 expectState: StateRunning,
243 testCaseComment: "Booting, boot probe succeeds, deployRunner succeeds, run probe succeeds",
245 deployRunner: []byte("ELF"),
246 expectStdin: []byte("ELF"),
248 respRunDeployed: respContainerRunning,
250 expectState: StateRunning,
253 testCaseComment: "Booting, boot probe succeeds, deployRunner fails",
255 deployRunner: []byte("ELF"),
256 respDeploy: respFail,
257 expectStdin: []byte("ELF"),
258 expectState: StateBooting,
261 testCaseComment: "Booting, boot probe succeeds, deployRunner skipped, run probe succeeds",
264 respDeploy: respFail,
265 expectState: StateIdle,
268 c.Logf("------- trial %d: %#v", idx, trial)
269 ctime := time.Now().Add(-trial.age)
270 exr := &stubExecutor{
271 response: map[string]stubResp{
272 "bootprobe": trial.respBoot,
273 "crunch-run --list": trial.respRun,
274 "{deploy}": trial.respDeploy,
279 newExecutor: func(cloud.Instance) Executor { return exr },
280 cluster: suite.testCluster,
281 bootProbeCommand: "bootprobe",
282 timeoutBooting: bootTimeout,
283 timeoutProbe: probeTimeout,
284 exited: map[string]time.Time{},
285 runnerCmdDefault: "crunch-run",
286 runnerArgs: []string{"--args=not used with --list"},
287 runnerCmd: "crunch-run",
288 runnerData: trial.deployRunner,
289 runnerMD5: md5.Sum(trial.deployRunner),
291 wp.registerMetrics(prometheus.NewRegistry())
292 if trial.deployRunner != nil {
293 svHash := md5.Sum(trial.deployRunner)
294 wp.runnerCmd = fmt.Sprintf("/var/run/arvados/crunch-run~%x", svHash)
295 exr.response[wp.runnerCmd+" --list"] = trial.respRunDeployed
298 logger: suite.logger,
308 running: map[string]*remoteRunner{},
309 starting: map[string]*remoteRunner{},
310 probing: make(chan struct{}, 1),
312 if trial.running > 0 {
313 uuid := "zzzzz-dz642-abcdefghijklmno"
314 wkr.running = map[string]*remoteRunner{uuid: newRemoteRunner(uuid, wkr)}
316 if trial.starting > 0 {
317 uuid := "zzzzz-dz642-bcdefghijklmnop"
318 wkr.starting = map[string]*remoteRunner{uuid: newRemoteRunner(uuid, wkr)}
321 c.Check(wkr.state, check.Equals, trial.expectState)
322 c.Check(len(wkr.running), check.Equals, trial.expectRunning)
323 c.Check(exr.stdin.String(), check.Equals, string(trial.expectStdin))
327 type stubResp struct {
333 type stubExecutor struct {
334 response map[string]stubResp
338 func (se *stubExecutor) SetTarget(cloud.ExecutorTarget) {}
339 func (se *stubExecutor) Close() {}
340 func (se *stubExecutor) Execute(env map[string]string, cmd string, stdin io.Reader) (stdout, stderr []byte, err error) {
342 _, err = io.Copy(&se.stdin, stdin)
344 return nil, []byte(err.Error()), err
347 resp, ok := se.response[cmd]
348 if !ok && strings.Contains(cmd, `; cat >"$dstfile"`) {
349 resp, ok = se.response["{deploy}"]
352 return nil, []byte(fmt.Sprintf("%s: command not found\n", cmd)), errors.New("command not found")
354 return []byte(resp.stdout), []byte(resp.stderr), resp.err