1 // Copyright (C) The Arvados Authors. All rights reserved.
3 // SPDX-License-Identifier: AGPL-3.0
16 "git.arvados.org/arvados.git/lib/cloud"
17 "git.arvados.org/arvados.git/lib/config"
18 "git.arvados.org/arvados.git/lib/dispatchcloud/test"
19 "git.arvados.org/arvados.git/sdk/go/arvados"
20 "git.arvados.org/arvados.git/sdk/go/ctxlog"
21 "github.com/prometheus/client_golang/prometheus"
22 "github.com/sirupsen/logrus"
23 check "gopkg.in/check.v1"
26 var _ = check.Suite(&WorkerSuite{})
28 type WorkerSuite struct {
29 logger logrus.FieldLogger
30 testCluster *arvados.Cluster
33 func (suite *WorkerSuite) SetUpTest(c *check.C) {
34 suite.logger = ctxlog.TestLogger(c)
35 cfg, err := config.NewLoader(nil, suite.logger).Load()
36 c.Assert(err, check.IsNil)
37 suite.testCluster, err = cfg.GetCluster("")
38 c.Assert(err, check.IsNil)
41 func (suite *WorkerSuite) TestProbeAndUpdate(c *check.C) {
42 bootTimeout := time.Minute
43 probeTimeout := time.Second
45 ac := arvados.NewClientFromEnv()
46 is, err := (&test.StubDriver{}).InstanceSet(nil, "test-instance-set-id", nil, suite.logger)
47 c.Assert(err, check.IsNil)
48 inst, err := is.Create(arvados.InstanceType{}, "", nil, "echo InitCommand", nil)
49 c.Assert(err, check.IsNil)
52 testCaseComment string // displayed in test output to help identify failure case
57 respBoot stubResp // zero value is success
58 respDeploy stubResp // zero value is success
59 respRun stubResp // zero value is success + nothing running
60 respRunDeployed stubResp
67 errFail := errors.New("failed")
68 respFail := stubResp{"", "command failed\n", errFail}
69 respContainerRunning := stubResp{"zzzzz-dz642-abcdefghijklmno\n", "", nil}
70 for idx, trial := range []trialT{
72 testCaseComment: "Unknown, probes fail",
76 expectState: StateUnknown,
79 testCaseComment: "Unknown, boot probe fails, but one container is running",
82 respRun: respContainerRunning,
83 expectState: StateUnknown,
87 testCaseComment: "Unknown, boot probe fails, previously running container has exited",
91 expectState: StateUnknown,
95 testCaseComment: "Unknown, boot timeout exceeded, boot probe fails",
97 age: bootTimeout + time.Second,
100 expectState: StateShutdown,
103 testCaseComment: "Unknown, boot timeout exceeded, boot probe succeeds but crunch-run fails",
105 age: bootTimeout * 2,
107 expectState: StateShutdown,
110 testCaseComment: "Unknown, boot timeout exceeded, boot probe fails but crunch-run succeeds",
112 age: bootTimeout * 2,
114 expectState: StateShutdown,
117 testCaseComment: "Unknown, boot timeout exceeded, boot probe fails but container is running",
119 age: bootTimeout * 2,
121 respRun: respContainerRunning,
122 expectState: StateUnknown,
126 testCaseComment: "Booting, boot probe fails, run probe fails",
130 expectState: StateBooting,
133 testCaseComment: "Booting, boot probe fails, run probe succeeds (but isn't expected to be called)",
136 expectState: StateBooting,
139 testCaseComment: "Booting, boot probe succeeds, run probe fails",
142 expectState: StateBooting,
145 testCaseComment: "Booting, boot probe succeeds, run probe succeeds",
147 expectState: StateIdle,
150 testCaseComment: "Booting, boot probe succeeds, run probe succeeds, container is running",
152 respRun: respContainerRunning,
153 expectState: StateRunning,
157 testCaseComment: "Booting, boot timeout exceeded",
159 age: bootTimeout * 2,
161 expectState: StateShutdown,
164 testCaseComment: "Idle, probe timeout exceeded, one container running",
166 age: probeTimeout * 2,
167 respRun: respContainerRunning,
168 expectState: StateRunning,
172 testCaseComment: "Idle, probe timeout exceeded, one container running, probe fails",
174 age: probeTimeout * 2,
177 expectState: StateShutdown,
181 testCaseComment: "Idle, probe timeout exceeded, nothing running, probe fails",
183 age: probeTimeout * 2,
185 expectState: StateShutdown,
188 testCaseComment: "Running, one container still running",
191 respRun: respContainerRunning,
192 expectState: StateRunning,
196 testCaseComment: "Running, container has exited",
199 expectState: StateIdle,
203 testCaseComment: "Running, probe timeout exceeded, nothing running, new container being started",
205 age: probeTimeout * 2,
207 expectState: StateRunning,
210 testCaseComment: "Booting, boot probe succeeds, deployRunner succeeds, run probe succeeds",
212 deployRunner: []byte("ELF"),
213 expectStdin: []byte("ELF"),
215 respRunDeployed: respContainerRunning,
217 expectState: StateRunning,
220 testCaseComment: "Booting, boot probe succeeds, deployRunner fails",
222 deployRunner: []byte("ELF"),
223 respDeploy: respFail,
224 expectStdin: []byte("ELF"),
225 expectState: StateBooting,
228 testCaseComment: "Booting, boot probe succeeds, deployRunner skipped, run probe succeeds",
231 respDeploy: respFail,
232 expectState: StateIdle,
235 c.Logf("------- trial %d: %#v", idx, trial)
236 ctime := time.Now().Add(-trial.age)
237 exr := &stubExecutor{
238 response: map[string]stubResp{
239 "bootprobe": trial.respBoot,
240 "crunch-run --list": trial.respRun,
241 "{deploy}": trial.respDeploy,
246 newExecutor: func(cloud.Instance) Executor { return exr },
247 cluster: suite.testCluster,
248 bootProbeCommand: "bootprobe",
249 timeoutBooting: bootTimeout,
250 timeoutProbe: probeTimeout,
251 exited: map[string]time.Time{},
252 runnerCmdDefault: "crunch-run",
253 runnerArgs: []string{"--args=not used with --list"},
254 runnerCmd: "crunch-run",
255 runnerData: trial.deployRunner,
256 runnerMD5: md5.Sum(trial.deployRunner),
258 wp.registerMetrics(prometheus.NewRegistry())
259 if trial.deployRunner != nil {
260 svHash := md5.Sum(trial.deployRunner)
261 wp.runnerCmd = fmt.Sprintf("/var/run/arvados/crunch-run~%x", svHash)
262 exr.response[wp.runnerCmd+" --list"] = trial.respRunDeployed
265 logger: suite.logger,
275 running: map[string]*remoteRunner{},
276 starting: map[string]*remoteRunner{},
277 probing: make(chan struct{}, 1),
279 if trial.running > 0 {
280 uuid := "zzzzz-dz642-abcdefghijklmno"
281 wkr.running = map[string]*remoteRunner{uuid: newRemoteRunner(uuid, wkr)}
283 if trial.starting > 0 {
284 uuid := "zzzzz-dz642-bcdefghijklmnop"
285 wkr.starting = map[string]*remoteRunner{uuid: newRemoteRunner(uuid, wkr)}
288 c.Check(wkr.state, check.Equals, trial.expectState)
289 c.Check(len(wkr.running), check.Equals, trial.expectRunning)
290 c.Check(exr.stdin.String(), check.Equals, string(trial.expectStdin))
294 type stubResp struct {
300 type stubExecutor struct {
301 response map[string]stubResp
305 func (se *stubExecutor) SetTarget(cloud.ExecutorTarget) {}
306 func (se *stubExecutor) Close() {}
307 func (se *stubExecutor) Execute(env map[string]string, cmd string, stdin io.Reader) (stdout, stderr []byte, err error) {
309 _, err = io.Copy(&se.stdin, stdin)
311 return nil, []byte(err.Error()), err
314 resp, ok := se.response[cmd]
315 if !ok && strings.Contains(cmd, `; cat >"$dstfile"`) {
316 resp, ok = se.response["{deploy}"]
319 return nil, []byte(fmt.Sprintf("%s: command not found\n", cmd)), errors.New("command not found")
321 return []byte(resp.stdout), []byte(resp.stderr), resp.err