1 // Copyright (C) The Arvados Authors. All rights reserved.
3 // SPDX-License-Identifier: AGPL-3.0
16 "git.arvados.org/arvados.git/lib/cloud"
17 "git.arvados.org/arvados.git/lib/dispatchcloud/test"
18 "git.arvados.org/arvados.git/sdk/go/arvados"
19 "git.arvados.org/arvados.git/sdk/go/ctxlog"
20 "github.com/prometheus/client_golang/prometheus"
21 check "gopkg.in/check.v1"
24 var _ = check.Suite(&WorkerSuite{})
26 type WorkerSuite struct{}
28 func (suite *WorkerSuite) TestProbeAndUpdate(c *check.C) {
29 logger := ctxlog.TestLogger(c)
30 bootTimeout := time.Minute
31 probeTimeout := time.Second
33 ac := arvados.NewClientFromEnv()
34 is, err := (&test.StubDriver{}).InstanceSet(nil, "test-instance-set-id", nil, logger)
35 c.Assert(err, check.IsNil)
36 inst, err := is.Create(arvados.InstanceType{}, "", nil, "echo InitCommand", nil)
37 c.Assert(err, check.IsNil)
40 testCaseComment string // displayed in test output to help identify failure case
45 respBoot stubResp // zero value is success
46 respDeploy stubResp // zero value is success
47 respRun stubResp // zero value is success + nothing running
48 respRunDeployed stubResp
55 errFail := errors.New("failed")
56 respFail := stubResp{"", "command failed\n", errFail}
57 respContainerRunning := stubResp{"zzzzz-dz642-abcdefghijklmno\n", "", nil}
58 for idx, trial := range []trialT{
60 testCaseComment: "Unknown, probes fail",
64 expectState: StateUnknown,
67 testCaseComment: "Unknown, boot probe fails, but one container is running",
70 respRun: respContainerRunning,
71 expectState: StateUnknown,
75 testCaseComment: "Unknown, boot probe fails, previously running container has exited",
79 expectState: StateUnknown,
83 testCaseComment: "Unknown, boot timeout exceeded, boot probe fails",
85 age: bootTimeout + time.Second,
88 expectState: StateShutdown,
91 testCaseComment: "Unknown, boot timeout exceeded, boot probe succeeds but crunch-run fails",
95 expectState: StateShutdown,
98 testCaseComment: "Unknown, boot timeout exceeded, boot probe fails but crunch-run succeeds",
100 age: bootTimeout * 2,
102 expectState: StateShutdown,
105 testCaseComment: "Unknown, boot timeout exceeded, boot probe fails but container is running",
107 age: bootTimeout * 2,
109 respRun: respContainerRunning,
110 expectState: StateUnknown,
114 testCaseComment: "Booting, boot probe fails, run probe fails",
118 expectState: StateBooting,
121 testCaseComment: "Booting, boot probe fails, run probe succeeds (but isn't expected to be called)",
124 expectState: StateBooting,
127 testCaseComment: "Booting, boot probe succeeds, run probe fails",
130 expectState: StateBooting,
133 testCaseComment: "Booting, boot probe succeeds, run probe succeeds",
135 expectState: StateIdle,
138 testCaseComment: "Booting, boot probe succeeds, run probe succeeds, container is running",
140 respRun: respContainerRunning,
141 expectState: StateRunning,
145 testCaseComment: "Booting, boot timeout exceeded",
147 age: bootTimeout * 2,
149 expectState: StateShutdown,
152 testCaseComment: "Idle, probe timeout exceeded, one container running",
154 age: probeTimeout * 2,
155 respRun: respContainerRunning,
156 expectState: StateRunning,
160 testCaseComment: "Idle, probe timeout exceeded, one container running, probe fails",
162 age: probeTimeout * 2,
165 expectState: StateShutdown,
169 testCaseComment: "Idle, probe timeout exceeded, nothing running, probe fails",
171 age: probeTimeout * 2,
173 expectState: StateShutdown,
176 testCaseComment: "Running, one container still running",
179 respRun: respContainerRunning,
180 expectState: StateRunning,
184 testCaseComment: "Running, container has exited",
187 expectState: StateIdle,
191 testCaseComment: "Running, probe timeout exceeded, nothing running, new container being started",
193 age: probeTimeout * 2,
195 expectState: StateRunning,
198 testCaseComment: "Booting, boot probe succeeds, deployRunner succeeds, run probe succeeds",
200 deployRunner: []byte("ELF"),
201 expectStdin: []byte("ELF"),
203 respRunDeployed: respContainerRunning,
205 expectState: StateRunning,
208 testCaseComment: "Booting, boot probe succeeds, deployRunner fails",
210 deployRunner: []byte("ELF"),
211 respDeploy: respFail,
212 expectStdin: []byte("ELF"),
213 expectState: StateBooting,
216 testCaseComment: "Booting, boot probe succeeds, deployRunner skipped, run probe succeeds",
219 respDeploy: respFail,
220 expectState: StateIdle,
223 c.Logf("------- trial %d: %#v", idx, trial)
224 ctime := time.Now().Add(-trial.age)
225 exr := &stubExecutor{
226 response: map[string]stubResp{
227 "bootprobe": trial.respBoot,
228 "crunch-run --list": trial.respRun,
229 "{deploy}": trial.respDeploy,
234 newExecutor: func(cloud.Instance) Executor { return exr },
235 bootProbeCommand: "bootprobe",
236 timeoutBooting: bootTimeout,
237 timeoutProbe: probeTimeout,
238 exited: map[string]time.Time{},
239 runnerCmdDefault: "crunch-run",
240 runnerArgs: []string{"--args=not used with --list"},
241 runnerCmd: "crunch-run",
242 runnerData: trial.deployRunner,
243 runnerMD5: md5.Sum(trial.deployRunner),
245 wp.registerMetrics(prometheus.NewRegistry())
246 if trial.deployRunner != nil {
247 svHash := md5.Sum(trial.deployRunner)
248 wp.runnerCmd = fmt.Sprintf("/var/run/arvados/crunch-run~%x", svHash)
249 exr.response[wp.runnerCmd+" --list"] = trial.respRunDeployed
262 running: map[string]*remoteRunner{},
263 starting: map[string]*remoteRunner{},
264 probing: make(chan struct{}, 1),
266 if trial.running > 0 {
267 uuid := "zzzzz-dz642-abcdefghijklmno"
268 wkr.running = map[string]*remoteRunner{uuid: newRemoteRunner(uuid, wkr)}
270 if trial.starting > 0 {
271 uuid := "zzzzz-dz642-bcdefghijklmnop"
272 wkr.starting = map[string]*remoteRunner{uuid: newRemoteRunner(uuid, wkr)}
275 c.Check(wkr.state, check.Equals, trial.expectState)
276 c.Check(len(wkr.running), check.Equals, trial.expectRunning)
277 c.Check(exr.stdin.String(), check.Equals, string(trial.expectStdin))
281 type stubResp struct {
287 type stubExecutor struct {
288 response map[string]stubResp
292 func (se *stubExecutor) SetTarget(cloud.ExecutorTarget) {}
293 func (se *stubExecutor) Close() {}
294 func (se *stubExecutor) Execute(env map[string]string, cmd string, stdin io.Reader) (stdout, stderr []byte, err error) {
296 _, err = io.Copy(&se.stdin, stdin)
298 return nil, []byte(err.Error()), err
301 resp, ok := se.response[cmd]
302 if !ok && strings.Contains(cmd, `; cat >"$dstfile"`) {
303 resp, ok = se.response["{deploy}"]
306 return nil, []byte(fmt.Sprintf("%s: command not found\n", cmd)), errors.New("command not found")
308 return []byte(resp.stdout), []byte(resp.stderr), resp.err