14325: Test resuming worker pool state after restart.
[arvados.git] / lib / dispatchcloud / worker / worker_test.go
1 // Copyright (C) The Arvados Authors. All rights reserved.
2 //
3 // SPDX-License-Identifier: AGPL-3.0
4
5 package worker
6
7 import (
8         "errors"
9         "io"
10         "time"
11
12         "git.curoverse.com/arvados.git/lib/cloud"
13         "git.curoverse.com/arvados.git/lib/dispatchcloud/test"
14         "git.curoverse.com/arvados.git/sdk/go/arvados"
15         "github.com/sirupsen/logrus"
16         check "gopkg.in/check.v1"
17 )
18
19 var _ = check.Suite(&WorkerSuite{})
20
21 type WorkerSuite struct{}
22
23 func (suite *WorkerSuite) TestProbeAndUpdate(c *check.C) {
24         logger := logrus.StandardLogger()
25         bootTimeout := time.Minute
26         probeTimeout := time.Second
27
28         is, err := (&test.StubDriver{}).InstanceSet(nil, "", logger)
29         c.Assert(err, check.IsNil)
30         inst, err := is.Create(arvados.InstanceType{}, "", nil, nil)
31         c.Assert(err, check.IsNil)
32
33         type trialT struct {
34                 testCaseComment string // displayed in test output to help identify failure case
35                 age             time.Duration
36                 state           State
37                 running         int
38                 starting        int
39                 respBoot        stubResp // zero value is success
40                 respRun         stubResp // zero value is success + nothing running
41                 expectState     State
42                 expectRunning   int
43         }
44
45         errFail := errors.New("failed")
46         respFail := stubResp{"", "command failed\n", errFail}
47         respContainerRunning := stubResp{"zzzzz-dz642-abcdefghijklmno\n", "", nil}
48         for _, trial := range []trialT{
49                 {
50                         testCaseComment: "Unknown, probes fail",
51                         state:           StateUnknown,
52                         respBoot:        respFail,
53                         respRun:         respFail,
54                         expectState:     StateUnknown,
55                 },
56                 {
57                         testCaseComment: "Unknown, boot probe fails, but one container is running",
58                         state:           StateUnknown,
59                         respBoot:        respFail,
60                         respRun:         respContainerRunning,
61                         expectState:     StateUnknown,
62                         expectRunning:   1,
63                 },
64                 {
65                         testCaseComment: "Unknown, boot probe fails, previously running container has exited",
66                         state:           StateUnknown,
67                         running:         1,
68                         respBoot:        respFail,
69                         expectState:     StateUnknown,
70                         expectRunning:   0,
71                 },
72                 {
73                         testCaseComment: "Unknown, boot timeout exceeded, boot probe fails",
74                         state:           StateUnknown,
75                         age:             bootTimeout + time.Second,
76                         respBoot:        respFail,
77                         respRun:         respFail,
78                         expectState:     StateShutdown,
79                 },
80                 {
81                         testCaseComment: "Unknown, boot timeout exceeded, boot probe succeeds but crunch-run fails",
82                         state:           StateUnknown,
83                         age:             bootTimeout * 2,
84                         respRun:         respFail,
85                         expectState:     StateShutdown,
86                 },
87                 {
88                         testCaseComment: "Unknown, boot timeout exceeded, boot probe fails but crunch-run succeeds",
89                         state:           StateUnknown,
90                         age:             bootTimeout * 2,
91                         respBoot:        respFail,
92                         expectState:     StateShutdown,
93                 },
94                 {
95                         testCaseComment: "Unknown, boot timeout exceeded, boot probe fails but container is running",
96                         state:           StateUnknown,
97                         age:             bootTimeout * 2,
98                         respBoot:        respFail,
99                         respRun:         respContainerRunning,
100                         expectState:     StateUnknown,
101                         expectRunning:   1,
102                 },
103                 {
104                         testCaseComment: "Booting, boot probe fails, run probe fails",
105                         state:           StateBooting,
106                         respBoot:        respFail,
107                         respRun:         respFail,
108                         expectState:     StateBooting,
109                 },
110                 {
111                         testCaseComment: "Booting, boot probe fails, run probe succeeds (but isn't expected to be called)",
112                         state:           StateBooting,
113                         respBoot:        respFail,
114                         expectState:     StateBooting,
115                 },
116                 {
117                         testCaseComment: "Booting, boot probe succeeds, run probe fails",
118                         state:           StateBooting,
119                         respRun:         respFail,
120                         expectState:     StateBooting,
121                 },
122                 {
123                         testCaseComment: "Booting, boot probe succeeds, run probe succeeds",
124                         state:           StateBooting,
125                         expectState:     StateIdle,
126                 },
127                 {
128                         testCaseComment: "Booting, boot probe succeeds, run probe succeeds, container is running",
129                         state:           StateBooting,
130                         respRun:         respContainerRunning,
131                         expectState:     StateRunning,
132                         expectRunning:   1,
133                 },
134                 {
135                         testCaseComment: "Booting, boot timeout exceeded",
136                         state:           StateBooting,
137                         age:             bootTimeout * 2,
138                         respRun:         respFail,
139                         expectState:     StateShutdown,
140                 },
141                 {
142                         testCaseComment: "Idle, probe timeout exceeded, one container running",
143                         state:           StateIdle,
144                         age:             probeTimeout * 2,
145                         respRun:         respContainerRunning,
146                         expectState:     StateRunning,
147                         expectRunning:   1,
148                 },
149                 {
150                         testCaseComment: "Idle, probe timeout exceeded, one container running, probe fails",
151                         state:           StateIdle,
152                         age:             probeTimeout * 2,
153                         running:         1,
154                         respRun:         respFail,
155                         expectState:     StateShutdown,
156                         expectRunning:   1,
157                 },
158                 {
159                         testCaseComment: "Idle, probe timeout exceeded, nothing running, probe fails",
160                         state:           StateIdle,
161                         age:             probeTimeout * 2,
162                         respRun:         respFail,
163                         expectState:     StateShutdown,
164                 },
165                 {
166                         testCaseComment: "Running, one container still running",
167                         state:           StateRunning,
168                         running:         1,
169                         respRun:         respContainerRunning,
170                         expectState:     StateRunning,
171                         expectRunning:   1,
172                 },
173                 {
174                         testCaseComment: "Running, container has exited",
175                         state:           StateRunning,
176                         running:         1,
177                         expectState:     StateIdle,
178                         expectRunning:   0,
179                 },
180                 {
181                         testCaseComment: "Running, probe timeout exceeded, nothing running, new container being started",
182                         state:           StateRunning,
183                         age:             probeTimeout * 2,
184                         starting:        1,
185                         expectState:     StateRunning,
186                 },
187         } {
188                 c.Logf("------- %#v", trial)
189                 ctime := time.Now().Add(-trial.age)
190                 exr := stubExecutor{
191                         "bootprobe":         trial.respBoot,
192                         "crunch-run --list": trial.respRun,
193                 }
194                 wp := &Pool{
195                         newExecutor:      func(cloud.Instance) Executor { return exr },
196                         bootProbeCommand: "bootprobe",
197                         timeoutBooting:   bootTimeout,
198                         timeoutProbe:     probeTimeout,
199                         exited:           map[string]time.Time{},
200                 }
201                 wkr := &worker{
202                         logger:   logger,
203                         executor: exr,
204                         wp:       wp,
205                         mtx:      &wp.mtx,
206                         state:    trial.state,
207                         instance: inst,
208                         appeared: ctime,
209                         busy:     ctime,
210                         probed:   ctime,
211                         updated:  ctime,
212                 }
213                 if trial.running > 0 {
214                         wkr.running = map[string]struct{}{"zzzzz-dz642-abcdefghijklmno": struct{}{}}
215                 }
216                 if trial.starting > 0 {
217                         wkr.starting = map[string]struct{}{"zzzzz-dz642-abcdefghijklmno": struct{}{}}
218                 }
219                 wkr.probeAndUpdate()
220                 c.Check(wkr.state, check.Equals, trial.expectState)
221                 c.Check(len(wkr.running), check.Equals, trial.expectRunning)
222         }
223 }
224
225 type stubResp struct {
226         stdout string
227         stderr string
228         err    error
229 }
230 type stubExecutor map[string]stubResp
231
232 func (se stubExecutor) SetTarget(cloud.ExecutorTarget) {}
233 func (se stubExecutor) Close()                         {}
234 func (se stubExecutor) Execute(env map[string]string, cmd string, stdin io.Reader) (stdout, stderr []byte, err error) {
235         resp, ok := se[cmd]
236         if !ok {
237                 return nil, []byte("command not found\n"), errors.New("command not found")
238         }
239         return []byte(resp.stdout), []byte(resp.stderr), resp.err
240 }