14325: Don't shutdown busy VMs even if boot probe fails.
[arvados.git] / lib / dispatchcloud / worker / pool_test.go
1 // Copyright (C) The Arvados Authors. All rights reserved.
2 //
3 // SPDX-License-Identifier: AGPL-3.0
4
5 package worker
6
7 import (
8         "time"
9
10         "git.curoverse.com/arvados.git/lib/cloud"
11         "git.curoverse.com/arvados.git/lib/dispatchcloud/test"
12         "git.curoverse.com/arvados.git/sdk/go/arvados"
13         "github.com/sirupsen/logrus"
14         check "gopkg.in/check.v1"
15 )
16
17 const GiB arvados.ByteSize = 1 << 30
18
19 var _ = check.Suite(&PoolSuite{})
20
21 type lessChecker struct {
22         *check.CheckerInfo
23 }
24
25 func (*lessChecker) Check(params []interface{}, names []string) (result bool, error string) {
26         return params[0].(int) < params[1].(int), ""
27 }
28
29 var less = &lessChecker{&check.CheckerInfo{Name: "less", Params: []string{"obtained", "expected"}}}
30
31 type PoolSuite struct{}
32
33 func (suite *PoolSuite) SetUpSuite(c *check.C) {
34         logrus.StandardLogger().SetLevel(logrus.DebugLevel)
35 }
36
37 func (suite *PoolSuite) TestStartContainer(c *check.C) {
38         // TODO: use an instanceSet stub with an SSH server
39 }
40
41 func (suite *PoolSuite) TestVerifyHostKey(c *check.C) {
42         // TODO: use an instanceSet stub with an SSH server
43 }
44
45 func (suite *PoolSuite) TestCreateUnallocShutdown(c *check.C) {
46         lameInstanceSet := &test.LameInstanceSet{Hold: make(chan bool)}
47         type1 := arvados.InstanceType{Name: "a1s", ProviderType: "a1.small", VCPUs: 1, RAM: 1 * GiB, Price: .01}
48         type2 := arvados.InstanceType{Name: "a2m", ProviderType: "a2.medium", VCPUs: 2, RAM: 2 * GiB, Price: .02}
49         type3 := arvados.InstanceType{Name: "a2l", ProviderType: "a2.large", VCPUs: 4, RAM: 4 * GiB, Price: .04}
50         pool := &Pool{
51                 logger:      logrus.StandardLogger(),
52                 newExecutor: func(cloud.Instance) Executor { return stubExecutor{} },
53                 instanceSet: &throttledInstanceSet{InstanceSet: lameInstanceSet},
54                 instanceTypes: arvados.InstanceTypeMap{
55                         type1.Name: type1,
56                         type2.Name: type2,
57                         type3.Name: type3,
58                 },
59         }
60         notify := pool.Subscribe()
61         defer pool.Unsubscribe(notify)
62         notify2 := pool.Subscribe()
63         defer pool.Unsubscribe(notify2)
64
65         c.Check(pool.Unallocated()[type1], check.Equals, 0)
66         c.Check(pool.Unallocated()[type2], check.Equals, 0)
67         c.Check(pool.Unallocated()[type3], check.Equals, 0)
68         pool.Create(type2)
69         pool.Create(type1)
70         pool.Create(type2)
71         pool.Create(type3)
72         c.Check(pool.Unallocated()[type1], check.Equals, 1)
73         c.Check(pool.Unallocated()[type2], check.Equals, 2)
74         c.Check(pool.Unallocated()[type3], check.Equals, 1)
75
76         // Unblock the pending Create calls.
77         go lameInstanceSet.Release(4)
78
79         // Wait for each instance to either return from its Create
80         // call, or show up in a poll.
81         suite.wait(c, pool, notify, func() bool {
82                 pool.mtx.RLock()
83                 defer pool.mtx.RUnlock()
84                 return len(pool.workers) == 4
85         })
86
87         // Place type3 node on admin-hold
88         ivs := suite.instancesByType(pool, type3)
89         c.Assert(ivs, check.HasLen, 1)
90         type3instanceID := ivs[0].Instance
91         err := pool.SetIdleBehavior(type3instanceID, IdleBehaviorHold)
92         c.Check(err, check.IsNil)
93
94         // Check admin-hold behavior: refuse to shutdown, and don't
95         // report as Unallocated ("available now or soon").
96         c.Check(pool.Shutdown(type3), check.Equals, false)
97         suite.wait(c, pool, notify, func() bool {
98                 return pool.Unallocated()[type3] == 0
99         })
100         c.Check(suite.instancesByType(pool, type3), check.HasLen, 1)
101
102         // Shutdown both type2 nodes
103         c.Check(pool.Shutdown(type2), check.Equals, true)
104         suite.wait(c, pool, notify, func() bool {
105                 return pool.Unallocated()[type1] == 1 && pool.Unallocated()[type2] == 1
106         })
107         c.Check(pool.Shutdown(type2), check.Equals, true)
108         suite.wait(c, pool, notify, func() bool {
109                 return pool.Unallocated()[type1] == 1 && pool.Unallocated()[type2] == 0
110         })
111         c.Check(pool.Shutdown(type2), check.Equals, false)
112         for {
113                 // Consume any waiting notifications to ensure the
114                 // next one we get is from Shutdown.
115                 select {
116                 case <-notify:
117                         continue
118                 default:
119                 }
120                 break
121         }
122
123         // Shutdown type1 node
124         c.Check(pool.Shutdown(type1), check.Equals, true)
125         suite.wait(c, pool, notify, func() bool {
126                 return pool.Unallocated()[type1] == 0 && pool.Unallocated()[type2] == 0 && pool.Unallocated()[type3] == 0
127         })
128         select {
129         case <-notify2:
130         case <-time.After(time.Second):
131                 c.Error("notify did not receive")
132         }
133
134         // Put type3 node back in service.
135         err = pool.SetIdleBehavior(type3instanceID, IdleBehaviorRun)
136         c.Check(err, check.IsNil)
137         suite.wait(c, pool, notify, func() bool {
138                 return pool.Unallocated()[type3] == 1
139         })
140
141         // Check admin-drain behavior: shut down right away, and don't
142         // report as Unallocated.
143         err = pool.SetIdleBehavior(type3instanceID, IdleBehaviorDrain)
144         c.Check(err, check.IsNil)
145         suite.wait(c, pool, notify, func() bool {
146                 return pool.Unallocated()[type3] == 0
147         })
148         suite.wait(c, pool, notify, func() bool {
149                 ivs := suite.instancesByType(pool, type3)
150                 return len(ivs) == 1 && ivs[0].WorkerState == StateShutdown.String()
151         })
152
153         // Unblock all pending Destroy calls. Pool calls Destroy again
154         // if a node still appears in the provider list after a
155         // previous attempt, so there might be more than 4 Destroy
156         // calls to unblock.
157         go lameInstanceSet.Release(4444)
158
159         // Sync until all instances disappear from the provider list.
160         suite.wait(c, pool, notify, func() bool {
161                 pool.getInstancesAndSync()
162                 return len(pool.Instances()) == 0
163         })
164 }
165
166 func (suite *PoolSuite) instancesByType(pool *Pool, it arvados.InstanceType) []InstanceView {
167         var ivs []InstanceView
168         for _, iv := range pool.Instances() {
169                 if iv.ArvadosInstanceType == it.Name {
170                         ivs = append(ivs, iv)
171                 }
172         }
173         return ivs
174 }
175
176 func (suite *PoolSuite) wait(c *check.C, pool *Pool, notify <-chan struct{}, ready func() bool) {
177         timeout := time.NewTimer(time.Second).C
178         for !ready() {
179                 select {
180                 case <-notify:
181                         continue
182                 case <-timeout:
183                 }
184                 break
185         }
186         c.Check(ready(), check.Equals, true)
187 }