15026: Add cloudtest command.
[arvados.git] / lib / cloud / cloudtest / tester.go
1 // Copyright (C) The Arvados Authors. All rights reserved.
2 //
3 // SPDX-License-Identifier: AGPL-3.0
4
5 package cloudtest
6
7 import (
8         "crypto/rand"
9         "encoding/json"
10         "errors"
11         "fmt"
12         "time"
13
14         "git.curoverse.com/arvados.git/lib/cloud"
15         "git.curoverse.com/arvados.git/lib/dispatchcloud/ssh_executor"
16         "git.curoverse.com/arvados.git/lib/dispatchcloud/worker"
17         "git.curoverse.com/arvados.git/sdk/go/arvados"
18         "github.com/sirupsen/logrus"
19         "golang.org/x/crypto/ssh"
20 )
21
22 var (
23         errTestInstanceNotFound = errors.New("test instance missing from cloud provider's list")
24 )
25
26 // A tester does a sequence of operations to test a cloud driver and
27 // configuration. Run() should be called only once, after assigning
28 // suitable values to public fields.
29 type tester struct {
30         Logger             logrus.FieldLogger
31         Tags               cloud.SharedResourceTags
32         TagKeyPrefix       string
33         SetID              cloud.InstanceSetID
34         DestroyExisting    bool
35         ProbeInterval      time.Duration
36         SyncInterval       time.Duration
37         TimeoutBooting     time.Duration
38         Driver             cloud.Driver
39         DriverParameters   json.RawMessage
40         InstanceType       arvados.InstanceType
41         ImageID            cloud.ImageID
42         SSHKey             ssh.Signer
43         SSHPort            string
44         BootProbeCommand   string
45         ShellCommand       string
46         PauseBeforeDestroy func()
47
48         is              cloud.InstanceSet
49         testInstance    cloud.Instance
50         secret          string
51         executor        *ssh_executor.Executor
52         showedLoginInfo bool
53
54         failed bool
55 }
56
57 // Run the test suite as specified, clean up as needed, and return
58 // true (everything is OK) or false (something went wrong).
59 func (t *tester) Run() bool {
60         // This flag gets set when we encounter a non-fatal error, so
61         // we can continue doing more tests but remember to return
62         // false (failure) at the end.
63         deferredError := false
64
65         var err error
66         t.is, err = t.Driver.InstanceSet(t.DriverParameters, t.SetID, t.Tags, t.Logger)
67         if err != nil {
68                 t.Logger.WithError(err).Info("error initializing driver")
69                 return false
70         }
71         insts, err := t.getInstances(nil)
72         if err != nil {
73                 t.Logger.WithError(err).Info("error getting initial list of instances")
74                 return false
75         }
76         for {
77                 foundExisting := false
78                 for _, i := range insts {
79                         if i.Tags()[t.TagKeyPrefix+"InstanceSetID"] != string(t.SetID) {
80                                 continue
81                         }
82                         lgr := t.Logger.WithFields(logrus.Fields{
83                                 "Instance":      i.ID(),
84                                 "InstanceSetID": t.SetID,
85                         })
86                         foundExisting = true
87                         if t.DestroyExisting {
88                                 lgr.Info("destroying existing instance with our InstanceSetID")
89                                 err := i.Destroy()
90                                 if err != nil {
91                                         lgr.WithError(err).Error("error destroying existing instance")
92                                 } else {
93                                         lgr.Info("Destroy() call succeeded")
94                                 }
95                         } else {
96                                 lgr.Error("found existing instance with our InstanceSetID")
97                         }
98                 }
99                 if !foundExisting {
100                         break
101                 } else if t.DestroyExisting {
102                         t.sleepSyncInterval()
103                 } else {
104                         t.Logger.Error("cannot continue with existing instances -- clean up manually, use -destroy-existing=true, or choose a different -instance-set-id")
105                         return false
106                 }
107         }
108
109         t.secret = randomHex(40)
110
111         tags := cloud.InstanceTags{}
112         for k, v := range t.Tags {
113                 tags[k] = v
114         }
115         tags[t.TagKeyPrefix+"InstanceSetID"] = string(t.SetID)
116         tags[t.TagKeyPrefix+"InstanceSecret"] = t.secret
117
118         defer t.destroyTestInstance()
119
120         bootDeadline := time.Now().Add(t.TimeoutBooting)
121         initCommand := worker.TagVerifier{nil, t.secret}.InitCommand()
122
123         t.Logger.WithFields(logrus.Fields{
124                 "InstanceType":         t.InstanceType.Name,
125                 "ProviderInstanceType": t.InstanceType.ProviderType,
126                 "ImageID":              t.ImageID,
127                 "Tags":                 tags,
128                 "InitCommand":          initCommand,
129         }).Info("creating instance")
130         inst, err := t.is.Create(t.InstanceType, t.ImageID, tags, initCommand, t.SSHKey.PublicKey())
131         if err != nil {
132                 deferredError = true
133                 t.Logger.WithError(err).Error("error creating test instance")
134                 t.Logger.WithField("Deadline", bootDeadline).Info("waiting for instance to appear anyway, in case the Create response was incorrect")
135                 for err = t.refreshTestInstance(); err != nil; err = t.refreshTestInstance() {
136                         if time.Now().After(bootDeadline) {
137                                 t.Logger.Error("timed out")
138                                 return false
139                         } else {
140                                 t.sleepSyncInterval()
141                         }
142                 }
143                 t.Logger.WithField("Instance", t.testInstance.ID()).Info("new instance appeared")
144                 t.showLoginInfo()
145         } else {
146                 t.Logger.WithField("Instance", inst.ID()).Info("created instance")
147                 t.testInstance = inst
148                 t.showLoginInfo()
149                 err = t.refreshTestInstance()
150                 if err == errTestInstanceNotFound {
151                         t.Logger.WithError(err).Error("cloud/driver Create succeeded, but instance is not in list")
152                         deferredError = true
153                 } else if err != nil {
154                         t.Logger.WithError(err).Error("error getting list of instances")
155                         return false
156                 }
157         }
158         t.testInstance = worker.TagVerifier{t.testInstance, t.secret}
159
160         if !t.checkTags() {
161                 // checkTags() already logged the errors
162                 return false
163         }
164
165         if !t.waitForBoot(bootDeadline) {
166                 return false
167         }
168
169         if t.ShellCommand != "" {
170                 err = t.runShellCommand(t.ShellCommand)
171                 if err != nil {
172                         t.Logger.WithError(err).Error("shell command failed")
173                         deferredError = true
174                 }
175         }
176
177         if fn := t.PauseBeforeDestroy; fn != nil {
178                 t.showLoginInfo()
179                 fn()
180         }
181
182         return !deferredError
183 }
184
185 func (t *tester) showLoginInfo() {
186         t.updateExecutor()
187         host, port := t.executor.TargetHostPort()
188         if host == "" {
189                 return
190         }
191         user := t.testInstance.RemoteUser()
192         t.Logger.WithField("Command", fmt.Sprintf("ssh -p%s %s@%s", port, user, host)).Info("showing login information")
193         t.showedLoginInfo = true
194 }
195
196 // Get the latest instance list from the driver. If our test instance
197 // is found, assign it to t.testIntance.
198 func (t *tester) refreshTestInstance() error {
199         insts, err := t.getInstances(cloud.InstanceTags{t.TagKeyPrefix + "InstanceSetID": string(t.SetID)})
200         if err != nil {
201                 return err
202         }
203         for _, i := range insts {
204                 if t.testInstance == nil {
205                         // Filter by InstanceSetID tag value
206                         if i.Tags()[t.TagKeyPrefix+"InstanceSetID"] != string(t.SetID) {
207                                 continue
208                         }
209                 } else {
210                         // Filter by instance ID
211                         if i.ID() != t.testInstance.ID() {
212                                 continue
213                         }
214                 }
215                 t.Logger.WithFields(logrus.Fields{
216                         "Instance": i.ID(),
217                         "Address":  i.Address(),
218                 }).Info("found our instance in returned list")
219                 t.testInstance = worker.TagVerifier{i, t.secret}
220                 if !t.showedLoginInfo {
221                         t.showLoginInfo()
222                 }
223                 return nil
224         }
225         return errTestInstanceNotFound
226 }
227
228 func (t *tester) getInstances(tags cloud.InstanceTags) ([]cloud.Instance, error) {
229         var ret []cloud.Instance
230         t.Logger.WithField("FilterTags", tags).Info("getting instance list")
231         insts, err := t.is.Instances(tags)
232         if err != nil {
233                 return nil, err
234         }
235         t.Logger.WithField("N", len(insts)).Info("got instance list")
236         for _, i := range insts {
237                 if i.Tags()[t.TagKeyPrefix+"InstanceSetID"] == string(t.SetID) {
238                         ret = append(ret, i)
239                 }
240         }
241         return ret, nil
242 }
243
244 func (t *tester) checkTags() bool {
245         ok := true
246         for k, v := range t.Tags {
247                 if got := t.testInstance.Tags()[k]; got != v {
248                         ok = false
249                         t.Logger.WithFields(logrus.Fields{
250                                 "Key":           k,
251                                 "ExpectedValue": v,
252                                 "GotValue":      got,
253                         }).Error("tag is missing from test instance")
254                 }
255         }
256         if ok {
257                 t.Logger.Info("all expected tags are present")
258         }
259         return ok
260 }
261
262 func (t *tester) waitForBoot(deadline time.Time) bool {
263         for time.Now().Before(deadline) {
264                 err := t.runShellCommand(t.BootProbeCommand)
265                 if err == nil {
266                         return true
267                 }
268                 t.sleepProbeInterval()
269                 t.refreshTestInstance()
270         }
271         t.Logger.Error("timed out")
272         return false
273 }
274
275 func (t *tester) updateExecutor() {
276         if t.executor == nil {
277                 t.executor = ssh_executor.New(t.testInstance)
278                 t.executor.SetTargetPort(t.SSHPort)
279                 t.executor.SetSigners(t.SSHKey)
280         } else {
281                 t.executor.SetTarget(t.testInstance)
282         }
283 }
284
285 func (t *tester) runShellCommand(cmd string) error {
286         t.updateExecutor()
287         t.Logger.WithFields(logrus.Fields{
288                 "Command": cmd,
289         }).Info("executing remote command")
290         stdout, stderr, err := t.executor.Execute(nil, cmd, nil)
291         lgr := t.Logger.WithFields(logrus.Fields{
292                 "Command": cmd,
293                 "stdout":  string(stdout),
294                 "stderr":  string(stderr),
295         })
296         if err != nil {
297                 lgr.WithError(err).Info("remote command failed")
298         } else {
299                 lgr.Info("remote command succeeded")
300         }
301         return err
302 }
303
304 // currently, this tries forever until it can return true (success).
305 func (t *tester) destroyTestInstance() bool {
306         if t.testInstance == nil {
307                 return true
308         }
309         for {
310                 t.Logger.WithField("Instance", t.testInstance.ID()).Info("destroying instance")
311                 err := t.testInstance.Destroy()
312                 if err != nil {
313                         t.Logger.WithError(err).WithField("Instance", t.testInstance.ID()).Error("error destroying instance")
314                 } else {
315                         t.Logger.WithField("Instance", t.testInstance.ID()).Info("destroyed instance")
316                 }
317                 err = t.refreshTestInstance()
318                 if err == errTestInstanceNotFound {
319                         t.Logger.WithField("Instance", t.testInstance.ID()).Info("instance no longer appears in list")
320                         t.testInstance = nil
321                         return true
322                 } else if err == nil {
323                         t.Logger.WithField("Instance", t.testInstance.ID()).Info("instance still exists after calling Destroy")
324                         t.sleepSyncInterval()
325                         continue
326                 } else {
327                         t.Logger.WithError(err).Error("error getting list of instances")
328                         continue
329                 }
330         }
331 }
332
333 func (t *tester) sleepSyncInterval() {
334         t.Logger.WithField("Duration", t.SyncInterval).Info("waiting SyncInterval")
335         time.Sleep(t.SyncInterval)
336 }
337
338 func (t *tester) sleepProbeInterval() {
339         t.Logger.WithField("Duration", t.ProbeInterval).Info("waiting ProbeInterval")
340         time.Sleep(t.ProbeInterval)
341 }
342
343 // Return a random string of n hexadecimal digits (n*4 random bits). n
344 // must be even.
345 func randomHex(n int) string {
346         buf := make([]byte, n/2)
347         _, err := rand.Read(buf)
348         if err != nil {
349                 panic(err)
350         }
351         return fmt.Sprintf("%x", buf)
352 }