20755: Run cloudtest suite once per configured subnet, not just 1st.
[arvados.git] / lib / cloud / cloudtest / tester.go
1 // Copyright (C) The Arvados Authors. All rights reserved.
2 //
3 // SPDX-License-Identifier: AGPL-3.0
4
5 package cloudtest
6
7 import (
8         "crypto/rand"
9         "encoding/json"
10         "errors"
11         "fmt"
12         "time"
13
14         "git.arvados.org/arvados.git/lib/cloud"
15         "git.arvados.org/arvados.git/lib/dispatchcloud/sshexecutor"
16         "git.arvados.org/arvados.git/lib/dispatchcloud/worker"
17         "git.arvados.org/arvados.git/sdk/go/arvados"
18         "github.com/sirupsen/logrus"
19         "golang.org/x/crypto/ssh"
20 )
21
22 var (
23         errTestInstanceNotFound = errors.New("test instance missing from cloud provider's list")
24 )
25
26 // A tester does a sequence of operations to test a cloud driver and
27 // configuration. Run() should be called only once, after assigning
28 // suitable values to public fields.
29 type tester struct {
30         Logger              logrus.FieldLogger
31         Tags                cloud.SharedResourceTags
32         TagKeyPrefix        string
33         SetID               cloud.InstanceSetID
34         DestroyExisting     bool
35         ProbeInterval       time.Duration
36         SyncInterval        time.Duration
37         TimeoutBooting      time.Duration
38         Driver              cloud.Driver
39         DriverParameters    json.RawMessage
40         InstanceType        arvados.InstanceType
41         ImageID             cloud.ImageID
42         SSHKey              ssh.Signer
43         SSHPort             string
44         BootProbeCommand    string
45         InstanceInitCommand cloud.InitCommand
46         ShellCommand        string
47         PauseBeforeDestroy  func()
48
49         is              cloud.InstanceSet
50         testInstance    *worker.TagVerifier
51         secret          string
52         executor        *sshexecutor.Executor
53         showedLoginInfo bool
54
55         failed bool
56 }
57
58 // Run the test suite once for each applicable permutation of
59 // DriverParameters.  Return true if everything worked.
60 //
61 // Currently this means run once for each configured SubnetID.
62 func (t *tester) Run() bool {
63         var dp map[string]interface{}
64         err := json.Unmarshal(t.DriverParameters, &dp)
65         if err != nil {
66                 t.Logger.WithError(err).Error("error decoding configured CloudVMs.DriverParameters")
67                 return false
68         }
69         subnets, ok := dp["SubnetID"].([]interface{})
70         if !ok || len(subnets) <= 1 {
71                 // Easy, only one SubnetID to test.
72                 return t.runWithDriverParameters(t.DriverParameters)
73         }
74
75         deferredError := false
76         for i, subnet := range subnets {
77                 subnet, ok := subnet.(string)
78                 if !ok {
79                         t.Logger.Errorf("CloudVMs.DriverParameters.SubnetID[%d] is invalid -- must be a string", i)
80                         deferredError = true
81                         continue
82                 }
83                 dp["SubnetID"] = subnet
84                 t.Logger.Infof("running tests using SubnetID[%d] %q", i, subnet)
85                 dpjson, err := json.Marshal(dp)
86                 if err != nil {
87                         t.Logger.WithError(err).Error("error encoding driver parameters")
88                         deferredError = true
89                         continue
90                 }
91                 ok = t.runWithDriverParameters(dpjson)
92                 if !ok {
93                         t.Logger.Infof("failed tests using SubnetID[%d] %q", i, subnet)
94                         deferredError = true
95                 }
96         }
97         return !deferredError
98 }
99
100 // Run the test suite as specified, clean up as needed, and return
101 // true (everything is OK) or false (something went wrong).
102 func (t *tester) runWithDriverParameters(driverParameters json.RawMessage) bool {
103         // This flag gets set when we encounter a non-fatal error, so
104         // we can continue doing more tests but remember to return
105         // false (failure) at the end.
106         deferredError := false
107
108         var err error
109         t.is, err = t.Driver.InstanceSet(driverParameters, t.SetID, t.Tags, t.Logger, nil)
110         if err != nil {
111                 t.Logger.WithError(err).Info("error initializing driver")
112                 return false
113         }
114
115         for {
116                 // Don't send the driver any filters when getting the
117                 // initial instance list. This way we can log an
118                 // instance count (N=...)  that includes all instances
119                 // in this service account, even if they don't have
120                 // the same InstanceSetID.
121                 insts, err := t.getInstances(nil)
122                 if err != nil {
123                         t.Logger.WithError(err).Info("error getting list of instances")
124                         return false
125                 }
126
127                 foundExisting := false
128                 for _, i := range insts {
129                         if i.Tags()[t.TagKeyPrefix+"InstanceSetID"] != string(t.SetID) {
130                                 continue
131                         }
132                         lgr := t.Logger.WithFields(logrus.Fields{
133                                 "Instance":      i.ID(),
134                                 "InstanceSetID": t.SetID,
135                         })
136                         foundExisting = true
137                         if t.DestroyExisting {
138                                 lgr.Info("destroying existing instance with our InstanceSetID")
139                                 t0 := time.Now()
140                                 err := i.Destroy()
141                                 lgr := lgr.WithField("Duration", time.Since(t0))
142                                 if err != nil {
143                                         lgr.WithError(err).Error("error destroying existing instance")
144                                 } else {
145                                         lgr.Info("Destroy() call succeeded")
146                                 }
147                         } else {
148                                 lgr.Error("found existing instance with our InstanceSetID")
149                         }
150                 }
151                 if !foundExisting {
152                         break
153                 } else if t.DestroyExisting {
154                         t.sleepSyncInterval()
155                 } else {
156                         t.Logger.Error("cannot continue with existing instances -- clean up manually, use -destroy-existing=true, or choose a different -instance-set-id")
157                         return false
158                 }
159         }
160
161         t.secret = randomHex(40)
162
163         tags := cloud.InstanceTags{}
164         for k, v := range t.Tags {
165                 tags[k] = v
166         }
167         tags[t.TagKeyPrefix+"InstanceSetID"] = string(t.SetID)
168         tags[t.TagKeyPrefix+"InstanceSecret"] = t.secret
169
170         defer t.destroyTestInstance()
171
172         bootDeadline := time.Now().Add(t.TimeoutBooting)
173         initCommand := worker.TagVerifier{Instance: nil, Secret: t.secret, ReportVerified: nil}.InitCommand() + "\n" + t.InstanceInitCommand
174
175         t.Logger.WithFields(logrus.Fields{
176                 "InstanceType":         t.InstanceType.Name,
177                 "ProviderInstanceType": t.InstanceType.ProviderType,
178                 "ImageID":              t.ImageID,
179                 "Tags":                 tags,
180                 "InitCommand":          initCommand,
181         }).Info("creating instance")
182         t0 := time.Now()
183         inst, err := t.is.Create(t.InstanceType, t.ImageID, tags, initCommand, t.SSHKey.PublicKey())
184         lgrC := t.Logger.WithField("Duration", time.Since(t0))
185         if err != nil {
186                 // Create() might have failed due to a bug or network
187                 // error even though the creation was successful, so
188                 // it's safer to wait a bit for an instance to appear.
189                 deferredError = true
190                 lgrC.WithError(err).Error("error creating test instance")
191                 t.Logger.WithField("Deadline", bootDeadline).Info("waiting for instance to appear anyway, in case the Create response was incorrect")
192                 for err = t.refreshTestInstance(); err != nil; err = t.refreshTestInstance() {
193                         if time.Now().After(bootDeadline) {
194                                 t.Logger.Error("timed out")
195                                 return false
196                         }
197                         t.sleepSyncInterval()
198                 }
199                 t.Logger.WithField("Instance", t.testInstance.ID()).Info("new instance appeared")
200                 t.showLoginInfo()
201         } else {
202                 // Create() succeeded. Make sure the new instance
203                 // appears right away in the Instances() list.
204                 lgrC.WithField("Instance", inst.ID()).Info("created instance")
205                 t.testInstance = &worker.TagVerifier{Instance: inst, Secret: t.secret, ReportVerified: nil}
206                 t.showLoginInfo()
207                 err = t.refreshTestInstance()
208                 if err == errTestInstanceNotFound {
209                         t.Logger.WithError(err).Error("cloud/driver Create succeeded, but instance is not in list")
210                         deferredError = true
211                 } else if err != nil {
212                         t.Logger.WithError(err).Error("error getting list of instances")
213                         return false
214                 }
215         }
216
217         if !t.checkTags() {
218                 // checkTags() already logged the errors
219                 deferredError = true
220         }
221
222         if !t.waitForBoot(bootDeadline) {
223                 deferredError = true
224         }
225
226         if t.ShellCommand != "" {
227                 err = t.runShellCommand(t.ShellCommand)
228                 if err != nil {
229                         t.Logger.WithError(err).Error("shell command failed")
230                         deferredError = true
231                 }
232         }
233
234         if fn := t.PauseBeforeDestroy; fn != nil {
235                 t.showLoginInfo()
236                 fn()
237         }
238
239         return !deferredError
240 }
241
242 // If the test instance has an address, log an "ssh user@host" command
243 // line that the operator can paste into another terminal, and set
244 // t.showedLoginInfo.
245 //
246 // If the test instance doesn't have an address yet, do nothing.
247 func (t *tester) showLoginInfo() {
248         t.updateExecutor()
249         host, port := t.executor.TargetHostPort()
250         if host == "" {
251                 return
252         }
253         user := t.testInstance.RemoteUser()
254         t.Logger.WithField("Command", fmt.Sprintf("ssh -p%s %s@%s", port, user, host)).Info("showing login information")
255         t.showedLoginInfo = true
256 }
257
258 // Get the latest instance list from the driver. If our test instance
259 // is found, assign it to t.testIntance.
260 func (t *tester) refreshTestInstance() error {
261         insts, err := t.getInstances(cloud.InstanceTags{t.TagKeyPrefix + "InstanceSetID": string(t.SetID)})
262         if err != nil {
263                 return err
264         }
265         for _, i := range insts {
266                 if t.testInstance == nil {
267                         // Filter by InstanceSetID tag value
268                         if i.Tags()[t.TagKeyPrefix+"InstanceSetID"] != string(t.SetID) {
269                                 continue
270                         }
271                 } else {
272                         // Filter by instance ID
273                         if i.ID() != t.testInstance.ID() {
274                                 continue
275                         }
276                 }
277                 t.Logger.WithFields(logrus.Fields{
278                         "Instance": i.ID(),
279                         "Address":  i.Address(),
280                 }).Info("found our instance in returned list")
281                 t.testInstance = &worker.TagVerifier{Instance: i, Secret: t.secret, ReportVerified: nil}
282                 if !t.showedLoginInfo {
283                         t.showLoginInfo()
284                 }
285                 return nil
286         }
287         return errTestInstanceNotFound
288 }
289
290 // Get the list of instances, passing the given tags to the cloud
291 // driver to filter results.
292 //
293 // Return only the instances that have our InstanceSetID tag.
294 func (t *tester) getInstances(tags cloud.InstanceTags) ([]cloud.Instance, error) {
295         var ret []cloud.Instance
296         t.Logger.WithField("FilterTags", tags).Info("getting instance list")
297         t0 := time.Now()
298         insts, err := t.is.Instances(tags)
299         if err != nil {
300                 return nil, err
301         }
302         t.Logger.WithFields(logrus.Fields{
303                 "Duration": time.Since(t0),
304                 "N":        len(insts),
305         }).Info("got instance list")
306         for _, i := range insts {
307                 if i.Tags()[t.TagKeyPrefix+"InstanceSetID"] == string(t.SetID) {
308                         ret = append(ret, i)
309                 }
310         }
311         return ret, nil
312 }
313
314 // Check that t.testInstance has every tag in t.Tags. If not, log an
315 // error and return false.
316 func (t *tester) checkTags() bool {
317         ok := true
318         for k, v := range t.Tags {
319                 if got := t.testInstance.Tags()[k]; got != v {
320                         ok = false
321                         t.Logger.WithFields(logrus.Fields{
322                                 "Key":           k,
323                                 "ExpectedValue": v,
324                                 "GotValue":      got,
325                         }).Error("tag is missing from test instance")
326                 }
327         }
328         if ok {
329                 t.Logger.Info("all expected tags are present")
330         }
331         return ok
332 }
333
334 // Run t.BootProbeCommand on t.testInstance until it succeeds or the
335 // deadline arrives.
336 func (t *tester) waitForBoot(deadline time.Time) bool {
337         for time.Now().Before(deadline) {
338                 err := t.runShellCommand(t.BootProbeCommand)
339                 if err == nil {
340                         return true
341                 }
342                 t.sleepProbeInterval()
343                 t.refreshTestInstance()
344         }
345         t.Logger.Error("timed out")
346         return false
347 }
348
349 // Create t.executor and/or update its target to t.testInstance's
350 // current address.
351 func (t *tester) updateExecutor() {
352         if t.executor == nil {
353                 t.executor = sshexecutor.New(t.testInstance)
354                 t.executor.SetTargetPort(t.SSHPort)
355                 t.executor.SetSigners(t.SSHKey)
356         } else {
357                 t.executor.SetTarget(t.testInstance)
358         }
359 }
360
361 func (t *tester) runShellCommand(cmd string) error {
362         t.updateExecutor()
363         t.Logger.WithFields(logrus.Fields{
364                 "Command": cmd,
365         }).Info("executing remote command")
366         t0 := time.Now()
367         stdout, stderr, err := t.executor.Execute(nil, cmd, nil)
368         lgr := t.Logger.WithFields(logrus.Fields{
369                 "Duration": time.Since(t0),
370                 "Command":  cmd,
371                 "stdout":   string(stdout),
372                 "stderr":   string(stderr),
373         })
374         if err != nil {
375                 lgr.WithError(err).Info("remote command failed")
376         } else {
377                 lgr.Info("remote command succeeded")
378         }
379         return err
380 }
381
382 // currently, this tries forever until it can return true (success).
383 func (t *tester) destroyTestInstance() bool {
384         if t.testInstance == nil {
385                 return true
386         }
387         for {
388                 lgr := t.Logger.WithField("Instance", t.testInstance.ID())
389                 lgr.Info("destroying instance")
390                 t0 := time.Now()
391
392                 err := t.testInstance.Destroy()
393                 lgrDur := lgr.WithField("Duration", time.Since(t0))
394                 if err != nil {
395                         lgrDur.WithError(err).Error("error destroying instance")
396                 } else {
397                         lgrDur.Info("destroyed instance")
398                 }
399
400                 err = t.refreshTestInstance()
401                 if err == errTestInstanceNotFound {
402                         lgr.Info("instance no longer appears in list")
403                         t.testInstance = nil
404                         return true
405                 } else if err == nil {
406                         lgr.Info("instance still exists after calling Destroy")
407                         t.sleepSyncInterval()
408                         continue
409                 } else {
410                         t.Logger.WithError(err).Error("error getting list of instances")
411                         continue
412                 }
413         }
414 }
415
416 func (t *tester) sleepSyncInterval() {
417         t.Logger.WithField("Duration", t.SyncInterval).Info("waiting SyncInterval")
418         time.Sleep(t.SyncInterval)
419 }
420
421 func (t *tester) sleepProbeInterval() {
422         t.Logger.WithField("Duration", t.ProbeInterval).Info("waiting ProbeInterval")
423         time.Sleep(t.ProbeInterval)
424 }
425
426 // Return a random string of n hexadecimal digits (n*4 random bits). n
427 // must be even.
428 func randomHex(n int) string {
429         buf := make([]byte, n/2)
430         _, err := rand.Read(buf)
431         if err != nil {
432                 panic(err)
433         }
434         return fmt.Sprintf("%x", buf)
435 }