lib/cloud/cloudtest/tester.go

   1 // Copyright (C) The Arvados Authors. All rights reserved.
   2 //
   3 // SPDX-License-Identifier: AGPL-3.0
   4
   5 package cloudtest
   6
   7 import (
   8         "crypto/rand"
   9         "encoding/json"
  10         "errors"
  11         "fmt"
  12         "time"
  13
  14         "git.arvados.org/arvados.git/lib/cloud"
  15         "git.arvados.org/arvados.git/lib/dispatchcloud/sshexecutor"
  16         "git.arvados.org/arvados.git/lib/dispatchcloud/worker"
  17         "git.arvados.org/arvados.git/sdk/go/arvados"
  18         "github.com/sirupsen/logrus"
  19         "golang.org/x/crypto/ssh"
  20 )
  21
  22 var (
  23         errTestInstanceNotFound = errors.New("test instance missing from cloud provider's list")
  24 )
  25
  26 // A tester does a sequence of operations to test a cloud driver and
  27 // configuration. Run() should be called only once, after assigning
  28 // suitable values to public fields.
  29 type tester struct {
  30         Logger              logrus.FieldLogger
  31         Tags                cloud.SharedResourceTags
  32         TagKeyPrefix        string
  33         SetID               cloud.InstanceSetID
  34         DestroyExisting     bool
  35         ProbeInterval       time.Duration
  36         SyncInterval        time.Duration
  37         TimeoutBooting      time.Duration
  38         Driver              cloud.Driver
  39         DriverParameters    json.RawMessage
  40         InstanceType        arvados.InstanceType
  41         ImageID             cloud.ImageID
  42         SSHKey              ssh.Signer
  43         SSHPort             string
  44         DeployPublicKey     bool
  45         BootProbeCommand    string
  46         InstanceInitCommand cloud.InitCommand
  47         ShellCommand        string
  48         PauseBeforeDestroy  func()
  49
  50         is              cloud.InstanceSet
  51         testInstance    *worker.TagVerifier
  52         secret          string
  53         executor        *sshexecutor.Executor
  54         showedLoginInfo bool
  55
  56         failed bool
  57 }
  58
  59 // Run the test suite once for each applicable permutation of
  60 // DriverParameters.  Return true if everything worked.
  61 //
  62 // Currently this means run once for each configured SubnetID.
  63 func (t *tester) Run() bool {
  64         var dp map[string]interface{}
  65         if len(t.DriverParameters) > 0 {
  66                 err := json.Unmarshal(t.DriverParameters, &dp)
  67                 if err != nil {
  68                         t.Logger.WithError(err).Error("error decoding configured CloudVMs.DriverParameters")
  69                         return false
  70                 }
  71         }
  72         subnets, ok := dp["SubnetID"].([]interface{})
  73         if !ok || len(subnets) <= 1 {
  74                 // Easy, only one SubnetID to test.
  75                 return t.runWithDriverParameters(t.DriverParameters)
  76         }
  77
  78         deferredError := false
  79         for i, subnet := range subnets {
  80                 subnet, ok := subnet.(string)
  81                 if !ok {
  82                         t.Logger.Errorf("CloudVMs.DriverParameters.SubnetID[%d] is invalid -- must be a string", i)
  83                         deferredError = true
  84                         continue
  85                 }
  86                 dp["SubnetID"] = subnet
  87                 t.Logger.Infof("running tests using SubnetID[%d] %q", i, subnet)
  88                 dpjson, err := json.Marshal(dp)
  89                 if err != nil {
  90                         t.Logger.WithError(err).Error("error encoding driver parameters")
  91                         deferredError = true
  92                         continue
  93                 }
  94                 ok = t.runWithDriverParameters(dpjson)
  95                 if !ok {
  96                         t.Logger.Infof("failed tests using SubnetID[%d] %q", i, subnet)
  97                         deferredError = true
  98                 }
  99         }
 100         return !deferredError
 101 }
 102
 103 // Run the test suite as specified, clean up as needed, and return
 104 // true (everything is OK) or false (something went wrong).
 105 func (t *tester) runWithDriverParameters(driverParameters json.RawMessage) bool {
 106         // This flag gets set when we encounter a non-fatal error, so
 107         // we can continue doing more tests but remember to return
 108         // false (failure) at the end.
 109         deferredError := false
 110
 111         var err error
 112         t.is, err = t.Driver.InstanceSet(driverParameters, t.SetID, t.Tags, t.Logger, nil)
 113         if err != nil {
 114                 t.Logger.WithError(err).Info("error initializing driver")
 115                 return false
 116         }
 117
 118         for {
 119                 // Don't send the driver any filters when getting the
 120                 // initial instance list. This way we can log an
 121                 // instance count (N=...)  that includes all instances
 122                 // in this service account, even if they don't have
 123                 // the same InstanceSetID.
 124                 insts, err := t.getInstances(nil)
 125                 if err != nil {
 126                         t.Logger.WithError(err).Info("error getting list of instances")
 127                         return false
 128                 }
 129
 130                 foundExisting := false
 131                 for _, i := range insts {
 132                         if i.Tags()[t.TagKeyPrefix+"InstanceSetID"] != string(t.SetID) {
 133                                 continue
 134                         }
 135                         lgr := t.Logger.WithFields(logrus.Fields{
 136                                 "Instance":      i.ID(),
 137                                 "InstanceSetID": t.SetID,
 138                         })
 139                         foundExisting = true
 140                         if t.DestroyExisting {
 141                                 lgr.Info("destroying existing instance with our InstanceSetID")
 142                                 t0 := time.Now()
 143                                 err := i.Destroy()
 144                                 lgr := lgr.WithField("Duration", time.Since(t0))
 145                                 if err != nil {
 146                                         lgr.WithError(err).Error("error destroying existing instance")
 147                                 } else {
 148                                         lgr.Info("Destroy() call succeeded")
 149                                 }
 150                         } else {
 151                                 lgr.Error("found existing instance with our InstanceSetID")
 152                         }
 153                 }
 154                 if !foundExisting {
 155                         break
 156                 } else if t.DestroyExisting {
 157                         t.sleepSyncInterval()
 158                 } else {
 159                         t.Logger.Error("cannot continue with existing instances -- clean up manually, use -destroy-existing=true, or choose a different -instance-set-id")
 160                         return false
 161                 }
 162         }
 163
 164         t.secret = randomHex(40)
 165
 166         tags := cloud.InstanceTags{}
 167         for k, v := range t.Tags {
 168                 tags[k] = v
 169         }
 170         tags[t.TagKeyPrefix+"InstanceSetID"] = string(t.SetID)
 171         tags[t.TagKeyPrefix+"InstanceSecret"] = t.secret
 172
 173         defer t.destroyTestInstance()
 174
 175         bootDeadline := time.Now().Add(t.TimeoutBooting)
 176         initCommand := worker.TagVerifier{Instance: nil, Secret: t.secret, ReportVerified: nil}.InitCommand() + "\n" + t.InstanceInitCommand
 177
 178         installPublicKey := t.SSHKey.PublicKey()
 179         if !t.DeployPublicKey {
 180                 installPublicKey = nil
 181         }
 182
 183         t.Logger.WithFields(logrus.Fields{
 184                 "InstanceType":         t.InstanceType.Name,
 185                 "ProviderInstanceType": t.InstanceType.ProviderType,
 186                 "ImageID":              t.ImageID,
 187                 "Tags":                 tags,
 188                 "InitCommand":          initCommand,
 189                 "DeployPublicKey":      installPublicKey != nil,
 190         }).Info("creating instance")
 191         t0 := time.Now()
 192         inst, err := t.is.Create(t.InstanceType, t.ImageID, tags, initCommand, installPublicKey)
 193         lgrC := t.Logger.WithField("Duration", time.Since(t0))
 194         if err != nil {
 195                 // Create() might have failed due to a bug or network
 196                 // error even though the creation was successful, so
 197                 // it's safer to wait a bit for an instance to appear.
 198                 deferredError = true
 199                 lgrC.WithError(err).Error("error creating test instance")
 200                 t.Logger.WithField("Deadline", bootDeadline).Info("waiting for instance to appear anyway, in case the Create response was incorrect")
 201                 for err = t.refreshTestInstance(); err != nil; err = t.refreshTestInstance() {
 202                         if time.Now().After(bootDeadline) {
 203                                 t.Logger.Error("timed out")
 204                                 return false
 205                         }
 206                         t.sleepSyncInterval()
 207                 }
 208                 t.Logger.WithField("Instance", t.testInstance.ID()).Info("new instance appeared")
 209                 t.showLoginInfo()
 210         } else {
 211                 // Create() succeeded. Make sure the new instance
 212                 // appears right away in the Instances() list.
 213                 lgrC.WithField("Instance", inst.ID()).Info("created instance")
 214                 t.testInstance = &worker.TagVerifier{Instance: inst, Secret: t.secret, ReportVerified: nil}
 215                 t.showLoginInfo()
 216                 err = t.refreshTestInstance()
 217                 if err == errTestInstanceNotFound {
 218                         t.Logger.WithError(err).Error("cloud/driver Create succeeded, but instance is not in list")
 219                         deferredError = true
 220                 } else if err != nil {
 221                         t.Logger.WithError(err).Error("error getting list of instances")
 222                         return false
 223                 }
 224         }
 225
 226         if !t.checkTags() {
 227                 // checkTags() already logged the errors
 228                 deferredError = true
 229         }
 230
 231         if !t.waitForBoot(bootDeadline) {
 232                 deferredError = true
 233         }
 234
 235         if t.ShellCommand != "" {
 236                 err = t.runShellCommand(t.ShellCommand)
 237                 if err != nil {
 238                         t.Logger.WithError(err).Error("shell command failed")
 239                         deferredError = true
 240                 }
 241         }
 242
 243         if fn := t.PauseBeforeDestroy; fn != nil {
 244                 t.showLoginInfo()
 245                 fn()
 246         }
 247
 248         return !deferredError
 249 }
 250
 251 // If the test instance has an address, log an "ssh user@host" command
 252 // line that the operator can paste into another terminal, and set
 253 // t.showedLoginInfo.
 254 //
 255 // If the test instance doesn't have an address yet, do nothing.
 256 func (t *tester) showLoginInfo() {
 257         t.updateExecutor()
 258         host, port := t.executor.TargetHostPort()
 259         if host == "" {
 260                 return
 261         }
 262         user := t.testInstance.RemoteUser()
 263         t.Logger.WithField("Command", fmt.Sprintf("ssh -p%s %s@%s", port, user, host)).Info("showing login information")
 264         t.showedLoginInfo = true
 265 }
 266
 267 // Get the latest instance list from the driver. If our test instance
 268 // is found, assign it to t.testIntance.
 269 func (t *tester) refreshTestInstance() error {
 270         insts, err := t.getInstances(cloud.InstanceTags{t.TagKeyPrefix + "InstanceSetID": string(t.SetID)})
 271         if err != nil {
 272                 return err
 273         }
 274         for _, i := range insts {
 275                 if t.testInstance == nil {
 276                         // Filter by InstanceSetID tag value
 277                         if i.Tags()[t.TagKeyPrefix+"InstanceSetID"] != string(t.SetID) {
 278                                 continue
 279                         }
 280                 } else {
 281                         // Filter by instance ID
 282                         if i.ID() != t.testInstance.ID() {
 283                                 continue
 284                         }
 285                 }
 286                 t.Logger.WithFields(logrus.Fields{
 287                         "Instance": i.ID(),
 288                         "Address":  i.Address(),
 289                 }).Info("found our instance in returned list")
 290                 t.testInstance = &worker.TagVerifier{Instance: i, Secret: t.secret, ReportVerified: nil}
 291                 if !t.showedLoginInfo {
 292                         t.showLoginInfo()
 293                 }
 294                 return nil
 295         }
 296         return errTestInstanceNotFound
 297 }
 298
 299 // Get the list of instances, passing the given tags to the cloud
 300 // driver to filter results.
 301 //
 302 // Return only the instances that have our InstanceSetID tag.
 303 func (t *tester) getInstances(tags cloud.InstanceTags) ([]cloud.Instance, error) {
 304         var ret []cloud.Instance
 305         t.Logger.WithField("FilterTags", tags).Info("getting instance list")
 306         t0 := time.Now()
 307         insts, err := t.is.Instances(tags)
 308         if err != nil {
 309                 return nil, err
 310         }
 311         t.Logger.WithFields(logrus.Fields{
 312                 "Duration": time.Since(t0),
 313                 "N":        len(insts),
 314         }).Info("got instance list")
 315         for _, i := range insts {
 316                 if i.Tags()[t.TagKeyPrefix+"InstanceSetID"] == string(t.SetID) {
 317                         ret = append(ret, i)
 318                 }
 319         }
 320         return ret, nil
 321 }
 322
 323 // Check that t.testInstance has every tag in t.Tags. If not, log an
 324 // error and return false.
 325 func (t *tester) checkTags() bool {
 326         ok := true
 327         for k, v := range t.Tags {
 328                 if got := t.testInstance.Tags()[k]; got != v {
 329                         ok = false
 330                         t.Logger.WithFields(logrus.Fields{
 331                                 "Key":           k,
 332                                 "ExpectedValue": v,
 333                                 "GotValue":      got,
 334                         }).Error("tag is missing from test instance")
 335                 }
 336         }
 337         if ok {
 338                 t.Logger.Info("all expected tags are present")
 339         }
 340         return ok
 341 }
 342
 343 // Run t.BootProbeCommand on t.testInstance until it succeeds or the
 344 // deadline arrives.
 345 func (t *tester) waitForBoot(deadline time.Time) bool {
 346         for time.Now().Before(deadline) {
 347                 err := t.runShellCommand(t.BootProbeCommand)
 348                 if err == nil {
 349                         return true
 350                 }
 351                 t.sleepProbeInterval()
 352                 t.refreshTestInstance()
 353         }
 354         t.Logger.Error("timed out")
 355         return false
 356 }
 357
 358 // Create t.executor and/or update its target to t.testInstance's
 359 // current address.
 360 func (t *tester) updateExecutor() {
 361         if t.executor == nil {
 362                 t.executor = sshexecutor.New(t.testInstance)
 363                 t.executor.SetTargetPort(t.SSHPort)
 364                 t.executor.SetSigners(t.SSHKey)
 365         } else {
 366                 t.executor.SetTarget(t.testInstance)
 367         }
 368 }
 369
 370 func (t *tester) runShellCommand(cmd string) error {
 371         t.updateExecutor()
 372         t.Logger.WithFields(logrus.Fields{
 373                 "Command": cmd,
 374         }).Info("executing remote command")
 375         t0 := time.Now()
 376         stdout, stderr, err := t.executor.Execute(nil, cmd, nil)
 377         lgr := t.Logger.WithFields(logrus.Fields{
 378                 "Duration": time.Since(t0),
 379                 "Command":  cmd,
 380                 "stdout":   string(stdout),
 381                 "stderr":   string(stderr),
 382         })
 383         if err != nil {
 384                 lgr.WithError(err).Info("remote command failed")
 385         } else {
 386                 lgr.Info("remote command succeeded")
 387         }
 388         return err
 389 }
 390
 391 // currently, this tries forever until it can return true (success).
 392 func (t *tester) destroyTestInstance() bool {
 393         if t.testInstance == nil {
 394                 return true
 395         }
 396         for {
 397                 lgr := t.Logger.WithField("Instance", t.testInstance.ID())
 398                 lgr.Info("destroying instance")
 399                 t0 := time.Now()
 400
 401                 err := t.testInstance.Destroy()
 402                 lgrDur := lgr.WithField("Duration", time.Since(t0))
 403                 if err != nil {
 404                         lgrDur.WithError(err).Error("error destroying instance")
 405                 } else {
 406                         lgrDur.Info("destroyed instance")
 407                 }
 408
 409                 err = t.refreshTestInstance()
 410                 if err == errTestInstanceNotFound {
 411                         lgr.Info("instance no longer appears in list")
 412                         t.testInstance = nil
 413                         return true
 414                 } else if err == nil {
 415                         lgr.Info("instance still exists after calling Destroy")
 416                         t.sleepSyncInterval()
 417                         continue
 418                 } else {
 419                         t.Logger.WithError(err).Error("error getting list of instances")
 420                         continue
 421                 }
 422         }
 423 }
 424
 425 func (t *tester) sleepSyncInterval() {
 426         t.Logger.WithField("Duration", t.SyncInterval).Info("waiting SyncInterval")
 427         time.Sleep(t.SyncInterval)
 428 }
 429
 430 func (t *tester) sleepProbeInterval() {
 431         t.Logger.WithField("Duration", t.ProbeInterval).Info("waiting ProbeInterval")
 432         time.Sleep(t.ProbeInterval)
 433 }
 434
 435 // Return a random string of n hexadecimal digits (n*4 random bits). n
 436 // must be even.
 437 func randomHex(n int) string {
 438         buf := make([]byte, n/2)
 439         _, err := rand.Read(buf)
 440         if err != nil {
 441                 panic(err)
 442         }
 443         return fmt.Sprintf("%x", buf)
 444 }