lib/cloud/cloudtest/tester.go

   1 // Copyright (C) The Arvados Authors. All rights reserved.
   2 //
   3 // SPDX-License-Identifier: AGPL-3.0
   4
   5 package cloudtest
   6
   7 import (
   8         "crypto/rand"
   9         "encoding/json"
  10         "errors"
  11         "fmt"
  12         "time"
  13
  14         "git.arvados.org/arvados.git/lib/cloud"
  15         "git.arvados.org/arvados.git/lib/dispatchcloud/sshexecutor"
  16         "git.arvados.org/arvados.git/lib/dispatchcloud/worker"
  17         "git.arvados.org/arvados.git/sdk/go/arvados"
  18         "github.com/sirupsen/logrus"
  19         "golang.org/x/crypto/ssh"
  20 )
  21
  22 var (
  23         errTestInstanceNotFound = errors.New("test instance missing from cloud provider's list")
  24 )
  25
  26 // A tester does a sequence of operations to test a cloud driver and
  27 // configuration. Run() should be called only once, after assigning
  28 // suitable values to public fields.
  29 type tester struct {
  30         Logger              logrus.FieldLogger
  31         Tags                cloud.SharedResourceTags
  32         TagKeyPrefix        string
  33         SetID               cloud.InstanceSetID
  34         DestroyExisting     bool
  35         ProbeInterval       time.Duration
  36         SyncInterval        time.Duration
  37         TimeoutBooting      time.Duration
  38         Driver              cloud.Driver
  39         DriverParameters    json.RawMessage
  40         InstanceType        arvados.InstanceType
  41         ImageID             cloud.ImageID
  42         SSHKey              ssh.Signer
  43         SSHPort             string
  44         BootProbeCommand    string
  45         InstanceInitCommand cloud.InitCommand
  46         ShellCommand        string
  47         PauseBeforeDestroy  func()
  48
  49         is              cloud.InstanceSet
  50         testInstance    *worker.TagVerifier
  51         secret          string
  52         executor        *sshexecutor.Executor
  53         showedLoginInfo bool
  54
  55         failed bool
  56 }
  57
  58 // Run the test suite once for each applicable permutation of
  59 // DriverParameters.  Return true if everything worked.
  60 //
  61 // Currently this means run once for each configured SubnetID.
  62 func (t *tester) Run() bool {
  63         var dp map[string]interface{}
  64         err := json.Unmarshal(t.DriverParameters, &dp)
  65         if err != nil {
  66                 t.Logger.WithError(err).Error("error decoding configured CloudVMs.DriverParameters")
  67                 return false
  68         }
  69         subnets, ok := dp["SubnetID"].([]interface{})
  70         if !ok || len(subnets) <= 1 {
  71                 // Easy, only one SubnetID to test.
  72                 return t.runWithDriverParameters(t.DriverParameters)
  73         }
  74
  75         deferredError := false
  76         for i, subnet := range subnets {
  77                 subnet, ok := subnet.(string)
  78                 if !ok {
  79                         t.Logger.Errorf("CloudVMs.DriverParameters.SubnetID[%d] is invalid -- must be a string", i)
  80                         deferredError = true
  81                         continue
  82                 }
  83                 dp["SubnetID"] = subnet
  84                 t.Logger.Infof("running tests using SubnetID[%d] %q", i, subnet)
  85                 dpjson, err := json.Marshal(dp)
  86                 if err != nil {
  87                         t.Logger.WithError(err).Error("error encoding driver parameters")
  88                         deferredError = true
  89                         continue
  90                 }
  91                 ok = t.runWithDriverParameters(dpjson)
  92                 if !ok {
  93                         t.Logger.Infof("failed tests using SubnetID[%d] %q", i, subnet)
  94                         deferredError = true
  95                 }
  96         }
  97         return !deferredError
  98 }
  99
 100 // Run the test suite as specified, clean up as needed, and return
 101 // true (everything is OK) or false (something went wrong).
 102 func (t *tester) runWithDriverParameters(driverParameters json.RawMessage) bool {
 103         // This flag gets set when we encounter a non-fatal error, so
 104         // we can continue doing more tests but remember to return
 105         // false (failure) at the end.
 106         deferredError := false
 107
 108         var err error
 109         t.is, err = t.Driver.InstanceSet(driverParameters, t.SetID, t.Tags, t.Logger, nil)
 110         if err != nil {
 111                 t.Logger.WithError(err).Info("error initializing driver")
 112                 return false
 113         }
 114
 115         for {
 116                 // Don't send the driver any filters when getting the
 117                 // initial instance list. This way we can log an
 118                 // instance count (N=...)  that includes all instances
 119                 // in this service account, even if they don't have
 120                 // the same InstanceSetID.
 121                 insts, err := t.getInstances(nil)
 122                 if err != nil {
 123                         t.Logger.WithError(err).Info("error getting list of instances")
 124                         return false
 125                 }
 126
 127                 foundExisting := false
 128                 for _, i := range insts {
 129                         if i.Tags()[t.TagKeyPrefix+"InstanceSetID"] != string(t.SetID) {
 130                                 continue
 131                         }
 132                         lgr := t.Logger.WithFields(logrus.Fields{
 133                                 "Instance":      i.ID(),
 134                                 "InstanceSetID": t.SetID,
 135                         })
 136                         foundExisting = true
 137                         if t.DestroyExisting {
 138                                 lgr.Info("destroying existing instance with our InstanceSetID")
 139                                 t0 := time.Now()
 140                                 err := i.Destroy()
 141                                 lgr := lgr.WithField("Duration", time.Since(t0))
 142                                 if err != nil {
 143                                         lgr.WithError(err).Error("error destroying existing instance")
 144                                 } else {
 145                                         lgr.Info("Destroy() call succeeded")
 146                                 }
 147                         } else {
 148                                 lgr.Error("found existing instance with our InstanceSetID")
 149                         }
 150                 }
 151                 if !foundExisting {
 152                         break
 153                 } else if t.DestroyExisting {
 154                         t.sleepSyncInterval()
 155                 } else {
 156                         t.Logger.Error("cannot continue with existing instances -- clean up manually, use -destroy-existing=true, or choose a different -instance-set-id")
 157                         return false
 158                 }
 159         }
 160
 161         t.secret = randomHex(40)
 162
 163         tags := cloud.InstanceTags{}
 164         for k, v := range t.Tags {
 165                 tags[k] = v
 166         }
 167         tags[t.TagKeyPrefix+"InstanceSetID"] = string(t.SetID)
 168         tags[t.TagKeyPrefix+"InstanceSecret"] = t.secret
 169
 170         defer t.destroyTestInstance()
 171
 172         bootDeadline := time.Now().Add(t.TimeoutBooting)
 173         initCommand := worker.TagVerifier{Instance: nil, Secret: t.secret, ReportVerified: nil}.InitCommand() + "\n" + t.InstanceInitCommand
 174
 175         t.Logger.WithFields(logrus.Fields{
 176                 "InstanceType":         t.InstanceType.Name,
 177                 "ProviderInstanceType": t.InstanceType.ProviderType,
 178                 "ImageID":              t.ImageID,
 179                 "Tags":                 tags,
 180                 "InitCommand":          initCommand,
 181         }).Info("creating instance")
 182         t0 := time.Now()
 183         inst, err := t.is.Create(t.InstanceType, t.ImageID, tags, initCommand, t.SSHKey.PublicKey())
 184         lgrC := t.Logger.WithField("Duration", time.Since(t0))
 185         if err != nil {
 186                 // Create() might have failed due to a bug or network
 187                 // error even though the creation was successful, so
 188                 // it's safer to wait a bit for an instance to appear.
 189                 deferredError = true
 190                 lgrC.WithError(err).Error("error creating test instance")
 191                 t.Logger.WithField("Deadline", bootDeadline).Info("waiting for instance to appear anyway, in case the Create response was incorrect")
 192                 for err = t.refreshTestInstance(); err != nil; err = t.refreshTestInstance() {
 193                         if time.Now().After(bootDeadline) {
 194                                 t.Logger.Error("timed out")
 195                                 return false
 196                         }
 197                         t.sleepSyncInterval()
 198                 }
 199                 t.Logger.WithField("Instance", t.testInstance.ID()).Info("new instance appeared")
 200                 t.showLoginInfo()
 201         } else {
 202                 // Create() succeeded. Make sure the new instance
 203                 // appears right away in the Instances() list.
 204                 lgrC.WithField("Instance", inst.ID()).Info("created instance")
 205                 t.testInstance = &worker.TagVerifier{Instance: inst, Secret: t.secret, ReportVerified: nil}
 206                 t.showLoginInfo()
 207                 err = t.refreshTestInstance()
 208                 if err == errTestInstanceNotFound {
 209                         t.Logger.WithError(err).Error("cloud/driver Create succeeded, but instance is not in list")
 210                         deferredError = true
 211                 } else if err != nil {
 212                         t.Logger.WithError(err).Error("error getting list of instances")
 213                         return false
 214                 }
 215         }
 216
 217         if !t.checkTags() {
 218                 // checkTags() already logged the errors
 219                 deferredError = true
 220         }
 221
 222         if !t.waitForBoot(bootDeadline) {
 223                 deferredError = true
 224         }
 225
 226         if t.ShellCommand != "" {
 227                 err = t.runShellCommand(t.ShellCommand)
 228                 if err != nil {
 229                         t.Logger.WithError(err).Error("shell command failed")
 230                         deferredError = true
 231                 }
 232         }
 233
 234         if fn := t.PauseBeforeDestroy; fn != nil {
 235                 t.showLoginInfo()
 236                 fn()
 237         }
 238
 239         return !deferredError
 240 }
 241
 242 // If the test instance has an address, log an "ssh user@host" command
 243 // line that the operator can paste into another terminal, and set
 244 // t.showedLoginInfo.
 245 //
 246 // If the test instance doesn't have an address yet, do nothing.
 247 func (t *tester) showLoginInfo() {
 248         t.updateExecutor()
 249         host, port := t.executor.TargetHostPort()
 250         if host == "" {
 251                 return
 252         }
 253         user := t.testInstance.RemoteUser()
 254         t.Logger.WithField("Command", fmt.Sprintf("ssh -p%s %s@%s", port, user, host)).Info("showing login information")
 255         t.showedLoginInfo = true
 256 }
 257
 258 // Get the latest instance list from the driver. If our test instance
 259 // is found, assign it to t.testIntance.
 260 func (t *tester) refreshTestInstance() error {
 261         insts, err := t.getInstances(cloud.InstanceTags{t.TagKeyPrefix + "InstanceSetID": string(t.SetID)})
 262         if err != nil {
 263                 return err
 264         }
 265         for _, i := range insts {
 266                 if t.testInstance == nil {
 267                         // Filter by InstanceSetID tag value
 268                         if i.Tags()[t.TagKeyPrefix+"InstanceSetID"] != string(t.SetID) {
 269                                 continue
 270                         }
 271                 } else {
 272                         // Filter by instance ID
 273                         if i.ID() != t.testInstance.ID() {
 274                                 continue
 275                         }
 276                 }
 277                 t.Logger.WithFields(logrus.Fields{
 278                         "Instance": i.ID(),
 279                         "Address":  i.Address(),
 280                 }).Info("found our instance in returned list")
 281                 t.testInstance = &worker.TagVerifier{Instance: i, Secret: t.secret, ReportVerified: nil}
 282                 if !t.showedLoginInfo {
 283                         t.showLoginInfo()
 284                 }
 285                 return nil
 286         }
 287         return errTestInstanceNotFound
 288 }
 289
 290 // Get the list of instances, passing the given tags to the cloud
 291 // driver to filter results.
 292 //
 293 // Return only the instances that have our InstanceSetID tag.
 294 func (t *tester) getInstances(tags cloud.InstanceTags) ([]cloud.Instance, error) {
 295         var ret []cloud.Instance
 296         t.Logger.WithField("FilterTags", tags).Info("getting instance list")
 297         t0 := time.Now()
 298         insts, err := t.is.Instances(tags)
 299         if err != nil {
 300                 return nil, err
 301         }
 302         t.Logger.WithFields(logrus.Fields{
 303                 "Duration": time.Since(t0),
 304                 "N":        len(insts),
 305         }).Info("got instance list")
 306         for _, i := range insts {
 307                 if i.Tags()[t.TagKeyPrefix+"InstanceSetID"] == string(t.SetID) {
 308                         ret = append(ret, i)
 309                 }
 310         }
 311         return ret, nil
 312 }
 313
 314 // Check that t.testInstance has every tag in t.Tags. If not, log an
 315 // error and return false.
 316 func (t *tester) checkTags() bool {
 317         ok := true
 318         for k, v := range t.Tags {
 319                 if got := t.testInstance.Tags()[k]; got != v {
 320                         ok = false
 321                         t.Logger.WithFields(logrus.Fields{
 322                                 "Key":           k,
 323                                 "ExpectedValue": v,
 324                                 "GotValue":      got,
 325                         }).Error("tag is missing from test instance")
 326                 }
 327         }
 328         if ok {
 329                 t.Logger.Info("all expected tags are present")
 330         }
 331         return ok
 332 }
 333
 334 // Run t.BootProbeCommand on t.testInstance until it succeeds or the
 335 // deadline arrives.
 336 func (t *tester) waitForBoot(deadline time.Time) bool {
 337         for time.Now().Before(deadline) {
 338                 err := t.runShellCommand(t.BootProbeCommand)
 339                 if err == nil {
 340                         return true
 341                 }
 342                 t.sleepProbeInterval()
 343                 t.refreshTestInstance()
 344         }
 345         t.Logger.Error("timed out")
 346         return false
 347 }
 348
 349 // Create t.executor and/or update its target to t.testInstance's
 350 // current address.
 351 func (t *tester) updateExecutor() {
 352         if t.executor == nil {
 353                 t.executor = sshexecutor.New(t.testInstance)
 354                 t.executor.SetTargetPort(t.SSHPort)
 355                 t.executor.SetSigners(t.SSHKey)
 356         } else {
 357                 t.executor.SetTarget(t.testInstance)
 358         }
 359 }
 360
 361 func (t *tester) runShellCommand(cmd string) error {
 362         t.updateExecutor()
 363         t.Logger.WithFields(logrus.Fields{
 364                 "Command": cmd,
 365         }).Info("executing remote command")
 366         t0 := time.Now()
 367         stdout, stderr, err := t.executor.Execute(nil, cmd, nil)
 368         lgr := t.Logger.WithFields(logrus.Fields{
 369                 "Duration": time.Since(t0),
 370                 "Command":  cmd,
 371                 "stdout":   string(stdout),
 372                 "stderr":   string(stderr),
 373         })
 374         if err != nil {
 375                 lgr.WithError(err).Info("remote command failed")
 376         } else {
 377                 lgr.Info("remote command succeeded")
 378         }
 379         return err
 380 }
 381
 382 // currently, this tries forever until it can return true (success).
 383 func (t *tester) destroyTestInstance() bool {
 384         if t.testInstance == nil {
 385                 return true
 386         }
 387         for {
 388                 lgr := t.Logger.WithField("Instance", t.testInstance.ID())
 389                 lgr.Info("destroying instance")
 390                 t0 := time.Now()
 391
 392                 err := t.testInstance.Destroy()
 393                 lgrDur := lgr.WithField("Duration", time.Since(t0))
 394                 if err != nil {
 395                         lgrDur.WithError(err).Error("error destroying instance")
 396                 } else {
 397                         lgrDur.Info("destroyed instance")
 398                 }
 399
 400                 err = t.refreshTestInstance()
 401                 if err == errTestInstanceNotFound {
 402                         lgr.Info("instance no longer appears in list")
 403                         t.testInstance = nil
 404                         return true
 405                 } else if err == nil {
 406                         lgr.Info("instance still exists after calling Destroy")
 407                         t.sleepSyncInterval()
 408                         continue
 409                 } else {
 410                         t.Logger.WithError(err).Error("error getting list of instances")
 411                         continue
 412                 }
 413         }
 414 }
 415
 416 func (t *tester) sleepSyncInterval() {
 417         t.Logger.WithField("Duration", t.SyncInterval).Info("waiting SyncInterval")
 418         time.Sleep(t.SyncInterval)
 419 }
 420
 421 func (t *tester) sleepProbeInterval() {
 422         t.Logger.WithField("Duration", t.ProbeInterval).Info("waiting ProbeInterval")
 423         time.Sleep(t.ProbeInterval)
 424 }
 425
 426 // Return a random string of n hexadecimal digits (n*4 random bits). n
 427 // must be even.
 428 func randomHex(n int) string {
 429         buf := make([]byte, n/2)
 430         _, err := rand.Read(buf)
 431         if err != nil {
 432                 panic(err)
 433         }
 434         return fmt.Sprintf("%x", buf)
 435 }