Merge commit '3b735dd9330e0989f51a76771c3303031154154e' into 21158-wf-page-list

[arvados.git] / lib / cloud / cloudtest / tester.go
diff --git a/lib/cloud/cloudtest/tester.go b/lib/cloud/cloudtest/tester.go

index 81b14aed88c578b62aaf9c58b6cfab9fa6249e8e..a335278ed6b15a91794bc8927697dec3a6aef1ec 100644 (file)
--- a/lib/cloud/cloudtest/tester.go
+++ b/lib/cloud/cloudtest/tester.go
@@ -11,10 +11,10 @@ import (
         "fmt"
         "time"
  
-       "git.curoverse.com/arvados.git/lib/cloud"
-       "git.curoverse.com/arvados.git/lib/dispatchcloud/ssh_executor"
-       "git.curoverse.com/arvados.git/lib/dispatchcloud/worker"
-       "git.curoverse.com/arvados.git/sdk/go/arvados"
+       "git.arvados.org/arvados.git/lib/cloud"
+       "git.arvados.org/arvados.git/lib/dispatchcloud/sshexecutor"
+       "git.arvados.org/arvados.git/lib/dispatchcloud/worker"
+       "git.arvados.org/arvados.git/sdk/go/arvados"
         "github.com/sirupsen/logrus"
         "golang.org/x/crypto/ssh"
  )
@@ -27,53 +27,106 @@ var (
  // configuration. Run() should be called only once, after assigning
  // suitable values to public fields.
  type tester struct {
-       Logger             logrus.FieldLogger
-       Tags               cloud.SharedResourceTags
-       TagKeyPrefix       string
-       SetID              cloud.InstanceSetID
-       DestroyExisting    bool
-       ProbeInterval      time.Duration
-       SyncInterval       time.Duration
-       TimeoutBooting     time.Duration
-       Driver             cloud.Driver
-       DriverParameters   json.RawMessage
-       InstanceType       arvados.InstanceType
-       ImageID            cloud.ImageID
-       SSHKey             ssh.Signer
-       SSHPort            string
-       BootProbeCommand   string
-       ShellCommand       string
-       PauseBeforeDestroy func()
+       Logger              logrus.FieldLogger
+       Tags                cloud.SharedResourceTags
+       TagKeyPrefix        string
+       SetID               cloud.InstanceSetID
+       DestroyExisting     bool
+       ProbeInterval       time.Duration
+       SyncInterval        time.Duration
+       TimeoutBooting      time.Duration
+       Driver              cloud.Driver
+       DriverParameters    json.RawMessage
+       InstanceType        arvados.InstanceType
+       ImageID             cloud.ImageID
+       SSHKey              ssh.Signer
+       SSHPort             string
+       DeployPublicKey     bool
+       BootProbeCommand    string
+       InstanceInitCommand cloud.InitCommand
+       ShellCommand        string
+       PauseBeforeDestroy  func()
  
         is              cloud.InstanceSet
-       testInstance    cloud.Instance
+       testInstance    *worker.TagVerifier
         secret          string
-       executor        *ssh_executor.Executor
+       executor        *sshexecutor.Executor
         showedLoginInfo bool
  
         failed bool
  }
  
+// Run the test suite once for each applicable permutation of
+// DriverParameters.  Return true if everything worked.
+//
+// Currently this means run once for each configured SubnetID.
+func (t *tester) Run() bool {
+       var dp map[string]interface{}
+       if len(t.DriverParameters) > 0 {
+               err := json.Unmarshal(t.DriverParameters, &dp)
+               if err != nil {
+                       t.Logger.WithError(err).Error("error decoding configured CloudVMs.DriverParameters")
+                       return false
+               }
+       }
+       subnets, ok := dp["SubnetID"].([]interface{})
+       if !ok || len(subnets) <= 1 {
+               // Easy, only one SubnetID to test.
+               return t.runWithDriverParameters(t.DriverParameters)
+       }
+
+       deferredError := false
+       for i, subnet := range subnets {
+               subnet, ok := subnet.(string)
+               if !ok {
+                       t.Logger.Errorf("CloudVMs.DriverParameters.SubnetID[%d] is invalid -- must be a string", i)
+                       deferredError = true
+                       continue
+               }
+               dp["SubnetID"] = subnet
+               t.Logger.Infof("running tests using SubnetID[%d] %q", i, subnet)
+               dpjson, err := json.Marshal(dp)
+               if err != nil {
+                       t.Logger.WithError(err).Error("error encoding driver parameters")
+                       deferredError = true
+                       continue
+               }
+               ok = t.runWithDriverParameters(dpjson)
+               if !ok {
+                       t.Logger.Infof("failed tests using SubnetID[%d] %q", i, subnet)
+                       deferredError = true
+               }
+       }
+       return !deferredError
+}
+
  // Run the test suite as specified, clean up as needed, and return
  // true (everything is OK) or false (something went wrong).
-func (t *tester) Run() bool {
+func (t *tester) runWithDriverParameters(driverParameters json.RawMessage) bool {
         // This flag gets set when we encounter a non-fatal error, so
         // we can continue doing more tests but remember to return
         // false (failure) at the end.
         deferredError := false
  
         var err error
-       t.is, err = t.Driver.InstanceSet(t.DriverParameters, t.SetID, t.Tags, t.Logger)
+       t.is, err = t.Driver.InstanceSet(driverParameters, t.SetID, t.Tags, t.Logger, nil)
         if err != nil {
                 t.Logger.WithError(err).Info("error initializing driver")
                 return false
         }
-       insts, err := t.getInstances(nil)
-       if err != nil {
-               t.Logger.WithError(err).Info("error getting initial list of instances")
-               return false
-       }
+
         for {
+               // Don't send the driver any filters when getting the
+               // initial instance list. This way we can log an
+               // instance count (N=...)  that includes all instances
+               // in this service account, even if they don't have
+               // the same InstanceSetID.
+               insts, err := t.getInstances(nil)
+               if err != nil {
+                       t.Logger.WithError(err).Info("error getting list of instances")
+                       return false
+               }
+
                 foundExisting := false
                 for _, i := range insts {
                         if i.Tags()[t.TagKeyPrefix+"InstanceSetID"] != string(t.SetID) {
@@ -86,7 +139,9 @@ func (t *tester) Run() bool {
                         foundExisting = true
                         if t.DestroyExisting {
                                 lgr.Info("destroying existing instance with our InstanceSetID")
+                               t0 := time.Now()
                                 err := i.Destroy()
+                               lgr := lgr.WithField("Duration", time.Since(t0))
                                 if err != nil {
                                         lgr.WithError(err).Error("error destroying existing instance")
                                 } else {
@@ -118,7 +173,12 @@ func (t *tester) Run() bool {
         defer t.destroyTestInstance()
  
         bootDeadline := time.Now().Add(t.TimeoutBooting)
-       initCommand := worker.TagVerifier{nil, t.secret}.InitCommand()
+       initCommand := worker.TagVerifier{Instance: nil, Secret: t.secret, ReportVerified: nil}.InitCommand() + "\n" + t.InstanceInitCommand
+
+       installPublicKey := t.SSHKey.PublicKey()
+       if !t.DeployPublicKey {
+               installPublicKey = nil
+       }
  
         t.Logger.WithFields(logrus.Fields{
                 "InstanceType":         t.InstanceType.Name,
@@ -126,25 +186,32 @@ func (t *tester) Run() bool {
                 "ImageID":              t.ImageID,
                 "Tags":                 tags,
                 "InitCommand":          initCommand,
+               "DeployPublicKey":      installPublicKey != nil,
         }).Info("creating instance")
-       inst, err := t.is.Create(t.InstanceType, t.ImageID, tags, initCommand, t.SSHKey.PublicKey())
+       t0 := time.Now()
+       inst, err := t.is.Create(t.InstanceType, t.ImageID, tags, initCommand, installPublicKey)
+       lgrC := t.Logger.WithField("Duration", time.Since(t0))
         if err != nil {
+               // Create() might have failed due to a bug or network
+               // error even though the creation was successful, so
+               // it's safer to wait a bit for an instance to appear.
                 deferredError = true
-               t.Logger.WithError(err).Error("error creating test instance")
+               lgrC.WithError(err).Error("error creating test instance")
                 t.Logger.WithField("Deadline", bootDeadline).Info("waiting for instance to appear anyway, in case the Create response was incorrect")
                 for err = t.refreshTestInstance(); err != nil; err = t.refreshTestInstance() {
                         if time.Now().After(bootDeadline) {
                                 t.Logger.Error("timed out")
                                 return false
-                       } else {
-                               t.sleepSyncInterval()
                         }
+                       t.sleepSyncInterval()
                 }
                 t.Logger.WithField("Instance", t.testInstance.ID()).Info("new instance appeared")
                 t.showLoginInfo()
         } else {
-               t.Logger.WithField("Instance", inst.ID()).Info("created instance")
-               t.testInstance = inst
+               // Create() succeeded. Make sure the new instance
+               // appears right away in the Instances() list.
+               lgrC.WithField("Instance", inst.ID()).Info("created instance")
+               t.testInstance = &worker.TagVerifier{Instance: inst, Secret: t.secret, ReportVerified: nil}
                 t.showLoginInfo()
                 err = t.refreshTestInstance()
                 if err == errTestInstanceNotFound {
@@ -155,15 +222,14 @@ func (t *tester) Run() bool {
                         return false
                 }
         }
-       t.testInstance = worker.TagVerifier{t.testInstance, t.secret}
  
         if !t.checkTags() {
                 // checkTags() already logged the errors
-               return false
+               deferredError = true
         }
  
         if !t.waitForBoot(bootDeadline) {
-               return false
+               deferredError = true
         }
  
         if t.ShellCommand != "" {
@@ -182,6 +248,11 @@ func (t *tester) Run() bool {
         return !deferredError
  }
  
+// If the test instance has an address, log an "ssh user@host" command
+// line that the operator can paste into another terminal, and set
+// t.showedLoginInfo.
+//
+// If the test instance doesn't have an address yet, do nothing.
  func (t *tester) showLoginInfo() {
         t.updateExecutor()
         host, port := t.executor.TargetHostPort()
@@ -216,7 +287,7 @@ func (t *tester) refreshTestInstance() error {
                         "Instance": i.ID(),
                         "Address":  i.Address(),
                 }).Info("found our instance in returned list")
-               t.testInstance = worker.TagVerifier{i, t.secret}
+               t.testInstance = &worker.TagVerifier{Instance: i, Secret: t.secret, ReportVerified: nil}
                 if !t.showedLoginInfo {
                         t.showLoginInfo()
                 }
@@ -225,14 +296,22 @@ func (t *tester) refreshTestInstance() error {
         return errTestInstanceNotFound
  }
  
+// Get the list of instances, passing the given tags to the cloud
+// driver to filter results.
+//
+// Return only the instances that have our InstanceSetID tag.
  func (t *tester) getInstances(tags cloud.InstanceTags) ([]cloud.Instance, error) {
         var ret []cloud.Instance
         t.Logger.WithField("FilterTags", tags).Info("getting instance list")
+       t0 := time.Now()
         insts, err := t.is.Instances(tags)
         if err != nil {
                 return nil, err
         }
-       t.Logger.WithField("N", len(insts)).Info("got instance list")
+       t.Logger.WithFields(logrus.Fields{
+               "Duration": time.Since(t0),
+               "N":        len(insts),
+       }).Info("got instance list")
         for _, i := range insts {
                 if i.Tags()[t.TagKeyPrefix+"InstanceSetID"] == string(t.SetID) {
                         ret = append(ret, i)
@@ -241,6 +320,8 @@ func (t *tester) getInstances(tags cloud.InstanceTags) ([]cloud.Instance, error)
         return ret, nil
  }
  
+// Check that t.testInstance has every tag in t.Tags. If not, log an
+// error and return false.
  func (t *tester) checkTags() bool {
         ok := true
         for k, v := range t.Tags {
@@ -259,6 +340,8 @@ func (t *tester) checkTags() bool {
         return ok
  }
  
+// Run t.BootProbeCommand on t.testInstance until it succeeds or the
+// deadline arrives.
  func (t *tester) waitForBoot(deadline time.Time) bool {
         for time.Now().Before(deadline) {
                 err := t.runShellCommand(t.BootProbeCommand)
@@ -272,9 +355,11 @@ func (t *tester) waitForBoot(deadline time.Time) bool {
         return false
  }
  
+// Create t.executor and/or update its target to t.testInstance's
+// current address.
  func (t *tester) updateExecutor() {
         if t.executor == nil {
-               t.executor = ssh_executor.New(t.testInstance)
+               t.executor = sshexecutor.New(t.testInstance)
                 t.executor.SetTargetPort(t.SSHPort)
                 t.executor.SetSigners(t.SSHKey)
         } else {
@@ -287,11 +372,13 @@ func (t *tester) runShellCommand(cmd string) error {
         t.Logger.WithFields(logrus.Fields{
                 "Command": cmd,
         }).Info("executing remote command")
+       t0 := time.Now()
         stdout, stderr, err := t.executor.Execute(nil, cmd, nil)
         lgr := t.Logger.WithFields(logrus.Fields{
-               "Command": cmd,
-               "stdout":  string(stdout),
-               "stderr":  string(stderr),
+               "Duration": time.Since(t0),
+               "Command":  cmd,
+               "stdout":   string(stdout),
+               "stderr":   string(stderr),
         })
         if err != nil {
                 lgr.WithError(err).Info("remote command failed")
@@ -307,20 +394,25 @@ func (t *tester) destroyTestInstance() bool {
                 return true
         }
         for {
-               t.Logger.WithField("Instance", t.testInstance.ID()).Info("destroying instance")
+               lgr := t.Logger.WithField("Instance", t.testInstance.ID())
+               lgr.Info("destroying instance")
+               t0 := time.Now()
+
                 err := t.testInstance.Destroy()
+               lgrDur := lgr.WithField("Duration", time.Since(t0))
                 if err != nil {
-                       t.Logger.WithError(err).WithField("Instance", t.testInstance.ID()).Error("error destroying instance")
+                       lgrDur.WithError(err).Error("error destroying instance")
                 } else {
-                       t.Logger.WithField("Instance", t.testInstance.ID()).Info("destroyed instance")
+                       lgrDur.Info("destroyed instance")
                 }
+
                 err = t.refreshTestInstance()
                 if err == errTestInstanceNotFound {
-                       t.Logger.WithField("Instance", t.testInstance.ID()).Info("instance no longer appears in list")
+                       lgr.Info("instance no longer appears in list")
                         t.testInstance = nil
                         return true
                 } else if err == nil {
-                       t.Logger.WithField("Instance", t.testInstance.ID()).Info("instance still exists after calling Destroy")
+                       lgr.Info("instance still exists after calling Destroy")
                         t.sleepSyncInterval()
                         continue
                 } else {