1 // Copyright (C) The Arvados Authors. All rights reserved.
3 // SPDX-License-Identifier: AGPL-3.0
14 "git.arvados.org/arvados.git/lib/cloud"
15 "git.arvados.org/arvados.git/lib/dispatchcloud/sshexecutor"
16 "git.arvados.org/arvados.git/lib/dispatchcloud/worker"
17 "git.arvados.org/arvados.git/sdk/go/arvados"
18 "github.com/sirupsen/logrus"
19 "golang.org/x/crypto/ssh"
23 errTestInstanceNotFound = errors.New("test instance missing from cloud provider's list")
26 // A tester does a sequence of operations to test a cloud driver and
27 // configuration. Run() should be called only once, after assigning
28 // suitable values to public fields.
30 Logger logrus.FieldLogger
31 Tags cloud.SharedResourceTags
33 SetID cloud.InstanceSetID
35 ProbeInterval time.Duration
36 SyncInterval time.Duration
37 TimeoutBooting time.Duration
39 DriverParameters json.RawMessage
40 InstanceType arvados.InstanceType
44 BootProbeCommand string
45 InstanceInitCommand cloud.InitCommand
47 PauseBeforeDestroy func()
50 testInstance *worker.TagVerifier
52 executor *sshexecutor.Executor
58 // Run the test suite as specified, clean up as needed, and return
59 // true (everything is OK) or false (something went wrong).
60 func (t *tester) Run() bool {
61 // This flag gets set when we encounter a non-fatal error, so
62 // we can continue doing more tests but remember to return
63 // false (failure) at the end.
64 deferredError := false
67 t.is, err = t.Driver.InstanceSet(t.DriverParameters, t.SetID, t.Tags, t.Logger)
69 t.Logger.WithError(err).Info("error initializing driver")
74 // Don't send the driver any filters when getting the
75 // initial instance list. This way we can log an
76 // instance count (N=...) that includes all instances
77 // in this service account, even if they don't have
78 // the same InstanceSetID.
79 insts, err := t.getInstances(nil)
81 t.Logger.WithError(err).Info("error getting list of instances")
85 foundExisting := false
86 for _, i := range insts {
87 if i.Tags()[t.TagKeyPrefix+"InstanceSetID"] != string(t.SetID) {
90 lgr := t.Logger.WithFields(logrus.Fields{
92 "InstanceSetID": t.SetID,
95 if t.DestroyExisting {
96 lgr.Info("destroying existing instance with our InstanceSetID")
99 lgr := lgr.WithField("Duration", time.Since(t0))
101 lgr.WithError(err).Error("error destroying existing instance")
103 lgr.Info("Destroy() call succeeded")
106 lgr.Error("found existing instance with our InstanceSetID")
111 } else if t.DestroyExisting {
112 t.sleepSyncInterval()
114 t.Logger.Error("cannot continue with existing instances -- clean up manually, use -destroy-existing=true, or choose a different -instance-set-id")
119 t.secret = randomHex(40)
121 tags := cloud.InstanceTags{}
122 for k, v := range t.Tags {
125 tags[t.TagKeyPrefix+"InstanceSetID"] = string(t.SetID)
126 tags[t.TagKeyPrefix+"InstanceSecret"] = t.secret
128 defer t.destroyTestInstance()
130 bootDeadline := time.Now().Add(t.TimeoutBooting)
131 initCommand := worker.TagVerifier{Instance: nil, Secret: t.secret, ReportVerified: nil}.InitCommand() + "\n" + t.InstanceInitCommand
133 t.Logger.WithFields(logrus.Fields{
134 "InstanceType": t.InstanceType.Name,
135 "ProviderInstanceType": t.InstanceType.ProviderType,
136 "ImageID": t.ImageID,
138 "InitCommand": initCommand,
139 }).Info("creating instance")
141 inst, err := t.is.Create(t.InstanceType, t.ImageID, tags, initCommand, t.SSHKey.PublicKey())
142 lgrC := t.Logger.WithField("Duration", time.Since(t0))
144 // Create() might have failed due to a bug or network
145 // error even though the creation was successful, so
146 // it's safer to wait a bit for an instance to appear.
148 lgrC.WithError(err).Error("error creating test instance")
149 t.Logger.WithField("Deadline", bootDeadline).Info("waiting for instance to appear anyway, in case the Create response was incorrect")
150 for err = t.refreshTestInstance(); err != nil; err = t.refreshTestInstance() {
151 if time.Now().After(bootDeadline) {
152 t.Logger.Error("timed out")
155 t.sleepSyncInterval()
157 t.Logger.WithField("Instance", t.testInstance.ID()).Info("new instance appeared")
160 // Create() succeeded. Make sure the new instance
161 // appears right away in the Instances() list.
162 lgrC.WithField("Instance", inst.ID()).Info("created instance")
163 t.testInstance = &worker.TagVerifier{Instance: inst, Secret: t.secret, ReportVerified: nil}
165 err = t.refreshTestInstance()
166 if err == errTestInstanceNotFound {
167 t.Logger.WithError(err).Error("cloud/driver Create succeeded, but instance is not in list")
169 } else if err != nil {
170 t.Logger.WithError(err).Error("error getting list of instances")
176 // checkTags() already logged the errors
180 if !t.waitForBoot(bootDeadline) {
184 if t.ShellCommand != "" {
185 err = t.runShellCommand(t.ShellCommand)
187 t.Logger.WithError(err).Error("shell command failed")
192 if fn := t.PauseBeforeDestroy; fn != nil {
197 return !deferredError
200 // If the test instance has an address, log an "ssh user@host" command
201 // line that the operator can paste into another terminal, and set
202 // t.showedLoginInfo.
204 // If the test instance doesn't have an address yet, do nothing.
205 func (t *tester) showLoginInfo() {
207 host, port := t.executor.TargetHostPort()
211 user := t.testInstance.RemoteUser()
212 t.Logger.WithField("Command", fmt.Sprintf("ssh -p%s %s@%s", port, user, host)).Info("showing login information")
213 t.showedLoginInfo = true
216 // Get the latest instance list from the driver. If our test instance
217 // is found, assign it to t.testIntance.
218 func (t *tester) refreshTestInstance() error {
219 insts, err := t.getInstances(cloud.InstanceTags{t.TagKeyPrefix + "InstanceSetID": string(t.SetID)})
223 for _, i := range insts {
224 if t.testInstance == nil {
225 // Filter by InstanceSetID tag value
226 if i.Tags()[t.TagKeyPrefix+"InstanceSetID"] != string(t.SetID) {
230 // Filter by instance ID
231 if i.ID() != t.testInstance.ID() {
235 t.Logger.WithFields(logrus.Fields{
237 "Address": i.Address(),
238 }).Info("found our instance in returned list")
239 t.testInstance = &worker.TagVerifier{Instance: i, Secret: t.secret, ReportVerified: nil}
240 if !t.showedLoginInfo {
245 return errTestInstanceNotFound
248 // Get the list of instances, passing the given tags to the cloud
249 // driver to filter results.
251 // Return only the instances that have our InstanceSetID tag.
252 func (t *tester) getInstances(tags cloud.InstanceTags) ([]cloud.Instance, error) {
253 var ret []cloud.Instance
254 t.Logger.WithField("FilterTags", tags).Info("getting instance list")
256 insts, err := t.is.Instances(tags)
260 t.Logger.WithFields(logrus.Fields{
261 "Duration": time.Since(t0),
263 }).Info("got instance list")
264 for _, i := range insts {
265 if i.Tags()[t.TagKeyPrefix+"InstanceSetID"] == string(t.SetID) {
272 // Check that t.testInstance has every tag in t.Tags. If not, log an
273 // error and return false.
274 func (t *tester) checkTags() bool {
276 for k, v := range t.Tags {
277 if got := t.testInstance.Tags()[k]; got != v {
279 t.Logger.WithFields(logrus.Fields{
283 }).Error("tag is missing from test instance")
287 t.Logger.Info("all expected tags are present")
292 // Run t.BootProbeCommand on t.testInstance until it succeeds or the
294 func (t *tester) waitForBoot(deadline time.Time) bool {
295 for time.Now().Before(deadline) {
296 err := t.runShellCommand(t.BootProbeCommand)
300 t.sleepProbeInterval()
301 t.refreshTestInstance()
303 t.Logger.Error("timed out")
307 // Create t.executor and/or update its target to t.testInstance's
309 func (t *tester) updateExecutor() {
310 if t.executor == nil {
311 t.executor = sshexecutor.New(t.testInstance)
312 t.executor.SetTargetPort(t.SSHPort)
313 t.executor.SetSigners(t.SSHKey)
315 t.executor.SetTarget(t.testInstance)
319 func (t *tester) runShellCommand(cmd string) error {
321 t.Logger.WithFields(logrus.Fields{
323 }).Info("executing remote command")
325 stdout, stderr, err := t.executor.Execute(nil, cmd, nil)
326 lgr := t.Logger.WithFields(logrus.Fields{
327 "Duration": time.Since(t0),
329 "stdout": string(stdout),
330 "stderr": string(stderr),
333 lgr.WithError(err).Info("remote command failed")
335 lgr.Info("remote command succeeded")
340 // currently, this tries forever until it can return true (success).
341 func (t *tester) destroyTestInstance() bool {
342 if t.testInstance == nil {
346 lgr := t.Logger.WithField("Instance", t.testInstance.ID())
347 lgr.Info("destroying instance")
350 err := t.testInstance.Destroy()
351 lgrDur := lgr.WithField("Duration", time.Since(t0))
353 lgrDur.WithError(err).Error("error destroying instance")
355 lgrDur.Info("destroyed instance")
358 err = t.refreshTestInstance()
359 if err == errTestInstanceNotFound {
360 lgr.Info("instance no longer appears in list")
363 } else if err == nil {
364 lgr.Info("instance still exists after calling Destroy")
365 t.sleepSyncInterval()
368 t.Logger.WithError(err).Error("error getting list of instances")
374 func (t *tester) sleepSyncInterval() {
375 t.Logger.WithField("Duration", t.SyncInterval).Info("waiting SyncInterval")
376 time.Sleep(t.SyncInterval)
379 func (t *tester) sleepProbeInterval() {
380 t.Logger.WithField("Duration", t.ProbeInterval).Info("waiting ProbeInterval")
381 time.Sleep(t.ProbeInterval)
384 // Return a random string of n hexadecimal digits (n*4 random bits). n
386 func randomHex(n int) string {
387 buf := make([]byte, n/2)
388 _, err := rand.Read(buf)
392 return fmt.Sprintf("%x", buf)