1 // Copyright (C) The Arvados Authors. All rights reserved.
3 // SPDX-License-Identifier: AGPL-3.0
14 "git.arvados.org/arvados.git/lib/cloud"
15 "git.arvados.org/arvados.git/lib/dispatchcloud/sshexecutor"
16 "git.arvados.org/arvados.git/lib/dispatchcloud/worker"
17 "git.arvados.org/arvados.git/sdk/go/arvados"
18 "github.com/sirupsen/logrus"
19 "golang.org/x/crypto/ssh"
23 errTestInstanceNotFound = errors.New("test instance missing from cloud provider's list")
26 // A tester does a sequence of operations to test a cloud driver and
27 // configuration. Run() should be called only once, after assigning
28 // suitable values to public fields.
30 Logger logrus.FieldLogger
31 Tags cloud.SharedResourceTags
33 SetID cloud.InstanceSetID
35 ProbeInterval time.Duration
36 SyncInterval time.Duration
37 TimeoutBooting time.Duration
39 DriverParameters json.RawMessage
40 InstanceType arvados.InstanceType
44 BootProbeCommand string
45 InstanceInitCommand cloud.InitCommand
47 PauseBeforeDestroy func()
50 testInstance *worker.TagVerifier
52 executor *sshexecutor.Executor
58 // Run the test suite once for each applicable permutation of
59 // DriverParameters. Return true if everything worked.
61 // Currently this means run once for each configured SubnetID.
62 func (t *tester) Run() bool {
63 var dp map[string]interface{}
64 if len(t.DriverParameters) > 0 {
65 err := json.Unmarshal(t.DriverParameters, &dp)
67 t.Logger.WithError(err).Error("error decoding configured CloudVMs.DriverParameters")
71 subnets, ok := dp["SubnetID"].([]interface{})
72 if !ok || len(subnets) <= 1 {
73 // Easy, only one SubnetID to test.
74 return t.runWithDriverParameters(t.DriverParameters)
77 deferredError := false
78 for i, subnet := range subnets {
79 subnet, ok := subnet.(string)
81 t.Logger.Errorf("CloudVMs.DriverParameters.SubnetID[%d] is invalid -- must be a string", i)
85 dp["SubnetID"] = subnet
86 t.Logger.Infof("running tests using SubnetID[%d] %q", i, subnet)
87 dpjson, err := json.Marshal(dp)
89 t.Logger.WithError(err).Error("error encoding driver parameters")
93 ok = t.runWithDriverParameters(dpjson)
95 t.Logger.Infof("failed tests using SubnetID[%d] %q", i, subnet)
102 // Run the test suite as specified, clean up as needed, and return
103 // true (everything is OK) or false (something went wrong).
104 func (t *tester) runWithDriverParameters(driverParameters json.RawMessage) bool {
105 // This flag gets set when we encounter a non-fatal error, so
106 // we can continue doing more tests but remember to return
107 // false (failure) at the end.
108 deferredError := false
111 t.is, err = t.Driver.InstanceSet(driverParameters, t.SetID, t.Tags, t.Logger, nil)
113 t.Logger.WithError(err).Info("error initializing driver")
118 // Don't send the driver any filters when getting the
119 // initial instance list. This way we can log an
120 // instance count (N=...) that includes all instances
121 // in this service account, even if they don't have
122 // the same InstanceSetID.
123 insts, err := t.getInstances(nil)
125 t.Logger.WithError(err).Info("error getting list of instances")
129 foundExisting := false
130 for _, i := range insts {
131 if i.Tags()[t.TagKeyPrefix+"InstanceSetID"] != string(t.SetID) {
134 lgr := t.Logger.WithFields(logrus.Fields{
136 "InstanceSetID": t.SetID,
139 if t.DestroyExisting {
140 lgr.Info("destroying existing instance with our InstanceSetID")
143 lgr := lgr.WithField("Duration", time.Since(t0))
145 lgr.WithError(err).Error("error destroying existing instance")
147 lgr.Info("Destroy() call succeeded")
150 lgr.Error("found existing instance with our InstanceSetID")
155 } else if t.DestroyExisting {
156 t.sleepSyncInterval()
158 t.Logger.Error("cannot continue with existing instances -- clean up manually, use -destroy-existing=true, or choose a different -instance-set-id")
163 t.secret = randomHex(40)
165 tags := cloud.InstanceTags{}
166 for k, v := range t.Tags {
169 tags[t.TagKeyPrefix+"InstanceSetID"] = string(t.SetID)
170 tags[t.TagKeyPrefix+"InstanceSecret"] = t.secret
172 defer t.destroyTestInstance()
174 bootDeadline := time.Now().Add(t.TimeoutBooting)
175 initCommand := worker.TagVerifier{Instance: nil, Secret: t.secret, ReportVerified: nil}.InitCommand() + "\n" + t.InstanceInitCommand
177 t.Logger.WithFields(logrus.Fields{
178 "InstanceType": t.InstanceType.Name,
179 "ProviderInstanceType": t.InstanceType.ProviderType,
180 "ImageID": t.ImageID,
182 "InitCommand": initCommand,
183 }).Info("creating instance")
185 inst, err := t.is.Create(t.InstanceType, t.ImageID, tags, initCommand, t.SSHKey.PublicKey())
186 lgrC := t.Logger.WithField("Duration", time.Since(t0))
188 // Create() might have failed due to a bug or network
189 // error even though the creation was successful, so
190 // it's safer to wait a bit for an instance to appear.
192 lgrC.WithError(err).Error("error creating test instance")
193 t.Logger.WithField("Deadline", bootDeadline).Info("waiting for instance to appear anyway, in case the Create response was incorrect")
194 for err = t.refreshTestInstance(); err != nil; err = t.refreshTestInstance() {
195 if time.Now().After(bootDeadline) {
196 t.Logger.Error("timed out")
199 t.sleepSyncInterval()
201 t.Logger.WithField("Instance", t.testInstance.ID()).Info("new instance appeared")
204 // Create() succeeded. Make sure the new instance
205 // appears right away in the Instances() list.
206 lgrC.WithField("Instance", inst.ID()).Info("created instance")
207 t.testInstance = &worker.TagVerifier{Instance: inst, Secret: t.secret, ReportVerified: nil}
209 err = t.refreshTestInstance()
210 if err == errTestInstanceNotFound {
211 t.Logger.WithError(err).Error("cloud/driver Create succeeded, but instance is not in list")
213 } else if err != nil {
214 t.Logger.WithError(err).Error("error getting list of instances")
220 // checkTags() already logged the errors
224 if !t.waitForBoot(bootDeadline) {
228 if t.ShellCommand != "" {
229 err = t.runShellCommand(t.ShellCommand)
231 t.Logger.WithError(err).Error("shell command failed")
236 if fn := t.PauseBeforeDestroy; fn != nil {
241 return !deferredError
244 // If the test instance has an address, log an "ssh user@host" command
245 // line that the operator can paste into another terminal, and set
246 // t.showedLoginInfo.
248 // If the test instance doesn't have an address yet, do nothing.
249 func (t *tester) showLoginInfo() {
251 host, port := t.executor.TargetHostPort()
255 user := t.testInstance.RemoteUser()
256 t.Logger.WithField("Command", fmt.Sprintf("ssh -p%s %s@%s", port, user, host)).Info("showing login information")
257 t.showedLoginInfo = true
260 // Get the latest instance list from the driver. If our test instance
261 // is found, assign it to t.testIntance.
262 func (t *tester) refreshTestInstance() error {
263 insts, err := t.getInstances(cloud.InstanceTags{t.TagKeyPrefix + "InstanceSetID": string(t.SetID)})
267 for _, i := range insts {
268 if t.testInstance == nil {
269 // Filter by InstanceSetID tag value
270 if i.Tags()[t.TagKeyPrefix+"InstanceSetID"] != string(t.SetID) {
274 // Filter by instance ID
275 if i.ID() != t.testInstance.ID() {
279 t.Logger.WithFields(logrus.Fields{
281 "Address": i.Address(),
282 }).Info("found our instance in returned list")
283 t.testInstance = &worker.TagVerifier{Instance: i, Secret: t.secret, ReportVerified: nil}
284 if !t.showedLoginInfo {
289 return errTestInstanceNotFound
292 // Get the list of instances, passing the given tags to the cloud
293 // driver to filter results.
295 // Return only the instances that have our InstanceSetID tag.
296 func (t *tester) getInstances(tags cloud.InstanceTags) ([]cloud.Instance, error) {
297 var ret []cloud.Instance
298 t.Logger.WithField("FilterTags", tags).Info("getting instance list")
300 insts, err := t.is.Instances(tags)
304 t.Logger.WithFields(logrus.Fields{
305 "Duration": time.Since(t0),
307 }).Info("got instance list")
308 for _, i := range insts {
309 if i.Tags()[t.TagKeyPrefix+"InstanceSetID"] == string(t.SetID) {
316 // Check that t.testInstance has every tag in t.Tags. If not, log an
317 // error and return false.
318 func (t *tester) checkTags() bool {
320 for k, v := range t.Tags {
321 if got := t.testInstance.Tags()[k]; got != v {
323 t.Logger.WithFields(logrus.Fields{
327 }).Error("tag is missing from test instance")
331 t.Logger.Info("all expected tags are present")
336 // Run t.BootProbeCommand on t.testInstance until it succeeds or the
338 func (t *tester) waitForBoot(deadline time.Time) bool {
339 for time.Now().Before(deadline) {
340 err := t.runShellCommand(t.BootProbeCommand)
344 t.sleepProbeInterval()
345 t.refreshTestInstance()
347 t.Logger.Error("timed out")
351 // Create t.executor and/or update its target to t.testInstance's
353 func (t *tester) updateExecutor() {
354 if t.executor == nil {
355 t.executor = sshexecutor.New(t.testInstance)
356 t.executor.SetTargetPort(t.SSHPort)
357 t.executor.SetSigners(t.SSHKey)
359 t.executor.SetTarget(t.testInstance)
363 func (t *tester) runShellCommand(cmd string) error {
365 t.Logger.WithFields(logrus.Fields{
367 }).Info("executing remote command")
369 stdout, stderr, err := t.executor.Execute(nil, cmd, nil)
370 lgr := t.Logger.WithFields(logrus.Fields{
371 "Duration": time.Since(t0),
373 "stdout": string(stdout),
374 "stderr": string(stderr),
377 lgr.WithError(err).Info("remote command failed")
379 lgr.Info("remote command succeeded")
384 // currently, this tries forever until it can return true (success).
385 func (t *tester) destroyTestInstance() bool {
386 if t.testInstance == nil {
390 lgr := t.Logger.WithField("Instance", t.testInstance.ID())
391 lgr.Info("destroying instance")
394 err := t.testInstance.Destroy()
395 lgrDur := lgr.WithField("Duration", time.Since(t0))
397 lgrDur.WithError(err).Error("error destroying instance")
399 lgrDur.Info("destroyed instance")
402 err = t.refreshTestInstance()
403 if err == errTestInstanceNotFound {
404 lgr.Info("instance no longer appears in list")
407 } else if err == nil {
408 lgr.Info("instance still exists after calling Destroy")
409 t.sleepSyncInterval()
412 t.Logger.WithError(err).Error("error getting list of instances")
418 func (t *tester) sleepSyncInterval() {
419 t.Logger.WithField("Duration", t.SyncInterval).Info("waiting SyncInterval")
420 time.Sleep(t.SyncInterval)
423 func (t *tester) sleepProbeInterval() {
424 t.Logger.WithField("Duration", t.ProbeInterval).Info("waiting ProbeInterval")
425 time.Sleep(t.ProbeInterval)
428 // Return a random string of n hexadecimal digits (n*4 random bits). n
430 func randomHex(n int) string {
431 buf := make([]byte, n/2)
432 _, err := rand.Read(buf)
436 return fmt.Sprintf("%x", buf)