1 // Copyright (C) The Arvados Authors. All rights reserved.
3 // SPDX-License-Identifier: AGPL-3.0
14 "git.curoverse.com/arvados.git/lib/cloud"
15 "git.curoverse.com/arvados.git/lib/dispatchcloud/ssh_executor"
16 "git.curoverse.com/arvados.git/lib/dispatchcloud/worker"
17 "git.curoverse.com/arvados.git/sdk/go/arvados"
18 "github.com/sirupsen/logrus"
19 "golang.org/x/crypto/ssh"
23 errTestInstanceNotFound = errors.New("test instance missing from cloud provider's list")
26 // A tester does a sequence of operations to test a cloud driver and
27 // configuration. Run() should be called only once, after assigning
28 // suitable values to public fields.
30 Logger logrus.FieldLogger
31 Tags cloud.SharedResourceTags
33 SetID cloud.InstanceSetID
35 ProbeInterval time.Duration
36 SyncInterval time.Duration
37 TimeoutBooting time.Duration
39 DriverParameters json.RawMessage
40 InstanceType arvados.InstanceType
44 BootProbeCommand string
46 PauseBeforeDestroy func()
49 testInstance *worker.TagVerifier
51 executor *ssh_executor.Executor
57 // Run the test suite as specified, clean up as needed, and return
58 // true (everything is OK) or false (something went wrong).
59 func (t *tester) Run() bool {
60 // This flag gets set when we encounter a non-fatal error, so
61 // we can continue doing more tests but remember to return
62 // false (failure) at the end.
63 deferredError := false
66 t.is, err = t.Driver.InstanceSet(t.DriverParameters, t.SetID, t.Tags, t.Logger)
68 t.Logger.WithError(err).Info("error initializing driver")
72 // Don't send the driver any filters the first time we get the
73 // instance list. This way we can log an instance count
74 // (N=...) that includes all instances in this service
75 // account, even if they don't have the same InstanceSetID.
76 insts, err := t.getInstances(nil)
78 t.Logger.WithError(err).Info("error getting initial list of instances")
83 foundExisting := false
84 for _, i := range insts {
85 if i.Tags()[t.TagKeyPrefix+"InstanceSetID"] != string(t.SetID) {
88 lgr := t.Logger.WithFields(logrus.Fields{
90 "InstanceSetID": t.SetID,
93 if t.DestroyExisting {
94 lgr.Info("destroying existing instance with our InstanceSetID")
97 lgr.WithError(err).Error("error destroying existing instance")
99 lgr.Info("Destroy() call succeeded")
102 lgr.Error("found existing instance with our InstanceSetID")
107 } else if t.DestroyExisting {
108 t.sleepSyncInterval()
110 t.Logger.Error("cannot continue with existing instances -- clean up manually, use -destroy-existing=true, or choose a different -instance-set-id")
115 t.secret = randomHex(40)
117 tags := cloud.InstanceTags{}
118 for k, v := range t.Tags {
121 tags[t.TagKeyPrefix+"InstanceSetID"] = string(t.SetID)
122 tags[t.TagKeyPrefix+"InstanceSecret"] = t.secret
124 defer t.destroyTestInstance()
126 bootDeadline := time.Now().Add(t.TimeoutBooting)
127 initCommand := worker.TagVerifier{nil, t.secret}.InitCommand()
129 t.Logger.WithFields(logrus.Fields{
130 "InstanceType": t.InstanceType.Name,
131 "ProviderInstanceType": t.InstanceType.ProviderType,
132 "ImageID": t.ImageID,
134 "InitCommand": initCommand,
135 }).Info("creating instance")
136 inst, err := t.is.Create(t.InstanceType, t.ImageID, tags, initCommand, t.SSHKey.PublicKey())
138 // Create() might have failed due to a bug or network
139 // error even though the creation was successful, so
140 // it's safer to wait a bit for an instance to appear.
142 t.Logger.WithError(err).Error("error creating test instance")
143 t.Logger.WithField("Deadline", bootDeadline).Info("waiting for instance to appear anyway, in case the Create response was incorrect")
144 for err = t.refreshTestInstance(); err != nil; err = t.refreshTestInstance() {
145 if time.Now().After(bootDeadline) {
146 t.Logger.Error("timed out")
149 t.sleepSyncInterval()
152 t.Logger.WithField("Instance", t.testInstance.ID()).Info("new instance appeared")
155 // Create() succeeded. Make sure the new instance
156 // appears right away in the Instances() list.
157 t.Logger.WithField("Instance", inst.ID()).Info("created instance")
158 t.testInstance = &worker.TagVerifier{inst, t.secret}
160 err = t.refreshTestInstance()
161 if err == errTestInstanceNotFound {
162 t.Logger.WithError(err).Error("cloud/driver Create succeeded, but instance is not in list")
164 } else if err != nil {
165 t.Logger.WithError(err).Error("error getting list of instances")
171 // checkTags() already logged the errors
175 if !t.waitForBoot(bootDeadline) {
179 if t.ShellCommand != "" {
180 err = t.runShellCommand(t.ShellCommand)
182 t.Logger.WithError(err).Error("shell command failed")
187 if fn := t.PauseBeforeDestroy; fn != nil {
192 return !deferredError
195 // If the test instance has an address, log an "ssh user@host" command
196 // line that the operator can paste into another terminal, and set
197 // t.showedLoginInfo.
199 // If the test instance doesn't have an address yet, do nothing.
200 func (t *tester) showLoginInfo() {
202 host, port := t.executor.TargetHostPort()
206 user := t.testInstance.RemoteUser()
207 t.Logger.WithField("Command", fmt.Sprintf("ssh -p%s %s@%s", port, user, host)).Info("showing login information")
208 t.showedLoginInfo = true
211 // Get the latest instance list from the driver. If our test instance
212 // is found, assign it to t.testIntance.
213 func (t *tester) refreshTestInstance() error {
214 insts, err := t.getInstances(cloud.InstanceTags{t.TagKeyPrefix + "InstanceSetID": string(t.SetID)})
218 for _, i := range insts {
219 if t.testInstance == nil {
220 // Filter by InstanceSetID tag value
221 if i.Tags()[t.TagKeyPrefix+"InstanceSetID"] != string(t.SetID) {
225 // Filter by instance ID
226 if i.ID() != t.testInstance.ID() {
230 t.Logger.WithFields(logrus.Fields{
232 "Address": i.Address(),
233 }).Info("found our instance in returned list")
234 t.testInstance = &worker.TagVerifier{i, t.secret}
235 if !t.showedLoginInfo {
240 return errTestInstanceNotFound
243 // Get the list of instances, passing the given tags to the cloud
244 // driver to filter results.
246 // Return only the instances that have our InstanceSetID tag.
247 func (t *tester) getInstances(tags cloud.InstanceTags) ([]cloud.Instance, error) {
248 var ret []cloud.Instance
249 t.Logger.WithField("FilterTags", tags).Info("getting instance list")
250 insts, err := t.is.Instances(tags)
254 t.Logger.WithField("N", len(insts)).Info("got instance list")
255 for _, i := range insts {
256 if i.Tags()[t.TagKeyPrefix+"InstanceSetID"] == string(t.SetID) {
263 // Check that t.testInstance has every tag in t.Tags. If not, log an
264 // error and return false.
265 func (t *tester) checkTags() bool {
267 for k, v := range t.Tags {
268 if got := t.testInstance.Tags()[k]; got != v {
270 t.Logger.WithFields(logrus.Fields{
274 }).Error("tag is missing from test instance")
278 t.Logger.Info("all expected tags are present")
283 // Run t.BootProbeCommand on t.testInstance until it succeeds or the
285 func (t *tester) waitForBoot(deadline time.Time) bool {
286 for time.Now().Before(deadline) {
287 err := t.runShellCommand(t.BootProbeCommand)
291 t.sleepProbeInterval()
292 t.refreshTestInstance()
294 t.Logger.Error("timed out")
298 // Create t.executor and/or update its target to t.testInstance's
300 func (t *tester) updateExecutor() {
301 if t.executor == nil {
302 t.executor = ssh_executor.New(t.testInstance)
303 t.executor.SetTargetPort(t.SSHPort)
304 t.executor.SetSigners(t.SSHKey)
306 t.executor.SetTarget(t.testInstance)
310 func (t *tester) runShellCommand(cmd string) error {
312 t.Logger.WithFields(logrus.Fields{
314 }).Info("executing remote command")
315 stdout, stderr, err := t.executor.Execute(nil, cmd, nil)
316 lgr := t.Logger.WithFields(logrus.Fields{
318 "stdout": string(stdout),
319 "stderr": string(stderr),
322 lgr.WithError(err).Info("remote command failed")
324 lgr.Info("remote command succeeded")
329 // currently, this tries forever until it can return true (success).
330 func (t *tester) destroyTestInstance() bool {
331 if t.testInstance == nil {
335 t.Logger.WithField("Instance", t.testInstance.ID()).Info("destroying instance")
336 err := t.testInstance.Destroy()
338 t.Logger.WithError(err).WithField("Instance", t.testInstance.ID()).Error("error destroying instance")
340 t.Logger.WithField("Instance", t.testInstance.ID()).Info("destroyed instance")
342 err = t.refreshTestInstance()
343 if err == errTestInstanceNotFound {
344 t.Logger.WithField("Instance", t.testInstance.ID()).Info("instance no longer appears in list")
347 } else if err == nil {
348 t.Logger.WithField("Instance", t.testInstance.ID()).Info("instance still exists after calling Destroy")
349 t.sleepSyncInterval()
352 t.Logger.WithError(err).Error("error getting list of instances")
358 func (t *tester) sleepSyncInterval() {
359 t.Logger.WithField("Duration", t.SyncInterval).Info("waiting SyncInterval")
360 time.Sleep(t.SyncInterval)
363 func (t *tester) sleepProbeInterval() {
364 t.Logger.WithField("Duration", t.ProbeInterval).Info("waiting ProbeInterval")
365 time.Sleep(t.ProbeInterval)
368 // Return a random string of n hexadecimal digits (n*4 random bits). n
370 func randomHex(n int) string {
371 buf := make([]byte, n/2)
372 _, err := rand.Read(buf)
376 return fmt.Sprintf("%x", buf)