1 // Copyright (C) The Arvados Authors. All rights reserved.
3 // SPDX-License-Identifier: AGPL-3.0
14 "git.curoverse.com/arvados.git/lib/cloud"
15 "git.curoverse.com/arvados.git/lib/dispatchcloud/ssh_executor"
16 "git.curoverse.com/arvados.git/lib/dispatchcloud/worker"
17 "git.curoverse.com/arvados.git/sdk/go/arvados"
18 "github.com/sirupsen/logrus"
19 "golang.org/x/crypto/ssh"
23 errTestInstanceNotFound = errors.New("test instance missing from cloud provider's list")
26 // A tester does a sequence of operations to test a cloud driver and
27 // configuration. Run() should be called only once, after assigning
28 // suitable values to public fields.
30 Logger logrus.FieldLogger
31 Tags cloud.SharedResourceTags
33 SetID cloud.InstanceSetID
35 ProbeInterval time.Duration
36 SyncInterval time.Duration
37 TimeoutBooting time.Duration
39 DriverParameters json.RawMessage
40 InstanceType arvados.InstanceType
44 BootProbeCommand string
46 PauseBeforeDestroy func()
49 testInstance cloud.Instance
51 executor *ssh_executor.Executor
57 // Run the test suite as specified, clean up as needed, and return
58 // true (everything is OK) or false (something went wrong).
59 func (t *tester) Run() bool {
60 // This flag gets set when we encounter a non-fatal error, so
61 // we can continue doing more tests but remember to return
62 // false (failure) at the end.
63 deferredError := false
66 t.is, err = t.Driver.InstanceSet(t.DriverParameters, t.SetID, t.Tags, t.Logger)
68 t.Logger.WithError(err).Info("error initializing driver")
71 insts, err := t.getInstances(nil)
73 t.Logger.WithError(err).Info("error getting initial list of instances")
77 foundExisting := false
78 for _, i := range insts {
79 if i.Tags()[t.TagKeyPrefix+"InstanceSetID"] != string(t.SetID) {
82 lgr := t.Logger.WithFields(logrus.Fields{
84 "InstanceSetID": t.SetID,
87 if t.DestroyExisting {
88 lgr.Info("destroying existing instance with our InstanceSetID")
91 lgr.WithError(err).Error("error destroying existing instance")
93 lgr.Info("Destroy() call succeeded")
96 lgr.Error("found existing instance with our InstanceSetID")
101 } else if t.DestroyExisting {
102 t.sleepSyncInterval()
104 t.Logger.Error("cannot continue with existing instances -- clean up manually, use -destroy-existing=true, or choose a different -instance-set-id")
109 t.secret = randomHex(40)
111 tags := cloud.InstanceTags{}
112 for k, v := range t.Tags {
115 tags[t.TagKeyPrefix+"InstanceSetID"] = string(t.SetID)
116 tags[t.TagKeyPrefix+"InstanceSecret"] = t.secret
118 defer t.destroyTestInstance()
120 bootDeadline := time.Now().Add(t.TimeoutBooting)
121 initCommand := worker.TagVerifier{nil, t.secret}.InitCommand()
123 t.Logger.WithFields(logrus.Fields{
124 "InstanceType": t.InstanceType.Name,
125 "ProviderInstanceType": t.InstanceType.ProviderType,
126 "ImageID": t.ImageID,
128 "InitCommand": initCommand,
129 }).Info("creating instance")
130 inst, err := t.is.Create(t.InstanceType, t.ImageID, tags, initCommand, t.SSHKey.PublicKey())
133 t.Logger.WithError(err).Error("error creating test instance")
134 t.Logger.WithField("Deadline", bootDeadline).Info("waiting for instance to appear anyway, in case the Create response was incorrect")
135 for err = t.refreshTestInstance(); err != nil; err = t.refreshTestInstance() {
136 if time.Now().After(bootDeadline) {
137 t.Logger.Error("timed out")
140 t.sleepSyncInterval()
143 t.Logger.WithField("Instance", t.testInstance.ID()).Info("new instance appeared")
146 t.Logger.WithField("Instance", inst.ID()).Info("created instance")
147 t.testInstance = inst
149 err = t.refreshTestInstance()
150 if err == errTestInstanceNotFound {
151 t.Logger.WithError(err).Error("cloud/driver Create succeeded, but instance is not in list")
153 } else if err != nil {
154 t.Logger.WithError(err).Error("error getting list of instances")
158 t.testInstance = worker.TagVerifier{t.testInstance, t.secret}
161 // checkTags() already logged the errors
165 if !t.waitForBoot(bootDeadline) {
169 if t.ShellCommand != "" {
170 err = t.runShellCommand(t.ShellCommand)
172 t.Logger.WithError(err).Error("shell command failed")
177 if fn := t.PauseBeforeDestroy; fn != nil {
182 return !deferredError
185 func (t *tester) showLoginInfo() {
187 host, port := t.executor.TargetHostPort()
191 user := t.testInstance.RemoteUser()
192 t.Logger.WithField("Command", fmt.Sprintf("ssh -p%s %s@%s", port, user, host)).Info("showing login information")
193 t.showedLoginInfo = true
196 // Get the latest instance list from the driver. If our test instance
197 // is found, assign it to t.testIntance.
198 func (t *tester) refreshTestInstance() error {
199 insts, err := t.getInstances(cloud.InstanceTags{t.TagKeyPrefix + "InstanceSetID": string(t.SetID)})
203 for _, i := range insts {
204 if t.testInstance == nil {
205 // Filter by InstanceSetID tag value
206 if i.Tags()[t.TagKeyPrefix+"InstanceSetID"] != string(t.SetID) {
210 // Filter by instance ID
211 if i.ID() != t.testInstance.ID() {
215 t.Logger.WithFields(logrus.Fields{
217 "Address": i.Address(),
218 }).Info("found our instance in returned list")
219 t.testInstance = worker.TagVerifier{i, t.secret}
220 if !t.showedLoginInfo {
225 return errTestInstanceNotFound
228 func (t *tester) getInstances(tags cloud.InstanceTags) ([]cloud.Instance, error) {
229 var ret []cloud.Instance
230 t.Logger.WithField("FilterTags", tags).Info("getting instance list")
231 insts, err := t.is.Instances(tags)
235 t.Logger.WithField("N", len(insts)).Info("got instance list")
236 for _, i := range insts {
237 if i.Tags()[t.TagKeyPrefix+"InstanceSetID"] == string(t.SetID) {
244 func (t *tester) checkTags() bool {
246 for k, v := range t.Tags {
247 if got := t.testInstance.Tags()[k]; got != v {
249 t.Logger.WithFields(logrus.Fields{
253 }).Error("tag is missing from test instance")
257 t.Logger.Info("all expected tags are present")
262 func (t *tester) waitForBoot(deadline time.Time) bool {
263 for time.Now().Before(deadline) {
264 err := t.runShellCommand(t.BootProbeCommand)
268 t.sleepProbeInterval()
269 t.refreshTestInstance()
271 t.Logger.Error("timed out")
275 func (t *tester) updateExecutor() {
276 if t.executor == nil {
277 t.executor = ssh_executor.New(t.testInstance)
278 t.executor.SetTargetPort(t.SSHPort)
279 t.executor.SetSigners(t.SSHKey)
281 t.executor.SetTarget(t.testInstance)
285 func (t *tester) runShellCommand(cmd string) error {
287 t.Logger.WithFields(logrus.Fields{
289 }).Info("executing remote command")
290 stdout, stderr, err := t.executor.Execute(nil, cmd, nil)
291 lgr := t.Logger.WithFields(logrus.Fields{
293 "stdout": string(stdout),
294 "stderr": string(stderr),
297 lgr.WithError(err).Info("remote command failed")
299 lgr.Info("remote command succeeded")
304 // currently, this tries forever until it can return true (success).
305 func (t *tester) destroyTestInstance() bool {
306 if t.testInstance == nil {
310 t.Logger.WithField("Instance", t.testInstance.ID()).Info("destroying instance")
311 err := t.testInstance.Destroy()
313 t.Logger.WithError(err).WithField("Instance", t.testInstance.ID()).Error("error destroying instance")
315 t.Logger.WithField("Instance", t.testInstance.ID()).Info("destroyed instance")
317 err = t.refreshTestInstance()
318 if err == errTestInstanceNotFound {
319 t.Logger.WithField("Instance", t.testInstance.ID()).Info("instance no longer appears in list")
322 } else if err == nil {
323 t.Logger.WithField("Instance", t.testInstance.ID()).Info("instance still exists after calling Destroy")
324 t.sleepSyncInterval()
327 t.Logger.WithError(err).Error("error getting list of instances")
333 func (t *tester) sleepSyncInterval() {
334 t.Logger.WithField("Duration", t.SyncInterval).Info("waiting SyncInterval")
335 time.Sleep(t.SyncInterval)
338 func (t *tester) sleepProbeInterval() {
339 t.Logger.WithField("Duration", t.ProbeInterval).Info("waiting ProbeInterval")
340 time.Sleep(t.ProbeInterval)
343 // Return a random string of n hexadecimal digits (n*4 random bits). n
345 func randomHex(n int) string {
346 buf := make([]byte, n/2)
347 _, err := rand.Read(buf)
351 return fmt.Sprintf("%x", buf)