1 // Copyright (C) The Arvados Authors. All rights reserved.
3 // SPDX-License-Identifier: AGPL-3.0
6 // How to manually run individual tests against the real cloud:
8 // $ go test -v git.arvados.org/arvados.git/lib/cloud/ec2 -live-ec2-cfg ec2config.yml -check.f=TestCreate
10 // Tests should be run individually and in the order they are listed in the file:
12 // Example ec2config.yml:
14 // ImageIDForTestSuite: ami-xxxxxxxxxxxxxxxxx
16 // AccessKeyID: XXXXXXXXXXXXXX
17 // SecretAccessKey: xxxxxxxxxxxxxxxxxxxx
19 // SecurityGroupIDs: [sg-xxxxxxxx]
20 // SubnetID: subnet-xxxxxxxx
21 // AdminUsername: crunch
34 "git.arvados.org/arvados.git/lib/cloud"
35 "git.arvados.org/arvados.git/lib/dispatchcloud/test"
36 "git.arvados.org/arvados.git/sdk/go/arvados"
37 "git.arvados.org/arvados.git/sdk/go/arvadostest"
38 "git.arvados.org/arvados.git/sdk/go/config"
39 "git.arvados.org/arvados.git/sdk/go/ctxlog"
40 "github.com/aws/aws-sdk-go/aws"
41 "github.com/aws/aws-sdk-go/aws/awserr"
42 "github.com/aws/aws-sdk-go/service/ec2"
43 "github.com/ghodss/yaml"
44 "github.com/prometheus/client_golang/prometheus"
45 "github.com/sirupsen/logrus"
46 check "gopkg.in/check.v1"
49 var live = flag.String("live-ec2-cfg", "", "Test with real EC2 API, provide config file")
51 // Gocheck boilerplate
52 func Test(t *testing.T) {
56 type sliceOrStringSuite struct{}
58 var _ = check.Suite(&sliceOrStringSuite{})
60 func (s *sliceOrStringSuite) TestUnmarshal(c *check.C) {
61 var conf ec2InstanceSetConfig
62 for _, trial := range []struct {
64 output sliceOrSingleString
69 {`"foo"`, sliceOrSingleString{"foo"}},
70 {`["foo"]`, sliceOrSingleString{"foo"}},
71 {`[foo]`, sliceOrSingleString{"foo"}},
72 {`["foo", "bar"]`, sliceOrSingleString{"foo", "bar"}},
73 {`[foo-bar, baz]`, sliceOrSingleString{"foo-bar", "baz"}},
75 c.Logf("trial: %+v", trial)
76 err := yaml.Unmarshal([]byte("SubnetID: "+trial.input+"\n"), &conf)
77 if !c.Check(err, check.IsNil) {
80 c.Check(conf.SubnetID, check.DeepEquals, trial.output)
84 type EC2InstanceSetSuite struct{}
86 var _ = check.Suite(&EC2InstanceSetSuite{})
88 type testConfig struct {
89 ImageIDForTestSuite string
90 DriverParameters json.RawMessage
96 importKeyPairCalls []*ec2.ImportKeyPairInput
97 describeKeyPairsCalls []*ec2.DescribeKeyPairsInput
98 runInstancesCalls []*ec2.RunInstancesInput
99 // {subnetID => error}: RunInstances returns error if subnetID
101 subnetErrorOnRunInstances map[string]error
104 func (e *ec2stub) ImportKeyPair(input *ec2.ImportKeyPairInput) (*ec2.ImportKeyPairOutput, error) {
105 e.importKeyPairCalls = append(e.importKeyPairCalls, input)
109 func (e *ec2stub) DescribeKeyPairs(input *ec2.DescribeKeyPairsInput) (*ec2.DescribeKeyPairsOutput, error) {
110 e.describeKeyPairsCalls = append(e.describeKeyPairsCalls, input)
111 return &ec2.DescribeKeyPairsOutput{}, nil
114 func (e *ec2stub) RunInstances(input *ec2.RunInstancesInput) (*ec2.Reservation, error) {
115 e.runInstancesCalls = append(e.runInstancesCalls, input)
116 if len(input.NetworkInterfaces) > 0 && input.NetworkInterfaces[0].SubnetId != nil {
117 err := e.subnetErrorOnRunInstances[*input.NetworkInterfaces[0].SubnetId]
122 return &ec2.Reservation{Instances: []*ec2.Instance{{
123 InstanceId: aws.String("i-123"),
124 InstanceType: aws.String("t2.micro"),
125 Tags: input.TagSpecifications[0].Tags,
129 func (e *ec2stub) DescribeInstances(input *ec2.DescribeInstancesInput) (*ec2.DescribeInstancesOutput, error) {
130 return &ec2.DescribeInstancesOutput{
131 Reservations: []*ec2.Reservation{{
132 Instances: []*ec2.Instance{{
133 InstanceId: aws.String("i-123"),
134 InstanceLifecycle: aws.String("spot"),
135 InstanceType: aws.String("t2.micro"),
136 PrivateIpAddress: aws.String("10.1.2.3"),
137 State: &ec2.InstanceState{Name: aws.String("running"), Code: aws.Int64(16)},
139 InstanceId: aws.String("i-124"),
140 InstanceLifecycle: aws.String("spot"),
141 InstanceType: aws.String("t2.micro"),
142 PrivateIpAddress: aws.String("10.1.2.4"),
143 State: &ec2.InstanceState{Name: aws.String("running"), Code: aws.Int64(16)},
149 func (e *ec2stub) DescribeInstanceStatusPages(input *ec2.DescribeInstanceStatusInput, fn func(*ec2.DescribeInstanceStatusOutput, bool) bool) error {
150 fn(&ec2.DescribeInstanceStatusOutput{
151 InstanceStatuses: []*ec2.InstanceStatus{{
152 InstanceId: aws.String("i-123"),
153 AvailabilityZone: aws.String("aa-east-1a"),
155 InstanceId: aws.String("i-124"),
156 AvailabilityZone: aws.String("aa-east-1a"),
162 func (e *ec2stub) DescribeSpotPriceHistoryPages(input *ec2.DescribeSpotPriceHistoryInput, fn func(*ec2.DescribeSpotPriceHistoryOutput, bool) bool) error {
163 if !fn(&ec2.DescribeSpotPriceHistoryOutput{
164 SpotPriceHistory: []*ec2.SpotPrice{
166 InstanceType: aws.String("t2.micro"),
167 AvailabilityZone: aws.String("aa-east-1a"),
168 SpotPrice: aws.String("0.005"),
169 Timestamp: aws.Time(e.reftime.Add(-9 * time.Minute)),
172 InstanceType: aws.String("t2.micro"),
173 AvailabilityZone: aws.String("aa-east-1a"),
174 SpotPrice: aws.String("0.015"),
175 Timestamp: aws.Time(e.reftime.Add(-5 * time.Minute)),
181 fn(&ec2.DescribeSpotPriceHistoryOutput{
182 SpotPriceHistory: []*ec2.SpotPrice{
184 InstanceType: aws.String("t2.micro"),
185 AvailabilityZone: aws.String("aa-east-1a"),
186 SpotPrice: aws.String("0.01"),
187 Timestamp: aws.Time(e.reftime.Add(-2 * time.Minute)),
194 func (e *ec2stub) CreateTags(input *ec2.CreateTagsInput) (*ec2.CreateTagsOutput, error) {
198 func (e *ec2stub) TerminateInstances(input *ec2.TerminateInstancesInput) (*ec2.TerminateInstancesOutput, error) {
202 type ec2stubError struct {
207 func (err *ec2stubError) Code() string { return err.code }
208 func (err *ec2stubError) Message() string { return err.message }
209 func (err *ec2stubError) Error() string { return fmt.Sprintf("%s: %s", err.code, err.message) }
210 func (err *ec2stubError) OrigErr() error { return errors.New("stub OrigErr") }
212 // Ensure ec2stubError satisfies the aws.Error interface
213 var _ = awserr.Error(&ec2stubError{})
215 func GetInstanceSet(c *check.C, conf string) (*ec2InstanceSet, cloud.ImageID, arvados.Cluster, *prometheus.Registry) {
216 reg := prometheus.NewRegistry()
217 cluster := arvados.Cluster{
218 InstanceTypes: arvados.InstanceTypeMap(map[string]arvados.InstanceType{
221 ProviderType: "t2.micro",
224 Scratch: 10000000000,
228 "tiny-with-extra-scratch": {
229 Name: "tiny-with-extra-scratch",
230 ProviderType: "t2.micro",
235 AddedScratch: 20000000000,
237 "tiny-preemptible": {
238 Name: "tiny-preemptible",
239 ProviderType: "t2.micro",
242 Scratch: 10000000000,
248 var exampleCfg testConfig
249 err := config.LoadFile(&exampleCfg, *live)
250 c.Assert(err, check.IsNil)
252 is, err := newEC2InstanceSet(exampleCfg.DriverParameters, "test123", nil, logrus.StandardLogger(), reg)
253 c.Assert(err, check.IsNil)
254 return is.(*ec2InstanceSet), cloud.ImageID(exampleCfg.ImageIDForTestSuite), cluster, reg
256 is, err := newEC2InstanceSet(json.RawMessage(conf), "test123", nil, ctxlog.TestLogger(c), reg)
257 c.Assert(err, check.IsNil)
258 is.(*ec2InstanceSet).client = &ec2stub{c: c, reftime: time.Now().UTC()}
259 return is.(*ec2InstanceSet), cloud.ImageID("blob"), cluster, reg
263 func (*EC2InstanceSetSuite) TestCreate(c *check.C) {
264 ap, img, cluster, _ := GetInstanceSet(c, "{}")
265 pk, _ := test.LoadTestKey(c, "../../dispatchcloud/test/sshkey_dispatch")
267 inst, err := ap.Create(cluster.InstanceTypes["tiny"],
268 img, map[string]string{
269 "TestTagName": "test tag value",
270 }, "umask 0600; echo -n test-file-data >/var/run/test-file", pk)
271 c.Assert(err, check.IsNil)
274 c.Check(tags["TestTagName"], check.Equals, "test tag value")
275 c.Logf("inst.String()=%v Address()=%v Tags()=%v", inst.String(), inst.Address(), tags)
278 c.Check(ap.client.(*ec2stub).describeKeyPairsCalls, check.HasLen, 1)
279 c.Check(ap.client.(*ec2stub).importKeyPairCalls, check.HasLen, 1)
283 func (*EC2InstanceSetSuite) TestCreateWithExtraScratch(c *check.C) {
284 ap, img, cluster, _ := GetInstanceSet(c, "{}")
285 inst, err := ap.Create(cluster.InstanceTypes["tiny-with-extra-scratch"],
286 img, map[string]string{
287 "TestTagName": "test tag value",
288 }, "umask 0600; echo -n test-file-data >/var/run/test-file", nil)
290 c.Assert(err, check.IsNil)
293 c.Check(tags["TestTagName"], check.Equals, "test tag value")
294 c.Logf("inst.String()=%v Address()=%v Tags()=%v", inst.String(), inst.Address(), tags)
297 // Should not have called key pair APIs, because
298 // publickey arg was nil
299 c.Check(ap.client.(*ec2stub).describeKeyPairsCalls, check.HasLen, 0)
300 c.Check(ap.client.(*ec2stub).importKeyPairCalls, check.HasLen, 0)
304 func (*EC2InstanceSetSuite) TestCreatePreemptible(c *check.C) {
305 ap, img, cluster, _ := GetInstanceSet(c, "{}")
306 pk, _ := test.LoadTestKey(c, "../../dispatchcloud/test/sshkey_dispatch")
308 inst, err := ap.Create(cluster.InstanceTypes["tiny-preemptible"],
309 img, map[string]string{
310 "TestTagName": "test tag value",
311 }, "umask 0600; echo -n test-file-data >/var/run/test-file", pk)
313 c.Assert(err, check.IsNil)
316 c.Check(tags["TestTagName"], check.Equals, "test tag value")
317 c.Logf("inst.String()=%v Address()=%v Tags()=%v", inst.String(), inst.Address(), tags)
321 func (*EC2InstanceSetSuite) TestCreateFailoverSecondSubnet(c *check.C) {
323 c.Skip("not applicable in live mode")
327 ap, img, cluster, reg := GetInstanceSet(c, `{"SubnetID":["subnet-full","subnet-good"]}`)
328 ap.client.(*ec2stub).subnetErrorOnRunInstances = map[string]error{
329 "subnet-full": &ec2stubError{
330 code: "InsufficientFreeAddressesInSubnet",
331 message: "subnet is full",
334 inst, err := ap.Create(cluster.InstanceTypes["tiny"], img, nil, "", nil)
335 c.Check(err, check.IsNil)
336 c.Check(inst, check.NotNil)
337 c.Check(ap.client.(*ec2stub).runInstancesCalls, check.HasLen, 2)
338 metrics := arvadostest.GatherMetricsAsString(reg)
339 c.Check(metrics, check.Matches, `(?ms).*`+
340 `arvados_dispatchcloud_ec2_instance_starts_total{subnet_id="subnet-full",success="0"} 1\n`+
341 `arvados_dispatchcloud_ec2_instance_starts_total{subnet_id="subnet-full",success="1"} 0\n`+
342 `arvados_dispatchcloud_ec2_instance_starts_total{subnet_id="subnet-good",success="0"} 0\n`+
343 `arvados_dispatchcloud_ec2_instance_starts_total{subnet_id="subnet-good",success="1"} 1\n`+
346 // Next RunInstances call should try the working subnet first
347 inst, err = ap.Create(cluster.InstanceTypes["tiny"], img, nil, "", nil)
348 c.Check(err, check.IsNil)
349 c.Check(inst, check.NotNil)
350 c.Check(ap.client.(*ec2stub).runInstancesCalls, check.HasLen, 3)
351 metrics = arvadostest.GatherMetricsAsString(reg)
352 c.Check(metrics, check.Matches, `(?ms).*`+
353 `arvados_dispatchcloud_ec2_instance_starts_total{subnet_id="subnet-full",success="0"} 1\n`+
354 `arvados_dispatchcloud_ec2_instance_starts_total{subnet_id="subnet-full",success="1"} 0\n`+
355 `arvados_dispatchcloud_ec2_instance_starts_total{subnet_id="subnet-good",success="0"} 0\n`+
356 `arvados_dispatchcloud_ec2_instance_starts_total{subnet_id="subnet-good",success="1"} 2\n`+
360 func (*EC2InstanceSetSuite) TestCreateAllSubnetsFailing(c *check.C) {
362 c.Skip("not applicable in live mode")
366 ap, img, cluster, reg := GetInstanceSet(c, `{"SubnetID":["subnet-full","subnet-broken"]}`)
367 ap.client.(*ec2stub).subnetErrorOnRunInstances = map[string]error{
368 "subnet-full": &ec2stubError{
369 code: "InsufficientFreeAddressesInSubnet",
370 message: "subnet is full",
372 "subnet-broken": &ec2stubError{
373 code: "InvalidSubnetId.NotFound",
374 message: "bogus subnet id",
377 _, err := ap.Create(cluster.InstanceTypes["tiny"], img, nil, "", nil)
378 c.Check(err, check.NotNil)
379 c.Check(err, check.ErrorMatches, `.*InvalidSubnetId\.NotFound.*`)
380 c.Check(ap.client.(*ec2stub).runInstancesCalls, check.HasLen, 2)
381 metrics := arvadostest.GatherMetricsAsString(reg)
382 c.Check(metrics, check.Matches, `(?ms).*`+
383 `arvados_dispatchcloud_ec2_instance_starts_total{subnet_id="subnet-broken",success="0"} 1\n`+
384 `arvados_dispatchcloud_ec2_instance_starts_total{subnet_id="subnet-broken",success="1"} 0\n`+
385 `arvados_dispatchcloud_ec2_instance_starts_total{subnet_id="subnet-full",success="0"} 1\n`+
386 `arvados_dispatchcloud_ec2_instance_starts_total{subnet_id="subnet-full",success="1"} 0\n`+
389 _, err = ap.Create(cluster.InstanceTypes["tiny"], img, nil, "", nil)
390 c.Check(err, check.NotNil)
391 c.Check(err, check.ErrorMatches, `.*InsufficientFreeAddressesInSubnet.*`)
392 c.Check(ap.client.(*ec2stub).runInstancesCalls, check.HasLen, 4)
393 metrics = arvadostest.GatherMetricsAsString(reg)
394 c.Check(metrics, check.Matches, `(?ms).*`+
395 `arvados_dispatchcloud_ec2_instance_starts_total{subnet_id="subnet-broken",success="0"} 2\n`+
396 `arvados_dispatchcloud_ec2_instance_starts_total{subnet_id="subnet-broken",success="1"} 0\n`+
397 `arvados_dispatchcloud_ec2_instance_starts_total{subnet_id="subnet-full",success="0"} 2\n`+
398 `arvados_dispatchcloud_ec2_instance_starts_total{subnet_id="subnet-full",success="1"} 0\n`+
402 func (*EC2InstanceSetSuite) TestTagInstances(c *check.C) {
403 ap, _, _, _ := GetInstanceSet(c, "{}")
404 l, err := ap.Instances(nil)
405 c.Assert(err, check.IsNil)
407 for _, i := range l {
409 tg["TestTag2"] = "123 test tag 2"
410 c.Check(i.SetTags(tg), check.IsNil)
414 func (*EC2InstanceSetSuite) TestListInstances(c *check.C) {
415 ap, _, _, reg := GetInstanceSet(c, "{}")
416 l, err := ap.Instances(nil)
417 c.Assert(err, check.IsNil)
419 for _, i := range l {
421 c.Logf("%v %v %v", i.String(), i.Address(), tg)
424 metrics := arvadostest.GatherMetricsAsString(reg)
425 c.Check(metrics, check.Matches, `(?ms).*`+
426 `arvados_dispatchcloud_ec2_instances{subnet_id="[^"]*"} \d+\n`+
430 func (*EC2InstanceSetSuite) TestDestroyInstances(c *check.C) {
431 ap, _, _, _ := GetInstanceSet(c, "{}")
432 l, err := ap.Instances(nil)
433 c.Assert(err, check.IsNil)
435 for _, i := range l {
436 c.Check(i.Destroy(), check.IsNil)
440 func (*EC2InstanceSetSuite) TestInstancePriceHistory(c *check.C) {
441 ap, img, cluster, _ := GetInstanceSet(c, "{}")
442 pk, _ := test.LoadTestKey(c, "../../dispatchcloud/test/sshkey_dispatch")
443 tags := cloud.InstanceTags{"arvados-ec2-driver": "test"}
446 instances, err := ap.Instances(tags)
447 c.Assert(err, check.IsNil)
448 for _, inst := range instances {
449 c.Logf("cleanup: destroy instance %s", inst)
450 c.Check(inst.Destroy(), check.IsNil)
454 ap.ec2config.SpotPriceUpdateInterval = arvados.Duration(time.Hour)
455 ap.ec2config.EBSPrice = 0.1 // $/GiB/month
456 inst1, err := ap.Create(cluster.InstanceTypes["tiny-preemptible"], img, tags, "true", pk)
457 c.Assert(err, check.IsNil)
458 defer inst1.Destroy()
459 inst2, err := ap.Create(cluster.InstanceTypes["tiny-preemptible"], img, tags, "true", pk)
460 c.Assert(err, check.IsNil)
461 defer inst2.Destroy()
463 // in live mode, we need to wait for the instances to reach
464 // running state before we can discover their availability
465 // zones and look up the appropriate prices.
466 var instances []cloud.Instance
467 for deadline := time.Now().Add(5 * time.Minute); ; {
468 if deadline.Before(time.Now()) {
471 instances, err = ap.Instances(tags)
473 for _, inst := range instances {
474 ec2i := inst.(*ec2Instance).instance
475 if *ec2i.InstanceLifecycle == "spot" && *ec2i.State.Code&16 != 0 {
480 c.Logf("instances are running, and identifiable as spot instances")
483 c.Logf("waiting for instances to reach running state so their availability zone becomes visible...")
484 time.Sleep(10 * time.Second)
487 for _, inst := range instances {
488 hist := inst.PriceHistory(arvados.InstanceType{})
489 c.Logf("%s price history: %v", inst.ID(), hist)
490 c.Check(len(hist) > 0, check.Equals, true)
492 histWithScratch := inst.PriceHistory(arvados.InstanceType{AddedScratch: 640 << 30})
493 c.Logf("%s price history with 640 GiB scratch: %v", inst.ID(), histWithScratch)
495 for i, ip := range hist {
496 c.Check(ip.Price, check.Not(check.Equals), 0.0)
498 c.Check(ip.StartTime.Before(hist[i-1].StartTime), check.Equals, true)
500 c.Check(ip.Price < histWithScratch[i].Price, check.Equals, true)
505 func (*EC2InstanceSetSuite) TestWrapError(c *check.C) {
506 retryError := awserr.New("Throttling", "", nil)
507 wrapped := wrapError(retryError, &atomic.Value{})
508 _, ok := wrapped.(cloud.RateLimitError)
509 c.Check(ok, check.Equals, true)
511 quotaError := awserr.New("InsufficientInstanceCapacity", "", nil)
512 wrapped = wrapError(quotaError, nil)
513 _, ok = wrapped.(cloud.QuotaError)
514 c.Check(ok, check.Equals, true)