20300: Add rails-generated files to .licenseignore.
[arvados.git] / lib / cloud / ec2 / ec2_test.go
1 // Copyright (C) The Arvados Authors. All rights reserved.
2 //
3 // SPDX-License-Identifier: AGPL-3.0
4 //
5 //
6 // How to manually run individual tests against the real cloud:
7 //
8 // $ go test -v git.arvados.org/arvados.git/lib/cloud/ec2 -live-ec2-cfg ec2config.yml -check.f=TestCreate
9 //
10 // Tests should be run individually and in the order they are listed in the file:
11 //
12 // Example ec2config.yml:
13 //
14 // ImageIDForTestSuite: ami-xxxxxxxxxxxxxxxxx
15 // DriverParameters:
16 //       AccessKeyID: XXXXXXXXXXXXXX
17 //       SecretAccessKey: xxxxxxxxxxxxxxxxxxxx
18 //       Region: us-east-1
19 //       SecurityGroupIDs: [sg-xxxxxxxx]
20 //       SubnetID: subnet-xxxxxxxx
21 //       AdminUsername: crunch
22
23 package ec2
24
25 import (
26         "encoding/json"
27         "errors"
28         "flag"
29         "fmt"
30         "sync/atomic"
31         "testing"
32         "time"
33
34         "git.arvados.org/arvados.git/lib/cloud"
35         "git.arvados.org/arvados.git/lib/dispatchcloud/test"
36         "git.arvados.org/arvados.git/sdk/go/arvados"
37         "git.arvados.org/arvados.git/sdk/go/arvadostest"
38         "git.arvados.org/arvados.git/sdk/go/config"
39         "git.arvados.org/arvados.git/sdk/go/ctxlog"
40         "github.com/aws/aws-sdk-go/aws"
41         "github.com/aws/aws-sdk-go/aws/awserr"
42         "github.com/aws/aws-sdk-go/service/ec2"
43         "github.com/ghodss/yaml"
44         "github.com/prometheus/client_golang/prometheus"
45         "github.com/sirupsen/logrus"
46         check "gopkg.in/check.v1"
47 )
48
49 var live = flag.String("live-ec2-cfg", "", "Test with real EC2 API, provide config file")
50
51 // Gocheck boilerplate
52 func Test(t *testing.T) {
53         check.TestingT(t)
54 }
55
56 type sliceOrStringSuite struct{}
57
58 var _ = check.Suite(&sliceOrStringSuite{})
59
60 func (s *sliceOrStringSuite) TestUnmarshal(c *check.C) {
61         var conf ec2InstanceSetConfig
62         for _, trial := range []struct {
63                 input  string
64                 output sliceOrSingleString
65         }{
66                 {``, nil},
67                 {`""`, nil},
68                 {`[]`, nil},
69                 {`"foo"`, sliceOrSingleString{"foo"}},
70                 {`["foo"]`, sliceOrSingleString{"foo"}},
71                 {`[foo]`, sliceOrSingleString{"foo"}},
72                 {`["foo", "bar"]`, sliceOrSingleString{"foo", "bar"}},
73                 {`[foo-bar, baz]`, sliceOrSingleString{"foo-bar", "baz"}},
74         } {
75                 c.Logf("trial: %+v", trial)
76                 err := yaml.Unmarshal([]byte("SubnetID: "+trial.input+"\n"), &conf)
77                 if !c.Check(err, check.IsNil) {
78                         continue
79                 }
80                 c.Check(conf.SubnetID, check.DeepEquals, trial.output)
81         }
82 }
83
84 type EC2InstanceSetSuite struct{}
85
86 var _ = check.Suite(&EC2InstanceSetSuite{})
87
88 type testConfig struct {
89         ImageIDForTestSuite string
90         DriverParameters    json.RawMessage
91 }
92
93 type ec2stub struct {
94         c                     *check.C
95         reftime               time.Time
96         importKeyPairCalls    []*ec2.ImportKeyPairInput
97         describeKeyPairsCalls []*ec2.DescribeKeyPairsInput
98         runInstancesCalls     []*ec2.RunInstancesInput
99         // {subnetID => error}: RunInstances returns error if subnetID
100         // matches.
101         subnetErrorOnRunInstances map[string]error
102 }
103
104 func (e *ec2stub) ImportKeyPair(input *ec2.ImportKeyPairInput) (*ec2.ImportKeyPairOutput, error) {
105         e.importKeyPairCalls = append(e.importKeyPairCalls, input)
106         return nil, nil
107 }
108
109 func (e *ec2stub) DescribeKeyPairs(input *ec2.DescribeKeyPairsInput) (*ec2.DescribeKeyPairsOutput, error) {
110         e.describeKeyPairsCalls = append(e.describeKeyPairsCalls, input)
111         return &ec2.DescribeKeyPairsOutput{}, nil
112 }
113
114 func (e *ec2stub) RunInstances(input *ec2.RunInstancesInput) (*ec2.Reservation, error) {
115         e.runInstancesCalls = append(e.runInstancesCalls, input)
116         if len(input.NetworkInterfaces) > 0 && input.NetworkInterfaces[0].SubnetId != nil {
117                 err := e.subnetErrorOnRunInstances[*input.NetworkInterfaces[0].SubnetId]
118                 if err != nil {
119                         return nil, err
120                 }
121         }
122         return &ec2.Reservation{Instances: []*ec2.Instance{{
123                 InstanceId:   aws.String("i-123"),
124                 InstanceType: aws.String("t2.micro"),
125                 Tags:         input.TagSpecifications[0].Tags,
126         }}}, nil
127 }
128
129 func (e *ec2stub) DescribeInstances(input *ec2.DescribeInstancesInput) (*ec2.DescribeInstancesOutput, error) {
130         return &ec2.DescribeInstancesOutput{
131                 Reservations: []*ec2.Reservation{{
132                         Instances: []*ec2.Instance{{
133                                 InstanceId:        aws.String("i-123"),
134                                 InstanceLifecycle: aws.String("spot"),
135                                 InstanceType:      aws.String("t2.micro"),
136                                 PrivateIpAddress:  aws.String("10.1.2.3"),
137                                 State:             &ec2.InstanceState{Name: aws.String("running"), Code: aws.Int64(16)},
138                         }, {
139                                 InstanceId:        aws.String("i-124"),
140                                 InstanceLifecycle: aws.String("spot"),
141                                 InstanceType:      aws.String("t2.micro"),
142                                 PrivateIpAddress:  aws.String("10.1.2.4"),
143                                 State:             &ec2.InstanceState{Name: aws.String("running"), Code: aws.Int64(16)},
144                         }},
145                 }},
146         }, nil
147 }
148
149 func (e *ec2stub) DescribeInstanceStatusPages(input *ec2.DescribeInstanceStatusInput, fn func(*ec2.DescribeInstanceStatusOutput, bool) bool) error {
150         fn(&ec2.DescribeInstanceStatusOutput{
151                 InstanceStatuses: []*ec2.InstanceStatus{{
152                         InstanceId:       aws.String("i-123"),
153                         AvailabilityZone: aws.String("aa-east-1a"),
154                 }, {
155                         InstanceId:       aws.String("i-124"),
156                         AvailabilityZone: aws.String("aa-east-1a"),
157                 }},
158         }, true)
159         return nil
160 }
161
162 func (e *ec2stub) DescribeSpotPriceHistoryPages(input *ec2.DescribeSpotPriceHistoryInput, fn func(*ec2.DescribeSpotPriceHistoryOutput, bool) bool) error {
163         if !fn(&ec2.DescribeSpotPriceHistoryOutput{
164                 SpotPriceHistory: []*ec2.SpotPrice{
165                         &ec2.SpotPrice{
166                                 InstanceType:     aws.String("t2.micro"),
167                                 AvailabilityZone: aws.String("aa-east-1a"),
168                                 SpotPrice:        aws.String("0.005"),
169                                 Timestamp:        aws.Time(e.reftime.Add(-9 * time.Minute)),
170                         },
171                         &ec2.SpotPrice{
172                                 InstanceType:     aws.String("t2.micro"),
173                                 AvailabilityZone: aws.String("aa-east-1a"),
174                                 SpotPrice:        aws.String("0.015"),
175                                 Timestamp:        aws.Time(e.reftime.Add(-5 * time.Minute)),
176                         },
177                 },
178         }, false) {
179                 return nil
180         }
181         fn(&ec2.DescribeSpotPriceHistoryOutput{
182                 SpotPriceHistory: []*ec2.SpotPrice{
183                         &ec2.SpotPrice{
184                                 InstanceType:     aws.String("t2.micro"),
185                                 AvailabilityZone: aws.String("aa-east-1a"),
186                                 SpotPrice:        aws.String("0.01"),
187                                 Timestamp:        aws.Time(e.reftime.Add(-2 * time.Minute)),
188                         },
189                 },
190         }, true)
191         return nil
192 }
193
194 func (e *ec2stub) CreateTags(input *ec2.CreateTagsInput) (*ec2.CreateTagsOutput, error) {
195         return nil, nil
196 }
197
198 func (e *ec2stub) TerminateInstances(input *ec2.TerminateInstancesInput) (*ec2.TerminateInstancesOutput, error) {
199         return nil, nil
200 }
201
202 type ec2stubError struct {
203         code    string
204         message string
205 }
206
207 func (err *ec2stubError) Code() string    { return err.code }
208 func (err *ec2stubError) Message() string { return err.message }
209 func (err *ec2stubError) Error() string   { return fmt.Sprintf("%s: %s", err.code, err.message) }
210 func (err *ec2stubError) OrigErr() error  { return errors.New("stub OrigErr") }
211
212 // Ensure ec2stubError satisfies the aws.Error interface
213 var _ = awserr.Error(&ec2stubError{})
214
215 func GetInstanceSet(c *check.C, conf string) (*ec2InstanceSet, cloud.ImageID, arvados.Cluster, *prometheus.Registry) {
216         reg := prometheus.NewRegistry()
217         cluster := arvados.Cluster{
218                 InstanceTypes: arvados.InstanceTypeMap(map[string]arvados.InstanceType{
219                         "tiny": {
220                                 Name:         "tiny",
221                                 ProviderType: "t2.micro",
222                                 VCPUs:        1,
223                                 RAM:          4000000000,
224                                 Scratch:      10000000000,
225                                 Price:        .02,
226                                 Preemptible:  false,
227                         },
228                         "tiny-with-extra-scratch": {
229                                 Name:         "tiny-with-extra-scratch",
230                                 ProviderType: "t2.micro",
231                                 VCPUs:        1,
232                                 RAM:          4000000000,
233                                 Price:        .02,
234                                 Preemptible:  false,
235                                 AddedScratch: 20000000000,
236                         },
237                         "tiny-preemptible": {
238                                 Name:         "tiny-preemptible",
239                                 ProviderType: "t2.micro",
240                                 VCPUs:        1,
241                                 RAM:          4000000000,
242                                 Scratch:      10000000000,
243                                 Price:        .02,
244                                 Preemptible:  true,
245                         },
246                 })}
247         if *live != "" {
248                 var exampleCfg testConfig
249                 err := config.LoadFile(&exampleCfg, *live)
250                 c.Assert(err, check.IsNil)
251
252                 is, err := newEC2InstanceSet(exampleCfg.DriverParameters, "test123", nil, logrus.StandardLogger(), reg)
253                 c.Assert(err, check.IsNil)
254                 return is.(*ec2InstanceSet), cloud.ImageID(exampleCfg.ImageIDForTestSuite), cluster, reg
255         } else {
256                 is, err := newEC2InstanceSet(json.RawMessage(conf), "test123", nil, ctxlog.TestLogger(c), reg)
257                 c.Assert(err, check.IsNil)
258                 is.(*ec2InstanceSet).client = &ec2stub{c: c, reftime: time.Now().UTC()}
259                 return is.(*ec2InstanceSet), cloud.ImageID("blob"), cluster, reg
260         }
261 }
262
263 func (*EC2InstanceSetSuite) TestCreate(c *check.C) {
264         ap, img, cluster, _ := GetInstanceSet(c, "{}")
265         pk, _ := test.LoadTestKey(c, "../../dispatchcloud/test/sshkey_dispatch")
266
267         inst, err := ap.Create(cluster.InstanceTypes["tiny"],
268                 img, map[string]string{
269                         "TestTagName": "test tag value",
270                 }, "umask 0600; echo -n test-file-data >/var/run/test-file", pk)
271         c.Assert(err, check.IsNil)
272
273         tags := inst.Tags()
274         c.Check(tags["TestTagName"], check.Equals, "test tag value")
275         c.Logf("inst.String()=%v Address()=%v Tags()=%v", inst.String(), inst.Address(), tags)
276
277         if *live == "" {
278                 c.Check(ap.client.(*ec2stub).describeKeyPairsCalls, check.HasLen, 1)
279                 c.Check(ap.client.(*ec2stub).importKeyPairCalls, check.HasLen, 1)
280         }
281 }
282
283 func (*EC2InstanceSetSuite) TestCreateWithExtraScratch(c *check.C) {
284         ap, img, cluster, _ := GetInstanceSet(c, "{}")
285         inst, err := ap.Create(cluster.InstanceTypes["tiny-with-extra-scratch"],
286                 img, map[string]string{
287                         "TestTagName": "test tag value",
288                 }, "umask 0600; echo -n test-file-data >/var/run/test-file", nil)
289
290         c.Assert(err, check.IsNil)
291
292         tags := inst.Tags()
293         c.Check(tags["TestTagName"], check.Equals, "test tag value")
294         c.Logf("inst.String()=%v Address()=%v Tags()=%v", inst.String(), inst.Address(), tags)
295
296         if *live == "" {
297                 // Should not have called key pair APIs, because
298                 // publickey arg was nil
299                 c.Check(ap.client.(*ec2stub).describeKeyPairsCalls, check.HasLen, 0)
300                 c.Check(ap.client.(*ec2stub).importKeyPairCalls, check.HasLen, 0)
301         }
302 }
303
304 func (*EC2InstanceSetSuite) TestCreatePreemptible(c *check.C) {
305         ap, img, cluster, _ := GetInstanceSet(c, "{}")
306         pk, _ := test.LoadTestKey(c, "../../dispatchcloud/test/sshkey_dispatch")
307
308         inst, err := ap.Create(cluster.InstanceTypes["tiny-preemptible"],
309                 img, map[string]string{
310                         "TestTagName": "test tag value",
311                 }, "umask 0600; echo -n test-file-data >/var/run/test-file", pk)
312
313         c.Assert(err, check.IsNil)
314
315         tags := inst.Tags()
316         c.Check(tags["TestTagName"], check.Equals, "test tag value")
317         c.Logf("inst.String()=%v Address()=%v Tags()=%v", inst.String(), inst.Address(), tags)
318
319 }
320
321 func (*EC2InstanceSetSuite) TestCreateFailoverSecondSubnet(c *check.C) {
322         if *live != "" {
323                 c.Skip("not applicable in live mode")
324                 return
325         }
326
327         ap, img, cluster, reg := GetInstanceSet(c, `{"SubnetID":["subnet-full","subnet-good"]}`)
328         ap.client.(*ec2stub).subnetErrorOnRunInstances = map[string]error{
329                 "subnet-full": &ec2stubError{
330                         code:    "InsufficientFreeAddressesInSubnet",
331                         message: "subnet is full",
332                 },
333         }
334         inst, err := ap.Create(cluster.InstanceTypes["tiny"], img, nil, "", nil)
335         c.Check(err, check.IsNil)
336         c.Check(inst, check.NotNil)
337         c.Check(ap.client.(*ec2stub).runInstancesCalls, check.HasLen, 2)
338         metrics := arvadostest.GatherMetricsAsString(reg)
339         c.Check(metrics, check.Matches, `(?ms).*`+
340                 `arvados_dispatchcloud_ec2_instance_starts_total{subnet_id="subnet-full",success="0"} 1\n`+
341                 `arvados_dispatchcloud_ec2_instance_starts_total{subnet_id="subnet-full",success="1"} 0\n`+
342                 `arvados_dispatchcloud_ec2_instance_starts_total{subnet_id="subnet-good",success="0"} 0\n`+
343                 `arvados_dispatchcloud_ec2_instance_starts_total{subnet_id="subnet-good",success="1"} 1\n`+
344                 `.*`)
345
346         // Next RunInstances call should try the working subnet first
347         inst, err = ap.Create(cluster.InstanceTypes["tiny"], img, nil, "", nil)
348         c.Check(err, check.IsNil)
349         c.Check(inst, check.NotNil)
350         c.Check(ap.client.(*ec2stub).runInstancesCalls, check.HasLen, 3)
351         metrics = arvadostest.GatherMetricsAsString(reg)
352         c.Check(metrics, check.Matches, `(?ms).*`+
353                 `arvados_dispatchcloud_ec2_instance_starts_total{subnet_id="subnet-full",success="0"} 1\n`+
354                 `arvados_dispatchcloud_ec2_instance_starts_total{subnet_id="subnet-full",success="1"} 0\n`+
355                 `arvados_dispatchcloud_ec2_instance_starts_total{subnet_id="subnet-good",success="0"} 0\n`+
356                 `arvados_dispatchcloud_ec2_instance_starts_total{subnet_id="subnet-good",success="1"} 2\n`+
357                 `.*`)
358 }
359
360 func (*EC2InstanceSetSuite) TestCreateAllSubnetsFailing(c *check.C) {
361         if *live != "" {
362                 c.Skip("not applicable in live mode")
363                 return
364         }
365
366         ap, img, cluster, reg := GetInstanceSet(c, `{"SubnetID":["subnet-full","subnet-broken"]}`)
367         ap.client.(*ec2stub).subnetErrorOnRunInstances = map[string]error{
368                 "subnet-full": &ec2stubError{
369                         code:    "InsufficientFreeAddressesInSubnet",
370                         message: "subnet is full",
371                 },
372                 "subnet-broken": &ec2stubError{
373                         code:    "InvalidSubnetId.NotFound",
374                         message: "bogus subnet id",
375                 },
376         }
377         _, err := ap.Create(cluster.InstanceTypes["tiny"], img, nil, "", nil)
378         c.Check(err, check.NotNil)
379         c.Check(err, check.ErrorMatches, `.*InvalidSubnetId\.NotFound.*`)
380         c.Check(ap.client.(*ec2stub).runInstancesCalls, check.HasLen, 2)
381         metrics := arvadostest.GatherMetricsAsString(reg)
382         c.Check(metrics, check.Matches, `(?ms).*`+
383                 `arvados_dispatchcloud_ec2_instance_starts_total{subnet_id="subnet-broken",success="0"} 1\n`+
384                 `arvados_dispatchcloud_ec2_instance_starts_total{subnet_id="subnet-broken",success="1"} 0\n`+
385                 `arvados_dispatchcloud_ec2_instance_starts_total{subnet_id="subnet-full",success="0"} 1\n`+
386                 `arvados_dispatchcloud_ec2_instance_starts_total{subnet_id="subnet-full",success="1"} 0\n`+
387                 `.*`)
388
389         _, err = ap.Create(cluster.InstanceTypes["tiny"], img, nil, "", nil)
390         c.Check(err, check.NotNil)
391         c.Check(err, check.ErrorMatches, `.*InsufficientFreeAddressesInSubnet.*`)
392         c.Check(ap.client.(*ec2stub).runInstancesCalls, check.HasLen, 4)
393         metrics = arvadostest.GatherMetricsAsString(reg)
394         c.Check(metrics, check.Matches, `(?ms).*`+
395                 `arvados_dispatchcloud_ec2_instance_starts_total{subnet_id="subnet-broken",success="0"} 2\n`+
396                 `arvados_dispatchcloud_ec2_instance_starts_total{subnet_id="subnet-broken",success="1"} 0\n`+
397                 `arvados_dispatchcloud_ec2_instance_starts_total{subnet_id="subnet-full",success="0"} 2\n`+
398                 `arvados_dispatchcloud_ec2_instance_starts_total{subnet_id="subnet-full",success="1"} 0\n`+
399                 `.*`)
400 }
401
402 func (*EC2InstanceSetSuite) TestTagInstances(c *check.C) {
403         ap, _, _, _ := GetInstanceSet(c, "{}")
404         l, err := ap.Instances(nil)
405         c.Assert(err, check.IsNil)
406
407         for _, i := range l {
408                 tg := i.Tags()
409                 tg["TestTag2"] = "123 test tag 2"
410                 c.Check(i.SetTags(tg), check.IsNil)
411         }
412 }
413
414 func (*EC2InstanceSetSuite) TestListInstances(c *check.C) {
415         ap, _, _, reg := GetInstanceSet(c, "{}")
416         l, err := ap.Instances(nil)
417         c.Assert(err, check.IsNil)
418
419         for _, i := range l {
420                 tg := i.Tags()
421                 c.Logf("%v %v %v", i.String(), i.Address(), tg)
422         }
423
424         metrics := arvadostest.GatherMetricsAsString(reg)
425         c.Check(metrics, check.Matches, `(?ms).*`+
426                 `arvados_dispatchcloud_ec2_instances{subnet_id="[^"]*"} \d+\n`+
427                 `.*`)
428 }
429
430 func (*EC2InstanceSetSuite) TestDestroyInstances(c *check.C) {
431         ap, _, _, _ := GetInstanceSet(c, "{}")
432         l, err := ap.Instances(nil)
433         c.Assert(err, check.IsNil)
434
435         for _, i := range l {
436                 c.Check(i.Destroy(), check.IsNil)
437         }
438 }
439
440 func (*EC2InstanceSetSuite) TestInstancePriceHistory(c *check.C) {
441         ap, img, cluster, _ := GetInstanceSet(c, "{}")
442         pk, _ := test.LoadTestKey(c, "../../dispatchcloud/test/sshkey_dispatch")
443         tags := cloud.InstanceTags{"arvados-ec2-driver": "test"}
444
445         defer func() {
446                 instances, err := ap.Instances(tags)
447                 c.Assert(err, check.IsNil)
448                 for _, inst := range instances {
449                         c.Logf("cleanup: destroy instance %s", inst)
450                         c.Check(inst.Destroy(), check.IsNil)
451                 }
452         }()
453
454         ap.ec2config.SpotPriceUpdateInterval = arvados.Duration(time.Hour)
455         ap.ec2config.EBSPrice = 0.1 // $/GiB/month
456         inst1, err := ap.Create(cluster.InstanceTypes["tiny-preemptible"], img, tags, "true", pk)
457         c.Assert(err, check.IsNil)
458         defer inst1.Destroy()
459         inst2, err := ap.Create(cluster.InstanceTypes["tiny-preemptible"], img, tags, "true", pk)
460         c.Assert(err, check.IsNil)
461         defer inst2.Destroy()
462
463         // in live mode, we need to wait for the instances to reach
464         // running state before we can discover their availability
465         // zones and look up the appropriate prices.
466         var instances []cloud.Instance
467         for deadline := time.Now().Add(5 * time.Minute); ; {
468                 if deadline.Before(time.Now()) {
469                         c.Fatal("timed out")
470                 }
471                 instances, err = ap.Instances(tags)
472                 running := 0
473                 for _, inst := range instances {
474                         ec2i := inst.(*ec2Instance).instance
475                         if *ec2i.InstanceLifecycle == "spot" && *ec2i.State.Code&16 != 0 {
476                                 running++
477                         }
478                 }
479                 if running >= 2 {
480                         c.Logf("instances are running, and identifiable as spot instances")
481                         break
482                 }
483                 c.Logf("waiting for instances to reach running state so their availability zone becomes visible...")
484                 time.Sleep(10 * time.Second)
485         }
486
487         for _, inst := range instances {
488                 hist := inst.PriceHistory(arvados.InstanceType{})
489                 c.Logf("%s price history: %v", inst.ID(), hist)
490                 c.Check(len(hist) > 0, check.Equals, true)
491
492                 histWithScratch := inst.PriceHistory(arvados.InstanceType{AddedScratch: 640 << 30})
493                 c.Logf("%s price history with 640 GiB scratch: %v", inst.ID(), histWithScratch)
494
495                 for i, ip := range hist {
496                         c.Check(ip.Price, check.Not(check.Equals), 0.0)
497                         if i > 0 {
498                                 c.Check(ip.StartTime.Before(hist[i-1].StartTime), check.Equals, true)
499                         }
500                         c.Check(ip.Price < histWithScratch[i].Price, check.Equals, true)
501                 }
502         }
503 }
504
505 func (*EC2InstanceSetSuite) TestWrapError(c *check.C) {
506         retryError := awserr.New("Throttling", "", nil)
507         wrapped := wrapError(retryError, &atomic.Value{})
508         _, ok := wrapped.(cloud.RateLimitError)
509         c.Check(ok, check.Equals, true)
510
511         quotaError := awserr.New("InsufficientInstanceCapacity", "", nil)
512         wrapped = wrapError(quotaError, nil)
513         _, ok = wrapped.(cloud.QuotaError)
514         c.Check(ok, check.Equals, true)
515 }