d8322e7af38ab145e9a23b4e01554dae44fc6760
[arvados.git] / lib / cloud / ec2 / ec2_test.go
1 // Copyright (C) The Arvados Authors. All rights reserved.
2 //
3 // SPDX-License-Identifier: AGPL-3.0
4 //
5 //
6 // How to manually run individual tests against the real cloud:
7 //
8 // $ go test -v git.arvados.org/arvados.git/lib/cloud/ec2 -live-ec2-cfg ec2config.yml -check.f=TestCreate
9 //
10 // Tests should be run individually and in the order they are listed in the file:
11 //
12 // Example ec2config.yml:
13 //
14 // ImageIDForTestSuite: ami-xxxxxxxxxxxxxxxxx
15 // DriverParameters:
16 //       AccessKeyID: XXXXXXXXXXXXXX
17 //       SecretAccessKey: xxxxxxxxxxxxxxxxxxxx
18 //       Region: us-east-1
19 //       SecurityGroupIDs: [sg-xxxxxxxx]
20 //       SubnetID: subnet-xxxxxxxx
21 //       AdminUsername: crunch
22
23 package ec2
24
25 import (
26         "encoding/json"
27         "errors"
28         "flag"
29         "fmt"
30         "sync/atomic"
31         "testing"
32         "time"
33
34         "git.arvados.org/arvados.git/lib/cloud"
35         "git.arvados.org/arvados.git/lib/dispatchcloud/test"
36         "git.arvados.org/arvados.git/sdk/go/arvados"
37         "git.arvados.org/arvados.git/sdk/go/config"
38         "git.arvados.org/arvados.git/sdk/go/ctxlog"
39         "github.com/aws/aws-sdk-go/aws"
40         "github.com/aws/aws-sdk-go/aws/awserr"
41         "github.com/aws/aws-sdk-go/service/ec2"
42         "github.com/ghodss/yaml"
43         "github.com/sirupsen/logrus"
44         check "gopkg.in/check.v1"
45 )
46
47 var live = flag.String("live-ec2-cfg", "", "Test with real EC2 API, provide config file")
48
49 // Gocheck boilerplate
50 func Test(t *testing.T) {
51         check.TestingT(t)
52 }
53
54 type sliceOrStringSuite struct{}
55
56 var _ = check.Suite(&sliceOrStringSuite{})
57
58 func (s *sliceOrStringSuite) TestUnmarshal(c *check.C) {
59         var conf ec2InstanceSetConfig
60         for _, trial := range []struct {
61                 input  string
62                 output sliceOrSingleString
63         }{
64                 {``, nil},
65                 {`""`, nil},
66                 {`[]`, nil},
67                 {`"foo"`, sliceOrSingleString{"foo"}},
68                 {`["foo"]`, sliceOrSingleString{"foo"}},
69                 {`[foo]`, sliceOrSingleString{"foo"}},
70                 {`["foo", "bar"]`, sliceOrSingleString{"foo", "bar"}},
71                 {`[foo-bar, baz]`, sliceOrSingleString{"foo-bar", "baz"}},
72         } {
73                 c.Logf("trial: %+v", trial)
74                 err := yaml.Unmarshal([]byte("SubnetID: "+trial.input+"\n"), &conf)
75                 if !c.Check(err, check.IsNil) {
76                         continue
77                 }
78                 c.Check(conf.SubnetID, check.DeepEquals, trial.output)
79         }
80 }
81
82 type EC2InstanceSetSuite struct{}
83
84 var _ = check.Suite(&EC2InstanceSetSuite{})
85
86 type testConfig struct {
87         ImageIDForTestSuite string
88         DriverParameters    json.RawMessage
89 }
90
91 type ec2stub struct {
92         c                     *check.C
93         reftime               time.Time
94         importKeyPairCalls    []*ec2.ImportKeyPairInput
95         describeKeyPairsCalls []*ec2.DescribeKeyPairsInput
96         runInstancesCalls     []*ec2.RunInstancesInput
97         // {subnetID => error}: RunInstances returns error if subnetID
98         // matches.
99         subnetErrorOnRunInstances map[string]error
100 }
101
102 func (e *ec2stub) ImportKeyPair(input *ec2.ImportKeyPairInput) (*ec2.ImportKeyPairOutput, error) {
103         e.importKeyPairCalls = append(e.importKeyPairCalls, input)
104         return nil, nil
105 }
106
107 func (e *ec2stub) DescribeKeyPairs(input *ec2.DescribeKeyPairsInput) (*ec2.DescribeKeyPairsOutput, error) {
108         e.describeKeyPairsCalls = append(e.describeKeyPairsCalls, input)
109         return &ec2.DescribeKeyPairsOutput{}, nil
110 }
111
112 func (e *ec2stub) RunInstances(input *ec2.RunInstancesInput) (*ec2.Reservation, error) {
113         e.runInstancesCalls = append(e.runInstancesCalls, input)
114         if len(input.NetworkInterfaces) > 0 && input.NetworkInterfaces[0].SubnetId != nil {
115                 err := e.subnetErrorOnRunInstances[*input.NetworkInterfaces[0].SubnetId]
116                 if err != nil {
117                         return nil, err
118                 }
119         }
120         return &ec2.Reservation{Instances: []*ec2.Instance{{
121                 InstanceId:   aws.String("i-123"),
122                 InstanceType: aws.String("t2.micro"),
123                 Tags:         input.TagSpecifications[0].Tags,
124         }}}, nil
125 }
126
127 func (e *ec2stub) DescribeInstances(input *ec2.DescribeInstancesInput) (*ec2.DescribeInstancesOutput, error) {
128         return &ec2.DescribeInstancesOutput{
129                 Reservations: []*ec2.Reservation{{
130                         Instances: []*ec2.Instance{{
131                                 InstanceId:        aws.String("i-123"),
132                                 InstanceLifecycle: aws.String("spot"),
133                                 InstanceType:      aws.String("t2.micro"),
134                                 PrivateIpAddress:  aws.String("10.1.2.3"),
135                                 State:             &ec2.InstanceState{Name: aws.String("running"), Code: aws.Int64(16)},
136                         }, {
137                                 InstanceId:        aws.String("i-124"),
138                                 InstanceLifecycle: aws.String("spot"),
139                                 InstanceType:      aws.String("t2.micro"),
140                                 PrivateIpAddress:  aws.String("10.1.2.4"),
141                                 State:             &ec2.InstanceState{Name: aws.String("running"), Code: aws.Int64(16)},
142                         }},
143                 }},
144         }, nil
145 }
146
147 func (e *ec2stub) DescribeInstanceStatusPages(input *ec2.DescribeInstanceStatusInput, fn func(*ec2.DescribeInstanceStatusOutput, bool) bool) error {
148         fn(&ec2.DescribeInstanceStatusOutput{
149                 InstanceStatuses: []*ec2.InstanceStatus{{
150                         InstanceId:       aws.String("i-123"),
151                         AvailabilityZone: aws.String("aa-east-1a"),
152                 }, {
153                         InstanceId:       aws.String("i-124"),
154                         AvailabilityZone: aws.String("aa-east-1a"),
155                 }},
156         }, true)
157         return nil
158 }
159
160 func (e *ec2stub) DescribeSpotPriceHistoryPages(input *ec2.DescribeSpotPriceHistoryInput, fn func(*ec2.DescribeSpotPriceHistoryOutput, bool) bool) error {
161         if !fn(&ec2.DescribeSpotPriceHistoryOutput{
162                 SpotPriceHistory: []*ec2.SpotPrice{
163                         &ec2.SpotPrice{
164                                 InstanceType:     aws.String("t2.micro"),
165                                 AvailabilityZone: aws.String("aa-east-1a"),
166                                 SpotPrice:        aws.String("0.005"),
167                                 Timestamp:        aws.Time(e.reftime.Add(-9 * time.Minute)),
168                         },
169                         &ec2.SpotPrice{
170                                 InstanceType:     aws.String("t2.micro"),
171                                 AvailabilityZone: aws.String("aa-east-1a"),
172                                 SpotPrice:        aws.String("0.015"),
173                                 Timestamp:        aws.Time(e.reftime.Add(-5 * time.Minute)),
174                         },
175                 },
176         }, false) {
177                 return nil
178         }
179         fn(&ec2.DescribeSpotPriceHistoryOutput{
180                 SpotPriceHistory: []*ec2.SpotPrice{
181                         &ec2.SpotPrice{
182                                 InstanceType:     aws.String("t2.micro"),
183                                 AvailabilityZone: aws.String("aa-east-1a"),
184                                 SpotPrice:        aws.String("0.01"),
185                                 Timestamp:        aws.Time(e.reftime.Add(-2 * time.Minute)),
186                         },
187                 },
188         }, true)
189         return nil
190 }
191
192 func (e *ec2stub) CreateTags(input *ec2.CreateTagsInput) (*ec2.CreateTagsOutput, error) {
193         return nil, nil
194 }
195
196 func (e *ec2stub) TerminateInstances(input *ec2.TerminateInstancesInput) (*ec2.TerminateInstancesOutput, error) {
197         return nil, nil
198 }
199
200 type ec2stubError struct {
201         code    string
202         message string
203 }
204
205 func (err *ec2stubError) Code() string    { return err.code }
206 func (err *ec2stubError) Message() string { return err.message }
207 func (err *ec2stubError) Error() string   { return fmt.Sprintf("%s: %s", err.code, err.message) }
208 func (err *ec2stubError) OrigErr() error  { return errors.New("stub OrigErr") }
209
210 // Ensure ec2stubError satisfies the aws.Error interface
211 var _ = awserr.Error(&ec2stubError{})
212
213 func GetInstanceSet(c *check.C) (*ec2InstanceSet, cloud.ImageID, arvados.Cluster) {
214         cluster := arvados.Cluster{
215                 InstanceTypes: arvados.InstanceTypeMap(map[string]arvados.InstanceType{
216                         "tiny": {
217                                 Name:         "tiny",
218                                 ProviderType: "t2.micro",
219                                 VCPUs:        1,
220                                 RAM:          4000000000,
221                                 Scratch:      10000000000,
222                                 Price:        .02,
223                                 Preemptible:  false,
224                         },
225                         "tiny-with-extra-scratch": {
226                                 Name:         "tiny-with-extra-scratch",
227                                 ProviderType: "t2.micro",
228                                 VCPUs:        1,
229                                 RAM:          4000000000,
230                                 Price:        .02,
231                                 Preemptible:  false,
232                                 AddedScratch: 20000000000,
233                         },
234                         "tiny-preemptible": {
235                                 Name:         "tiny-preemptible",
236                                 ProviderType: "t2.micro",
237                                 VCPUs:        1,
238                                 RAM:          4000000000,
239                                 Scratch:      10000000000,
240                                 Price:        .02,
241                                 Preemptible:  true,
242                         },
243                 })}
244         if *live != "" {
245                 var exampleCfg testConfig
246                 err := config.LoadFile(&exampleCfg, *live)
247                 c.Assert(err, check.IsNil)
248
249                 ap, err := newEC2InstanceSet(exampleCfg.DriverParameters, "test123", nil, logrus.StandardLogger(), nil)
250                 c.Assert(err, check.IsNil)
251                 return ap.(*ec2InstanceSet), cloud.ImageID(exampleCfg.ImageIDForTestSuite), cluster
252         }
253         ap := ec2InstanceSet{
254                 instanceSetID: "test123",
255                 logger:        ctxlog.TestLogger(c),
256                 client:        &ec2stub{c: c, reftime: time.Now().UTC()},
257                 keys:          make(map[string]string),
258         }
259         return &ap, cloud.ImageID("blob"), cluster
260 }
261
262 func (*EC2InstanceSetSuite) TestCreate(c *check.C) {
263         ap, img, cluster := GetInstanceSet(c)
264         pk, _ := test.LoadTestKey(c, "../../dispatchcloud/test/sshkey_dispatch")
265
266         inst, err := ap.Create(cluster.InstanceTypes["tiny"],
267                 img, map[string]string{
268                         "TestTagName": "test tag value",
269                 }, "umask 0600; echo -n test-file-data >/var/run/test-file", pk)
270         c.Assert(err, check.IsNil)
271
272         tags := inst.Tags()
273         c.Check(tags["TestTagName"], check.Equals, "test tag value")
274         c.Logf("inst.String()=%v Address()=%v Tags()=%v", inst.String(), inst.Address(), tags)
275
276         if *live == "" {
277                 c.Check(ap.client.(*ec2stub).describeKeyPairsCalls, check.HasLen, 1)
278                 c.Check(ap.client.(*ec2stub).importKeyPairCalls, check.HasLen, 1)
279         }
280 }
281
282 func (*EC2InstanceSetSuite) TestCreateWithExtraScratch(c *check.C) {
283         ap, img, cluster := GetInstanceSet(c)
284         inst, err := ap.Create(cluster.InstanceTypes["tiny-with-extra-scratch"],
285                 img, map[string]string{
286                         "TestTagName": "test tag value",
287                 }, "umask 0600; echo -n test-file-data >/var/run/test-file", nil)
288
289         c.Assert(err, check.IsNil)
290
291         tags := inst.Tags()
292         c.Check(tags["TestTagName"], check.Equals, "test tag value")
293         c.Logf("inst.String()=%v Address()=%v Tags()=%v", inst.String(), inst.Address(), tags)
294
295         if *live == "" {
296                 // Should not have called key pair APIs, because
297                 // publickey arg was nil
298                 c.Check(ap.client.(*ec2stub).describeKeyPairsCalls, check.HasLen, 0)
299                 c.Check(ap.client.(*ec2stub).importKeyPairCalls, check.HasLen, 0)
300         }
301 }
302
303 func (*EC2InstanceSetSuite) TestCreatePreemptible(c *check.C) {
304         ap, img, cluster := GetInstanceSet(c)
305         pk, _ := test.LoadTestKey(c, "../../dispatchcloud/test/sshkey_dispatch")
306
307         inst, err := ap.Create(cluster.InstanceTypes["tiny-preemptible"],
308                 img, map[string]string{
309                         "TestTagName": "test tag value",
310                 }, "umask 0600; echo -n test-file-data >/var/run/test-file", pk)
311
312         c.Assert(err, check.IsNil)
313
314         tags := inst.Tags()
315         c.Check(tags["TestTagName"], check.Equals, "test tag value")
316         c.Logf("inst.String()=%v Address()=%v Tags()=%v", inst.String(), inst.Address(), tags)
317
318 }
319
320 func (*EC2InstanceSetSuite) TestCreateFailoverSecondSubnet(c *check.C) {
321         if *live != "" {
322                 c.Skip("not applicable in live mode")
323                 return
324         }
325
326         ap, img, cluster := GetInstanceSet(c)
327         ap.ec2config.SubnetID = sliceOrSingleString{"subnet-full", "subnet-good"}
328         ap.client.(*ec2stub).subnetErrorOnRunInstances = map[string]error{
329                 "subnet-full": &ec2stubError{
330                         code:    "InsufficientFreeAddressesInSubnet",
331                         message: "subnet is full",
332                 },
333         }
334         inst, err := ap.Create(cluster.InstanceTypes["tiny"], img, nil, "", nil)
335         c.Check(err, check.IsNil)
336         c.Check(inst, check.NotNil)
337         c.Check(ap.client.(*ec2stub).runInstancesCalls, check.HasLen, 2)
338
339         // Next RunInstances call should try the working subnet first
340         inst, err = ap.Create(cluster.InstanceTypes["tiny"], img, nil, "", nil)
341         c.Check(err, check.IsNil)
342         c.Check(inst, check.NotNil)
343         c.Check(ap.client.(*ec2stub).runInstancesCalls, check.HasLen, 3)
344 }
345
346 func (*EC2InstanceSetSuite) TestCreateAllSubnetsFailing(c *check.C) {
347         if *live != "" {
348                 c.Skip("not applicable in live mode")
349                 return
350         }
351
352         ap, img, cluster := GetInstanceSet(c)
353         ap.ec2config.SubnetID = sliceOrSingleString{"subnet-full", "subnet-broken"}
354         ap.client.(*ec2stub).subnetErrorOnRunInstances = map[string]error{
355                 "subnet-full": &ec2stubError{
356                         code:    "InsufficientFreeAddressesInSubnet",
357                         message: "subnet is full",
358                 },
359                 "subnet-broken": &ec2stubError{
360                         code:    "InvalidSubnetId.NotFound",
361                         message: "bogus subnet id",
362                 },
363         }
364         _, err := ap.Create(cluster.InstanceTypes["tiny"], img, nil, "", nil)
365         c.Check(err, check.NotNil)
366         c.Check(err, check.ErrorMatches, `.*InvalidSubnetId\.NotFound.*`)
367         c.Check(ap.client.(*ec2stub).runInstancesCalls, check.HasLen, 2)
368
369         _, err = ap.Create(cluster.InstanceTypes["tiny"], img, nil, "", nil)
370         c.Check(err, check.NotNil)
371         c.Check(err, check.ErrorMatches, `.*InsufficientFreeAddressesInSubnet.*`)
372         c.Check(ap.client.(*ec2stub).runInstancesCalls, check.HasLen, 4)
373 }
374
375 func (*EC2InstanceSetSuite) TestTagInstances(c *check.C) {
376         ap, _, _ := GetInstanceSet(c)
377         l, err := ap.Instances(nil)
378         c.Assert(err, check.IsNil)
379
380         for _, i := range l {
381                 tg := i.Tags()
382                 tg["TestTag2"] = "123 test tag 2"
383                 c.Check(i.SetTags(tg), check.IsNil)
384         }
385 }
386
387 func (*EC2InstanceSetSuite) TestListInstances(c *check.C) {
388         ap, _, _ := GetInstanceSet(c)
389         l, err := ap.Instances(nil)
390         c.Assert(err, check.IsNil)
391
392         for _, i := range l {
393                 tg := i.Tags()
394                 c.Logf("%v %v %v", i.String(), i.Address(), tg)
395         }
396 }
397
398 func (*EC2InstanceSetSuite) TestDestroyInstances(c *check.C) {
399         ap, _, _ := GetInstanceSet(c)
400         l, err := ap.Instances(nil)
401         c.Assert(err, check.IsNil)
402
403         for _, i := range l {
404                 c.Check(i.Destroy(), check.IsNil)
405         }
406 }
407
408 func (*EC2InstanceSetSuite) TestInstancePriceHistory(c *check.C) {
409         ap, img, cluster := GetInstanceSet(c)
410         pk, _ := test.LoadTestKey(c, "../../dispatchcloud/test/sshkey_dispatch")
411         tags := cloud.InstanceTags{"arvados-ec2-driver": "test"}
412
413         defer func() {
414                 instances, err := ap.Instances(tags)
415                 c.Assert(err, check.IsNil)
416                 for _, inst := range instances {
417                         c.Logf("cleanup: destroy instance %s", inst)
418                         c.Check(inst.Destroy(), check.IsNil)
419                 }
420         }()
421
422         ap.ec2config.SpotPriceUpdateInterval = arvados.Duration(time.Hour)
423         ap.ec2config.EBSPrice = 0.1 // $/GiB/month
424         inst1, err := ap.Create(cluster.InstanceTypes["tiny-preemptible"], img, tags, "true", pk)
425         c.Assert(err, check.IsNil)
426         defer inst1.Destroy()
427         inst2, err := ap.Create(cluster.InstanceTypes["tiny-preemptible"], img, tags, "true", pk)
428         c.Assert(err, check.IsNil)
429         defer inst2.Destroy()
430
431         // in live mode, we need to wait for the instances to reach
432         // running state before we can discover their availability
433         // zones and look up the appropriate prices.
434         var instances []cloud.Instance
435         for deadline := time.Now().Add(5 * time.Minute); ; {
436                 if deadline.Before(time.Now()) {
437                         c.Fatal("timed out")
438                 }
439                 instances, err = ap.Instances(tags)
440                 running := 0
441                 for _, inst := range instances {
442                         ec2i := inst.(*ec2Instance).instance
443                         if *ec2i.InstanceLifecycle == "spot" && *ec2i.State.Code&16 != 0 {
444                                 running++
445                         }
446                 }
447                 if running >= 2 {
448                         c.Logf("instances are running, and identifiable as spot instances")
449                         break
450                 }
451                 c.Logf("waiting for instances to reach running state so their availability zone becomes visible...")
452                 time.Sleep(10 * time.Second)
453         }
454
455         for _, inst := range instances {
456                 hist := inst.PriceHistory(arvados.InstanceType{})
457                 c.Logf("%s price history: %v", inst.ID(), hist)
458                 c.Check(len(hist) > 0, check.Equals, true)
459
460                 histWithScratch := inst.PriceHistory(arvados.InstanceType{AddedScratch: 640 << 30})
461                 c.Logf("%s price history with 640 GiB scratch: %v", inst.ID(), histWithScratch)
462
463                 for i, ip := range hist {
464                         c.Check(ip.Price, check.Not(check.Equals), 0.0)
465                         if i > 0 {
466                                 c.Check(ip.StartTime.Before(hist[i-1].StartTime), check.Equals, true)
467                         }
468                         c.Check(ip.Price < histWithScratch[i].Price, check.Equals, true)
469                 }
470         }
471 }
472
473 func (*EC2InstanceSetSuite) TestWrapError(c *check.C) {
474         retryError := awserr.New("Throttling", "", nil)
475         wrapped := wrapError(retryError, &atomic.Value{})
476         _, ok := wrapped.(cloud.RateLimitError)
477         c.Check(ok, check.Equals, true)
478
479         quotaError := awserr.New("InsufficientInstanceCapacity", "", nil)
480         wrapped = wrapError(quotaError, nil)
481         _, ok = wrapped.(cloud.QuotaError)
482         c.Check(ok, check.Equals, true)
483 }