Merge branch 'main' into 21461-excessive-scrollbars-fix
[arvados.git] / lib / cloud / ec2 / ec2.go
1 // Copyright (C) The Arvados Authors. All rights reserved.
2 //
3 // SPDX-License-Identifier: AGPL-3.0
4
5 package ec2
6
7 import (
8         "crypto/md5"
9         "crypto/rsa"
10         "crypto/sha1"
11         "crypto/x509"
12         "encoding/base64"
13         "encoding/json"
14         "fmt"
15         "math/big"
16         "strconv"
17         "strings"
18         "sync"
19         "sync/atomic"
20         "time"
21
22         "git.arvados.org/arvados.git/lib/cloud"
23         "git.arvados.org/arvados.git/sdk/go/arvados"
24         "github.com/aws/aws-sdk-go/aws"
25         "github.com/aws/aws-sdk-go/aws/awserr"
26         "github.com/aws/aws-sdk-go/aws/credentials"
27         "github.com/aws/aws-sdk-go/aws/credentials/ec2rolecreds"
28         "github.com/aws/aws-sdk-go/aws/ec2metadata"
29         "github.com/aws/aws-sdk-go/aws/request"
30         "github.com/aws/aws-sdk-go/aws/session"
31         "github.com/aws/aws-sdk-go/service/ec2"
32         "github.com/prometheus/client_golang/prometheus"
33         "github.com/sirupsen/logrus"
34         "golang.org/x/crypto/ssh"
35 )
36
37 // Driver is the ec2 implementation of the cloud.Driver interface.
38 var Driver = cloud.DriverFunc(newEC2InstanceSet)
39
40 const (
41         throttleDelayMin = time.Second
42         throttleDelayMax = time.Minute
43 )
44
45 type ec2InstanceSetConfig struct {
46         AccessKeyID             string
47         SecretAccessKey         string
48         Region                  string
49         SecurityGroupIDs        arvados.StringSet
50         SubnetID                sliceOrSingleString
51         AdminUsername           string
52         EBSVolumeType           string
53         EBSPrice                float64
54         IAMInstanceProfile      string
55         SpotPriceUpdateInterval arvados.Duration
56 }
57
58 type sliceOrSingleString []string
59
60 // UnmarshalJSON unmarshals an array of strings, and also accepts ""
61 // as [], and "foo" as ["foo"].
62 func (ss *sliceOrSingleString) UnmarshalJSON(data []byte) error {
63         if len(data) == 0 {
64                 *ss = nil
65         } else if data[0] == '[' {
66                 var slice []string
67                 err := json.Unmarshal(data, &slice)
68                 if err != nil {
69                         return err
70                 }
71                 if len(slice) == 0 {
72                         *ss = nil
73                 } else {
74                         *ss = slice
75                 }
76         } else {
77                 var str string
78                 err := json.Unmarshal(data, &str)
79                 if err != nil {
80                         return err
81                 }
82                 if str == "" {
83                         *ss = nil
84                 } else {
85                         *ss = []string{str}
86                 }
87         }
88         return nil
89 }
90
91 type ec2Interface interface {
92         DescribeKeyPairs(input *ec2.DescribeKeyPairsInput) (*ec2.DescribeKeyPairsOutput, error)
93         ImportKeyPair(input *ec2.ImportKeyPairInput) (*ec2.ImportKeyPairOutput, error)
94         RunInstances(input *ec2.RunInstancesInput) (*ec2.Reservation, error)
95         DescribeInstances(input *ec2.DescribeInstancesInput) (*ec2.DescribeInstancesOutput, error)
96         DescribeInstanceStatusPages(input *ec2.DescribeInstanceStatusInput, fn func(*ec2.DescribeInstanceStatusOutput, bool) bool) error
97         DescribeSpotPriceHistoryPages(input *ec2.DescribeSpotPriceHistoryInput, fn func(*ec2.DescribeSpotPriceHistoryOutput, bool) bool) error
98         CreateTags(input *ec2.CreateTagsInput) (*ec2.CreateTagsOutput, error)
99         TerminateInstances(input *ec2.TerminateInstancesInput) (*ec2.TerminateInstancesOutput, error)
100 }
101
102 type ec2InstanceSet struct {
103         ec2config              ec2InstanceSetConfig
104         currentSubnetIDIndex   int32
105         instanceSetID          cloud.InstanceSetID
106         logger                 logrus.FieldLogger
107         client                 ec2Interface
108         keysMtx                sync.Mutex
109         keys                   map[string]string
110         throttleDelayCreate    atomic.Value
111         throttleDelayInstances atomic.Value
112
113         prices        map[priceKey][]cloud.InstancePrice
114         pricesLock    sync.Mutex
115         pricesUpdated map[priceKey]time.Time
116
117         mInstances      *prometheus.GaugeVec
118         mInstanceStarts *prometheus.CounterVec
119 }
120
121 func newEC2InstanceSet(config json.RawMessage, instanceSetID cloud.InstanceSetID, _ cloud.SharedResourceTags, logger logrus.FieldLogger, reg *prometheus.Registry) (prv cloud.InstanceSet, err error) {
122         instanceSet := &ec2InstanceSet{
123                 instanceSetID: instanceSetID,
124                 logger:        logger,
125         }
126         err = json.Unmarshal(config, &instanceSet.ec2config)
127         if err != nil {
128                 return nil, err
129         }
130
131         sess, err := session.NewSession()
132         if err != nil {
133                 return nil, err
134         }
135         // First try any static credentials, fall back to an IAM instance profile/role
136         creds := credentials.NewChainCredentials(
137                 []credentials.Provider{
138                         &credentials.StaticProvider{Value: credentials.Value{AccessKeyID: instanceSet.ec2config.AccessKeyID, SecretAccessKey: instanceSet.ec2config.SecretAccessKey}},
139                         &ec2rolecreds.EC2RoleProvider{Client: ec2metadata.New(sess)},
140                 })
141
142         awsConfig := aws.NewConfig().WithCredentials(creds).WithRegion(instanceSet.ec2config.Region)
143         instanceSet.client = ec2.New(session.Must(session.NewSession(awsConfig)))
144         instanceSet.keys = make(map[string]string)
145         if instanceSet.ec2config.EBSVolumeType == "" {
146                 instanceSet.ec2config.EBSVolumeType = "gp2"
147         }
148
149         // Set up metrics
150         instanceSet.mInstances = prometheus.NewGaugeVec(prometheus.GaugeOpts{
151                 Namespace: "arvados",
152                 Subsystem: "dispatchcloud",
153                 Name:      "ec2_instances",
154                 Help:      "Number of instances running",
155         }, []string{"subnet_id"})
156         instanceSet.mInstanceStarts = prometheus.NewCounterVec(prometheus.CounterOpts{
157                 Namespace: "arvados",
158                 Subsystem: "dispatchcloud",
159                 Name:      "ec2_instance_starts_total",
160                 Help:      "Number of attempts to start a new instance",
161         }, []string{"subnet_id", "success"})
162         // Initialize all of the series we'll be reporting.  Otherwise
163         // the {subnet=A, success=0} series doesn't appear in metrics
164         // at all until there's a failure in subnet A.
165         for _, subnet := range instanceSet.ec2config.SubnetID {
166                 instanceSet.mInstanceStarts.WithLabelValues(subnet, "0").Add(0)
167                 instanceSet.mInstanceStarts.WithLabelValues(subnet, "1").Add(0)
168         }
169         if len(instanceSet.ec2config.SubnetID) == 0 {
170                 instanceSet.mInstanceStarts.WithLabelValues("", "0").Add(0)
171                 instanceSet.mInstanceStarts.WithLabelValues("", "1").Add(0)
172         }
173         if reg != nil {
174                 reg.MustRegister(instanceSet.mInstances)
175                 reg.MustRegister(instanceSet.mInstanceStarts)
176         }
177
178         return instanceSet, nil
179 }
180
181 func awsKeyFingerprint(pk ssh.PublicKey) (md5fp string, sha1fp string, err error) {
182         // AWS key fingerprints don't use the usual key fingerprint
183         // you get from ssh-keygen or ssh.FingerprintLegacyMD5()
184         // (you can get that from md5.Sum(pk.Marshal())
185         //
186         // AWS uses the md5 or sha1 of the PKIX DER encoding of the
187         // public key, so calculate those fingerprints here.
188         var rsaPub struct {
189                 Name string
190                 E    *big.Int
191                 N    *big.Int
192         }
193         if err := ssh.Unmarshal(pk.Marshal(), &rsaPub); err != nil {
194                 return "", "", fmt.Errorf("agent: Unmarshal failed to parse public key: %v", err)
195         }
196         rsaPk := rsa.PublicKey{
197                 E: int(rsaPub.E.Int64()),
198                 N: rsaPub.N,
199         }
200         pkix, _ := x509.MarshalPKIXPublicKey(&rsaPk)
201         md5pkix := md5.Sum([]byte(pkix))
202         sha1pkix := sha1.Sum([]byte(pkix))
203         md5fp = ""
204         sha1fp = ""
205         for i := 0; i < len(md5pkix); i++ {
206                 md5fp += fmt.Sprintf(":%02x", md5pkix[i])
207         }
208         for i := 0; i < len(sha1pkix); i++ {
209                 sha1fp += fmt.Sprintf(":%02x", sha1pkix[i])
210         }
211         return md5fp[1:], sha1fp[1:], nil
212 }
213
214 func (instanceSet *ec2InstanceSet) Create(
215         instanceType arvados.InstanceType,
216         imageID cloud.ImageID,
217         newTags cloud.InstanceTags,
218         initCommand cloud.InitCommand,
219         publicKey ssh.PublicKey) (cloud.Instance, error) {
220
221         ec2tags := []*ec2.Tag{}
222         for k, v := range newTags {
223                 ec2tags = append(ec2tags, &ec2.Tag{
224                         Key:   aws.String(k),
225                         Value: aws.String(v),
226                 })
227         }
228
229         var groups []string
230         for sg := range instanceSet.ec2config.SecurityGroupIDs {
231                 groups = append(groups, sg)
232         }
233
234         rii := ec2.RunInstancesInput{
235                 ImageId:      aws.String(string(imageID)),
236                 InstanceType: &instanceType.ProviderType,
237                 MaxCount:     aws.Int64(1),
238                 MinCount:     aws.Int64(1),
239
240                 NetworkInterfaces: []*ec2.InstanceNetworkInterfaceSpecification{
241                         {
242                                 AssociatePublicIpAddress: aws.Bool(false),
243                                 DeleteOnTermination:      aws.Bool(true),
244                                 DeviceIndex:              aws.Int64(0),
245                                 Groups:                   aws.StringSlice(groups),
246                         }},
247                 DisableApiTermination:             aws.Bool(false),
248                 InstanceInitiatedShutdownBehavior: aws.String("terminate"),
249                 TagSpecifications: []*ec2.TagSpecification{
250                         {
251                                 ResourceType: aws.String("instance"),
252                                 Tags:         ec2tags,
253                         }},
254                 MetadataOptions: &ec2.InstanceMetadataOptionsRequest{
255                         // Require IMDSv2, as described at
256                         // https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/configuring-IMDS-new-instances.html
257                         HttpEndpoint: aws.String(ec2.InstanceMetadataEndpointStateEnabled),
258                         HttpTokens:   aws.String(ec2.HttpTokensStateRequired),
259                 },
260                 UserData: aws.String(base64.StdEncoding.EncodeToString([]byte("#!/bin/sh\n" + initCommand + "\n"))),
261         }
262
263         if publicKey != nil {
264                 keyname, err := instanceSet.getKeyName(publicKey)
265                 if err != nil {
266                         return nil, err
267                 }
268                 rii.KeyName = &keyname
269         }
270
271         if instanceType.AddedScratch > 0 {
272                 rii.BlockDeviceMappings = []*ec2.BlockDeviceMapping{{
273                         DeviceName: aws.String("/dev/xvdt"),
274                         Ebs: &ec2.EbsBlockDevice{
275                                 DeleteOnTermination: aws.Bool(true),
276                                 VolumeSize:          aws.Int64((int64(instanceType.AddedScratch) + (1<<30 - 1)) >> 30),
277                                 VolumeType:          &instanceSet.ec2config.EBSVolumeType,
278                         }}}
279         }
280
281         if instanceType.Preemptible {
282                 rii.InstanceMarketOptions = &ec2.InstanceMarketOptionsRequest{
283                         MarketType: aws.String("spot"),
284                         SpotOptions: &ec2.SpotMarketOptions{
285                                 InstanceInterruptionBehavior: aws.String("terminate"),
286                                 MaxPrice:                     aws.String(fmt.Sprintf("%v", instanceType.Price)),
287                         }}
288         }
289
290         if instanceSet.ec2config.IAMInstanceProfile != "" {
291                 rii.IamInstanceProfile = &ec2.IamInstanceProfileSpecification{
292                         Name: aws.String(instanceSet.ec2config.IAMInstanceProfile),
293                 }
294         }
295
296         var rsv *ec2.Reservation
297         var errToReturn error
298         subnets := instanceSet.ec2config.SubnetID
299         currentSubnetIDIndex := int(atomic.LoadInt32(&instanceSet.currentSubnetIDIndex))
300         for tryOffset := 0; ; tryOffset++ {
301                 tryIndex := 0
302                 trySubnet := ""
303                 if len(subnets) > 0 {
304                         tryIndex = (currentSubnetIDIndex + tryOffset) % len(subnets)
305                         trySubnet = subnets[tryIndex]
306                         rii.NetworkInterfaces[0].SubnetId = aws.String(trySubnet)
307                 }
308                 var err error
309                 rsv, err = instanceSet.client.RunInstances(&rii)
310                 instanceSet.mInstanceStarts.WithLabelValues(trySubnet, boolLabelValue[err == nil]).Add(1)
311                 if !isErrorCapacity(errToReturn) || isErrorCapacity(err) {
312                         // We want to return the last capacity error,
313                         // if any; otherwise the last non-capacity
314                         // error.
315                         errToReturn = err
316                 }
317                 if isErrorSubnetSpecific(err) &&
318                         tryOffset < len(subnets)-1 {
319                         instanceSet.logger.WithError(err).WithField("SubnetID", subnets[tryIndex]).
320                                 Warn("RunInstances failed, trying next subnet")
321                         continue
322                 }
323                 // Succeeded, or exhausted all subnets, or got a
324                 // non-subnet-related error.
325                 //
326                 // We intentionally update currentSubnetIDIndex even
327                 // in the non-retryable-failure case here to avoid a
328                 // situation where successive calls to Create() keep
329                 // returning errors for the same subnet (perhaps
330                 // "subnet full") and never reveal the errors for the
331                 // other configured subnets (perhaps "subnet ID
332                 // invalid").
333                 atomic.StoreInt32(&instanceSet.currentSubnetIDIndex, int32(tryIndex))
334                 break
335         }
336         if rsv == nil || len(rsv.Instances) == 0 {
337                 return nil, wrapError(errToReturn, &instanceSet.throttleDelayCreate)
338         }
339         return &ec2Instance{
340                 provider: instanceSet,
341                 instance: rsv.Instances[0],
342         }, nil
343 }
344
345 func (instanceSet *ec2InstanceSet) getKeyName(publicKey ssh.PublicKey) (string, error) {
346         instanceSet.keysMtx.Lock()
347         defer instanceSet.keysMtx.Unlock()
348         md5keyFingerprint, sha1keyFingerprint, err := awsKeyFingerprint(publicKey)
349         if err != nil {
350                 return "", fmt.Errorf("Could not make key fingerprint: %v", err)
351         }
352         if keyname, ok := instanceSet.keys[md5keyFingerprint]; ok {
353                 return keyname, nil
354         }
355         keyout, err := instanceSet.client.DescribeKeyPairs(&ec2.DescribeKeyPairsInput{
356                 Filters: []*ec2.Filter{{
357                         Name:   aws.String("fingerprint"),
358                         Values: []*string{&md5keyFingerprint, &sha1keyFingerprint},
359                 }},
360         })
361         if err != nil {
362                 return "", fmt.Errorf("Could not search for keypair: %v", err)
363         }
364         if len(keyout.KeyPairs) > 0 {
365                 return *(keyout.KeyPairs[0].KeyName), nil
366         }
367         keyname := "arvados-dispatch-keypair-" + md5keyFingerprint
368         _, err = instanceSet.client.ImportKeyPair(&ec2.ImportKeyPairInput{
369                 KeyName:           &keyname,
370                 PublicKeyMaterial: ssh.MarshalAuthorizedKey(publicKey),
371         })
372         if err != nil {
373                 return "", fmt.Errorf("Could not import keypair: %v", err)
374         }
375         instanceSet.keys[md5keyFingerprint] = keyname
376         return keyname, nil
377 }
378
379 func (instanceSet *ec2InstanceSet) Instances(tags cloud.InstanceTags) (instances []cloud.Instance, err error) {
380         var filters []*ec2.Filter
381         for k, v := range tags {
382                 filters = append(filters, &ec2.Filter{
383                         Name:   aws.String("tag:" + k),
384                         Values: []*string{aws.String(v)},
385                 })
386         }
387         needAZs := false
388         dii := &ec2.DescribeInstancesInput{Filters: filters}
389         for {
390                 dio, err := instanceSet.client.DescribeInstances(dii)
391                 err = wrapError(err, &instanceSet.throttleDelayInstances)
392                 if err != nil {
393                         return nil, err
394                 }
395
396                 for _, rsv := range dio.Reservations {
397                         for _, inst := range rsv.Instances {
398                                 if *inst.State.Name != "shutting-down" && *inst.State.Name != "terminated" {
399                                         instances = append(instances, &ec2Instance{
400                                                 provider: instanceSet,
401                                                 instance: inst,
402                                         })
403                                         if aws.StringValue(inst.InstanceLifecycle) == "spot" {
404                                                 needAZs = true
405                                         }
406                                 }
407                         }
408                 }
409                 if dio.NextToken == nil {
410                         break
411                 }
412                 dii.NextToken = dio.NextToken
413         }
414         if needAZs && instanceSet.ec2config.SpotPriceUpdateInterval > 0 {
415                 az := map[string]string{}
416                 err := instanceSet.client.DescribeInstanceStatusPages(&ec2.DescribeInstanceStatusInput{
417                         IncludeAllInstances: aws.Bool(true),
418                 }, func(page *ec2.DescribeInstanceStatusOutput, lastPage bool) bool {
419                         for _, ent := range page.InstanceStatuses {
420                                 az[*ent.InstanceId] = *ent.AvailabilityZone
421                         }
422                         return true
423                 })
424                 if err != nil {
425                         instanceSet.logger.Warnf("error getting instance statuses: %s", err)
426                 }
427                 for _, inst := range instances {
428                         inst := inst.(*ec2Instance)
429                         inst.availabilityZone = az[*inst.instance.InstanceId]
430                 }
431                 instanceSet.updateSpotPrices(instances)
432         }
433
434         // Count instances in each subnet, and report in metrics.
435         subnetInstances := map[string]int{"": 0}
436         for _, subnet := range instanceSet.ec2config.SubnetID {
437                 subnetInstances[subnet] = 0
438         }
439         for _, inst := range instances {
440                 subnet := inst.(*ec2Instance).instance.SubnetId
441                 if subnet != nil {
442                         subnetInstances[*subnet]++
443                 } else {
444                         subnetInstances[""]++
445                 }
446         }
447         for subnet, count := range subnetInstances {
448                 instanceSet.mInstances.WithLabelValues(subnet).Set(float64(count))
449         }
450
451         return instances, err
452 }
453
454 type priceKey struct {
455         instanceType     string
456         spot             bool
457         availabilityZone string
458 }
459
460 // Refresh recent spot instance pricing data for the given instances,
461 // unless we already have recent pricing data for all relevant types.
462 func (instanceSet *ec2InstanceSet) updateSpotPrices(instances []cloud.Instance) {
463         if len(instances) == 0 {
464                 return
465         }
466
467         instanceSet.pricesLock.Lock()
468         defer instanceSet.pricesLock.Unlock()
469         if instanceSet.prices == nil {
470                 instanceSet.prices = map[priceKey][]cloud.InstancePrice{}
471                 instanceSet.pricesUpdated = map[priceKey]time.Time{}
472         }
473
474         updateTime := time.Now()
475         staleTime := updateTime.Add(-instanceSet.ec2config.SpotPriceUpdateInterval.Duration())
476         needUpdate := false
477         allTypes := map[string]bool{}
478
479         for _, inst := range instances {
480                 ec2inst := inst.(*ec2Instance).instance
481                 if aws.StringValue(ec2inst.InstanceLifecycle) == "spot" {
482                         pk := priceKey{
483                                 instanceType:     *ec2inst.InstanceType,
484                                 spot:             true,
485                                 availabilityZone: inst.(*ec2Instance).availabilityZone,
486                         }
487                         if instanceSet.pricesUpdated[pk].Before(staleTime) {
488                                 needUpdate = true
489                         }
490                         allTypes[*ec2inst.InstanceType] = true
491                 }
492         }
493         if !needUpdate {
494                 return
495         }
496         var typeFilterValues []*string
497         for instanceType := range allTypes {
498                 typeFilterValues = append(typeFilterValues, aws.String(instanceType))
499         }
500         // Get 3x update interval worth of pricing data. (Ideally the
501         // AWS API would tell us "we have shown you all of the price
502         // changes up to time T", but it doesn't, so we'll just ask
503         // for 3 intervals worth of data on each update, de-duplicate
504         // the data points, and not worry too much about occasionally
505         // missing some data points when our lookups fail twice in a
506         // row.
507         dsphi := &ec2.DescribeSpotPriceHistoryInput{
508                 StartTime: aws.Time(updateTime.Add(-3 * instanceSet.ec2config.SpotPriceUpdateInterval.Duration())),
509                 Filters: []*ec2.Filter{
510                         &ec2.Filter{Name: aws.String("instance-type"), Values: typeFilterValues},
511                         &ec2.Filter{Name: aws.String("product-description"), Values: []*string{aws.String("Linux/UNIX")}},
512                 },
513         }
514         err := instanceSet.client.DescribeSpotPriceHistoryPages(dsphi, func(page *ec2.DescribeSpotPriceHistoryOutput, lastPage bool) bool {
515                 for _, ent := range page.SpotPriceHistory {
516                         if ent.InstanceType == nil || ent.SpotPrice == nil || ent.Timestamp == nil {
517                                 // bogus record?
518                                 continue
519                         }
520                         price, err := strconv.ParseFloat(*ent.SpotPrice, 64)
521                         if err != nil {
522                                 // bogus record?
523                                 continue
524                         }
525                         pk := priceKey{
526                                 instanceType:     *ent.InstanceType,
527                                 spot:             true,
528                                 availabilityZone: *ent.AvailabilityZone,
529                         }
530                         instanceSet.prices[pk] = append(instanceSet.prices[pk], cloud.InstancePrice{
531                                 StartTime: *ent.Timestamp,
532                                 Price:     price,
533                         })
534                         instanceSet.pricesUpdated[pk] = updateTime
535                 }
536                 return true
537         })
538         if err != nil {
539                 instanceSet.logger.Warnf("error retrieving spot instance prices: %s", err)
540         }
541
542         expiredTime := updateTime.Add(-64 * instanceSet.ec2config.SpotPriceUpdateInterval.Duration())
543         for pk, last := range instanceSet.pricesUpdated {
544                 if last.Before(expiredTime) {
545                         delete(instanceSet.pricesUpdated, pk)
546                         delete(instanceSet.prices, pk)
547                 }
548         }
549         for pk, prices := range instanceSet.prices {
550                 instanceSet.prices[pk] = cloud.NormalizePriceHistory(prices)
551         }
552 }
553
554 func (instanceSet *ec2InstanceSet) Stop() {
555 }
556
557 type ec2Instance struct {
558         provider         *ec2InstanceSet
559         instance         *ec2.Instance
560         availabilityZone string // sometimes available for spot instances
561 }
562
563 func (inst *ec2Instance) ID() cloud.InstanceID {
564         return cloud.InstanceID(*inst.instance.InstanceId)
565 }
566
567 func (inst *ec2Instance) String() string {
568         return *inst.instance.InstanceId
569 }
570
571 func (inst *ec2Instance) ProviderType() string {
572         return *inst.instance.InstanceType
573 }
574
575 func (inst *ec2Instance) SetTags(newTags cloud.InstanceTags) error {
576         var ec2tags []*ec2.Tag
577         for k, v := range newTags {
578                 ec2tags = append(ec2tags, &ec2.Tag{
579                         Key:   aws.String(k),
580                         Value: aws.String(v),
581                 })
582         }
583
584         _, err := inst.provider.client.CreateTags(&ec2.CreateTagsInput{
585                 Resources: []*string{inst.instance.InstanceId},
586                 Tags:      ec2tags,
587         })
588
589         return err
590 }
591
592 func (inst *ec2Instance) Tags() cloud.InstanceTags {
593         tags := make(map[string]string)
594
595         for _, t := range inst.instance.Tags {
596                 tags[*t.Key] = *t.Value
597         }
598
599         return tags
600 }
601
602 func (inst *ec2Instance) Destroy() error {
603         _, err := inst.provider.client.TerminateInstances(&ec2.TerminateInstancesInput{
604                 InstanceIds: []*string{inst.instance.InstanceId},
605         })
606         return err
607 }
608
609 func (inst *ec2Instance) Address() string {
610         if inst.instance.PrivateIpAddress != nil {
611                 return *inst.instance.PrivateIpAddress
612         }
613         return ""
614 }
615
616 func (inst *ec2Instance) RemoteUser() string {
617         return inst.provider.ec2config.AdminUsername
618 }
619
620 func (inst *ec2Instance) VerifyHostKey(ssh.PublicKey, *ssh.Client) error {
621         return cloud.ErrNotImplemented
622 }
623
624 // PriceHistory returns the price history for this specific instance.
625 //
626 // AWS documentation is elusive about whether the hourly cost of a
627 // given spot instance changes as the current spot price changes for
628 // the corresponding instance type and availability zone. Our
629 // implementation assumes the answer is yes, based on the following
630 // hints.
631 //
632 // https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/spot-requests.html
633 // says: "After your Spot Instance is running, if the Spot price rises
634 // above your maximum price, Amazon EC2 interrupts your Spot
635 // Instance." (This doesn't address what happens when the spot price
636 // rises *without* exceeding your maximum price.)
637 //
638 // https://docs.aws.amazon.com/whitepapers/latest/cost-optimization-leveraging-ec2-spot-instances/how-spot-instances-work.html
639 // says: "You pay the Spot price that's in effect, billed to the
640 // nearest second." (But it's not explicitly stated whether "the price
641 // in effect" changes over time for a given instance.)
642 //
643 // The same page also says, in a discussion about the effect of
644 // specifying a maximum price: "Note that you never pay more than the
645 // Spot price that is in effect when your Spot Instance is running."
646 // (The use of the phrase "is running", as opposed to "was launched",
647 // hints that pricing is dynamic.)
648 func (inst *ec2Instance) PriceHistory(instType arvados.InstanceType) []cloud.InstancePrice {
649         inst.provider.pricesLock.Lock()
650         defer inst.provider.pricesLock.Unlock()
651         // Note updateSpotPrices currently populates
652         // inst.provider.prices only for spot instances, so if
653         // spot==false here, we will return no data.
654         pk := priceKey{
655                 instanceType:     *inst.instance.InstanceType,
656                 spot:             aws.StringValue(inst.instance.InstanceLifecycle) == "spot",
657                 availabilityZone: inst.availabilityZone,
658         }
659         var prices []cloud.InstancePrice
660         for _, price := range inst.provider.prices[pk] {
661                 // ceil(added scratch space in GiB)
662                 gib := (instType.AddedScratch + 1<<30 - 1) >> 30
663                 monthly := inst.provider.ec2config.EBSPrice * float64(gib)
664                 hourly := monthly / 30 / 24
665                 price.Price += hourly
666                 prices = append(prices, price)
667         }
668         return prices
669 }
670
671 type rateLimitError struct {
672         error
673         earliestRetry time.Time
674 }
675
676 func (err rateLimitError) EarliestRetry() time.Time {
677         return err.earliestRetry
678 }
679
680 type capacityError struct {
681         error
682         isInstanceTypeSpecific bool
683 }
684
685 func (er *capacityError) IsCapacityError() bool {
686         return true
687 }
688
689 func (er *capacityError) IsInstanceTypeSpecific() bool {
690         return er.isInstanceTypeSpecific
691 }
692
693 var isCodeQuota = map[string]bool{
694         "InstanceLimitExceeded":             true,
695         "InsufficientAddressCapacity":       true,
696         "InsufficientFreeAddressesInSubnet": true,
697         "InsufficientVolumeCapacity":        true,
698         "MaxSpotInstanceCountExceeded":      true,
699         "VcpuLimitExceeded":                 true,
700 }
701
702 // isErrorQuota returns whether the error indicates we have reached
703 // some usage quota/limit -- i.e., immediately retrying with an equal
704 // or larger instance type will probably not work.
705 //
706 // Returns false if error is nil.
707 func isErrorQuota(err error) bool {
708         if aerr, ok := err.(awserr.Error); ok && aerr != nil {
709                 if _, ok := isCodeQuota[aerr.Code()]; ok {
710                         return true
711                 }
712         }
713         return false
714 }
715
716 // isErrorSubnetSpecific returns true if the problem encountered by
717 // RunInstances might be avoided by trying a different subnet.
718 func isErrorSubnetSpecific(err error) bool {
719         aerr, ok := err.(awserr.Error)
720         if !ok {
721                 return false
722         }
723         code := aerr.Code()
724         return strings.Contains(code, "Subnet") ||
725                 code == "InsufficientInstanceCapacity" ||
726                 code == "InsufficientVolumeCapacity" ||
727                 code == "Unsupported"
728 }
729
730 // isErrorCapacity returns true if the error indicates lack of
731 // capacity (either temporary or permanent) to run a specific instance
732 // type -- i.e., retrying with a different instance type might
733 // succeed.
734 func isErrorCapacity(err error) bool {
735         aerr, ok := err.(awserr.Error)
736         if !ok {
737                 return false
738         }
739         code := aerr.Code()
740         return code == "InsufficientInstanceCapacity" ||
741                 (code == "Unsupported" && strings.Contains(aerr.Message(), "requested instance type"))
742 }
743
744 type ec2QuotaError struct {
745         error
746 }
747
748 func (er *ec2QuotaError) IsQuotaError() bool {
749         return true
750 }
751
752 func wrapError(err error, throttleValue *atomic.Value) error {
753         if request.IsErrorThrottle(err) {
754                 // Back off exponentially until an upstream call
755                 // either succeeds or returns a non-throttle error.
756                 d, _ := throttleValue.Load().(time.Duration)
757                 d = d*3/2 + time.Second
758                 if d < throttleDelayMin {
759                         d = throttleDelayMin
760                 } else if d > throttleDelayMax {
761                         d = throttleDelayMax
762                 }
763                 throttleValue.Store(d)
764                 return rateLimitError{error: err, earliestRetry: time.Now().Add(d)}
765         } else if isErrorQuota(err) {
766                 return &ec2QuotaError{err}
767         } else if isErrorCapacity(err) {
768                 return &capacityError{err, true}
769         } else if err != nil {
770                 throttleValue.Store(time.Duration(0))
771                 return err
772         }
773         throttleValue.Store(time.Duration(0))
774         return nil
775 }
776
777 var boolLabelValue = map[bool]string{false: "0", true: "1"}