21700: Install Bundler system-wide in Rails postinst
[arvados.git] / lib / cloud / ec2 / ec2.go
1 // Copyright (C) The Arvados Authors. All rights reserved.
2 //
3 // SPDX-License-Identifier: AGPL-3.0
4
5 package ec2
6
7 import (
8         "crypto/md5"
9         "crypto/rsa"
10         "crypto/sha1"
11         "crypto/x509"
12         "encoding/base64"
13         "encoding/json"
14         "fmt"
15         "math/big"
16         "regexp"
17         "strconv"
18         "strings"
19         "sync"
20         "sync/atomic"
21         "time"
22
23         "git.arvados.org/arvados.git/lib/cloud"
24         "git.arvados.org/arvados.git/sdk/go/arvados"
25         "github.com/aws/aws-sdk-go/aws"
26         "github.com/aws/aws-sdk-go/aws/awserr"
27         "github.com/aws/aws-sdk-go/aws/credentials"
28         "github.com/aws/aws-sdk-go/aws/credentials/ec2rolecreds"
29         "github.com/aws/aws-sdk-go/aws/ec2metadata"
30         "github.com/aws/aws-sdk-go/aws/request"
31         "github.com/aws/aws-sdk-go/aws/session"
32         "github.com/aws/aws-sdk-go/service/ec2"
33         "github.com/prometheus/client_golang/prometheus"
34         "github.com/sirupsen/logrus"
35         "golang.org/x/crypto/ssh"
36 )
37
38 // Driver is the ec2 implementation of the cloud.Driver interface.
39 var Driver = cloud.DriverFunc(newEC2InstanceSet)
40
41 const (
42         throttleDelayMin = time.Second
43         throttleDelayMax = time.Minute
44 )
45
46 type ec2InstanceSetConfig struct {
47         AccessKeyID             string
48         SecretAccessKey         string
49         Region                  string
50         SecurityGroupIDs        arvados.StringSet
51         SubnetID                sliceOrSingleString
52         AdminUsername           string
53         EBSVolumeType           string
54         EBSPrice                float64
55         IAMInstanceProfile      string
56         SpotPriceUpdateInterval arvados.Duration
57 }
58
59 type sliceOrSingleString []string
60
61 // UnmarshalJSON unmarshals an array of strings, and also accepts ""
62 // as [], and "foo" as ["foo"].
63 func (ss *sliceOrSingleString) UnmarshalJSON(data []byte) error {
64         if len(data) == 0 {
65                 *ss = nil
66         } else if data[0] == '[' {
67                 var slice []string
68                 err := json.Unmarshal(data, &slice)
69                 if err != nil {
70                         return err
71                 }
72                 if len(slice) == 0 {
73                         *ss = nil
74                 } else {
75                         *ss = slice
76                 }
77         } else {
78                 var str string
79                 err := json.Unmarshal(data, &str)
80                 if err != nil {
81                         return err
82                 }
83                 if str == "" {
84                         *ss = nil
85                 } else {
86                         *ss = []string{str}
87                 }
88         }
89         return nil
90 }
91
92 type ec2Interface interface {
93         DescribeKeyPairs(input *ec2.DescribeKeyPairsInput) (*ec2.DescribeKeyPairsOutput, error)
94         ImportKeyPair(input *ec2.ImportKeyPairInput) (*ec2.ImportKeyPairOutput, error)
95         RunInstances(input *ec2.RunInstancesInput) (*ec2.Reservation, error)
96         DescribeInstances(input *ec2.DescribeInstancesInput) (*ec2.DescribeInstancesOutput, error)
97         DescribeInstanceStatusPages(input *ec2.DescribeInstanceStatusInput, fn func(*ec2.DescribeInstanceStatusOutput, bool) bool) error
98         DescribeSpotPriceHistoryPages(input *ec2.DescribeSpotPriceHistoryInput, fn func(*ec2.DescribeSpotPriceHistoryOutput, bool) bool) error
99         CreateTags(input *ec2.CreateTagsInput) (*ec2.CreateTagsOutput, error)
100         TerminateInstances(input *ec2.TerminateInstancesInput) (*ec2.TerminateInstancesOutput, error)
101 }
102
103 type ec2InstanceSet struct {
104         ec2config              ec2InstanceSetConfig
105         currentSubnetIDIndex   int32
106         instanceSetID          cloud.InstanceSetID
107         logger                 logrus.FieldLogger
108         client                 ec2Interface
109         keysMtx                sync.Mutex
110         keys                   map[string]string
111         throttleDelayCreate    atomic.Value
112         throttleDelayInstances atomic.Value
113
114         prices        map[priceKey][]cloud.InstancePrice
115         pricesLock    sync.Mutex
116         pricesUpdated map[priceKey]time.Time
117
118         mInstances      *prometheus.GaugeVec
119         mInstanceStarts *prometheus.CounterVec
120 }
121
122 func newEC2InstanceSet(config json.RawMessage, instanceSetID cloud.InstanceSetID, _ cloud.SharedResourceTags, logger logrus.FieldLogger, reg *prometheus.Registry) (prv cloud.InstanceSet, err error) {
123         instanceSet := &ec2InstanceSet{
124                 instanceSetID: instanceSetID,
125                 logger:        logger,
126         }
127         err = json.Unmarshal(config, &instanceSet.ec2config)
128         if err != nil {
129                 return nil, err
130         }
131
132         sess, err := session.NewSession()
133         if err != nil {
134                 return nil, err
135         }
136         // First try any static credentials, fall back to an IAM instance profile/role
137         creds := credentials.NewChainCredentials(
138                 []credentials.Provider{
139                         &credentials.StaticProvider{Value: credentials.Value{AccessKeyID: instanceSet.ec2config.AccessKeyID, SecretAccessKey: instanceSet.ec2config.SecretAccessKey}},
140                         &ec2rolecreds.EC2RoleProvider{Client: ec2metadata.New(sess)},
141                 })
142
143         awsConfig := aws.NewConfig().WithCredentials(creds).WithRegion(instanceSet.ec2config.Region)
144         instanceSet.client = ec2.New(session.Must(session.NewSession(awsConfig)))
145         instanceSet.keys = make(map[string]string)
146         if instanceSet.ec2config.EBSVolumeType == "" {
147                 instanceSet.ec2config.EBSVolumeType = "gp2"
148         }
149
150         // Set up metrics
151         instanceSet.mInstances = prometheus.NewGaugeVec(prometheus.GaugeOpts{
152                 Namespace: "arvados",
153                 Subsystem: "dispatchcloud",
154                 Name:      "ec2_instances",
155                 Help:      "Number of instances running",
156         }, []string{"subnet_id"})
157         instanceSet.mInstanceStarts = prometheus.NewCounterVec(prometheus.CounterOpts{
158                 Namespace: "arvados",
159                 Subsystem: "dispatchcloud",
160                 Name:      "ec2_instance_starts_total",
161                 Help:      "Number of attempts to start a new instance",
162         }, []string{"subnet_id", "success"})
163         // Initialize all of the series we'll be reporting.  Otherwise
164         // the {subnet=A, success=0} series doesn't appear in metrics
165         // at all until there's a failure in subnet A.
166         for _, subnet := range instanceSet.ec2config.SubnetID {
167                 instanceSet.mInstanceStarts.WithLabelValues(subnet, "0").Add(0)
168                 instanceSet.mInstanceStarts.WithLabelValues(subnet, "1").Add(0)
169         }
170         if len(instanceSet.ec2config.SubnetID) == 0 {
171                 instanceSet.mInstanceStarts.WithLabelValues("", "0").Add(0)
172                 instanceSet.mInstanceStarts.WithLabelValues("", "1").Add(0)
173         }
174         if reg != nil {
175                 reg.MustRegister(instanceSet.mInstances)
176                 reg.MustRegister(instanceSet.mInstanceStarts)
177         }
178
179         return instanceSet, nil
180 }
181
182 func awsKeyFingerprint(pk ssh.PublicKey) (md5fp string, sha1fp string, err error) {
183         // AWS key fingerprints don't use the usual key fingerprint
184         // you get from ssh-keygen or ssh.FingerprintLegacyMD5()
185         // (you can get that from md5.Sum(pk.Marshal())
186         //
187         // AWS uses the md5 or sha1 of the PKIX DER encoding of the
188         // public key, so calculate those fingerprints here.
189         var rsaPub struct {
190                 Name string
191                 E    *big.Int
192                 N    *big.Int
193         }
194         if err := ssh.Unmarshal(pk.Marshal(), &rsaPub); err != nil {
195                 return "", "", fmt.Errorf("agent: Unmarshal failed to parse public key: %v", err)
196         }
197         rsaPk := rsa.PublicKey{
198                 E: int(rsaPub.E.Int64()),
199                 N: rsaPub.N,
200         }
201         pkix, _ := x509.MarshalPKIXPublicKey(&rsaPk)
202         md5pkix := md5.Sum([]byte(pkix))
203         sha1pkix := sha1.Sum([]byte(pkix))
204         md5fp = ""
205         sha1fp = ""
206         for i := 0; i < len(md5pkix); i++ {
207                 md5fp += fmt.Sprintf(":%02x", md5pkix[i])
208         }
209         for i := 0; i < len(sha1pkix); i++ {
210                 sha1fp += fmt.Sprintf(":%02x", sha1pkix[i])
211         }
212         return md5fp[1:], sha1fp[1:], nil
213 }
214
215 func (instanceSet *ec2InstanceSet) Create(
216         instanceType arvados.InstanceType,
217         imageID cloud.ImageID,
218         newTags cloud.InstanceTags,
219         initCommand cloud.InitCommand,
220         publicKey ssh.PublicKey) (cloud.Instance, error) {
221
222         ec2tags := []*ec2.Tag{}
223         for k, v := range newTags {
224                 ec2tags = append(ec2tags, &ec2.Tag{
225                         Key:   aws.String(k),
226                         Value: aws.String(v),
227                 })
228         }
229
230         var groups []string
231         for sg := range instanceSet.ec2config.SecurityGroupIDs {
232                 groups = append(groups, sg)
233         }
234
235         rii := ec2.RunInstancesInput{
236                 ImageId:      aws.String(string(imageID)),
237                 InstanceType: &instanceType.ProviderType,
238                 MaxCount:     aws.Int64(1),
239                 MinCount:     aws.Int64(1),
240
241                 NetworkInterfaces: []*ec2.InstanceNetworkInterfaceSpecification{
242                         {
243                                 AssociatePublicIpAddress: aws.Bool(false),
244                                 DeleteOnTermination:      aws.Bool(true),
245                                 DeviceIndex:              aws.Int64(0),
246                                 Groups:                   aws.StringSlice(groups),
247                         }},
248                 DisableApiTermination:             aws.Bool(false),
249                 InstanceInitiatedShutdownBehavior: aws.String("terminate"),
250                 TagSpecifications: []*ec2.TagSpecification{
251                         {
252                                 ResourceType: aws.String("instance"),
253                                 Tags:         ec2tags,
254                         }},
255                 MetadataOptions: &ec2.InstanceMetadataOptionsRequest{
256                         // Require IMDSv2, as described at
257                         // https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/configuring-IMDS-new-instances.html
258                         HttpEndpoint: aws.String(ec2.InstanceMetadataEndpointStateEnabled),
259                         HttpTokens:   aws.String(ec2.HttpTokensStateRequired),
260                 },
261                 UserData: aws.String(base64.StdEncoding.EncodeToString([]byte("#!/bin/sh\n" + initCommand + "\n"))),
262         }
263
264         if publicKey != nil {
265                 keyname, err := instanceSet.getKeyName(publicKey)
266                 if err != nil {
267                         return nil, err
268                 }
269                 rii.KeyName = &keyname
270         }
271
272         if instanceType.AddedScratch > 0 {
273                 rii.BlockDeviceMappings = []*ec2.BlockDeviceMapping{{
274                         DeviceName: aws.String("/dev/xvdt"),
275                         Ebs: &ec2.EbsBlockDevice{
276                                 DeleteOnTermination: aws.Bool(true),
277                                 VolumeSize:          aws.Int64((int64(instanceType.AddedScratch) + (1<<30 - 1)) >> 30),
278                                 VolumeType:          &instanceSet.ec2config.EBSVolumeType,
279                         }}}
280         }
281
282         if instanceType.Preemptible {
283                 rii.InstanceMarketOptions = &ec2.InstanceMarketOptionsRequest{
284                         MarketType: aws.String("spot"),
285                         SpotOptions: &ec2.SpotMarketOptions{
286                                 InstanceInterruptionBehavior: aws.String("terminate"),
287                                 MaxPrice:                     aws.String(fmt.Sprintf("%v", instanceType.Price)),
288                         }}
289         }
290
291         if instanceSet.ec2config.IAMInstanceProfile != "" {
292                 rii.IamInstanceProfile = &ec2.IamInstanceProfileSpecification{
293                         Name: aws.String(instanceSet.ec2config.IAMInstanceProfile),
294                 }
295         }
296
297         var rsv *ec2.Reservation
298         var errToReturn error
299         subnets := instanceSet.ec2config.SubnetID
300         currentSubnetIDIndex := int(atomic.LoadInt32(&instanceSet.currentSubnetIDIndex))
301         for tryOffset := 0; ; tryOffset++ {
302                 tryIndex := 0
303                 trySubnet := ""
304                 if len(subnets) > 0 {
305                         tryIndex = (currentSubnetIDIndex + tryOffset) % len(subnets)
306                         trySubnet = subnets[tryIndex]
307                         rii.NetworkInterfaces[0].SubnetId = aws.String(trySubnet)
308                 }
309                 var err error
310                 rsv, err = instanceSet.client.RunInstances(&rii)
311                 instanceSet.mInstanceStarts.WithLabelValues(trySubnet, boolLabelValue[err == nil]).Add(1)
312                 if !isErrorCapacity(errToReturn) || isErrorCapacity(err) {
313                         // We want to return the last capacity error,
314                         // if any; otherwise the last non-capacity
315                         // error.
316                         errToReturn = err
317                 }
318                 if isErrorSubnetSpecific(err) &&
319                         tryOffset < len(subnets)-1 {
320                         instanceSet.logger.WithError(err).WithField("SubnetID", subnets[tryIndex]).
321                                 Warn("RunInstances failed, trying next subnet")
322                         continue
323                 }
324                 // Succeeded, or exhausted all subnets, or got a
325                 // non-subnet-related error.
326                 //
327                 // We intentionally update currentSubnetIDIndex even
328                 // in the non-retryable-failure case here to avoid a
329                 // situation where successive calls to Create() keep
330                 // returning errors for the same subnet (perhaps
331                 // "subnet full") and never reveal the errors for the
332                 // other configured subnets (perhaps "subnet ID
333                 // invalid").
334                 atomic.StoreInt32(&instanceSet.currentSubnetIDIndex, int32(tryIndex))
335                 break
336         }
337         if rsv == nil || len(rsv.Instances) == 0 {
338                 return nil, wrapError(errToReturn, &instanceSet.throttleDelayCreate)
339         }
340         return &ec2Instance{
341                 provider: instanceSet,
342                 instance: rsv.Instances[0],
343         }, nil
344 }
345
346 func (instanceSet *ec2InstanceSet) getKeyName(publicKey ssh.PublicKey) (string, error) {
347         instanceSet.keysMtx.Lock()
348         defer instanceSet.keysMtx.Unlock()
349         md5keyFingerprint, sha1keyFingerprint, err := awsKeyFingerprint(publicKey)
350         if err != nil {
351                 return "", fmt.Errorf("Could not make key fingerprint: %v", err)
352         }
353         if keyname, ok := instanceSet.keys[md5keyFingerprint]; ok {
354                 return keyname, nil
355         }
356         keyout, err := instanceSet.client.DescribeKeyPairs(&ec2.DescribeKeyPairsInput{
357                 Filters: []*ec2.Filter{{
358                         Name:   aws.String("fingerprint"),
359                         Values: []*string{&md5keyFingerprint, &sha1keyFingerprint},
360                 }},
361         })
362         if err != nil {
363                 return "", fmt.Errorf("Could not search for keypair: %v", err)
364         }
365         if len(keyout.KeyPairs) > 0 {
366                 return *(keyout.KeyPairs[0].KeyName), nil
367         }
368         keyname := "arvados-dispatch-keypair-" + md5keyFingerprint
369         _, err = instanceSet.client.ImportKeyPair(&ec2.ImportKeyPairInput{
370                 KeyName:           &keyname,
371                 PublicKeyMaterial: ssh.MarshalAuthorizedKey(publicKey),
372         })
373         if err != nil {
374                 return "", fmt.Errorf("Could not import keypair: %v", err)
375         }
376         instanceSet.keys[md5keyFingerprint] = keyname
377         return keyname, nil
378 }
379
380 func (instanceSet *ec2InstanceSet) Instances(tags cloud.InstanceTags) (instances []cloud.Instance, err error) {
381         var filters []*ec2.Filter
382         for k, v := range tags {
383                 filters = append(filters, &ec2.Filter{
384                         Name:   aws.String("tag:" + k),
385                         Values: []*string{aws.String(v)},
386                 })
387         }
388         needAZs := false
389         dii := &ec2.DescribeInstancesInput{Filters: filters}
390         for {
391                 dio, err := instanceSet.client.DescribeInstances(dii)
392                 err = wrapError(err, &instanceSet.throttleDelayInstances)
393                 if err != nil {
394                         return nil, err
395                 }
396
397                 for _, rsv := range dio.Reservations {
398                         for _, inst := range rsv.Instances {
399                                 if *inst.State.Name != "shutting-down" && *inst.State.Name != "terminated" {
400                                         instances = append(instances, &ec2Instance{
401                                                 provider: instanceSet,
402                                                 instance: inst,
403                                         })
404                                         if aws.StringValue(inst.InstanceLifecycle) == "spot" {
405                                                 needAZs = true
406                                         }
407                                 }
408                         }
409                 }
410                 if dio.NextToken == nil {
411                         break
412                 }
413                 dii.NextToken = dio.NextToken
414         }
415         if needAZs && instanceSet.ec2config.SpotPriceUpdateInterval > 0 {
416                 az := map[string]string{}
417                 err := instanceSet.client.DescribeInstanceStatusPages(&ec2.DescribeInstanceStatusInput{
418                         IncludeAllInstances: aws.Bool(true),
419                 }, func(page *ec2.DescribeInstanceStatusOutput, lastPage bool) bool {
420                         for _, ent := range page.InstanceStatuses {
421                                 az[*ent.InstanceId] = *ent.AvailabilityZone
422                         }
423                         return true
424                 })
425                 if err != nil {
426                         instanceSet.logger.Warnf("error getting instance statuses: %s", err)
427                 }
428                 for _, inst := range instances {
429                         inst := inst.(*ec2Instance)
430                         inst.availabilityZone = az[*inst.instance.InstanceId]
431                 }
432                 instanceSet.updateSpotPrices(instances)
433         }
434
435         // Count instances in each subnet, and report in metrics.
436         subnetInstances := map[string]int{"": 0}
437         for _, subnet := range instanceSet.ec2config.SubnetID {
438                 subnetInstances[subnet] = 0
439         }
440         for _, inst := range instances {
441                 subnet := inst.(*ec2Instance).instance.SubnetId
442                 if subnet != nil {
443                         subnetInstances[*subnet]++
444                 } else {
445                         subnetInstances[""]++
446                 }
447         }
448         for subnet, count := range subnetInstances {
449                 instanceSet.mInstances.WithLabelValues(subnet).Set(float64(count))
450         }
451
452         return instances, err
453 }
454
455 type priceKey struct {
456         instanceType     string
457         spot             bool
458         availabilityZone string
459 }
460
461 // Refresh recent spot instance pricing data for the given instances,
462 // unless we already have recent pricing data for all relevant types.
463 func (instanceSet *ec2InstanceSet) updateSpotPrices(instances []cloud.Instance) {
464         if len(instances) == 0 {
465                 return
466         }
467
468         instanceSet.pricesLock.Lock()
469         defer instanceSet.pricesLock.Unlock()
470         if instanceSet.prices == nil {
471                 instanceSet.prices = map[priceKey][]cloud.InstancePrice{}
472                 instanceSet.pricesUpdated = map[priceKey]time.Time{}
473         }
474
475         updateTime := time.Now()
476         staleTime := updateTime.Add(-instanceSet.ec2config.SpotPriceUpdateInterval.Duration())
477         needUpdate := false
478         allTypes := map[string]bool{}
479
480         for _, inst := range instances {
481                 ec2inst := inst.(*ec2Instance).instance
482                 if aws.StringValue(ec2inst.InstanceLifecycle) == "spot" {
483                         pk := priceKey{
484                                 instanceType:     *ec2inst.InstanceType,
485                                 spot:             true,
486                                 availabilityZone: inst.(*ec2Instance).availabilityZone,
487                         }
488                         if instanceSet.pricesUpdated[pk].Before(staleTime) {
489                                 needUpdate = true
490                         }
491                         allTypes[*ec2inst.InstanceType] = true
492                 }
493         }
494         if !needUpdate {
495                 return
496         }
497         var typeFilterValues []*string
498         for instanceType := range allTypes {
499                 typeFilterValues = append(typeFilterValues, aws.String(instanceType))
500         }
501         // Get 3x update interval worth of pricing data. (Ideally the
502         // AWS API would tell us "we have shown you all of the price
503         // changes up to time T", but it doesn't, so we'll just ask
504         // for 3 intervals worth of data on each update, de-duplicate
505         // the data points, and not worry too much about occasionally
506         // missing some data points when our lookups fail twice in a
507         // row.
508         dsphi := &ec2.DescribeSpotPriceHistoryInput{
509                 StartTime: aws.Time(updateTime.Add(-3 * instanceSet.ec2config.SpotPriceUpdateInterval.Duration())),
510                 Filters: []*ec2.Filter{
511                         &ec2.Filter{Name: aws.String("instance-type"), Values: typeFilterValues},
512                         &ec2.Filter{Name: aws.String("product-description"), Values: []*string{aws.String("Linux/UNIX")}},
513                 },
514         }
515         err := instanceSet.client.DescribeSpotPriceHistoryPages(dsphi, func(page *ec2.DescribeSpotPriceHistoryOutput, lastPage bool) bool {
516                 for _, ent := range page.SpotPriceHistory {
517                         if ent.InstanceType == nil || ent.SpotPrice == nil || ent.Timestamp == nil {
518                                 // bogus record?
519                                 continue
520                         }
521                         price, err := strconv.ParseFloat(*ent.SpotPrice, 64)
522                         if err != nil {
523                                 // bogus record?
524                                 continue
525                         }
526                         pk := priceKey{
527                                 instanceType:     *ent.InstanceType,
528                                 spot:             true,
529                                 availabilityZone: *ent.AvailabilityZone,
530                         }
531                         instanceSet.prices[pk] = append(instanceSet.prices[pk], cloud.InstancePrice{
532                                 StartTime: *ent.Timestamp,
533                                 Price:     price,
534                         })
535                         instanceSet.pricesUpdated[pk] = updateTime
536                 }
537                 return true
538         })
539         if err != nil {
540                 instanceSet.logger.Warnf("error retrieving spot instance prices: %s", err)
541         }
542
543         expiredTime := updateTime.Add(-64 * instanceSet.ec2config.SpotPriceUpdateInterval.Duration())
544         for pk, last := range instanceSet.pricesUpdated {
545                 if last.Before(expiredTime) {
546                         delete(instanceSet.pricesUpdated, pk)
547                         delete(instanceSet.prices, pk)
548                 }
549         }
550         for pk, prices := range instanceSet.prices {
551                 instanceSet.prices[pk] = cloud.NormalizePriceHistory(prices)
552         }
553 }
554
555 func (instanceSet *ec2InstanceSet) Stop() {
556 }
557
558 type ec2Instance struct {
559         provider         *ec2InstanceSet
560         instance         *ec2.Instance
561         availabilityZone string // sometimes available for spot instances
562 }
563
564 func (inst *ec2Instance) ID() cloud.InstanceID {
565         return cloud.InstanceID(*inst.instance.InstanceId)
566 }
567
568 func (inst *ec2Instance) String() string {
569         return *inst.instance.InstanceId
570 }
571
572 func (inst *ec2Instance) ProviderType() string {
573         return *inst.instance.InstanceType
574 }
575
576 func (inst *ec2Instance) SetTags(newTags cloud.InstanceTags) error {
577         var ec2tags []*ec2.Tag
578         for k, v := range newTags {
579                 ec2tags = append(ec2tags, &ec2.Tag{
580                         Key:   aws.String(k),
581                         Value: aws.String(v),
582                 })
583         }
584
585         _, err := inst.provider.client.CreateTags(&ec2.CreateTagsInput{
586                 Resources: []*string{inst.instance.InstanceId},
587                 Tags:      ec2tags,
588         })
589
590         return err
591 }
592
593 func (inst *ec2Instance) Tags() cloud.InstanceTags {
594         tags := make(map[string]string)
595
596         for _, t := range inst.instance.Tags {
597                 tags[*t.Key] = *t.Value
598         }
599
600         return tags
601 }
602
603 func (inst *ec2Instance) Destroy() error {
604         _, err := inst.provider.client.TerminateInstances(&ec2.TerminateInstancesInput{
605                 InstanceIds: []*string{inst.instance.InstanceId},
606         })
607         return err
608 }
609
610 func (inst *ec2Instance) Address() string {
611         if inst.instance.PrivateIpAddress != nil {
612                 return *inst.instance.PrivateIpAddress
613         }
614         return ""
615 }
616
617 func (inst *ec2Instance) RemoteUser() string {
618         return inst.provider.ec2config.AdminUsername
619 }
620
621 func (inst *ec2Instance) VerifyHostKey(ssh.PublicKey, *ssh.Client) error {
622         return cloud.ErrNotImplemented
623 }
624
625 // PriceHistory returns the price history for this specific instance.
626 //
627 // AWS documentation is elusive about whether the hourly cost of a
628 // given spot instance changes as the current spot price changes for
629 // the corresponding instance type and availability zone. Our
630 // implementation assumes the answer is yes, based on the following
631 // hints.
632 //
633 // https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/spot-requests.html
634 // says: "After your Spot Instance is running, if the Spot price rises
635 // above your maximum price, Amazon EC2 interrupts your Spot
636 // Instance." (This doesn't address what happens when the spot price
637 // rises *without* exceeding your maximum price.)
638 //
639 // https://docs.aws.amazon.com/whitepapers/latest/cost-optimization-leveraging-ec2-spot-instances/how-spot-instances-work.html
640 // says: "You pay the Spot price that's in effect, billed to the
641 // nearest second." (But it's not explicitly stated whether "the price
642 // in effect" changes over time for a given instance.)
643 //
644 // The same page also says, in a discussion about the effect of
645 // specifying a maximum price: "Note that you never pay more than the
646 // Spot price that is in effect when your Spot Instance is running."
647 // (The use of the phrase "is running", as opposed to "was launched",
648 // hints that pricing is dynamic.)
649 func (inst *ec2Instance) PriceHistory(instType arvados.InstanceType) []cloud.InstancePrice {
650         inst.provider.pricesLock.Lock()
651         defer inst.provider.pricesLock.Unlock()
652         // Note updateSpotPrices currently populates
653         // inst.provider.prices only for spot instances, so if
654         // spot==false here, we will return no data.
655         pk := priceKey{
656                 instanceType:     *inst.instance.InstanceType,
657                 spot:             aws.StringValue(inst.instance.InstanceLifecycle) == "spot",
658                 availabilityZone: inst.availabilityZone,
659         }
660         var prices []cloud.InstancePrice
661         for _, price := range inst.provider.prices[pk] {
662                 // ceil(added scratch space in GiB)
663                 gib := (instType.AddedScratch + 1<<30 - 1) >> 30
664                 monthly := inst.provider.ec2config.EBSPrice * float64(gib)
665                 hourly := monthly / 30 / 24
666                 price.Price += hourly
667                 prices = append(prices, price)
668         }
669         return prices
670 }
671
672 type rateLimitError struct {
673         error
674         earliestRetry time.Time
675 }
676
677 func (err rateLimitError) EarliestRetry() time.Time {
678         return err.earliestRetry
679 }
680
681 type capacityError struct {
682         error
683         isInstanceTypeSpecific bool
684 }
685
686 func (er *capacityError) IsCapacityError() bool {
687         return true
688 }
689
690 func (er *capacityError) IsInstanceTypeSpecific() bool {
691         return er.isInstanceTypeSpecific
692 }
693
694 var isCodeQuota = map[string]bool{
695         "InstanceLimitExceeded":             true,
696         "InsufficientAddressCapacity":       true,
697         "InsufficientFreeAddressesInSubnet": true,
698         "InsufficientVolumeCapacity":        true,
699         "MaxSpotInstanceCountExceeded":      true,
700         "VcpuLimitExceeded":                 true,
701 }
702
703 // isErrorQuota returns whether the error indicates we have reached
704 // some usage quota/limit -- i.e., immediately retrying with an equal
705 // or larger instance type will probably not work.
706 //
707 // Returns false if error is nil.
708 func isErrorQuota(err error) bool {
709         if aerr, ok := err.(awserr.Error); ok && aerr != nil {
710                 if _, ok := isCodeQuota[aerr.Code()]; ok {
711                         return true
712                 }
713         }
714         return false
715 }
716
717 var reSubnetSpecificInvalidParameterMessage = regexp.MustCompile(`(?ms).*( subnet |sufficient free [Ii]pv[46] addresses).*`)
718
719 // isErrorSubnetSpecific returns true if the problem encountered by
720 // RunInstances might be avoided by trying a different subnet.
721 func isErrorSubnetSpecific(err error) bool {
722         aerr, ok := err.(awserr.Error)
723         if !ok {
724                 return false
725         }
726         code := aerr.Code()
727         return strings.Contains(code, "Subnet") ||
728                 code == "InsufficientInstanceCapacity" ||
729                 code == "InsufficientVolumeCapacity" ||
730                 code == "Unsupported" ||
731                 // See TestIsErrorSubnetSpecific for examples of why
732                 // we look for substrings in code/message instead of
733                 // only using specific codes here.
734                 (strings.Contains(code, "InvalidParameter") &&
735                         reSubnetSpecificInvalidParameterMessage.MatchString(aerr.Message()))
736 }
737
738 // isErrorCapacity returns true if the error indicates lack of
739 // capacity (either temporary or permanent) to run a specific instance
740 // type -- i.e., retrying with a different instance type might
741 // succeed.
742 func isErrorCapacity(err error) bool {
743         aerr, ok := err.(awserr.Error)
744         if !ok {
745                 return false
746         }
747         code := aerr.Code()
748         return code == "InsufficientInstanceCapacity" ||
749                 (code == "Unsupported" && strings.Contains(aerr.Message(), "requested instance type"))
750 }
751
752 type ec2QuotaError struct {
753         error
754 }
755
756 func (er *ec2QuotaError) IsQuotaError() bool {
757         return true
758 }
759
760 func wrapError(err error, throttleValue *atomic.Value) error {
761         if request.IsErrorThrottle(err) {
762                 // Back off exponentially until an upstream call
763                 // either succeeds or returns a non-throttle error.
764                 d, _ := throttleValue.Load().(time.Duration)
765                 d = d*3/2 + time.Second
766                 if d < throttleDelayMin {
767                         d = throttleDelayMin
768                 } else if d > throttleDelayMax {
769                         d = throttleDelayMax
770                 }
771                 throttleValue.Store(d)
772                 return rateLimitError{error: err, earliestRetry: time.Now().Add(d)}
773         } else if isErrorQuota(err) {
774                 return &ec2QuotaError{err}
775         } else if isErrorCapacity(err) {
776                 return &capacityError{err, true}
777         } else if err != nil {
778                 throttleValue.Store(time.Duration(0))
779                 return err
780         }
781         throttleValue.Store(time.Duration(0))
782         return nil
783 }
784
785 var boolLabelValue = map[bool]string{false: "0", true: "1"}