Merge branch '20594-scaling-nginx-settings'. Closes #20594
[arvados.git] / lib / cloud / ec2 / ec2.go
1 // Copyright (C) The Arvados Authors. All rights reserved.
2 //
3 // SPDX-License-Identifier: AGPL-3.0
4
5 package ec2
6
7 import (
8         "crypto/md5"
9         "crypto/rsa"
10         "crypto/sha1"
11         "crypto/x509"
12         "encoding/base64"
13         "encoding/json"
14         "fmt"
15         "math/big"
16         "strconv"
17         "sync"
18         "sync/atomic"
19         "time"
20
21         "git.arvados.org/arvados.git/lib/cloud"
22         "git.arvados.org/arvados.git/sdk/go/arvados"
23         "github.com/aws/aws-sdk-go/aws"
24         "github.com/aws/aws-sdk-go/aws/awserr"
25         "github.com/aws/aws-sdk-go/aws/credentials"
26         "github.com/aws/aws-sdk-go/aws/credentials/ec2rolecreds"
27         "github.com/aws/aws-sdk-go/aws/ec2metadata"
28         "github.com/aws/aws-sdk-go/aws/request"
29         "github.com/aws/aws-sdk-go/aws/session"
30         "github.com/aws/aws-sdk-go/service/ec2"
31         "github.com/sirupsen/logrus"
32         "golang.org/x/crypto/ssh"
33 )
34
35 // Driver is the ec2 implementation of the cloud.Driver interface.
36 var Driver = cloud.DriverFunc(newEC2InstanceSet)
37
38 const (
39         throttleDelayMin = time.Second
40         throttleDelayMax = time.Minute
41 )
42
43 type ec2InstanceSetConfig struct {
44         AccessKeyID             string
45         SecretAccessKey         string
46         Region                  string
47         SecurityGroupIDs        arvados.StringSet
48         SubnetID                string
49         AdminUsername           string
50         EBSVolumeType           string
51         EBSPrice                float64
52         IAMInstanceProfile      string
53         SpotPriceUpdateInterval arvados.Duration
54 }
55
56 type ec2Interface interface {
57         DescribeKeyPairs(input *ec2.DescribeKeyPairsInput) (*ec2.DescribeKeyPairsOutput, error)
58         ImportKeyPair(input *ec2.ImportKeyPairInput) (*ec2.ImportKeyPairOutput, error)
59         RunInstances(input *ec2.RunInstancesInput) (*ec2.Reservation, error)
60         DescribeInstances(input *ec2.DescribeInstancesInput) (*ec2.DescribeInstancesOutput, error)
61         DescribeInstanceStatusPages(input *ec2.DescribeInstanceStatusInput, fn func(*ec2.DescribeInstanceStatusOutput, bool) bool) error
62         DescribeSpotPriceHistoryPages(input *ec2.DescribeSpotPriceHistoryInput, fn func(*ec2.DescribeSpotPriceHistoryOutput, bool) bool) error
63         CreateTags(input *ec2.CreateTagsInput) (*ec2.CreateTagsOutput, error)
64         TerminateInstances(input *ec2.TerminateInstancesInput) (*ec2.TerminateInstancesOutput, error)
65 }
66
67 type ec2InstanceSet struct {
68         ec2config              ec2InstanceSetConfig
69         instanceSetID          cloud.InstanceSetID
70         logger                 logrus.FieldLogger
71         client                 ec2Interface
72         keysMtx                sync.Mutex
73         keys                   map[string]string
74         throttleDelayCreate    atomic.Value
75         throttleDelayInstances atomic.Value
76
77         prices        map[priceKey][]cloud.InstancePrice
78         pricesLock    sync.Mutex
79         pricesUpdated map[priceKey]time.Time
80 }
81
82 func newEC2InstanceSet(config json.RawMessage, instanceSetID cloud.InstanceSetID, _ cloud.SharedResourceTags, logger logrus.FieldLogger) (prv cloud.InstanceSet, err error) {
83         instanceSet := &ec2InstanceSet{
84                 instanceSetID: instanceSetID,
85                 logger:        logger,
86         }
87         err = json.Unmarshal(config, &instanceSet.ec2config)
88         if err != nil {
89                 return nil, err
90         }
91
92         sess, err := session.NewSession()
93         if err != nil {
94                 return nil, err
95         }
96         // First try any static credentials, fall back to an IAM instance profile/role
97         creds := credentials.NewChainCredentials(
98                 []credentials.Provider{
99                         &credentials.StaticProvider{Value: credentials.Value{AccessKeyID: instanceSet.ec2config.AccessKeyID, SecretAccessKey: instanceSet.ec2config.SecretAccessKey}},
100                         &ec2rolecreds.EC2RoleProvider{Client: ec2metadata.New(sess)},
101                 })
102
103         awsConfig := aws.NewConfig().WithCredentials(creds).WithRegion(instanceSet.ec2config.Region)
104         instanceSet.client = ec2.New(session.Must(session.NewSession(awsConfig)))
105         instanceSet.keys = make(map[string]string)
106         if instanceSet.ec2config.EBSVolumeType == "" {
107                 instanceSet.ec2config.EBSVolumeType = "gp2"
108         }
109         return instanceSet, nil
110 }
111
112 func awsKeyFingerprint(pk ssh.PublicKey) (md5fp string, sha1fp string, err error) {
113         // AWS key fingerprints don't use the usual key fingerprint
114         // you get from ssh-keygen or ssh.FingerprintLegacyMD5()
115         // (you can get that from md5.Sum(pk.Marshal())
116         //
117         // AWS uses the md5 or sha1 of the PKIX DER encoding of the
118         // public key, so calculate those fingerprints here.
119         var rsaPub struct {
120                 Name string
121                 E    *big.Int
122                 N    *big.Int
123         }
124         if err := ssh.Unmarshal(pk.Marshal(), &rsaPub); err != nil {
125                 return "", "", fmt.Errorf("agent: Unmarshal failed to parse public key: %v", err)
126         }
127         rsaPk := rsa.PublicKey{
128                 E: int(rsaPub.E.Int64()),
129                 N: rsaPub.N,
130         }
131         pkix, _ := x509.MarshalPKIXPublicKey(&rsaPk)
132         md5pkix := md5.Sum([]byte(pkix))
133         sha1pkix := sha1.Sum([]byte(pkix))
134         md5fp = ""
135         sha1fp = ""
136         for i := 0; i < len(md5pkix); i++ {
137                 md5fp += fmt.Sprintf(":%02x", md5pkix[i])
138         }
139         for i := 0; i < len(sha1pkix); i++ {
140                 sha1fp += fmt.Sprintf(":%02x", sha1pkix[i])
141         }
142         return md5fp[1:], sha1fp[1:], nil
143 }
144
145 func (instanceSet *ec2InstanceSet) Create(
146         instanceType arvados.InstanceType,
147         imageID cloud.ImageID,
148         newTags cloud.InstanceTags,
149         initCommand cloud.InitCommand,
150         publicKey ssh.PublicKey) (cloud.Instance, error) {
151
152         ec2tags := []*ec2.Tag{}
153         for k, v := range newTags {
154                 ec2tags = append(ec2tags, &ec2.Tag{
155                         Key:   aws.String(k),
156                         Value: aws.String(v),
157                 })
158         }
159
160         var groups []string
161         for sg := range instanceSet.ec2config.SecurityGroupIDs {
162                 groups = append(groups, sg)
163         }
164
165         rii := ec2.RunInstancesInput{
166                 ImageId:      aws.String(string(imageID)),
167                 InstanceType: &instanceType.ProviderType,
168                 MaxCount:     aws.Int64(1),
169                 MinCount:     aws.Int64(1),
170
171                 NetworkInterfaces: []*ec2.InstanceNetworkInterfaceSpecification{
172                         {
173                                 AssociatePublicIpAddress: aws.Bool(false),
174                                 DeleteOnTermination:      aws.Bool(true),
175                                 DeviceIndex:              aws.Int64(0),
176                                 Groups:                   aws.StringSlice(groups),
177                                 SubnetId:                 &instanceSet.ec2config.SubnetID,
178                         }},
179                 DisableApiTermination:             aws.Bool(false),
180                 InstanceInitiatedShutdownBehavior: aws.String("terminate"),
181                 TagSpecifications: []*ec2.TagSpecification{
182                         {
183                                 ResourceType: aws.String("instance"),
184                                 Tags:         ec2tags,
185                         }},
186                 UserData: aws.String(base64.StdEncoding.EncodeToString([]byte("#!/bin/sh\n" + initCommand + "\n"))),
187         }
188
189         if publicKey != nil {
190                 keyname, err := instanceSet.getKeyName(publicKey)
191                 if err != nil {
192                         return nil, err
193                 }
194                 rii.KeyName = &keyname
195         }
196
197         if instanceType.AddedScratch > 0 {
198                 rii.BlockDeviceMappings = []*ec2.BlockDeviceMapping{{
199                         DeviceName: aws.String("/dev/xvdt"),
200                         Ebs: &ec2.EbsBlockDevice{
201                                 DeleteOnTermination: aws.Bool(true),
202                                 VolumeSize:          aws.Int64((int64(instanceType.AddedScratch) + (1<<30 - 1)) >> 30),
203                                 VolumeType:          &instanceSet.ec2config.EBSVolumeType,
204                         }}}
205         }
206
207         if instanceType.Preemptible {
208                 rii.InstanceMarketOptions = &ec2.InstanceMarketOptionsRequest{
209                         MarketType: aws.String("spot"),
210                         SpotOptions: &ec2.SpotMarketOptions{
211                                 InstanceInterruptionBehavior: aws.String("terminate"),
212                                 MaxPrice:                     aws.String(fmt.Sprintf("%v", instanceType.Price)),
213                         }}
214         }
215
216         if instanceSet.ec2config.IAMInstanceProfile != "" {
217                 rii.IamInstanceProfile = &ec2.IamInstanceProfileSpecification{
218                         Name: aws.String(instanceSet.ec2config.IAMInstanceProfile),
219                 }
220         }
221
222         rsv, err := instanceSet.client.RunInstances(&rii)
223         err = wrapError(err, &instanceSet.throttleDelayCreate)
224         if err != nil {
225                 return nil, err
226         }
227         return &ec2Instance{
228                 provider: instanceSet,
229                 instance: rsv.Instances[0],
230         }, nil
231 }
232
233 func (instanceSet *ec2InstanceSet) getKeyName(publicKey ssh.PublicKey) (string, error) {
234         instanceSet.keysMtx.Lock()
235         defer instanceSet.keysMtx.Unlock()
236         md5keyFingerprint, sha1keyFingerprint, err := awsKeyFingerprint(publicKey)
237         if err != nil {
238                 return "", fmt.Errorf("Could not make key fingerprint: %v", err)
239         }
240         if keyname, ok := instanceSet.keys[md5keyFingerprint]; ok {
241                 return keyname, nil
242         }
243         keyout, err := instanceSet.client.DescribeKeyPairs(&ec2.DescribeKeyPairsInput{
244                 Filters: []*ec2.Filter{{
245                         Name:   aws.String("fingerprint"),
246                         Values: []*string{&md5keyFingerprint, &sha1keyFingerprint},
247                 }},
248         })
249         if err != nil {
250                 return "", fmt.Errorf("Could not search for keypair: %v", err)
251         }
252         if len(keyout.KeyPairs) > 0 {
253                 return *(keyout.KeyPairs[0].KeyName), nil
254         }
255         keyname := "arvados-dispatch-keypair-" + md5keyFingerprint
256         _, err = instanceSet.client.ImportKeyPair(&ec2.ImportKeyPairInput{
257                 KeyName:           &keyname,
258                 PublicKeyMaterial: ssh.MarshalAuthorizedKey(publicKey),
259         })
260         if err != nil {
261                 return "", fmt.Errorf("Could not import keypair: %v", err)
262         }
263         instanceSet.keys[md5keyFingerprint] = keyname
264         return keyname, nil
265 }
266
267 func (instanceSet *ec2InstanceSet) Instances(tags cloud.InstanceTags) (instances []cloud.Instance, err error) {
268         var filters []*ec2.Filter
269         for k, v := range tags {
270                 filters = append(filters, &ec2.Filter{
271                         Name:   aws.String("tag:" + k),
272                         Values: []*string{aws.String(v)},
273                 })
274         }
275         needAZs := false
276         dii := &ec2.DescribeInstancesInput{Filters: filters}
277         for {
278                 dio, err := instanceSet.client.DescribeInstances(dii)
279                 err = wrapError(err, &instanceSet.throttleDelayInstances)
280                 if err != nil {
281                         return nil, err
282                 }
283
284                 for _, rsv := range dio.Reservations {
285                         for _, inst := range rsv.Instances {
286                                 if *inst.State.Name != "shutting-down" && *inst.State.Name != "terminated" {
287                                         instances = append(instances, &ec2Instance{
288                                                 provider: instanceSet,
289                                                 instance: inst,
290                                         })
291                                         if aws.StringValue(inst.InstanceLifecycle) == "spot" {
292                                                 needAZs = true
293                                         }
294                                 }
295                         }
296                 }
297                 if dio.NextToken == nil {
298                         break
299                 }
300                 dii.NextToken = dio.NextToken
301         }
302         if needAZs && instanceSet.ec2config.SpotPriceUpdateInterval > 0 {
303                 az := map[string]string{}
304                 err := instanceSet.client.DescribeInstanceStatusPages(&ec2.DescribeInstanceStatusInput{
305                         IncludeAllInstances: aws.Bool(true),
306                 }, func(page *ec2.DescribeInstanceStatusOutput, lastPage bool) bool {
307                         for _, ent := range page.InstanceStatuses {
308                                 az[*ent.InstanceId] = *ent.AvailabilityZone
309                         }
310                         return true
311                 })
312                 if err != nil {
313                         instanceSet.logger.Warnf("error getting instance statuses: %s", err)
314                 }
315                 for _, inst := range instances {
316                         inst := inst.(*ec2Instance)
317                         inst.availabilityZone = az[*inst.instance.InstanceId]
318                 }
319                 instanceSet.updateSpotPrices(instances)
320         }
321         return instances, err
322 }
323
324 type priceKey struct {
325         instanceType     string
326         spot             bool
327         availabilityZone string
328 }
329
330 // Refresh recent spot instance pricing data for the given instances,
331 // unless we already have recent pricing data for all relevant types.
332 func (instanceSet *ec2InstanceSet) updateSpotPrices(instances []cloud.Instance) {
333         if len(instances) == 0 {
334                 return
335         }
336
337         instanceSet.pricesLock.Lock()
338         defer instanceSet.pricesLock.Unlock()
339         if instanceSet.prices == nil {
340                 instanceSet.prices = map[priceKey][]cloud.InstancePrice{}
341                 instanceSet.pricesUpdated = map[priceKey]time.Time{}
342         }
343
344         updateTime := time.Now()
345         staleTime := updateTime.Add(-instanceSet.ec2config.SpotPriceUpdateInterval.Duration())
346         needUpdate := false
347         allTypes := map[string]bool{}
348
349         for _, inst := range instances {
350                 ec2inst := inst.(*ec2Instance).instance
351                 if aws.StringValue(ec2inst.InstanceLifecycle) == "spot" {
352                         pk := priceKey{
353                                 instanceType:     *ec2inst.InstanceType,
354                                 spot:             true,
355                                 availabilityZone: inst.(*ec2Instance).availabilityZone,
356                         }
357                         if instanceSet.pricesUpdated[pk].Before(staleTime) {
358                                 needUpdate = true
359                         }
360                         allTypes[*ec2inst.InstanceType] = true
361                 }
362         }
363         if !needUpdate {
364                 return
365         }
366         var typeFilterValues []*string
367         for instanceType := range allTypes {
368                 typeFilterValues = append(typeFilterValues, aws.String(instanceType))
369         }
370         // Get 3x update interval worth of pricing data. (Ideally the
371         // AWS API would tell us "we have shown you all of the price
372         // changes up to time T", but it doesn't, so we'll just ask
373         // for 3 intervals worth of data on each update, de-duplicate
374         // the data points, and not worry too much about occasionally
375         // missing some data points when our lookups fail twice in a
376         // row.
377         dsphi := &ec2.DescribeSpotPriceHistoryInput{
378                 StartTime: aws.Time(updateTime.Add(-3 * instanceSet.ec2config.SpotPriceUpdateInterval.Duration())),
379                 Filters: []*ec2.Filter{
380                         &ec2.Filter{Name: aws.String("instance-type"), Values: typeFilterValues},
381                         &ec2.Filter{Name: aws.String("product-description"), Values: []*string{aws.String("Linux/UNIX")}},
382                 },
383         }
384         err := instanceSet.client.DescribeSpotPriceHistoryPages(dsphi, func(page *ec2.DescribeSpotPriceHistoryOutput, lastPage bool) bool {
385                 for _, ent := range page.SpotPriceHistory {
386                         if ent.InstanceType == nil || ent.SpotPrice == nil || ent.Timestamp == nil {
387                                 // bogus record?
388                                 continue
389                         }
390                         price, err := strconv.ParseFloat(*ent.SpotPrice, 64)
391                         if err != nil {
392                                 // bogus record?
393                                 continue
394                         }
395                         pk := priceKey{
396                                 instanceType:     *ent.InstanceType,
397                                 spot:             true,
398                                 availabilityZone: *ent.AvailabilityZone,
399                         }
400                         instanceSet.prices[pk] = append(instanceSet.prices[pk], cloud.InstancePrice{
401                                 StartTime: *ent.Timestamp,
402                                 Price:     price,
403                         })
404                         instanceSet.pricesUpdated[pk] = updateTime
405                 }
406                 return true
407         })
408         if err != nil {
409                 instanceSet.logger.Warnf("error retrieving spot instance prices: %s", err)
410         }
411
412         expiredTime := updateTime.Add(-64 * instanceSet.ec2config.SpotPriceUpdateInterval.Duration())
413         for pk, last := range instanceSet.pricesUpdated {
414                 if last.Before(expiredTime) {
415                         delete(instanceSet.pricesUpdated, pk)
416                         delete(instanceSet.prices, pk)
417                 }
418         }
419         for pk, prices := range instanceSet.prices {
420                 instanceSet.prices[pk] = cloud.NormalizePriceHistory(prices)
421         }
422 }
423
424 func (instanceSet *ec2InstanceSet) Stop() {
425 }
426
427 type ec2Instance struct {
428         provider         *ec2InstanceSet
429         instance         *ec2.Instance
430         availabilityZone string // sometimes available for spot instances
431 }
432
433 func (inst *ec2Instance) ID() cloud.InstanceID {
434         return cloud.InstanceID(*inst.instance.InstanceId)
435 }
436
437 func (inst *ec2Instance) String() string {
438         return *inst.instance.InstanceId
439 }
440
441 func (inst *ec2Instance) ProviderType() string {
442         return *inst.instance.InstanceType
443 }
444
445 func (inst *ec2Instance) SetTags(newTags cloud.InstanceTags) error {
446         var ec2tags []*ec2.Tag
447         for k, v := range newTags {
448                 ec2tags = append(ec2tags, &ec2.Tag{
449                         Key:   aws.String(k),
450                         Value: aws.String(v),
451                 })
452         }
453
454         _, err := inst.provider.client.CreateTags(&ec2.CreateTagsInput{
455                 Resources: []*string{inst.instance.InstanceId},
456                 Tags:      ec2tags,
457         })
458
459         return err
460 }
461
462 func (inst *ec2Instance) Tags() cloud.InstanceTags {
463         tags := make(map[string]string)
464
465         for _, t := range inst.instance.Tags {
466                 tags[*t.Key] = *t.Value
467         }
468
469         return tags
470 }
471
472 func (inst *ec2Instance) Destroy() error {
473         _, err := inst.provider.client.TerminateInstances(&ec2.TerminateInstancesInput{
474                 InstanceIds: []*string{inst.instance.InstanceId},
475         })
476         return err
477 }
478
479 func (inst *ec2Instance) Address() string {
480         if inst.instance.PrivateIpAddress != nil {
481                 return *inst.instance.PrivateIpAddress
482         }
483         return ""
484 }
485
486 func (inst *ec2Instance) RemoteUser() string {
487         return inst.provider.ec2config.AdminUsername
488 }
489
490 func (inst *ec2Instance) VerifyHostKey(ssh.PublicKey, *ssh.Client) error {
491         return cloud.ErrNotImplemented
492 }
493
494 // PriceHistory returns the price history for this specific instance.
495 //
496 // AWS documentation is elusive about whether the hourly cost of a
497 // given spot instance changes as the current spot price changes for
498 // the corresponding instance type and availability zone. Our
499 // implementation assumes the answer is yes, based on the following
500 // hints.
501 //
502 // https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/spot-requests.html
503 // says: "After your Spot Instance is running, if the Spot price rises
504 // above your maximum price, Amazon EC2 interrupts your Spot
505 // Instance." (This doesn't address what happens when the spot price
506 // rises *without* exceeding your maximum price.)
507 //
508 // https://docs.aws.amazon.com/whitepapers/latest/cost-optimization-leveraging-ec2-spot-instances/how-spot-instances-work.html
509 // says: "You pay the Spot price that's in effect, billed to the
510 // nearest second." (But it's not explicitly stated whether "the price
511 // in effect" changes over time for a given instance.)
512 //
513 // The same page also says, in a discussion about the effect of
514 // specifying a maximum price: "Note that you never pay more than the
515 // Spot price that is in effect when your Spot Instance is running."
516 // (The use of the phrase "is running", as opposed to "was launched",
517 // hints that pricing is dynamic.)
518 func (inst *ec2Instance) PriceHistory(instType arvados.InstanceType) []cloud.InstancePrice {
519         inst.provider.pricesLock.Lock()
520         defer inst.provider.pricesLock.Unlock()
521         // Note updateSpotPrices currently populates
522         // inst.provider.prices only for spot instances, so if
523         // spot==false here, we will return no data.
524         pk := priceKey{
525                 instanceType:     *inst.instance.InstanceType,
526                 spot:             aws.StringValue(inst.instance.InstanceLifecycle) == "spot",
527                 availabilityZone: inst.availabilityZone,
528         }
529         var prices []cloud.InstancePrice
530         for _, price := range inst.provider.prices[pk] {
531                 // ceil(added scratch space in GiB)
532                 gib := (instType.AddedScratch + 1<<30 - 1) >> 30
533                 monthly := inst.provider.ec2config.EBSPrice * float64(gib)
534                 hourly := monthly / 30 / 24
535                 price.Price += hourly
536                 prices = append(prices, price)
537         }
538         return prices
539 }
540
541 type rateLimitError struct {
542         error
543         earliestRetry time.Time
544 }
545
546 func (err rateLimitError) EarliestRetry() time.Time {
547         return err.earliestRetry
548 }
549
550 var isCodeCapacity = map[string]bool{
551         "InsufficientFreeAddressesInSubnet": true,
552         "InsufficientInstanceCapacity":      true,
553         "InsufficientVolumeCapacity":        true,
554         "MaxSpotInstanceCountExceeded":      true,
555         "VcpuLimitExceeded":                 true,
556 }
557
558 // isErrorCapacity returns whether the error is to be throttled based on its code.
559 // Returns false if error is nil.
560 func isErrorCapacity(err error) bool {
561         if aerr, ok := err.(awserr.Error); ok && aerr != nil {
562                 if _, ok := isCodeCapacity[aerr.Code()]; ok {
563                         return true
564                 }
565         }
566         return false
567 }
568
569 type ec2QuotaError struct {
570         error
571 }
572
573 func (er *ec2QuotaError) IsQuotaError() bool {
574         return true
575 }
576
577 func wrapError(err error, throttleValue *atomic.Value) error {
578         if request.IsErrorThrottle(err) {
579                 // Back off exponentially until an upstream call
580                 // either succeeds or returns a non-throttle error.
581                 d, _ := throttleValue.Load().(time.Duration)
582                 d = d*3/2 + time.Second
583                 if d < throttleDelayMin {
584                         d = throttleDelayMin
585                 } else if d > throttleDelayMax {
586                         d = throttleDelayMax
587                 }
588                 throttleValue.Store(d)
589                 return rateLimitError{error: err, earliestRetry: time.Now().Add(d)}
590         } else if isErrorCapacity(err) {
591                 return &ec2QuotaError{err}
592         } else if err != nil {
593                 throttleValue.Store(time.Duration(0))
594                 return err
595         }
596         throttleValue.Store(time.Duration(0))
597         return nil
598 }