17778: Merge branch 'master' into 17778-doc-update
[arvados.git] / lib / cloud / ec2 / ec2.go
index c5565d424559f0bba2841dd46df62d3af883cc19..269a7d8def59a1e38603633691d657aef29d8e81 100644 (file)
@@ -13,30 +13,37 @@ import (
        "encoding/json"
        "fmt"
        "math/big"
-       "strings"
        "sync"
+       "sync/atomic"
+       "time"
 
-       "git.curoverse.com/arvados.git/lib/cloud"
-       "git.curoverse.com/arvados.git/sdk/go/arvados"
+       "git.arvados.org/arvados.git/lib/cloud"
+       "git.arvados.org/arvados.git/sdk/go/arvados"
        "github.com/aws/aws-sdk-go/aws"
+       "github.com/aws/aws-sdk-go/aws/awserr"
        "github.com/aws/aws-sdk-go/aws/credentials"
+       "github.com/aws/aws-sdk-go/aws/credentials/ec2rolecreds"
+       "github.com/aws/aws-sdk-go/aws/ec2metadata"
+       "github.com/aws/aws-sdk-go/aws/request"
        "github.com/aws/aws-sdk-go/aws/session"
        "github.com/aws/aws-sdk-go/service/ec2"
        "github.com/sirupsen/logrus"
        "golang.org/x/crypto/ssh"
 )
 
-const arvadosDispatchID = "arvados-dispatch-id"
-const tagPrefix = "arvados-dispatch-tag-"
-
 // Driver is the ec2 implementation of the cloud.Driver interface.
 var Driver = cloud.DriverFunc(newEC2InstanceSet)
 
+const (
+       throttleDelayMin = time.Second
+       throttleDelayMax = time.Minute
+)
+
 type ec2InstanceSetConfig struct {
        AccessKeyID      string
        SecretAccessKey  string
        Region           string
-       SecurityGroupIDs []string
+       SecurityGroupIDs arvados.StringSet
        SubnetID         string
        AdminUsername    string
        EBSVolumeType    string
@@ -52,29 +59,38 @@ type ec2Interface interface {
 }
 
 type ec2InstanceSet struct {
-       ec2config    ec2InstanceSetConfig
-       dispatcherID cloud.InstanceSetID
-       logger       logrus.FieldLogger
-       client       ec2Interface
-       keysMtx      sync.Mutex
-       keys         map[string]string
+       ec2config              ec2InstanceSetConfig
+       instanceSetID          cloud.InstanceSetID
+       logger                 logrus.FieldLogger
+       client                 ec2Interface
+       keysMtx                sync.Mutex
+       keys                   map[string]string
+       throttleDelayCreate    atomic.Value
+       throttleDelayInstances atomic.Value
 }
 
-func newEC2InstanceSet(config json.RawMessage, dispatcherID cloud.InstanceSetID, logger logrus.FieldLogger) (prv cloud.InstanceSet, err error) {
+func newEC2InstanceSet(config json.RawMessage, instanceSetID cloud.InstanceSetID, _ cloud.SharedResourceTags, logger logrus.FieldLogger) (prv cloud.InstanceSet, err error) {
        instanceSet := &ec2InstanceSet{
-               dispatcherID: dispatcherID,
-               logger:       logger,
+               instanceSetID: instanceSetID,
+               logger:        logger,
        }
        err = json.Unmarshal(config, &instanceSet.ec2config)
        if err != nil {
                return nil, err
        }
-       awsConfig := aws.NewConfig().
-               WithCredentials(credentials.NewStaticCredentials(
-                       instanceSet.ec2config.AccessKeyID,
-                       instanceSet.ec2config.SecretAccessKey,
-                       "")).
-               WithRegion(instanceSet.ec2config.Region)
+
+       sess, err := session.NewSession()
+       if err != nil {
+               return nil, err
+       }
+       // First try any static credentials, fall back to an IAM instance profile/role
+       creds := credentials.NewChainCredentials(
+               []credentials.Provider{
+                       &credentials.StaticProvider{Value: credentials.Value{AccessKeyID: instanceSet.ec2config.AccessKeyID, SecretAccessKey: instanceSet.ec2config.SecretAccessKey}},
+                       &ec2rolecreds.EC2RoleProvider{Client: ec2metadata.New(sess)},
+               })
+
+       awsConfig := aws.NewConfig().WithCredentials(creds).WithRegion(instanceSet.ec2config.Region)
        instanceSet.client = ec2.New(session.Must(session.NewSession(awsConfig)))
        instanceSet.keys = make(map[string]string)
        if instanceSet.ec2config.EBSVolumeType == "" {
@@ -107,10 +123,10 @@ func awsKeyFingerprint(pk ssh.PublicKey) (md5fp string, sha1fp string, err error
        sha1pkix := sha1.Sum([]byte(pkix))
        md5fp = ""
        sha1fp = ""
-       for i := 0; i < len(md5pkix); i += 1 {
+       for i := 0; i < len(md5pkix); i++ {
                md5fp += fmt.Sprintf(":%02x", md5pkix[i])
        }
-       for i := 0; i < len(sha1pkix); i += 1 {
+       for i := 0; i < len(sha1pkix); i++ {
                sha1fp += fmt.Sprintf(":%02x", sha1pkix[i])
        }
        return md5fp[1:], sha1fp[1:], nil
@@ -132,7 +148,7 @@ func (instanceSet *ec2InstanceSet) Create(
        var ok bool
        if keyname, ok = instanceSet.keys[md5keyFingerprint]; !ok {
                keyout, err := instanceSet.client.DescribeKeyPairs(&ec2.DescribeKeyPairsInput{
-                       Filters: []*ec2.Filter{&ec2.Filter{
+                       Filters: []*ec2.Filter{{
                                Name:   aws.String("fingerprint"),
                                Values: []*string{&md5keyFingerprint, &sha1keyFingerprint},
                        }},
@@ -157,23 +173,19 @@ func (instanceSet *ec2InstanceSet) Create(
        }
        instanceSet.keysMtx.Unlock()
 
-       ec2tags := []*ec2.Tag{
-               &ec2.Tag{
-                       Key:   aws.String(arvadosDispatchID),
-                       Value: aws.String(string(instanceSet.dispatcherID)),
-               },
-               &ec2.Tag{
-                       Key:   aws.String("arvados-class"),
-                       Value: aws.String("dynamic-compute"),
-               },
-       }
+       ec2tags := []*ec2.Tag{}
        for k, v := range newTags {
                ec2tags = append(ec2tags, &ec2.Tag{
-                       Key:   aws.String(tagPrefix + k),
+                       Key:   aws.String(k),
                        Value: aws.String(v),
                })
        }
 
+       var groups []string
+       for sg := range instanceSet.ec2config.SecurityGroupIDs {
+               groups = append(groups, sg)
+       }
+
        rii := ec2.RunInstancesInput{
                ImageId:      aws.String(string(imageID)),
                InstanceType: &instanceType.ProviderType,
@@ -182,25 +194,25 @@ func (instanceSet *ec2InstanceSet) Create(
                KeyName:      &keyname,
 
                NetworkInterfaces: []*ec2.InstanceNetworkInterfaceSpecification{
-                       &ec2.InstanceNetworkInterfaceSpecification{
+                       {
                                AssociatePublicIpAddress: aws.Bool(false),
                                DeleteOnTermination:      aws.Bool(true),
                                DeviceIndex:              aws.Int64(0),
-                               Groups:                   aws.StringSlice(instanceSet.ec2config.SecurityGroupIDs),
+                               Groups:                   aws.StringSlice(groups),
                                SubnetId:                 &instanceSet.ec2config.SubnetID,
                        }},
                DisableApiTermination:             aws.Bool(false),
                InstanceInitiatedShutdownBehavior: aws.String("terminate"),
-               UserData: aws.String(base64.StdEncoding.EncodeToString([]byte("#!/bin/sh\n" + initCommand + "\n"))),
                TagSpecifications: []*ec2.TagSpecification{
-                       &ec2.TagSpecification{
+                       {
                                ResourceType: aws.String("instance"),
                                Tags:         ec2tags,
                        }},
+               UserData: aws.String(base64.StdEncoding.EncodeToString([]byte("#!/bin/sh\n" + initCommand + "\n"))),
        }
 
        if instanceType.AddedScratch > 0 {
-               rii.BlockDeviceMappings = []*ec2.BlockDeviceMapping{&ec2.BlockDeviceMapping{
+               rii.BlockDeviceMappings = []*ec2.BlockDeviceMapping{{
                        DeviceName: aws.String("/dev/xvdt"),
                        Ebs: &ec2.EbsBlockDevice{
                                DeleteOnTermination: aws.Bool(true),
@@ -219,7 +231,7 @@ func (instanceSet *ec2InstanceSet) Create(
        }
 
        rsv, err := instanceSet.client.RunInstances(&rii)
-
+       err = wrapError(err, &instanceSet.throttleDelayCreate)
        if err != nil {
                return nil, err
        }
@@ -230,15 +242,18 @@ func (instanceSet *ec2InstanceSet) Create(
        }, nil
 }
 
-func (instanceSet *ec2InstanceSet) Instances(cloud.InstanceTags) (instances []cloud.Instance, err error) {
-       dii := &ec2.DescribeInstancesInput{
-               Filters: []*ec2.Filter{&ec2.Filter{
-                       Name:   aws.String("tag:" + arvadosDispatchID),
-                       Values: []*string{aws.String(string(instanceSet.dispatcherID))},
-               }}}
-
+func (instanceSet *ec2InstanceSet) Instances(tags cloud.InstanceTags) (instances []cloud.Instance, err error) {
+       var filters []*ec2.Filter
+       for k, v := range tags {
+               filters = append(filters, &ec2.Filter{
+                       Name:   aws.String("tag:" + k),
+                       Values: []*string{aws.String(v)},
+               })
+       }
+       dii := &ec2.DescribeInstancesInput{Filters: filters}
        for {
                dio, err := instanceSet.client.DescribeInstances(dii)
+               err = wrapError(err, &instanceSet.throttleDelayInstances)
                if err != nil {
                        return nil, err
                }
@@ -257,7 +272,7 @@ func (instanceSet *ec2InstanceSet) Instances(cloud.InstanceTags) (instances []cl
        }
 }
 
-func (az *ec2InstanceSet) Stop() {
+func (instanceSet *ec2InstanceSet) Stop() {
 }
 
 type ec2Instance struct {
@@ -278,15 +293,10 @@ func (inst *ec2Instance) ProviderType() string {
 }
 
 func (inst *ec2Instance) SetTags(newTags cloud.InstanceTags) error {
-       ec2tags := []*ec2.Tag{
-               &ec2.Tag{
-                       Key:   aws.String(arvadosDispatchID),
-                       Value: aws.String(string(inst.provider.dispatcherID)),
-               },
-       }
+       var ec2tags []*ec2.Tag
        for k, v := range newTags {
                ec2tags = append(ec2tags, &ec2.Tag{
-                       Key:   aws.String(tagPrefix + k),
+                       Key:   aws.String(k),
                        Value: aws.String(v),
                })
        }
@@ -303,9 +313,7 @@ func (inst *ec2Instance) Tags() cloud.InstanceTags {
        tags := make(map[string]string)
 
        for _, t := range inst.instance.Tags {
-               if strings.HasPrefix(*t.Key, tagPrefix) {
-                       tags[(*t.Key)[len(tagPrefix):]] = *t.Value
-               }
+               tags[*t.Key] = *t.Value
        }
 
        return tags
@@ -321,9 +329,8 @@ func (inst *ec2Instance) Destroy() error {
 func (inst *ec2Instance) Address() string {
        if inst.instance.PrivateIpAddress != nil {
                return *inst.instance.PrivateIpAddress
-       } else {
-               return ""
        }
+       return ""
 }
 
 func (inst *ec2Instance) RemoteUser() string {
@@ -333,3 +340,60 @@ func (inst *ec2Instance) RemoteUser() string {
 func (inst *ec2Instance) VerifyHostKey(ssh.PublicKey, *ssh.Client) error {
        return cloud.ErrNotImplemented
 }
+
+type rateLimitError struct {
+       error
+       earliestRetry time.Time
+}
+
+func (err rateLimitError) EarliestRetry() time.Time {
+       return err.earliestRetry
+}
+
+var isCodeCapacity = map[string]bool{
+       "InsufficientInstanceCapacity": true,
+       "VcpuLimitExceeded":            true,
+       "MaxSpotInstanceCountExceeded": true,
+}
+
+// isErrorCapacity returns whether the error is to be throttled based on its code.
+// Returns false if error is nil.
+func isErrorCapacity(err error) bool {
+       if aerr, ok := err.(awserr.Error); ok && aerr != nil {
+               if _, ok := isCodeCapacity[aerr.Code()]; ok {
+                       return true
+               }
+       }
+       return false
+}
+
+type ec2QuotaError struct {
+       error
+}
+
+func (er *ec2QuotaError) IsQuotaError() bool {
+       return true
+}
+
+func wrapError(err error, throttleValue *atomic.Value) error {
+       if request.IsErrorThrottle(err) {
+               // Back off exponentially until an upstream call
+               // either succeeds or returns a non-throttle error.
+               d, _ := throttleValue.Load().(time.Duration)
+               d = d*3/2 + time.Second
+               if d < throttleDelayMin {
+                       d = throttleDelayMin
+               } else if d > throttleDelayMax {
+                       d = throttleDelayMax
+               }
+               throttleValue.Store(d)
+               return rateLimitError{error: err, earliestRetry: time.Now().Add(d)}
+       } else if isErrorCapacity(err) {
+               return &ec2QuotaError{err}
+       } else if err != nil {
+               throttleValue.Store(time.Duration(0))
+               return err
+       }
+       throttleValue.Store(time.Duration(0))
+       return nil
+}