18772: arvados-dispatch-cloud: add IamInstanceProfile field to the ec2
[arvados.git] / lib / cloud / ec2 / ec2.go
index 1e0de74024f52851ebe4eb08c0414617d0bdc7db..2cbe4cf290c1b0e7a9dcaaeb1b701242c0da1cd3 100644 (file)
@@ -14,13 +14,17 @@ import (
        "fmt"
        "math/big"
        "sync"
+       "sync/atomic"
+       "time"
 
        "git.arvados.org/arvados.git/lib/cloud"
        "git.arvados.org/arvados.git/sdk/go/arvados"
        "github.com/aws/aws-sdk-go/aws"
+       "github.com/aws/aws-sdk-go/aws/awserr"
        "github.com/aws/aws-sdk-go/aws/credentials"
        "github.com/aws/aws-sdk-go/aws/credentials/ec2rolecreds"
        "github.com/aws/aws-sdk-go/aws/ec2metadata"
+       "github.com/aws/aws-sdk-go/aws/request"
        "github.com/aws/aws-sdk-go/aws/session"
        "github.com/aws/aws-sdk-go/service/ec2"
        "github.com/sirupsen/logrus"
@@ -30,14 +34,20 @@ import (
 // Driver is the ec2 implementation of the cloud.Driver interface.
 var Driver = cloud.DriverFunc(newEC2InstanceSet)
 
+const (
+       throttleDelayMin = time.Second
+       throttleDelayMax = time.Minute
+)
+
 type ec2InstanceSetConfig struct {
-       AccessKeyID      string
-       SecretAccessKey  string
-       Region           string
-       SecurityGroupIDs arvados.StringSet
-       SubnetID         string
-       AdminUsername    string
-       EBSVolumeType    string
+       AccessKeyID        string
+       SecretAccessKey    string
+       Region             string
+       SecurityGroupIDs   arvados.StringSet
+       SubnetID           string
+       AdminUsername      string
+       EBSVolumeType      string
+       IamInstanceProfile string
 }
 
 type ec2Interface interface {
@@ -50,12 +60,14 @@ type ec2Interface interface {
 }
 
 type ec2InstanceSet struct {
-       ec2config     ec2InstanceSetConfig
-       instanceSetID cloud.InstanceSetID
-       logger        logrus.FieldLogger
-       client        ec2Interface
-       keysMtx       sync.Mutex
-       keys          map[string]string
+       ec2config              ec2InstanceSetConfig
+       instanceSetID          cloud.InstanceSetID
+       logger                 logrus.FieldLogger
+       client                 ec2Interface
+       keysMtx                sync.Mutex
+       keys                   map[string]string
+       throttleDelayCreate    atomic.Value
+       throttleDelayInstances atomic.Value
 }
 
 func newEC2InstanceSet(config json.RawMessage, instanceSetID cloud.InstanceSetID, _ cloud.SharedResourceTags, logger logrus.FieldLogger) (prv cloud.InstanceSet, err error) {
@@ -219,8 +231,14 @@ func (instanceSet *ec2InstanceSet) Create(
                        }}
        }
 
-       rsv, err := instanceSet.client.RunInstances(&rii)
+       if instanceSet.ec2config.IamInstanceProfile != "" {
+               rii.IamInstanceProfile = &ec2.IamInstanceProfileSpecification{
+                       Name: aws.String(instanceSet.ec2config.IamInstanceProfile),
+               }
+       }
 
+       rsv, err := instanceSet.client.RunInstances(&rii)
+       err = wrapError(err, &instanceSet.throttleDelayCreate)
        if err != nil {
                return nil, err
        }
@@ -242,6 +260,7 @@ func (instanceSet *ec2InstanceSet) Instances(tags cloud.InstanceTags) (instances
        dii := &ec2.DescribeInstancesInput{Filters: filters}
        for {
                dio, err := instanceSet.client.DescribeInstances(dii)
+               err = wrapError(err, &instanceSet.throttleDelayInstances)
                if err != nil {
                        return nil, err
                }
@@ -328,3 +347,60 @@ func (inst *ec2Instance) RemoteUser() string {
 func (inst *ec2Instance) VerifyHostKey(ssh.PublicKey, *ssh.Client) error {
        return cloud.ErrNotImplemented
 }
+
+type rateLimitError struct {
+       error
+       earliestRetry time.Time
+}
+
+func (err rateLimitError) EarliestRetry() time.Time {
+       return err.earliestRetry
+}
+
+var isCodeCapacity = map[string]bool{
+       "InsufficientInstanceCapacity": true,
+       "VcpuLimitExceeded":            true,
+       "MaxSpotInstanceCountExceeded": true,
+}
+
+// isErrorCapacity returns whether the error is to be throttled based on its code.
+// Returns false if error is nil.
+func isErrorCapacity(err error) bool {
+       if aerr, ok := err.(awserr.Error); ok && aerr != nil {
+               if _, ok := isCodeCapacity[aerr.Code()]; ok {
+                       return true
+               }
+       }
+       return false
+}
+
+type ec2QuotaError struct {
+       error
+}
+
+func (er *ec2QuotaError) IsQuotaError() bool {
+       return true
+}
+
+func wrapError(err error, throttleValue *atomic.Value) error {
+       if request.IsErrorThrottle(err) {
+               // Back off exponentially until an upstream call
+               // either succeeds or returns a non-throttle error.
+               d, _ := throttleValue.Load().(time.Duration)
+               d = d*3/2 + time.Second
+               if d < throttleDelayMin {
+                       d = throttleDelayMin
+               } else if d > throttleDelayMax {
+                       d = throttleDelayMax
+               }
+               throttleValue.Store(d)
+               return rateLimitError{error: err, earliestRetry: time.Now().Add(d)}
+       } else if isErrorCapacity(err) {
+               return &ec2QuotaError{err}
+       } else if err != nil {
+               throttleValue.Store(time.Duration(0))
+               return err
+       }
+       throttleValue.Store(time.Duration(0))
+       return nil
+}