X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/32ad82494652c10e4dfdf7c61782ab6a7684aba0..c89213f5a5e303050caaebe4f8fdf2980fc65605:/lib/cloud/ec2/ec2.go diff --git a/lib/cloud/ec2/ec2.go b/lib/cloud/ec2/ec2.go index b20dbfcc98..269a7d8def 100644 --- a/lib/cloud/ec2/ec2.go +++ b/lib/cloud/ec2/ec2.go @@ -14,11 +14,17 @@ import ( "fmt" "math/big" "sync" + "sync/atomic" + "time" "git.arvados.org/arvados.git/lib/cloud" "git.arvados.org/arvados.git/sdk/go/arvados" "github.com/aws/aws-sdk-go/aws" + "github.com/aws/aws-sdk-go/aws/awserr" "github.com/aws/aws-sdk-go/aws/credentials" + "github.com/aws/aws-sdk-go/aws/credentials/ec2rolecreds" + "github.com/aws/aws-sdk-go/aws/ec2metadata" + "github.com/aws/aws-sdk-go/aws/request" "github.com/aws/aws-sdk-go/aws/session" "github.com/aws/aws-sdk-go/service/ec2" "github.com/sirupsen/logrus" @@ -28,6 +34,11 @@ import ( // Driver is the ec2 implementation of the cloud.Driver interface. var Driver = cloud.DriverFunc(newEC2InstanceSet) +const ( + throttleDelayMin = time.Second + throttleDelayMax = time.Minute +) + type ec2InstanceSetConfig struct { AccessKeyID string SecretAccessKey string @@ -48,12 +59,14 @@ type ec2Interface interface { } type ec2InstanceSet struct { - ec2config ec2InstanceSetConfig - instanceSetID cloud.InstanceSetID - logger logrus.FieldLogger - client ec2Interface - keysMtx sync.Mutex - keys map[string]string + ec2config ec2InstanceSetConfig + instanceSetID cloud.InstanceSetID + logger logrus.FieldLogger + client ec2Interface + keysMtx sync.Mutex + keys map[string]string + throttleDelayCreate atomic.Value + throttleDelayInstances atomic.Value } func newEC2InstanceSet(config json.RawMessage, instanceSetID cloud.InstanceSetID, _ cloud.SharedResourceTags, logger logrus.FieldLogger) (prv cloud.InstanceSet, err error) { @@ -65,12 +78,19 @@ func newEC2InstanceSet(config json.RawMessage, instanceSetID cloud.InstanceSetID if err != nil { return nil, err } - awsConfig := aws.NewConfig(). - WithCredentials(credentials.NewStaticCredentials( - instanceSet.ec2config.AccessKeyID, - instanceSet.ec2config.SecretAccessKey, - "")). - WithRegion(instanceSet.ec2config.Region) + + sess, err := session.NewSession() + if err != nil { + return nil, err + } + // First try any static credentials, fall back to an IAM instance profile/role + creds := credentials.NewChainCredentials( + []credentials.Provider{ + &credentials.StaticProvider{Value: credentials.Value{AccessKeyID: instanceSet.ec2config.AccessKeyID, SecretAccessKey: instanceSet.ec2config.SecretAccessKey}}, + &ec2rolecreds.EC2RoleProvider{Client: ec2metadata.New(sess)}, + }) + + awsConfig := aws.NewConfig().WithCredentials(creds).WithRegion(instanceSet.ec2config.Region) instanceSet.client = ec2.New(session.Must(session.NewSession(awsConfig))) instanceSet.keys = make(map[string]string) if instanceSet.ec2config.EBSVolumeType == "" { @@ -211,7 +231,7 @@ func (instanceSet *ec2InstanceSet) Create( } rsv, err := instanceSet.client.RunInstances(&rii) - + err = wrapError(err, &instanceSet.throttleDelayCreate) if err != nil { return nil, err } @@ -233,6 +253,7 @@ func (instanceSet *ec2InstanceSet) Instances(tags cloud.InstanceTags) (instances dii := &ec2.DescribeInstancesInput{Filters: filters} for { dio, err := instanceSet.client.DescribeInstances(dii) + err = wrapError(err, &instanceSet.throttleDelayInstances) if err != nil { return nil, err } @@ -319,3 +340,60 @@ func (inst *ec2Instance) RemoteUser() string { func (inst *ec2Instance) VerifyHostKey(ssh.PublicKey, *ssh.Client) error { return cloud.ErrNotImplemented } + +type rateLimitError struct { + error + earliestRetry time.Time +} + +func (err rateLimitError) EarliestRetry() time.Time { + return err.earliestRetry +} + +var isCodeCapacity = map[string]bool{ + "InsufficientInstanceCapacity": true, + "VcpuLimitExceeded": true, + "MaxSpotInstanceCountExceeded": true, +} + +// isErrorCapacity returns whether the error is to be throttled based on its code. +// Returns false if error is nil. +func isErrorCapacity(err error) bool { + if aerr, ok := err.(awserr.Error); ok && aerr != nil { + if _, ok := isCodeCapacity[aerr.Code()]; ok { + return true + } + } + return false +} + +type ec2QuotaError struct { + error +} + +func (er *ec2QuotaError) IsQuotaError() bool { + return true +} + +func wrapError(err error, throttleValue *atomic.Value) error { + if request.IsErrorThrottle(err) { + // Back off exponentially until an upstream call + // either succeeds or returns a non-throttle error. + d, _ := throttleValue.Load().(time.Duration) + d = d*3/2 + time.Second + if d < throttleDelayMin { + d = throttleDelayMin + } else if d > throttleDelayMax { + d = throttleDelayMax + } + throttleValue.Store(d) + return rateLimitError{error: err, earliestRetry: time.Now().Add(d)} + } else if isErrorCapacity(err) { + return &ec2QuotaError{err} + } else if err != nil { + throttleValue.Store(time.Duration(0)) + return err + } + throttleValue.Store(time.Duration(0)) + return nil +}