Merge branch '21603-ec2-subnet-error'
[arvados.git] / lib / cloud / ec2 / ec2.go
index 5e4df05f46f01c9c2305d968771282c3f34e2013..6251f18df0fc485f876e710731875a9e9061e14d 100644 (file)
@@ -13,6 +13,7 @@ import (
        "encoding/json"
        "fmt"
        "math/big"
+       "regexp"
        "strconv"
        "strings"
        "sync"
@@ -251,6 +252,12 @@ func (instanceSet *ec2InstanceSet) Create(
                                ResourceType: aws.String("instance"),
                                Tags:         ec2tags,
                        }},
+               MetadataOptions: &ec2.InstanceMetadataOptionsRequest{
+                       // Require IMDSv2, as described at
+                       // https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/configuring-IMDS-new-instances.html
+                       HttpEndpoint: aws.String(ec2.InstanceMetadataEndpointStateEnabled),
+                       HttpTokens:   aws.String(ec2.HttpTokensStateRequired),
+               },
                UserData: aws.String(base64.StdEncoding.EncodeToString([]byte("#!/bin/sh\n" + initCommand + "\n"))),
        }
 
@@ -288,7 +295,7 @@ func (instanceSet *ec2InstanceSet) Create(
        }
 
        var rsv *ec2.Reservation
-       var err error
+       var errToReturn error
        subnets := instanceSet.ec2config.SubnetID
        currentSubnetIDIndex := int(atomic.LoadInt32(&instanceSet.currentSubnetIDIndex))
        for tryOffset := 0; ; tryOffset++ {
@@ -299,8 +306,15 @@ func (instanceSet *ec2InstanceSet) Create(
                        trySubnet = subnets[tryIndex]
                        rii.NetworkInterfaces[0].SubnetId = aws.String(trySubnet)
                }
+               var err error
                rsv, err = instanceSet.client.RunInstances(&rii)
                instanceSet.mInstanceStarts.WithLabelValues(trySubnet, boolLabelValue[err == nil]).Add(1)
+               if !isErrorCapacity(errToReturn) || isErrorCapacity(err) {
+                       // We want to return the last capacity error,
+                       // if any; otherwise the last non-capacity
+                       // error.
+                       errToReturn = err
+               }
                if isErrorSubnetSpecific(err) &&
                        tryOffset < len(subnets)-1 {
                        instanceSet.logger.WithError(err).WithField("SubnetID", subnets[tryIndex]).
@@ -320,9 +334,8 @@ func (instanceSet *ec2InstanceSet) Create(
                atomic.StoreInt32(&instanceSet.currentSubnetIDIndex, int32(tryIndex))
                break
        }
-       err = wrapError(err, &instanceSet.throttleDelayCreate)
-       if err != nil {
-               return nil, err
+       if rsv == nil || len(rsv.Instances) == 0 {
+               return nil, wrapError(errToReturn, &instanceSet.throttleDelayCreate)
        }
        return &ec2Instance{
                provider: instanceSet,
@@ -665,27 +678,44 @@ func (err rateLimitError) EarliestRetry() time.Time {
        return err.earliestRetry
 }
 
-var isCodeCapacity = map[string]bool{
+type capacityError struct {
+       error
+       isInstanceTypeSpecific bool
+}
+
+func (er *capacityError) IsCapacityError() bool {
+       return true
+}
+
+func (er *capacityError) IsInstanceTypeSpecific() bool {
+       return er.isInstanceTypeSpecific
+}
+
+var isCodeQuota = map[string]bool{
        "InstanceLimitExceeded":             true,
        "InsufficientAddressCapacity":       true,
        "InsufficientFreeAddressesInSubnet": true,
-       "InsufficientInstanceCapacity":      true,
        "InsufficientVolumeCapacity":        true,
        "MaxSpotInstanceCountExceeded":      true,
        "VcpuLimitExceeded":                 true,
 }
 
-// isErrorCapacity returns whether the error is to be throttled based on its code.
+// isErrorQuota returns whether the error indicates we have reached
+// some usage quota/limit -- i.e., immediately retrying with an equal
+// or larger instance type will probably not work.
+//
 // Returns false if error is nil.
-func isErrorCapacity(err error) bool {
+func isErrorQuota(err error) bool {
        if aerr, ok := err.(awserr.Error); ok && aerr != nil {
-               if _, ok := isCodeCapacity[aerr.Code()]; ok {
+               if _, ok := isCodeQuota[aerr.Code()]; ok {
                        return true
                }
        }
        return false
 }
 
+var reSubnetSpecificInvalidParameterMessage = regexp.MustCompile(`(?ms).*( subnet |sufficient free [Ii]pv[46] addresses).*`)
+
 // isErrorSubnetSpecific returns true if the problem encountered by
 // RunInstances might be avoided by trying a different subnet.
 func isErrorSubnetSpecific(err error) bool {
@@ -696,7 +726,27 @@ func isErrorSubnetSpecific(err error) bool {
        code := aerr.Code()
        return strings.Contains(code, "Subnet") ||
                code == "InsufficientInstanceCapacity" ||
-               code == "InsufficientVolumeCapacity"
+               code == "InsufficientVolumeCapacity" ||
+               code == "Unsupported" ||
+               // See TestIsErrorSubnetSpecific for examples of why
+               // we look for substrings in code/message instead of
+               // only using specific codes here.
+               (strings.Contains(code, "InvalidParameter") &&
+                       reSubnetSpecificInvalidParameterMessage.MatchString(aerr.Message()))
+}
+
+// isErrorCapacity returns true if the error indicates lack of
+// capacity (either temporary or permanent) to run a specific instance
+// type -- i.e., retrying with a different instance type might
+// succeed.
+func isErrorCapacity(err error) bool {
+       aerr, ok := err.(awserr.Error)
+       if !ok {
+               return false
+       }
+       code := aerr.Code()
+       return code == "InsufficientInstanceCapacity" ||
+               (code == "Unsupported" && strings.Contains(aerr.Message(), "requested instance type"))
 }
 
 type ec2QuotaError struct {
@@ -720,8 +770,10 @@ func wrapError(err error, throttleValue *atomic.Value) error {
                }
                throttleValue.Store(d)
                return rateLimitError{error: err, earliestRetry: time.Now().Add(d)}
-       } else if isErrorCapacity(err) {
+       } else if isErrorQuota(err) {
                return &ec2QuotaError{err}
+       } else if isErrorCapacity(err) {
+               return &capacityError{err, true}
        } else if err != nil {
                throttleValue.Store(time.Duration(0))
                return err