20978: Return the last capacity error if failing all subnets.

author Tom Clegg <tom@curii.com>

Thu, 2 Nov 2023 20:45:01 +0000 (16:45 -0400)

committer Tom Clegg <tom@curii.com>

Thu, 2 Nov 2023 20:45:01 +0000 (16:45 -0400)
author Tom Clegg <tom@curii.com>
Thu, 2 Nov 2023 20:45:01 +0000 (16:45 -0400)
committer Tom Clegg <tom@curii.com>
Thu, 2 Nov 2023 20:45:01 +0000 (16:45 -0400)
diff --git a/lib/cloud/ec2/ec2.go b/lib/cloud/ec2/ec2.go

index 55f9a1e3a36bbc9ab507db96cca99be58cad7b2e..0d181be0e9f87212f5f833b35e8e9220476d4815 100644 (file)
--- a/lib/cloud/ec2/ec2.go
+++ b/lib/cloud/ec2/ec2.go
@@ -288,7 +288,7 @@ func (instanceSet *ec2InstanceSet) Create(
         }
  
         var rsv *ec2.Reservation
-       var err error
+       var errToReturn error
         subnets := instanceSet.ec2config.SubnetID
         currentSubnetIDIndex := int(atomic.LoadInt32(&instanceSet.currentSubnetIDIndex))
         for tryOffset := 0; ; tryOffset++ {
@@ -299,8 +299,15 @@ func (instanceSet *ec2InstanceSet) Create(
                         trySubnet = subnets[tryIndex]
                         rii.NetworkInterfaces[0].SubnetId = aws.String(trySubnet)
                 }
+               var err error
                 rsv, err = instanceSet.client.RunInstances(&rii)
                 instanceSet.mInstanceStarts.WithLabelValues(trySubnet, boolLabelValue[err == nil]).Add(1)
+               if !isErrorCapacity(errToReturn) || isErrorCapacity(err) {
+                       // We want to return the last capacity error,
+                       // if any; otherwise the last non-capacity
+                       // error.
+                       errToReturn = err
+               }
                 if isErrorSubnetSpecific(err) &&
                         tryOffset < len(subnets)-1 {
                         instanceSet.logger.WithError(err).WithField("SubnetID", subnets[tryIndex]).
@@ -320,9 +327,8 @@ func (instanceSet *ec2InstanceSet) Create(
                 atomic.StoreInt32(&instanceSet.currentSubnetIDIndex, int32(tryIndex))
                 break
         }
-       err = wrapError(err, &instanceSet.throttleDelayCreate)
-       if err != nil {
-               return nil, err
+       if rsv == nil {
+               return nil, wrapError(errToReturn, &instanceSet.throttleDelayCreate)
         }
         return &ec2Instance{
                 provider: instanceSet,
@@ -715,6 +721,20 @@ func isErrorSubnetSpecific(err error) bool {
                 code == "Unsupported"
  }
  
+// isErrorCapacity returns true if the error indicates lack of
+// capacity (either temporary or permanent) to run a specific instance
+// type -- i.e., retrying with a different instance type might
+// succeed.
+func isErrorCapacity(err error) bool {
+       aerr, ok := err.(awserr.Error)
+       if !ok {
+               return false
+       }
+       code := aerr.Code()
+       return code == "InsufficientInstanceCapacity" ||
+               (code == "Unsupported" && strings.Contains(aerr.Message(), "requested instance type"))
+}
+
  type ec2QuotaError struct {
         error
  }
@@ -738,8 +758,7 @@ func wrapError(err error, throttleValue *atomic.Value) error {
                 return rateLimitError{error: err, earliestRetry: time.Now().Add(d)}
         } else if isErrorQuota(err) {
                 return &ec2QuotaError{err}
-       } else if aerr, ok := err.(awserr.Error); ok && (aerr.Code() == "InsufficientInstanceCapacity" ||
-               (aerr.Code() == "Unsupported" && strings.Contains(aerr.Message(), "requested instance type"))) {
+       } else if isErrorCapacity(err) {
                 return &capacityError{err, true}
         } else if err != nil {
                 throttleValue.Store(time.Duration(0))
diff --git a/lib/cloud/ec2/ec2_test.go b/lib/cloud/ec2/ec2_test.go

index 6ce5aa3cf9aa6899610c937d9dad1ae70e9cb695..4b830058963b93cfc508ee1795e65d22d3d70af9 100644 (file)
--- a/lib/cloud/ec2/ec2_test.go
+++ b/lib/cloud/ec2/ec2_test.go
@@ -399,6 +399,37 @@ func (*EC2InstanceSetSuite) TestCreateAllSubnetsFailing(c *check.C) {
                 `.*`)
  }
  
+func (*EC2InstanceSetSuite) TestCreateOneSubnetFailingCapacity(c *check.C) {
+       if *live != "" {
+               c.Skip("not applicable in live mode")
+               return
+       }
+       ap, img, cluster, reg := GetInstanceSet(c, `{"SubnetID":["subnet-full","subnet-broken"]}`)
+       ap.client.(*ec2stub).subnetErrorOnRunInstances = map[string]error{
+               "subnet-full": &ec2stubError{
+                       code:    "InsufficientFreeAddressesInSubnet",
+                       message: "subnet is full",
+               },
+               "subnet-broken": &ec2stubError{
+                       code:    "InsufficientInstanceCapacity",
+                       message: "insufficient capacity",
+               },
+       }
+       for i := 0; i < 3; i++ {
+               _, err := ap.Create(cluster.InstanceTypes["tiny"], img, nil, "", nil)
+               c.Check(err, check.NotNil)
+               c.Check(err, check.ErrorMatches, `.*InsufficientInstanceCapacity.*`)
+       }
+       c.Check(ap.client.(*ec2stub).runInstancesCalls, check.HasLen, 6)
+       metrics := arvadostest.GatherMetricsAsString(reg)
+       c.Check(metrics, check.Matches, `(?ms).*`+
+               `arvados_dispatchcloud_ec2_instance_starts_total{subnet_id="subnet-broken",success="0"} 3\n`+
+               `arvados_dispatchcloud_ec2_instance_starts_total{subnet_id="subnet-broken",success="1"} 0\n`+
+               `arvados_dispatchcloud_ec2_instance_starts_total{subnet_id="subnet-full",success="0"} 3\n`+
+               `arvados_dispatchcloud_ec2_instance_starts_total{subnet_id="subnet-full",success="1"} 0\n`+
+               `.*`)
+}
+
  func (*EC2InstanceSetSuite) TestTagInstances(c *check.C) {
         ap, _, _, _ := GetInstanceSet(c, "{}")
         l, err := ap.Instances(nil)
author	Tom Clegg <tom@curii.com>
	Thu, 2 Nov 2023 20:45:01 +0000 (16:45 -0400)
committer	Tom Clegg <tom@curii.com>
	Thu, 2 Nov 2023 20:45:01 +0000 (16:45 -0400)
lib/cloud/ec2/ec2.go		patch \| blob \| history
lib/cloud/ec2/ec2_test.go		patch \| blob \| history