20457: Exercise quota handling in dispatcher chaos test.
authorTom Clegg <tom@curii.com>
Wed, 9 Aug 2023 21:06:40 +0000 (17:06 -0400)
committerTom Clegg <tom@curii.com>
Wed, 9 Aug 2023 21:06:40 +0000 (17:06 -0400)
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom@curii.com>

lib/dispatchcloud/dispatcher_test.go
lib/dispatchcloud/scheduler/run_queue_test.go
lib/dispatchcloud/test/stub_driver.go
lib/dispatchcloud/worker/pool.go
lib/dispatchcloud/worker/worker.go

index 4583a596eebfe48a08fd862e6d840d8df401c047..ea2611959c355bd60e8bd8cc0baceade386f4572 100644 (file)
@@ -52,6 +52,7 @@ func (s *DispatcherSuite) SetUpTest(c *check.C) {
                ErrorRateCreate:           0.1,
                ErrorRateDestroy:          0.1,
                MinTimeBetweenCreateCalls: time.Millisecond,
+               QuotaMaxInstances:         10,
        }
 
        // We need the postgresql connection info from the integration
index f407ac848f70b34e8b5ef2ef8804efa4ce5dcf36..4359ae03babb91404d9917728c6b27f6dcf9efc5 100644 (file)
@@ -29,12 +29,6 @@ var (
        }()
 )
 
-type stubQuotaError struct {
-       error
-}
-
-func (stubQuotaError) IsQuotaError() bool { return true }
-
 type stubPool struct {
        notify    <-chan struct{}
        unalloc   map[arvados.InstanceType]int // idle+booting+unknown
index 5ca83d263c1c481bd71c968299744e2cf9b2486d..826e5c1af3bf05ae28f1b4b4a278190472910812 100644 (file)
@@ -54,6 +54,8 @@ type StubDriver struct {
        MinTimeBetweenCreateCalls    time.Duration
        MinTimeBetweenInstancesCalls time.Duration
 
+       QuotaMaxInstances int
+
        // If true, Create and Destroy calls block until Release() is
        // called.
        HoldCloudOps bool
@@ -124,6 +126,9 @@ func (sis *StubInstanceSet) Create(it arvados.InstanceType, image cloud.ImageID,
        if math_rand.Float64() < sis.driver.ErrorRateCreate {
                return nil, fmt.Errorf("StubInstanceSet: rand < ErrorRateCreate %f", sis.driver.ErrorRateCreate)
        }
+       if max := sis.driver.QuotaMaxInstances; max > 0 && len(sis.servers) >= max {
+               return nil, QuotaError{fmt.Errorf("StubInstanceSet: reached QuotaMaxInstances %d", max)}
+       }
        sis.allowCreateCall = time.Now().Add(sis.driver.MinTimeBetweenCreateCalls)
        ak := sis.driver.AuthorizedKeys
        if authKey != nil {
@@ -489,3 +494,9 @@ func copyTags(src cloud.InstanceTags) cloud.InstanceTags {
 func (si stubInstance) PriceHistory(arvados.InstanceType) []cloud.InstancePrice {
        return nil
 }
+
+type QuotaError struct {
+       error
+}
+
+func (QuotaError) IsQuotaError() bool { return true }
index f79bad98fc16397e85f1469ef8d8013214aad8f9..15b0dbcde57d4d3af93233b11bcc663973015a58 100644 (file)
@@ -1053,6 +1053,10 @@ func (wp *Pool) sync(threshold time.Time, instances []cloud.Instance) {
        }
 
        if wp.atQuotaUntilFewerInstances > len(wp.workers)+len(wp.creating) {
+               // After syncing, there are fewer instances (including
+               // pending creates) than there were last time we saw a
+               // quota error.  This might mean it's now possible to
+               // create new instances.  Reset our "at quota" state.
                wp.atQuotaUntilFewerInstances = 0
        }
 
index 8b4be1a3c77aa8f01ebe7dad4a3c266da3c36c81..7d94146cf725b402f35dee43f50192d4684ddfd1 100644 (file)
@@ -639,10 +639,12 @@ func (wkr *worker) Close() {
        for uuid, rr := range wkr.running {
                wkr.logger.WithField("ContainerUUID", uuid).Info("crunch-run process abandoned")
                rr.Close()
+               delete(wkr.running, uuid)
        }
        for uuid, rr := range wkr.starting {
                wkr.logger.WithField("ContainerUUID", uuid).Info("crunch-run process abandoned")
                rr.Close()
+               delete(wkr.starting, uuid)
        }
 }