20457: Don't lock-cycle next-in-line ctr while waiting for quota.
authorTom Clegg <tom@curii.com>
Fri, 11 Aug 2023 14:17:30 +0000 (10:17 -0400)
committerTom Clegg <tom@curii.com>
Fri, 11 Aug 2023 14:17:47 +0000 (10:17 -0400)
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom@curii.com>

lib/dispatchcloud/scheduler/run_queue.go

index d3d6b7e7b4050d01a2a07bca722c8df611417502..6a717bf44463b7688457260b934af091d8bfb4d7 100644 (file)
@@ -253,7 +253,28 @@ tryrun:
                // we're at quota (but if they have already been
                // scheduled and they're loading docker images etc.,
                // let them run).
-               for _, ctr := range append(overmaxsuper, overquota...) {
+               var unlock []container.QueueEnt
+               unlock = append(unlock, overmaxsuper...)
+               if totalInstances > 0 && len(overquota) > 1 {
+                       // We don't unlock the next-in-line container
+                       // when at quota.  This avoids a situation
+                       // where our "at quota" state expires, we lock
+                       // the next container and try to create an
+                       // instance, the cloud provider still returns
+                       // a quota error, we unlock the container, and
+                       // we repeat this until the container reaches
+                       // its limit of lock/unlock cycles.
+                       unlock = append(unlock, overquota[1:]...)
+               } else {
+                       // However, if totalInstances is 0 and we're
+                       // still getting quota errors, then the
+                       // next-in-line container is evidently not
+                       // possible to run, so we should let it
+                       // exhaust its lock/unlock cycles and
+                       // eventually cancel, to avoid starvation.
+                       unlock = append(unlock, overquota...)
+               }
+               for _, ctr := range unlock {
                        ctr := ctr.Container
                        _, toolate := running[ctr.UUID]
                        if ctr.State == arvados.ContainerStateLocked && !toolate {