20511: Don't shutdown excess instances just because MaxSupervisors.
authorTom Clegg <tom@curii.com>
Thu, 25 May 2023 21:10:48 +0000 (17:10 -0400)
committerTom Clegg <tom@curii.com>
Thu, 25 May 2023 21:10:48 +0000 (17:10 -0400)
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom@curii.com>

lib/dispatchcloud/scheduler/run_queue.go
lib/dispatchcloud/scheduler/run_queue_test.go

index db6e97b5961e2018965ecec27ccbef818a73ba06..adf89778981311c0d44819773e53f22ea53a13da 100644 (file)
@@ -204,6 +204,8 @@ tryrun:
                                }
                        }
                }
+       }
+       if len(overquota) > 0 {
                // Shut down idle workers that didn't get any
                // containers mapped onto them before we hit quota.
                for it, n := range unalloc {
index 2deff69e9322673c8eaf7c6f7f4720bf0f63b758..8192d47211317f6751b36ab4d7715285d3e9ed21 100644 (file)
@@ -374,6 +374,62 @@ func (*SchedulerSuite) TestIdleIn503QuietPeriod(c *check.C) {
        c.Check(queue.StateChanges(), check.HasLen, 0)
 }
 
+// If we somehow have more supervisor containers in Locked state than
+// we should (e.g., config changed since they started), and some
+// appropriate-sized instances booting up, unlock the excess
+// supervisor containers, but let the instances keep booting.
+func (*SchedulerSuite) TestUnlockExcessSupervisors(c *check.C) {
+       ctx := ctxlog.Context(context.Background(), ctxlog.TestLogger(c))
+       queue := test.Queue{
+               ChooseType: chooseType,
+       }
+       for i := 1; i <= 6; i++ {
+               queue.Containers = append(queue.Containers, arvados.Container{
+                       UUID:     test.ContainerUUID(i),
+                       Priority: int64(1000 - i),
+                       State:    arvados.ContainerStateLocked,
+                       RuntimeConstraints: arvados.RuntimeConstraints{
+                               VCPUs: 2,
+                               RAM:   2 << 30,
+                       },
+                       SchedulingParameters: arvados.SchedulingParameters{
+                               Supervisor: true,
+                       },
+               })
+       }
+       queue.Update()
+       pool := stubPool{
+               quota: 16,
+               unalloc: map[arvados.InstanceType]int{
+                       test.InstanceType(2): 2,
+               },
+               idle: map[arvados.InstanceType]int{
+                       test.InstanceType(2): 1,
+               },
+               running: map[string]time.Time{
+                       test.ContainerUUID(1): {},
+                       test.ContainerUUID(2): {},
+                       test.ContainerUUID(3): {},
+                       test.ContainerUUID(4): {},
+               },
+               creates:   []arvados.InstanceType{},
+               starts:    []string{},
+               canCreate: 0,
+       }
+       sch := New(ctx, arvados.NewClientFromEnv(), &queue, &pool, nil, time.Millisecond, time.Millisecond, 4)
+       sch.sync()
+       sch.runQueue()
+       sch.sync()
+
+       c.Check(pool.starts, check.DeepEquals, []string{})
+       c.Check(pool.shutdowns, check.Equals, 0)
+       c.Check(pool.creates, check.HasLen, 0)
+       c.Check(queue.StateChanges(), check.DeepEquals, []test.QueueStateChange{
+               {UUID: test.ContainerUUID(5), From: "Locked", To: "Queued"},
+               {UUID: test.ContainerUUID(6), From: "Locked", To: "Queued"},
+       })
+}
+
 // Don't flap lock/unlock when equal-priority containers compete for
 // limited workers.
 //