From 11fb3d91a28bf51803f170a53709d709941de788 Mon Sep 17 00:00:00 2001 From: Tom Clegg Date: Thu, 25 May 2023 17:10:48 -0400 Subject: [PATCH] 20511: Don't shutdown excess instances just because MaxSupervisors. Arvados-DCO-1.1-Signed-off-by: Tom Clegg --- lib/dispatchcloud/scheduler/run_queue.go | 2 + lib/dispatchcloud/scheduler/run_queue_test.go | 56 +++++++++++++++++++ 2 files changed, 58 insertions(+) diff --git a/lib/dispatchcloud/scheduler/run_queue.go b/lib/dispatchcloud/scheduler/run_queue.go index db6e97b596..adf8977898 100644 --- a/lib/dispatchcloud/scheduler/run_queue.go +++ b/lib/dispatchcloud/scheduler/run_queue.go @@ -204,6 +204,8 @@ tryrun: } } } + } + if len(overquota) > 0 { // Shut down idle workers that didn't get any // containers mapped onto them before we hit quota. for it, n := range unalloc { diff --git a/lib/dispatchcloud/scheduler/run_queue_test.go b/lib/dispatchcloud/scheduler/run_queue_test.go index 2deff69e93..8192d47211 100644 --- a/lib/dispatchcloud/scheduler/run_queue_test.go +++ b/lib/dispatchcloud/scheduler/run_queue_test.go @@ -374,6 +374,62 @@ func (*SchedulerSuite) TestIdleIn503QuietPeriod(c *check.C) { c.Check(queue.StateChanges(), check.HasLen, 0) } +// If we somehow have more supervisor containers in Locked state than +// we should (e.g., config changed since they started), and some +// appropriate-sized instances booting up, unlock the excess +// supervisor containers, but let the instances keep booting. +func (*SchedulerSuite) TestUnlockExcessSupervisors(c *check.C) { + ctx := ctxlog.Context(context.Background(), ctxlog.TestLogger(c)) + queue := test.Queue{ + ChooseType: chooseType, + } + for i := 1; i <= 6; i++ { + queue.Containers = append(queue.Containers, arvados.Container{ + UUID: test.ContainerUUID(i), + Priority: int64(1000 - i), + State: arvados.ContainerStateLocked, + RuntimeConstraints: arvados.RuntimeConstraints{ + VCPUs: 2, + RAM: 2 << 30, + }, + SchedulingParameters: arvados.SchedulingParameters{ + Supervisor: true, + }, + }) + } + queue.Update() + pool := stubPool{ + quota: 16, + unalloc: map[arvados.InstanceType]int{ + test.InstanceType(2): 2, + }, + idle: map[arvados.InstanceType]int{ + test.InstanceType(2): 1, + }, + running: map[string]time.Time{ + test.ContainerUUID(1): {}, + test.ContainerUUID(2): {}, + test.ContainerUUID(3): {}, + test.ContainerUUID(4): {}, + }, + creates: []arvados.InstanceType{}, + starts: []string{}, + canCreate: 0, + } + sch := New(ctx, arvados.NewClientFromEnv(), &queue, &pool, nil, time.Millisecond, time.Millisecond, 4) + sch.sync() + sch.runQueue() + sch.sync() + + c.Check(pool.starts, check.DeepEquals, []string{}) + c.Check(pool.shutdowns, check.Equals, 0) + c.Check(pool.creates, check.HasLen, 0) + c.Check(queue.StateChanges(), check.DeepEquals, []test.QueueStateChange{ + {UUID: test.ContainerUUID(5), From: "Locked", To: "Queued"}, + {UUID: test.ContainerUUID(6), From: "Locked", To: "Queued"}, + }) +} + // Don't flap lock/unlock when equal-priority containers compete for // limited workers. // -- 2.30.2