X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/7aaf9f22aa646077b4b7fd961d6b731185b88137..e5394906b154b630699c0edd4add36eca34611b3:/lib/dispatchcloud/scheduler/run_queue_test.go diff --git a/lib/dispatchcloud/scheduler/run_queue_test.go b/lib/dispatchcloud/scheduler/run_queue_test.go index 5b5fa960a1..4359ae03ba 100644 --- a/lib/dispatchcloud/scheduler/run_queue_test.go +++ b/lib/dispatchcloud/scheduler/run_queue_test.go @@ -29,12 +29,6 @@ var ( }() ) -type stubQuotaError struct { - error -} - -func (stubQuotaError) IsQuotaError() bool { return true } - type stubPool struct { notify <-chan struct{} unalloc map[arvados.InstanceType]int // idle+booting+unknown @@ -52,7 +46,14 @@ type stubPool struct { func (p *stubPool) AtQuota() bool { p.Lock() defer p.Unlock() - return len(p.unalloc)+len(p.running)+len(p.unknown) >= p.quota + n := len(p.running) + for _, nn := range p.unalloc { + n += nn + } + for _, nn := range p.unknown { + n += nn + } + return n >= p.quota } func (p *stubPool) Subscribe() <-chan struct{} { return p.notify } func (p *stubPool) Unsubscribe(<-chan struct{}) {} @@ -188,7 +189,7 @@ func (*SchedulerSuite) TestUseIdleWorkers(c *check.C) { running: map[string]time.Time{}, canCreate: 0, } - New(ctx, &queue, &pool, nil, time.Millisecond, time.Millisecond).runQueue() + New(ctx, arvados.NewClientFromEnv(), &queue, &pool, nil, time.Millisecond, time.Millisecond, 0, 0, 0).runQueue() c.Check(pool.creates, check.DeepEquals, []arvados.InstanceType{test.InstanceType(1), test.InstanceType(1), test.InstanceType(1)}) c.Check(pool.starts, check.DeepEquals, []string{test.ContainerUUID(4)}) c.Check(pool.running, check.HasLen, 1) @@ -201,12 +202,8 @@ func (*SchedulerSuite) TestUseIdleWorkers(c *check.C) { // call Create(). func (*SchedulerSuite) TestShutdownAtQuota(c *check.C) { ctx := ctxlog.Context(context.Background(), ctxlog.TestLogger(c)) - for quota := 1; quota < 3; quota++ { + for quota := 1; quota <= 3; quota++ { c.Logf("quota=%d", quota) - shouldCreate := []arvados.InstanceType{} - for i := 1; i < quota; i++ { - shouldCreate = append(shouldCreate, test.InstanceType(3)) - } queue := test.Queue{ ChooseType: chooseType, Containers: []arvados.Container{ @@ -244,23 +241,243 @@ func (*SchedulerSuite) TestShutdownAtQuota(c *check.C) { starts: []string{}, canCreate: 0, } - sch := New(ctx, &queue, &pool, nil, time.Millisecond, time.Millisecond) - sch.runQueue() + sch := New(ctx, arvados.NewClientFromEnv(), &queue, &pool, nil, time.Millisecond, time.Millisecond, 0, 0, 0) sch.sync() sch.runQueue() sch.sync() - c.Check(pool.creates, check.DeepEquals, shouldCreate) - if len(shouldCreate) == 0 { - c.Check(pool.starts, check.DeepEquals, []string{}) - } else { + switch quota { + case 1, 2: + // Can't create a type3 node for ctr3, so we + // shutdown an unallocated node (type2), and + // unlock both containers. + c.Check(pool.starts, check.HasLen, 0) + c.Check(pool.shutdowns, check.Equals, 1) + c.Check(pool.creates, check.HasLen, 0) + c.Check(queue.StateChanges(), check.DeepEquals, []test.QueueStateChange{ + {UUID: test.ContainerUUID(3), From: "Locked", To: "Queued"}, + {UUID: test.ContainerUUID(2), From: "Locked", To: "Queued"}, + }) + case 3: + // Creating a type3 instance works, so we + // start ctr2 on a type2 instance, and leave + // ctr3 locked while we wait for the new + // instance to come up. c.Check(pool.starts, check.DeepEquals, []string{test.ContainerUUID(2)}) + c.Check(pool.shutdowns, check.Equals, 0) + c.Check(pool.creates, check.DeepEquals, []arvados.InstanceType{test.InstanceType(3)}) + c.Check(queue.StateChanges(), check.HasLen, 0) + default: + panic("test not written for quota>3") } - c.Check(pool.shutdowns, check.Equals, 3-quota) - c.Check(queue.StateChanges(), check.DeepEquals, []test.QueueStateChange{ - {UUID: "zzzzz-dz642-000000000000003", From: "Locked", To: "Queued"}, - {UUID: "zzzzz-dz642-000000000000002", From: "Locked", To: "Queued"}, + } +} + +// Don't unlock containers or shutdown unalloc (booting/idle) nodes +// just because some 503 errors caused us to reduce maxConcurrency +// below the current load level. +// +// We expect to raise maxConcurrency soon when we stop seeing 503s. If +// that doesn't happen soon, the idle timeout will take care of the +// excess nodes. +func (*SchedulerSuite) TestIdleIn503QuietPeriod(c *check.C) { + ctx := ctxlog.Context(context.Background(), ctxlog.TestLogger(c)) + queue := test.Queue{ + ChooseType: chooseType, + Containers: []arvados.Container{ + // scheduled on an instance (but not Running yet) + { + UUID: test.ContainerUUID(1), + Priority: 1000, + State: arvados.ContainerStateLocked, + RuntimeConstraints: arvados.RuntimeConstraints{ + VCPUs: 2, + RAM: 2 << 30, + }, + }, + // not yet scheduled + { + UUID: test.ContainerUUID(2), + Priority: 1000, + State: arvados.ContainerStateLocked, + RuntimeConstraints: arvados.RuntimeConstraints{ + VCPUs: 2, + RAM: 2 << 30, + }, + }, + // scheduled on an instance (but not Running yet) + { + UUID: test.ContainerUUID(3), + Priority: 1000, + State: arvados.ContainerStateLocked, + RuntimeConstraints: arvados.RuntimeConstraints{ + VCPUs: 3, + RAM: 3 << 30, + }, + }, + // not yet scheduled + { + UUID: test.ContainerUUID(4), + Priority: 1000, + State: arvados.ContainerStateLocked, + RuntimeConstraints: arvados.RuntimeConstraints{ + VCPUs: 3, + RAM: 3 << 30, + }, + }, + // not yet locked + { + UUID: test.ContainerUUID(5), + Priority: 1000, + State: arvados.ContainerStateQueued, + RuntimeConstraints: arvados.RuntimeConstraints{ + VCPUs: 3, + RAM: 3 << 30, + }, + }, + }, + } + queue.Update() + pool := stubPool{ + quota: 16, + unalloc: map[arvados.InstanceType]int{ + test.InstanceType(2): 2, + test.InstanceType(3): 2, + }, + idle: map[arvados.InstanceType]int{ + test.InstanceType(2): 1, + test.InstanceType(3): 1, + }, + running: map[string]time.Time{ + test.ContainerUUID(1): {}, + test.ContainerUUID(3): {}, + }, + creates: []arvados.InstanceType{}, + starts: []string{}, + canCreate: 0, + } + sch := New(ctx, arvados.NewClientFromEnv(), &queue, &pool, nil, time.Millisecond, time.Millisecond, 0, 0, 0) + sch.last503time = time.Now() + sch.maxConcurrency = 3 + sch.sync() + sch.runQueue() + sch.sync() + + c.Check(pool.starts, check.DeepEquals, []string{test.ContainerUUID(2)}) + c.Check(pool.shutdowns, check.Equals, 0) + c.Check(pool.creates, check.HasLen, 0) + c.Check(queue.StateChanges(), check.HasLen, 0) +} + +// If we somehow have more supervisor containers in Locked state than +// we should (e.g., config changed since they started), and some +// appropriate-sized instances booting up, unlock the excess +// supervisor containers, but let the instances keep booting. +func (*SchedulerSuite) TestUnlockExcessSupervisors(c *check.C) { + ctx := ctxlog.Context(context.Background(), ctxlog.TestLogger(c)) + queue := test.Queue{ + ChooseType: chooseType, + } + for i := 1; i <= 6; i++ { + queue.Containers = append(queue.Containers, arvados.Container{ + UUID: test.ContainerUUID(i), + Priority: int64(1000 - i), + State: arvados.ContainerStateLocked, + RuntimeConstraints: arvados.RuntimeConstraints{ + VCPUs: 2, + RAM: 2 << 30, + }, + SchedulingParameters: arvados.SchedulingParameters{ + Supervisor: true, + }, }) } + queue.Update() + pool := stubPool{ + quota: 16, + unalloc: map[arvados.InstanceType]int{ + test.InstanceType(2): 2, + }, + idle: map[arvados.InstanceType]int{ + test.InstanceType(2): 1, + }, + running: map[string]time.Time{ + test.ContainerUUID(1): {}, + test.ContainerUUID(2): {}, + test.ContainerUUID(3): {}, + test.ContainerUUID(4): {}, + }, + creates: []arvados.InstanceType{}, + starts: []string{}, + canCreate: 0, + } + sch := New(ctx, arvados.NewClientFromEnv(), &queue, &pool, nil, time.Millisecond, time.Millisecond, 0, 8, 0.5) + sch.sync() + sch.runQueue() + sch.sync() + + c.Check(pool.starts, check.DeepEquals, []string{}) + c.Check(pool.shutdowns, check.Equals, 0) + c.Check(pool.creates, check.HasLen, 0) + c.Check(queue.StateChanges(), check.DeepEquals, []test.QueueStateChange{ + {UUID: test.ContainerUUID(5), From: "Locked", To: "Queued"}, + {UUID: test.ContainerUUID(6), From: "Locked", To: "Queued"}, + }) +} + +// Assuming we're not at quota, don't try to shutdown idle nodes +// merely because we have more queued/locked supervisor containers +// than MaxSupervisors -- it won't help. +func (*SchedulerSuite) TestExcessSupervisors(c *check.C) { + ctx := ctxlog.Context(context.Background(), ctxlog.TestLogger(c)) + queue := test.Queue{ + ChooseType: chooseType, + } + for i := 1; i <= 8; i++ { + queue.Containers = append(queue.Containers, arvados.Container{ + UUID: test.ContainerUUID(i), + Priority: int64(1000 + i), + State: arvados.ContainerStateQueued, + RuntimeConstraints: arvados.RuntimeConstraints{ + VCPUs: 2, + RAM: 2 << 30, + }, + SchedulingParameters: arvados.SchedulingParameters{ + Supervisor: true, + }, + }) + } + for i := 2; i < 4; i++ { + queue.Containers[i].State = arvados.ContainerStateLocked + } + for i := 4; i < 6; i++ { + queue.Containers[i].State = arvados.ContainerStateRunning + } + queue.Update() + pool := stubPool{ + quota: 16, + unalloc: map[arvados.InstanceType]int{ + test.InstanceType(2): 2, + }, + idle: map[arvados.InstanceType]int{ + test.InstanceType(2): 1, + }, + running: map[string]time.Time{ + test.ContainerUUID(5): {}, + test.ContainerUUID(6): {}, + }, + creates: []arvados.InstanceType{}, + starts: []string{}, + canCreate: 0, + } + sch := New(ctx, arvados.NewClientFromEnv(), &queue, &pool, nil, time.Millisecond, time.Millisecond, 0, 8, 0.5) + sch.sync() + sch.runQueue() + sch.sync() + + c.Check(pool.starts, check.HasLen, 2) + c.Check(pool.shutdowns, check.Equals, 0) + c.Check(pool.creates, check.HasLen, 0) + c.Check(queue.StateChanges(), check.HasLen, 0) } // Don't flap lock/unlock when equal-priority containers compete for @@ -293,24 +510,24 @@ func (*SchedulerSuite) TestEqualPriorityContainers(c *check.C) { pool := stubPool{ quota: 2, unalloc: map[arvados.InstanceType]int{ - test.InstanceType(3): 1, + test.InstanceType(3): 2, }, idle: map[arvados.InstanceType]int{ - test.InstanceType(3): 1, + test.InstanceType(3): 2, }, running: map[string]time.Time{}, creates: []arvados.InstanceType{}, starts: []string{}, - canCreate: 1, + canCreate: 0, } - sch := New(ctx, &queue, &pool, nil, time.Millisecond, time.Millisecond) + sch := New(ctx, arvados.NewClientFromEnv(), &queue, &pool, nil, time.Millisecond, time.Millisecond, 0, 0, 0) for i := 0; i < 30; i++ { sch.runQueue() sch.sync() time.Sleep(time.Millisecond) } c.Check(pool.shutdowns, check.Equals, 0) - c.Check(pool.starts, check.HasLen, 1) + c.Check(pool.starts, check.HasLen, 2) unlocked := map[string]int{} for _, chg := range queue.StateChanges() { if chg.To == arvados.ContainerStateQueued { @@ -405,7 +622,7 @@ func (*SchedulerSuite) TestStartWhileCreating(c *check.C) { }, } queue.Update() - New(ctx, &queue, &pool, nil, time.Millisecond, time.Millisecond).runQueue() + New(ctx, arvados.NewClientFromEnv(), &queue, &pool, nil, time.Millisecond, time.Millisecond, 0, 0, 0).runQueue() c.Check(pool.creates, check.DeepEquals, []arvados.InstanceType{test.InstanceType(2), test.InstanceType(1)}) c.Check(pool.starts, check.DeepEquals, []string{uuids[6], uuids[5], uuids[3], uuids[2]}) running := map[string]bool{} @@ -449,7 +666,7 @@ func (*SchedulerSuite) TestKillNonexistentContainer(c *check.C) { }, } queue.Update() - sch := New(ctx, &queue, &pool, nil, time.Millisecond, time.Millisecond) + sch := New(ctx, arvados.NewClientFromEnv(), &queue, &pool, nil, time.Millisecond, time.Millisecond, 0, 0, 0) c.Check(pool.running, check.HasLen, 1) sch.sync() for deadline := time.Now().Add(time.Second); len(pool.Running()) > 0 && time.Now().Before(deadline); time.Sleep(time.Millisecond) { @@ -482,7 +699,7 @@ func (*SchedulerSuite) TestContainersMetrics(c *check.C) { pool := stubPool{ unalloc: map[arvados.InstanceType]int{test.InstanceType(1): 1}, } - sch := New(ctx, &queue, &pool, nil, time.Millisecond, time.Millisecond) + sch := New(ctx, arvados.NewClientFromEnv(), &queue, &pool, nil, time.Millisecond, time.Millisecond, 0, 0, 0) sch.runQueue() sch.updateMetrics() @@ -494,7 +711,7 @@ func (*SchedulerSuite) TestContainersMetrics(c *check.C) { // 'over quota' metric will be 1 because no workers are available and canCreate defaults // to zero. pool = stubPool{} - sch = New(ctx, &queue, &pool, nil, time.Millisecond, time.Millisecond) + sch = New(ctx, arvados.NewClientFromEnv(), &queue, &pool, nil, time.Millisecond, time.Millisecond, 0, 0, 0) sch.runQueue() sch.updateMetrics() @@ -527,9 +744,81 @@ func (*SchedulerSuite) TestContainersMetrics(c *check.C) { unalloc: map[arvados.InstanceType]int{test.InstanceType(1): 1}, running: map[string]time.Time{}, } - sch = New(ctx, &queue, &pool, nil, time.Millisecond, time.Millisecond) + sch = New(ctx, arvados.NewClientFromEnv(), &queue, &pool, nil, time.Millisecond, time.Millisecond, 0, 0, 0) sch.runQueue() sch.updateMetrics() c.Check(int(testutil.ToFloat64(sch.mLongestWaitTimeSinceQueue)), check.Equals, 0) } + +// Assign priority=4, 3 and 1 containers to idle nodes. Ignore the supervisor at priority 2. +func (*SchedulerSuite) TestSkipSupervisors(c *check.C) { + ctx := ctxlog.Context(context.Background(), ctxlog.TestLogger(c)) + queue := test.Queue{ + ChooseType: chooseType, + Containers: []arvados.Container{ + { + UUID: test.ContainerUUID(1), + Priority: 1, + State: arvados.ContainerStateLocked, + RuntimeConstraints: arvados.RuntimeConstraints{ + VCPUs: 1, + RAM: 1 << 30, + }, + }, + { + UUID: test.ContainerUUID(2), + Priority: 2, + State: arvados.ContainerStateLocked, + RuntimeConstraints: arvados.RuntimeConstraints{ + VCPUs: 1, + RAM: 1 << 30, + }, + SchedulingParameters: arvados.SchedulingParameters{ + Supervisor: true, + }, + }, + { + UUID: test.ContainerUUID(3), + Priority: 3, + State: arvados.ContainerStateLocked, + RuntimeConstraints: arvados.RuntimeConstraints{ + VCPUs: 1, + RAM: 1 << 30, + }, + SchedulingParameters: arvados.SchedulingParameters{ + Supervisor: true, + }, + }, + { + UUID: test.ContainerUUID(4), + Priority: 4, + State: arvados.ContainerStateLocked, + RuntimeConstraints: arvados.RuntimeConstraints{ + VCPUs: 1, + RAM: 1 << 30, + }, + SchedulingParameters: arvados.SchedulingParameters{ + Supervisor: true, + }, + }, + }, + } + queue.Update() + pool := stubPool{ + quota: 1000, + unalloc: map[arvados.InstanceType]int{ + test.InstanceType(1): 4, + test.InstanceType(2): 4, + }, + idle: map[arvados.InstanceType]int{ + test.InstanceType(1): 4, + test.InstanceType(2): 4, + }, + running: map[string]time.Time{}, + canCreate: 0, + } + New(ctx, arvados.NewClientFromEnv(), &queue, &pool, nil, time.Millisecond, time.Millisecond, 0, 10, 0.2).runQueue() + c.Check(pool.creates, check.DeepEquals, []arvados.InstanceType(nil)) + c.Check(pool.starts, check.DeepEquals, []string{test.ContainerUUID(4), test.ContainerUUID(3), test.ContainerUUID(1)}) +}