X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/505c8fa50631201e289cc55230d46fdf52fa2055..bfb9e29c250bcfb34a6b1813ca46953503ca05e6:/lib/dispatchcloud/scheduler/run_queue_test.go diff --git a/lib/dispatchcloud/scheduler/run_queue_test.go b/lib/dispatchcloud/scheduler/run_queue_test.go index 992edddfba..8553993375 100644 --- a/lib/dispatchcloud/scheduler/run_queue_test.go +++ b/lib/dispatchcloud/scheduler/run_queue_test.go @@ -13,6 +13,9 @@ import ( "git.arvados.org/arvados.git/lib/dispatchcloud/worker" "git.arvados.org/arvados.git/sdk/go/arvados" "git.arvados.org/arvados.git/sdk/go/ctxlog" + + "github.com/prometheus/client_golang/prometheus/testutil" + check "gopkg.in/check.v1" ) @@ -38,7 +41,7 @@ type stubPool struct { idle map[arvados.InstanceType]int unknown map[arvados.InstanceType]int running map[string]time.Time - atQuota bool + quota int canCreate int creates []arvados.InstanceType starts []string @@ -46,7 +49,18 @@ type stubPool struct { sync.Mutex } -func (p *stubPool) AtQuota() bool { return p.atQuota } +func (p *stubPool) AtQuota() bool { + p.Lock() + defer p.Unlock() + n := len(p.running) + for _, nn := range p.unalloc { + n += nn + } + for _, nn := range p.unknown { + n += nn + } + return n >= p.quota +} func (p *stubPool) Subscribe() <-chan struct{} { return p.notify } func (p *stubPool) Unsubscribe(<-chan struct{}) {} func (p *stubPool) Running() map[string]time.Time { @@ -122,11 +136,8 @@ var _ = check.Suite(&SchedulerSuite{}) type SchedulerSuite struct{} -// Assign priority=4 container to idle node. Create a new instance for -// the priority=3 container. Don't try to start any priority<3 -// containers because priority=3 container didn't start -// immediately. Don't try to create any other nodes after the failed -// create. +// Assign priority=4 container to idle node. Create new instances for +// the priority=3, 2, 1 containers. func (*SchedulerSuite) TestUseIdleWorkers(c *check.C) { ctx := ctxlog.Context(context.Background(), ctxlog.TestLogger(c)) queue := test.Queue{ @@ -172,6 +183,7 @@ func (*SchedulerSuite) TestUseIdleWorkers(c *check.C) { } queue.Update() pool := stubPool{ + quota: 1000, unalloc: map[arvados.InstanceType]int{ test.InstanceType(1): 1, test.InstanceType(2): 2, @@ -183,8 +195,8 @@ func (*SchedulerSuite) TestUseIdleWorkers(c *check.C) { running: map[string]time.Time{}, canCreate: 0, } - New(ctx, &queue, &pool, time.Millisecond, time.Millisecond).runQueue() - c.Check(pool.creates, check.DeepEquals, []arvados.InstanceType{test.InstanceType(1)}) + New(ctx, arvados.NewClientFromEnv(), &queue, &pool, nil, time.Millisecond, time.Millisecond).runQueue() + c.Check(pool.creates, check.DeepEquals, []arvados.InstanceType{test.InstanceType(1), test.InstanceType(1), test.InstanceType(1)}) c.Check(pool.starts, check.DeepEquals, []string{test.ContainerUUID(4)}) c.Check(pool.running, check.HasLen, 1) for uuid := range pool.running { @@ -192,16 +204,12 @@ func (*SchedulerSuite) TestUseIdleWorkers(c *check.C) { } } -// If Create() fails, shutdown some nodes, and don't call Create() -// again. Don't call Create() at all if AtQuota() is true. +// If pool.AtQuota() is true, shutdown some unalloc nodes, and don't +// call Create(). func (*SchedulerSuite) TestShutdownAtQuota(c *check.C) { ctx := ctxlog.Context(context.Background(), ctxlog.TestLogger(c)) - for quota := 0; quota < 2; quota++ { + for quota := 1; quota <= 3; quota++ { c.Logf("quota=%d", quota) - shouldCreate := []arvados.InstanceType{} - for i := 0; i < quota; i++ { - shouldCreate = append(shouldCreate, test.InstanceType(3)) - } queue := test.Queue{ ChooseType: chooseType, Containers: []arvados.Container{ @@ -227,7 +235,7 @@ func (*SchedulerSuite) TestShutdownAtQuota(c *check.C) { } queue.Update() pool := stubPool{ - atQuota: quota == 0, + quota: quota, unalloc: map[arvados.InstanceType]int{ test.InstanceType(2): 2, }, @@ -239,10 +247,93 @@ func (*SchedulerSuite) TestShutdownAtQuota(c *check.C) { starts: []string{}, canCreate: 0, } - New(ctx, &queue, &pool, time.Millisecond, time.Millisecond).runQueue() - c.Check(pool.creates, check.DeepEquals, shouldCreate) - c.Check(pool.starts, check.DeepEquals, []string{}) - c.Check(pool.shutdowns, check.Not(check.Equals), 0) + sch := New(ctx, arvados.NewClientFromEnv(), &queue, &pool, nil, time.Millisecond, time.Millisecond) + sch.sync() + sch.runQueue() + sch.sync() + switch quota { + case 1, 2: + // Can't create a type3 node for ctr3, so we + // shutdown an unallocated node (type2), and + // unlock both containers. + c.Check(pool.starts, check.HasLen, 0) + c.Check(pool.shutdowns, check.Equals, 1) + c.Check(pool.creates, check.HasLen, 0) + c.Check(queue.StateChanges(), check.DeepEquals, []test.QueueStateChange{ + {UUID: test.ContainerUUID(3), From: "Locked", To: "Queued"}, + {UUID: test.ContainerUUID(2), From: "Locked", To: "Queued"}, + }) + case 3: + // Creating a type3 instance works, so we + // start ctr2 on a type2 instance, and leave + // ctr3 locked while we wait for the new + // instance to come up. + c.Check(pool.starts, check.DeepEquals, []string{test.ContainerUUID(2)}) + c.Check(pool.shutdowns, check.Equals, 0) + c.Check(pool.creates, check.DeepEquals, []arvados.InstanceType{test.InstanceType(3)}) + c.Check(queue.StateChanges(), check.HasLen, 0) + default: + panic("test not written for quota>3") + } + } +} + +// Don't flap lock/unlock when equal-priority containers compete for +// limited workers. +// +// (Unless we use FirstSeenAt as a secondary sort key, each runQueue() +// tends to choose a different one of the equal-priority containers as +// the "first" one that should be locked, and unlock the one it chose +// last time. This generates logging noise, and fails containers by +// reaching MaxDispatchAttempts quickly.) +func (*SchedulerSuite) TestEqualPriorityContainers(c *check.C) { + logger := ctxlog.TestLogger(c) + ctx := ctxlog.Context(context.Background(), logger) + queue := test.Queue{ + ChooseType: chooseType, + Logger: logger, + } + for i := 0; i < 8; i++ { + queue.Containers = append(queue.Containers, arvados.Container{ + UUID: test.ContainerUUID(i), + Priority: 333, + State: arvados.ContainerStateQueued, + RuntimeConstraints: arvados.RuntimeConstraints{ + VCPUs: 3, + RAM: 3 << 30, + }, + }) + } + queue.Update() + pool := stubPool{ + quota: 2, + unalloc: map[arvados.InstanceType]int{ + test.InstanceType(3): 2, + }, + idle: map[arvados.InstanceType]int{ + test.InstanceType(3): 2, + }, + running: map[string]time.Time{}, + creates: []arvados.InstanceType{}, + starts: []string{}, + canCreate: 0, + } + sch := New(ctx, arvados.NewClientFromEnv(), &queue, &pool, nil, time.Millisecond, time.Millisecond) + for i := 0; i < 30; i++ { + sch.runQueue() + sch.sync() + time.Sleep(time.Millisecond) + } + c.Check(pool.shutdowns, check.Equals, 0) + c.Check(pool.starts, check.HasLen, 2) + unlocked := map[string]int{} + for _, chg := range queue.StateChanges() { + if chg.To == arvados.ContainerStateQueued { + unlocked[chg.UUID]++ + } + } + for uuid, count := range unlocked { + c.Check(count, check.Equals, 1, check.Commentf("%s", uuid)) } } @@ -251,6 +342,7 @@ func (*SchedulerSuite) TestShutdownAtQuota(c *check.C) { func (*SchedulerSuite) TestStartWhileCreating(c *check.C) { ctx := ctxlog.Context(context.Background(), ctxlog.TestLogger(c)) pool := stubPool{ + quota: 1000, unalloc: map[arvados.InstanceType]int{ test.InstanceType(1): 2, test.InstanceType(2): 2, @@ -328,7 +420,7 @@ func (*SchedulerSuite) TestStartWhileCreating(c *check.C) { }, } queue.Update() - New(ctx, &queue, &pool, time.Millisecond, time.Millisecond).runQueue() + New(ctx, arvados.NewClientFromEnv(), &queue, &pool, nil, time.Millisecond, time.Millisecond).runQueue() c.Check(pool.creates, check.DeepEquals, []arvados.InstanceType{test.InstanceType(2), test.InstanceType(1)}) c.Check(pool.starts, check.DeepEquals, []string{uuids[6], uuids[5], uuids[3], uuids[2]}) running := map[string]bool{} @@ -345,6 +437,7 @@ func (*SchedulerSuite) TestStartWhileCreating(c *check.C) { func (*SchedulerSuite) TestKillNonexistentContainer(c *check.C) { ctx := ctxlog.Context(context.Background(), ctxlog.TestLogger(c)) pool := stubPool{ + quota: 1000, unalloc: map[arvados.InstanceType]int{ test.InstanceType(2): 0, }, @@ -352,7 +445,7 @@ func (*SchedulerSuite) TestKillNonexistentContainer(c *check.C) { test.InstanceType(2): 0, }, running: map[string]time.Time{ - test.ContainerUUID(2): time.Time{}, + test.ContainerUUID(2): {}, }, } queue := test.Queue{ @@ -371,10 +464,87 @@ func (*SchedulerSuite) TestKillNonexistentContainer(c *check.C) { }, } queue.Update() - sch := New(ctx, &queue, &pool, time.Millisecond, time.Millisecond) + sch := New(ctx, arvados.NewClientFromEnv(), &queue, &pool, nil, time.Millisecond, time.Millisecond) c.Check(pool.running, check.HasLen, 1) sch.sync() for deadline := time.Now().Add(time.Second); len(pool.Running()) > 0 && time.Now().Before(deadline); time.Sleep(time.Millisecond) { } c.Check(pool.Running(), check.HasLen, 0) } + +func (*SchedulerSuite) TestContainersMetrics(c *check.C) { + ctx := ctxlog.Context(context.Background(), ctxlog.TestLogger(c)) + queue := test.Queue{ + ChooseType: chooseType, + Containers: []arvados.Container{ + { + UUID: test.ContainerUUID(1), + Priority: 1, + State: arvados.ContainerStateLocked, + CreatedAt: time.Now().Add(-10 * time.Second), + RuntimeConstraints: arvados.RuntimeConstraints{ + VCPUs: 1, + RAM: 1 << 30, + }, + }, + }, + } + queue.Update() + + // Create a pool with one unallocated (idle/booting/unknown) worker, + // and `idle` and `unknown` not set (empty). Iow this worker is in the booting + // state, and the container will be allocated but not started yet. + pool := stubPool{ + unalloc: map[arvados.InstanceType]int{test.InstanceType(1): 1}, + } + sch := New(ctx, arvados.NewClientFromEnv(), &queue, &pool, nil, time.Millisecond, time.Millisecond) + sch.runQueue() + sch.updateMetrics() + + c.Check(int(testutil.ToFloat64(sch.mContainersAllocatedNotStarted)), check.Equals, 1) + c.Check(int(testutil.ToFloat64(sch.mContainersNotAllocatedOverQuota)), check.Equals, 0) + c.Check(int(testutil.ToFloat64(sch.mLongestWaitTimeSinceQueue)), check.Equals, 10) + + // Create a pool without workers. The queued container will not be started, and the + // 'over quota' metric will be 1 because no workers are available and canCreate defaults + // to zero. + pool = stubPool{} + sch = New(ctx, arvados.NewClientFromEnv(), &queue, &pool, nil, time.Millisecond, time.Millisecond) + sch.runQueue() + sch.updateMetrics() + + c.Check(int(testutil.ToFloat64(sch.mContainersAllocatedNotStarted)), check.Equals, 0) + c.Check(int(testutil.ToFloat64(sch.mContainersNotAllocatedOverQuota)), check.Equals, 1) + c.Check(int(testutil.ToFloat64(sch.mLongestWaitTimeSinceQueue)), check.Equals, 10) + + // Reset the queue, and create a pool with an idle worker. The queued + // container will be started immediately and mLongestWaitTimeSinceQueue + // should be zero. + queue = test.Queue{ + ChooseType: chooseType, + Containers: []arvados.Container{ + { + UUID: test.ContainerUUID(1), + Priority: 1, + State: arvados.ContainerStateLocked, + CreatedAt: time.Now().Add(-10 * time.Second), + RuntimeConstraints: arvados.RuntimeConstraints{ + VCPUs: 1, + RAM: 1 << 30, + }, + }, + }, + } + queue.Update() + + pool = stubPool{ + idle: map[arvados.InstanceType]int{test.InstanceType(1): 1}, + unalloc: map[arvados.InstanceType]int{test.InstanceType(1): 1}, + running: map[string]time.Time{}, + } + sch = New(ctx, arvados.NewClientFromEnv(), &queue, &pool, nil, time.Millisecond, time.Millisecond) + sch.runQueue() + sch.updateMetrics() + + c.Check(int(testutil.ToFloat64(sch.mLongestWaitTimeSinceQueue)), check.Equals, 0) +}