From a5dbdd1dbcdfcb835e7c1fe741e4c00927177404 Mon Sep 17 00:00:00 2001 From: Ward Vandewege Date: Mon, 14 Sep 2020 15:14:32 -0400 Subject: [PATCH] 16636: add 'containers_longest_wait_time_seconds' metric: a gauge that indicates the current longest wait time between seeing a container in the queue or requeueing, and starting its crunch-run process on a worker. Arvados-DCO-1.1-Signed-off-by: Ward Vandewege --- lib/dispatchcloud/scheduler/run_queue.go | 8 ++++ lib/dispatchcloud/scheduler/run_queue_test.go | 41 +++++++++++++++++-- lib/dispatchcloud/scheduler/scheduler.go | 8 ++++ 3 files changed, 53 insertions(+), 4 deletions(-) diff --git a/lib/dispatchcloud/scheduler/run_queue.go b/lib/dispatchcloud/scheduler/run_queue.go index b9d653a821..9bbb064fee 100644 --- a/lib/dispatchcloud/scheduler/run_queue.go +++ b/lib/dispatchcloud/scheduler/run_queue.go @@ -34,6 +34,7 @@ func (sch *Scheduler) runQueue() { dontstart := map[arvados.InstanceType]bool{} var overquota []container.QueueEnt // entries that are unmappable because of worker pool quota var containerAllocatedWorkerBootingCount int + var longestWaitTimeCandidate, previousLongestWaitTimeCandidate float64 tryrun: for i, ctr := range sorted { @@ -45,6 +46,11 @@ tryrun: if _, running := running[ctr.UUID]; running || ctr.Priority < 1 { continue } + previousLongestWaitTimeCandidate = longestWaitTimeCandidate + since := time.Since(ctr.CreatedAt).Seconds() + if since > longestWaitTimeCandidate { + longestWaitTimeCandidate = since + } switch ctr.State { case arvados.ContainerStateQueued: if unalloc[it] < 1 && sch.pool.AtQuota() { @@ -92,6 +98,7 @@ tryrun: logger.Info("not restarting yet: crunch-run process from previous attempt has not exited") } else if sch.pool.StartContainer(it, ctr) { // Success. + longestWaitTimeCandidate = previousLongestWaitTimeCandidate } else { containerAllocatedWorkerBootingCount += 1 dontstart[it] = true @@ -101,6 +108,7 @@ tryrun: sch.mContainersAllocatedNotStarted.Set(float64(containerAllocatedWorkerBootingCount)) sch.mContainersNotAllocatedOverQuota.Set(float64(len(overquota))) + sch.mLongestWaitTimeSinceQueue.Set(longestWaitTimeCandidate) if len(overquota) > 0 { // Unlock any containers that are unmappable while diff --git a/lib/dispatchcloud/scheduler/run_queue_test.go b/lib/dispatchcloud/scheduler/run_queue_test.go index 6d70364989..e7963ca7d4 100644 --- a/lib/dispatchcloud/scheduler/run_queue_test.go +++ b/lib/dispatchcloud/scheduler/run_queue_test.go @@ -391,15 +391,16 @@ func (*SchedulerSuite) TestKillNonexistentContainer(c *check.C) { c.Check(pool.Running(), check.HasLen, 0) } -func (*SchedulerSuite) TestContainersAllocatedNotStartedMetric(c *check.C) { +func (*SchedulerSuite) TestContainersMetrics(c *check.C) { ctx := ctxlog.Context(context.Background(), ctxlog.TestLogger(c)) queue := test.Queue{ ChooseType: chooseType, Containers: []arvados.Container{ { - UUID: test.ContainerUUID(1), - Priority: 1, - State: arvados.ContainerStateLocked, + UUID: test.ContainerUUID(1), + Priority: 1, + State: arvados.ContainerStateLocked, + CreatedAt: time.Now().Add(-10 * time.Second), RuntimeConstraints: arvados.RuntimeConstraints{ VCPUs: 1, RAM: 1 << 30, @@ -420,6 +421,7 @@ func (*SchedulerSuite) TestContainersAllocatedNotStartedMetric(c *check.C) { c.Check(int(testutil.ToFloat64(sch.mContainersAllocatedNotStarted)), check.Equals, 1) c.Check(int(testutil.ToFloat64(sch.mContainersNotAllocatedOverQuota)), check.Equals, 0) + c.Check(int(testutil.ToFloat64(sch.mLongestWaitTimeSinceQueue)), check.Equals, 10) // Create a pool without workers. The queued container will not be started, and the // 'over quota' metric will be 1 because no workers are available and canCreate defaults @@ -430,4 +432,35 @@ func (*SchedulerSuite) TestContainersAllocatedNotStartedMetric(c *check.C) { c.Check(int(testutil.ToFloat64(sch.mContainersAllocatedNotStarted)), check.Equals, 0) c.Check(int(testutil.ToFloat64(sch.mContainersNotAllocatedOverQuota)), check.Equals, 1) + c.Check(int(testutil.ToFloat64(sch.mLongestWaitTimeSinceQueue)), check.Equals, 10) + + // Reset the queue, and create a pool with an idle worker. The queued + // container will be started immediately and mLongestWaitTimeSinceQueue + // should be zero. + queue = test.Queue{ + ChooseType: chooseType, + Containers: []arvados.Container{ + { + UUID: test.ContainerUUID(1), + Priority: 1, + State: arvados.ContainerStateLocked, + CreatedAt: time.Now().Add(-10 * time.Second), + RuntimeConstraints: arvados.RuntimeConstraints{ + VCPUs: 1, + RAM: 1 << 30, + }, + }, + }, + } + queue.Update() + + pool = stubPool{ + idle: map[arvados.InstanceType]int{test.InstanceType(1): 1}, + unalloc: map[arvados.InstanceType]int{test.InstanceType(1): 1}, + running: map[string]time.Time{}, + } + sch = New(ctx, &queue, &pool, nil, time.Millisecond, time.Millisecond) + sch.runQueue() + + c.Check(int(testutil.ToFloat64(sch.mLongestWaitTimeSinceQueue)), check.Equals, 0) } diff --git a/lib/dispatchcloud/scheduler/scheduler.go b/lib/dispatchcloud/scheduler/scheduler.go index 6fd47e9194..c169245735 100644 --- a/lib/dispatchcloud/scheduler/scheduler.go +++ b/lib/dispatchcloud/scheduler/scheduler.go @@ -46,6 +46,7 @@ type Scheduler struct { mContainersAllocatedNotStarted prometheus.Gauge mContainersNotAllocatedOverQuota prometheus.Gauge + mLongestWaitTimeSinceQueue prometheus.Gauge } // New returns a new unstarted Scheduler. @@ -87,6 +88,13 @@ func (sch *Scheduler) registerMetrics(reg *prometheus.Registry) { Help: "Number of containers not allocated to a worker because the system has hit a quota.", }) reg.MustRegister(sch.mContainersNotAllocatedOverQuota) + sch.mLongestWaitTimeSinceQueue = prometheus.NewGauge(prometheus.GaugeOpts{ + Namespace: "arvados", + Subsystem: "dispatchcloud", + Name: "containers_longest_wait_time_seconds", + Help: "Current longest wait time of any container since queuing, and before the start of crunch-run.", + }) + reg.MustRegister(sch.mLongestWaitTimeSinceQueue) } // Start starts the scheduler. -- 2.30.2