From a5dbdd1dbcdfcb835e7c1fe741e4c00927177404 Mon Sep 17 00:00:00 2001
From: Ward Vandewege <ward@curii.com>
Date: Mon, 14 Sep 2020 15:14:32 -0400
Subject: [PATCH] 16636: add 'containers_longest_wait_time_seconds' metric: a
 gauge that        indicates the current longest wait time between seeing a   
     container in the queue or requeueing, and starting its crunch-run       
 process on a worker.

Arvados-DCO-1.1-Signed-off-by: Ward Vandewege <ward@curii.com>
---
 lib/dispatchcloud/scheduler/run_queue.go      |  8 ++++
 lib/dispatchcloud/scheduler/run_queue_test.go | 41 +++++++++++++++++--
 lib/dispatchcloud/scheduler/scheduler.go      |  8 ++++
 3 files changed, 53 insertions(+), 4 deletions(-)

diff --git a/lib/dispatchcloud/scheduler/run_queue.go b/lib/dispatchcloud/scheduler/run_queue.go
index b9d653a821..9bbb064fee 100644
--- a/lib/dispatchcloud/scheduler/run_queue.go
+++ b/lib/dispatchcloud/scheduler/run_queue.go
@@ -34,6 +34,7 @@ func (sch *Scheduler) runQueue() {
 	dontstart := map[arvados.InstanceType]bool{}
 	var overquota []container.QueueEnt // entries that are unmappable because of worker pool quota
 	var containerAllocatedWorkerBootingCount int
+	var longestWaitTimeCandidate, previousLongestWaitTimeCandidate float64
 
 tryrun:
 	for i, ctr := range sorted {
@@ -45,6 +46,11 @@ tryrun:
 		if _, running := running[ctr.UUID]; running || ctr.Priority < 1 {
 			continue
 		}
+		previousLongestWaitTimeCandidate = longestWaitTimeCandidate
+		since := time.Since(ctr.CreatedAt).Seconds()
+		if since > longestWaitTimeCandidate {
+			longestWaitTimeCandidate = since
+		}
 		switch ctr.State {
 		case arvados.ContainerStateQueued:
 			if unalloc[it] < 1 && sch.pool.AtQuota() {
@@ -92,6 +98,7 @@ tryrun:
 				logger.Info("not restarting yet: crunch-run process from previous attempt has not exited")
 			} else if sch.pool.StartContainer(it, ctr) {
 				// Success.
+				longestWaitTimeCandidate = previousLongestWaitTimeCandidate
 			} else {
 				containerAllocatedWorkerBootingCount += 1
 				dontstart[it] = true
@@ -101,6 +108,7 @@ tryrun:
 
 	sch.mContainersAllocatedNotStarted.Set(float64(containerAllocatedWorkerBootingCount))
 	sch.mContainersNotAllocatedOverQuota.Set(float64(len(overquota)))
+	sch.mLongestWaitTimeSinceQueue.Set(longestWaitTimeCandidate)
 
 	if len(overquota) > 0 {
 		// Unlock any containers that are unmappable while
diff --git a/lib/dispatchcloud/scheduler/run_queue_test.go b/lib/dispatchcloud/scheduler/run_queue_test.go
index 6d70364989..e7963ca7d4 100644
--- a/lib/dispatchcloud/scheduler/run_queue_test.go
+++ b/lib/dispatchcloud/scheduler/run_queue_test.go
@@ -391,15 +391,16 @@ func (*SchedulerSuite) TestKillNonexistentContainer(c *check.C) {
 	c.Check(pool.Running(), check.HasLen, 0)
 }
 
-func (*SchedulerSuite) TestContainersAllocatedNotStartedMetric(c *check.C) {
+func (*SchedulerSuite) TestContainersMetrics(c *check.C) {
 	ctx := ctxlog.Context(context.Background(), ctxlog.TestLogger(c))
 	queue := test.Queue{
 		ChooseType: chooseType,
 		Containers: []arvados.Container{
 			{
-				UUID:     test.ContainerUUID(1),
-				Priority: 1,
-				State:    arvados.ContainerStateLocked,
+				UUID:      test.ContainerUUID(1),
+				Priority:  1,
+				State:     arvados.ContainerStateLocked,
+				CreatedAt: time.Now().Add(-10 * time.Second),
 				RuntimeConstraints: arvados.RuntimeConstraints{
 					VCPUs: 1,
 					RAM:   1 << 30,
@@ -420,6 +421,7 @@ func (*SchedulerSuite) TestContainersAllocatedNotStartedMetric(c *check.C) {
 
 	c.Check(int(testutil.ToFloat64(sch.mContainersAllocatedNotStarted)), check.Equals, 1)
 	c.Check(int(testutil.ToFloat64(sch.mContainersNotAllocatedOverQuota)), check.Equals, 0)
+	c.Check(int(testutil.ToFloat64(sch.mLongestWaitTimeSinceQueue)), check.Equals, 10)
 
 	// Create a pool without workers. The queued container will not be started, and the
 	// 'over quota' metric will be 1 because no workers are available and canCreate defaults
@@ -430,4 +432,35 @@ func (*SchedulerSuite) TestContainersAllocatedNotStartedMetric(c *check.C) {
 
 	c.Check(int(testutil.ToFloat64(sch.mContainersAllocatedNotStarted)), check.Equals, 0)
 	c.Check(int(testutil.ToFloat64(sch.mContainersNotAllocatedOverQuota)), check.Equals, 1)
+	c.Check(int(testutil.ToFloat64(sch.mLongestWaitTimeSinceQueue)), check.Equals, 10)
+
+	// Reset the queue, and create a pool with an idle worker. The queued
+	// container will be started immediately and mLongestWaitTimeSinceQueue
+	// should be zero.
+	queue = test.Queue{
+		ChooseType: chooseType,
+		Containers: []arvados.Container{
+			{
+				UUID:      test.ContainerUUID(1),
+				Priority:  1,
+				State:     arvados.ContainerStateLocked,
+				CreatedAt: time.Now().Add(-10 * time.Second),
+				RuntimeConstraints: arvados.RuntimeConstraints{
+					VCPUs: 1,
+					RAM:   1 << 30,
+				},
+			},
+		},
+	}
+	queue.Update()
+
+	pool = stubPool{
+		idle:    map[arvados.InstanceType]int{test.InstanceType(1): 1},
+		unalloc: map[arvados.InstanceType]int{test.InstanceType(1): 1},
+		running: map[string]time.Time{},
+	}
+	sch = New(ctx, &queue, &pool, nil, time.Millisecond, time.Millisecond)
+	sch.runQueue()
+
+	c.Check(int(testutil.ToFloat64(sch.mLongestWaitTimeSinceQueue)), check.Equals, 0)
 }
diff --git a/lib/dispatchcloud/scheduler/scheduler.go b/lib/dispatchcloud/scheduler/scheduler.go
index 6fd47e9194..c169245735 100644
--- a/lib/dispatchcloud/scheduler/scheduler.go
+++ b/lib/dispatchcloud/scheduler/scheduler.go
@@ -46,6 +46,7 @@ type Scheduler struct {
 
 	mContainersAllocatedNotStarted   prometheus.Gauge
 	mContainersNotAllocatedOverQuota prometheus.Gauge
+	mLongestWaitTimeSinceQueue       prometheus.Gauge
 }
 
 // New returns a new unstarted Scheduler.
@@ -87,6 +88,13 @@ func (sch *Scheduler) registerMetrics(reg *prometheus.Registry) {
 		Help:      "Number of containers not allocated to a worker because the system has hit a quota.",
 	})
 	reg.MustRegister(sch.mContainersNotAllocatedOverQuota)
+	sch.mLongestWaitTimeSinceQueue = prometheus.NewGauge(prometheus.GaugeOpts{
+		Namespace: "arvados",
+		Subsystem: "dispatchcloud",
+		Name:      "containers_longest_wait_time_seconds",
+		Help:      "Current longest wait time of any container since queuing, and before the start of crunch-run.",
+	})
+	reg.MustRegister(sch.mLongestWaitTimeSinceQueue)
 }
 
 // Start starts the scheduler.
-- 
2.30.2