16636: add 'containers_longest_wait_time_seconds' metric: a gauge that
authorWard Vandewege <ward@curii.com>
Mon, 14 Sep 2020 19:14:32 +0000 (15:14 -0400)
committerWard Vandewege <ward@curii.com>
Mon, 14 Sep 2020 19:14:32 +0000 (15:14 -0400)
       indicates the current longest wait time between seeing a
       container in the queue or requeueing, and starting its crunch-run
       process on a worker.

Arvados-DCO-1.1-Signed-off-by: Ward Vandewege <ward@curii.com>

lib/dispatchcloud/scheduler/run_queue.go
lib/dispatchcloud/scheduler/run_queue_test.go
lib/dispatchcloud/scheduler/scheduler.go

index b9d653a821e4b6650d2666e368414df43843e4b8..9bbb064fee6a06bcc769b49fe8b7ae2b417b1af3 100644 (file)
@@ -34,6 +34,7 @@ func (sch *Scheduler) runQueue() {
        dontstart := map[arvados.InstanceType]bool{}
        var overquota []container.QueueEnt // entries that are unmappable because of worker pool quota
        var containerAllocatedWorkerBootingCount int
+       var longestWaitTimeCandidate, previousLongestWaitTimeCandidate float64
 
 tryrun:
        for i, ctr := range sorted {
@@ -45,6 +46,11 @@ tryrun:
                if _, running := running[ctr.UUID]; running || ctr.Priority < 1 {
                        continue
                }
+               previousLongestWaitTimeCandidate = longestWaitTimeCandidate
+               since := time.Since(ctr.CreatedAt).Seconds()
+               if since > longestWaitTimeCandidate {
+                       longestWaitTimeCandidate = since
+               }
                switch ctr.State {
                case arvados.ContainerStateQueued:
                        if unalloc[it] < 1 && sch.pool.AtQuota() {
@@ -92,6 +98,7 @@ tryrun:
                                logger.Info("not restarting yet: crunch-run process from previous attempt has not exited")
                        } else if sch.pool.StartContainer(it, ctr) {
                                // Success.
+                               longestWaitTimeCandidate = previousLongestWaitTimeCandidate
                        } else {
                                containerAllocatedWorkerBootingCount += 1
                                dontstart[it] = true
@@ -101,6 +108,7 @@ tryrun:
 
        sch.mContainersAllocatedNotStarted.Set(float64(containerAllocatedWorkerBootingCount))
        sch.mContainersNotAllocatedOverQuota.Set(float64(len(overquota)))
+       sch.mLongestWaitTimeSinceQueue.Set(longestWaitTimeCandidate)
 
        if len(overquota) > 0 {
                // Unlock any containers that are unmappable while
index 6d703649893356bdb91429954128d1e5afd6a9c5..e7963ca7d4bd99f839fb17a7731a5fe5fefc2c89 100644 (file)
@@ -391,15 +391,16 @@ func (*SchedulerSuite) TestKillNonexistentContainer(c *check.C) {
        c.Check(pool.Running(), check.HasLen, 0)
 }
 
-func (*SchedulerSuite) TestContainersAllocatedNotStartedMetric(c *check.C) {
+func (*SchedulerSuite) TestContainersMetrics(c *check.C) {
        ctx := ctxlog.Context(context.Background(), ctxlog.TestLogger(c))
        queue := test.Queue{
                ChooseType: chooseType,
                Containers: []arvados.Container{
                        {
-                               UUID:     test.ContainerUUID(1),
-                               Priority: 1,
-                               State:    arvados.ContainerStateLocked,
+                               UUID:      test.ContainerUUID(1),
+                               Priority:  1,
+                               State:     arvados.ContainerStateLocked,
+                               CreatedAt: time.Now().Add(-10 * time.Second),
                                RuntimeConstraints: arvados.RuntimeConstraints{
                                        VCPUs: 1,
                                        RAM:   1 << 30,
@@ -420,6 +421,7 @@ func (*SchedulerSuite) TestContainersAllocatedNotStartedMetric(c *check.C) {
 
        c.Check(int(testutil.ToFloat64(sch.mContainersAllocatedNotStarted)), check.Equals, 1)
        c.Check(int(testutil.ToFloat64(sch.mContainersNotAllocatedOverQuota)), check.Equals, 0)
+       c.Check(int(testutil.ToFloat64(sch.mLongestWaitTimeSinceQueue)), check.Equals, 10)
 
        // Create a pool without workers. The queued container will not be started, and the
        // 'over quota' metric will be 1 because no workers are available and canCreate defaults
@@ -430,4 +432,35 @@ func (*SchedulerSuite) TestContainersAllocatedNotStartedMetric(c *check.C) {
 
        c.Check(int(testutil.ToFloat64(sch.mContainersAllocatedNotStarted)), check.Equals, 0)
        c.Check(int(testutil.ToFloat64(sch.mContainersNotAllocatedOverQuota)), check.Equals, 1)
+       c.Check(int(testutil.ToFloat64(sch.mLongestWaitTimeSinceQueue)), check.Equals, 10)
+
+       // Reset the queue, and create a pool with an idle worker. The queued
+       // container will be started immediately and mLongestWaitTimeSinceQueue
+       // should be zero.
+       queue = test.Queue{
+               ChooseType: chooseType,
+               Containers: []arvados.Container{
+                       {
+                               UUID:      test.ContainerUUID(1),
+                               Priority:  1,
+                               State:     arvados.ContainerStateLocked,
+                               CreatedAt: time.Now().Add(-10 * time.Second),
+                               RuntimeConstraints: arvados.RuntimeConstraints{
+                                       VCPUs: 1,
+                                       RAM:   1 << 30,
+                               },
+                       },
+               },
+       }
+       queue.Update()
+
+       pool = stubPool{
+               idle:    map[arvados.InstanceType]int{test.InstanceType(1): 1},
+               unalloc: map[arvados.InstanceType]int{test.InstanceType(1): 1},
+               running: map[string]time.Time{},
+       }
+       sch = New(ctx, &queue, &pool, nil, time.Millisecond, time.Millisecond)
+       sch.runQueue()
+
+       c.Check(int(testutil.ToFloat64(sch.mLongestWaitTimeSinceQueue)), check.Equals, 0)
 }
index 6fd47e919411d69026316d24d8643e5693d06f38..c169245735fb65b8ed7de8c73d504447905fcbae 100644 (file)
@@ -46,6 +46,7 @@ type Scheduler struct {
 
        mContainersAllocatedNotStarted   prometheus.Gauge
        mContainersNotAllocatedOverQuota prometheus.Gauge
+       mLongestWaitTimeSinceQueue       prometheus.Gauge
 }
 
 // New returns a new unstarted Scheduler.
@@ -87,6 +88,13 @@ func (sch *Scheduler) registerMetrics(reg *prometheus.Registry) {
                Help:      "Number of containers not allocated to a worker because the system has hit a quota.",
        })
        reg.MustRegister(sch.mContainersNotAllocatedOverQuota)
+       sch.mLongestWaitTimeSinceQueue = prometheus.NewGauge(prometheus.GaugeOpts{
+               Namespace: "arvados",
+               Subsystem: "dispatchcloud",
+               Name:      "containers_longest_wait_time_seconds",
+               Help:      "Current longest wait time of any container since queuing, and before the start of crunch-run.",
+       })
+       reg.MustRegister(sch.mLongestWaitTimeSinceQueue)
 }
 
 // Start starts the scheduler.