16636: Merge branch 'master' into 16636-more-metrics
authorWard Vandewege <ward@curii.com>
Tue, 15 Sep 2020 19:32:15 +0000 (15:32 -0400)
committerWard Vandewege <ward@curii.com>
Tue, 15 Sep 2020 19:32:30 +0000 (15:32 -0400)
Arvados-DCO-1.1-Signed-off-by: Ward Vandewege <ward@curii.com>

doc/Gemfile.lock
lib/dispatchcloud/container/queue.go
lib/dispatchcloud/dispatcher_test.go
lib/dispatchcloud/scheduler/run_queue_test.go
lib/dispatchcloud/scheduler/scheduler.go
lib/dispatchcloud/worker/pool.go
lib/dispatchcloud/worker/worker.go

index 344a0a86b51555d60c7bb812afbdc0d5a1819349..b5e62cacd6b0b605599fc1cff58428d6d73673fc 100644 (file)
@@ -1,28 +1,23 @@
 GEM
   remote: https://rubygems.org/
   specs:
-    RedCloth (4.2.9)
-    coderay (1.1.0)
-    colorize (0.6.0)
-    kramdown (1.3.1)
-    less (1.2.21)
-      mutter (>= 0.4.2)
-      treetop (>= 1.4.2)
-    liquid (2.6.1)
-    makerakeworkwell (1.0.3)
-      rake (>= 0.9.2, < 11)
-    mutter (0.5.3)
-    polyglot (0.3.3)
-    rake (10.1.1)
-    treetop (1.4.15)
-      polyglot
-      polyglot (>= 0.3.1)
-    zenweb (3.3.1)
+    RedCloth (4.3.2)
+    coderay (1.1.3)
+    colorize (0.8.1)
+    commonjs (0.2.7)
+    kramdown (1.17.0)
+    less (2.6.0)
+      commonjs (~> 0.2.7)
+    liquid (4.0.3)
+    makerakeworkwell (1.0.4)
+      rake (>= 0.9.2, < 15)
+    rake (13.0.1)
+    zenweb (3.10.4)
       coderay (~> 1.0)
-      kramdown (~> 1.0)
-      less (~> 1.2)
+      kramdown (~> 1.4)
+      less (~> 2.0)
       makerakeworkwell (~> 1.0)
-      rake (>= 0.9, < 11)
+      rake (>= 0.9, < 15)
 
 PLATFORMS
   ruby
@@ -32,3 +27,6 @@ DEPENDENCIES
   colorize
   liquid
   zenweb
+
+BUNDLED WITH
+   2.1.4
index 45b346383fab8641b27d16063a46bc4468fc96ce..a1ff414b73ecbd3be22276b716a64af6eaa3f225 100644 (file)
@@ -382,7 +382,7 @@ func (cq *Queue) poll() (map[string]*arvados.Container, error) {
                        *next[upd.UUID] = upd
                }
        }
-       selectParam := []string{"uuid", "state", "priority", "runtime_constraints", "container_image", "mounts", "scheduling_parameters"}
+       selectParam := []string{"uuid", "state", "priority", "runtime_constraints", "container_image", "mounts", "scheduling_parameters", "created_at"}
        limitParam := 1000
 
        mine, err := cq.fetchAll(arvados.ResourceListParams{
index cb5cdf1cf4e87ba8b9e741fe60775276587d5f0d..9f1eb098e01aa029ebe735f2f8a7a956bf56c236 100644 (file)
@@ -223,6 +223,10 @@ func (s *DispatcherSuite) TestDispatchToStubDriver(c *check.C) {
        c.Check(resp.Body.String(), check.Matches, `(?ms).*time_to_ready_for_container_seconds{quantile="0.95"} [0-9.]*`)
        c.Check(resp.Body.String(), check.Matches, `(?ms).*time_to_ready_for_container_seconds_count [0-9]*`)
        c.Check(resp.Body.String(), check.Matches, `(?ms).*time_to_ready_for_container_seconds_sum [0-9.]*`)
+       c.Check(resp.Body.String(), check.Matches, `(?ms).*time_from_shutdown_request_to_disappearance_seconds_count [0-9]*`)
+       c.Check(resp.Body.String(), check.Matches, `(?ms).*time_from_shutdown_request_to_disappearance_seconds_sum [0-9.]*`)
+       c.Check(resp.Body.String(), check.Matches, `(?ms).*time_from_queue_to_crunch_run_seconds_count [0-9]*`)
+       c.Check(resp.Body.String(), check.Matches, `(?ms).*time_from_queue_to_crunch_run_seconds_sum [0-9e+.]*`)
 }
 
 func (s *DispatcherSuite) TestAPIPermissions(c *check.C) {
index 6d703649893356bdb91429954128d1e5afd6a9c5..fd1d0a870b7ac9f34f9d1dd39f250fed62b4a099 100644 (file)
@@ -391,15 +391,16 @@ func (*SchedulerSuite) TestKillNonexistentContainer(c *check.C) {
        c.Check(pool.Running(), check.HasLen, 0)
 }
 
-func (*SchedulerSuite) TestContainersAllocatedNotStartedMetric(c *check.C) {
+func (*SchedulerSuite) TestContainersMetrics(c *check.C) {
        ctx := ctxlog.Context(context.Background(), ctxlog.TestLogger(c))
        queue := test.Queue{
                ChooseType: chooseType,
                Containers: []arvados.Container{
                        {
-                               UUID:     test.ContainerUUID(1),
-                               Priority: 1,
-                               State:    arvados.ContainerStateLocked,
+                               UUID:      test.ContainerUUID(1),
+                               Priority:  1,
+                               State:     arvados.ContainerStateLocked,
+                               CreatedAt: time.Now().Add(-10 * time.Second),
                                RuntimeConstraints: arvados.RuntimeConstraints{
                                        VCPUs: 1,
                                        RAM:   1 << 30,
@@ -417,9 +418,11 @@ func (*SchedulerSuite) TestContainersAllocatedNotStartedMetric(c *check.C) {
        }
        sch := New(ctx, &queue, &pool, nil, time.Millisecond, time.Millisecond)
        sch.runQueue()
+       sch.updateMetrics()
 
        c.Check(int(testutil.ToFloat64(sch.mContainersAllocatedNotStarted)), check.Equals, 1)
        c.Check(int(testutil.ToFloat64(sch.mContainersNotAllocatedOverQuota)), check.Equals, 0)
+       c.Check(int(testutil.ToFloat64(sch.mLongestWaitTimeSinceQueue)), check.Equals, 10)
 
        // Create a pool without workers. The queued container will not be started, and the
        // 'over quota' metric will be 1 because no workers are available and canCreate defaults
@@ -427,7 +430,40 @@ func (*SchedulerSuite) TestContainersAllocatedNotStartedMetric(c *check.C) {
        pool = stubPool{}
        sch = New(ctx, &queue, &pool, nil, time.Millisecond, time.Millisecond)
        sch.runQueue()
+       sch.updateMetrics()
 
        c.Check(int(testutil.ToFloat64(sch.mContainersAllocatedNotStarted)), check.Equals, 0)
        c.Check(int(testutil.ToFloat64(sch.mContainersNotAllocatedOverQuota)), check.Equals, 1)
+       c.Check(int(testutil.ToFloat64(sch.mLongestWaitTimeSinceQueue)), check.Equals, 10)
+
+       // Reset the queue, and create a pool with an idle worker. The queued
+       // container will be started immediately and mLongestWaitTimeSinceQueue
+       // should be zero.
+       queue = test.Queue{
+               ChooseType: chooseType,
+               Containers: []arvados.Container{
+                       {
+                               UUID:      test.ContainerUUID(1),
+                               Priority:  1,
+                               State:     arvados.ContainerStateLocked,
+                               CreatedAt: time.Now().Add(-10 * time.Second),
+                               RuntimeConstraints: arvados.RuntimeConstraints{
+                                       VCPUs: 1,
+                                       RAM:   1 << 30,
+                               },
+                       },
+               },
+       }
+       queue.Update()
+
+       pool = stubPool{
+               idle:    map[arvados.InstanceType]int{test.InstanceType(1): 1},
+               unalloc: map[arvados.InstanceType]int{test.InstanceType(1): 1},
+               running: map[string]time.Time{},
+       }
+       sch = New(ctx, &queue, &pool, nil, time.Millisecond, time.Millisecond)
+       sch.runQueue()
+       sch.updateMetrics()
+
+       c.Check(int(testutil.ToFloat64(sch.mLongestWaitTimeSinceQueue)), check.Equals, 0)
 }
index 6fd47e919411d69026316d24d8643e5693d06f38..c3e67dd11f70a4e00c8a74f59826efb13bf0e35c 100644 (file)
@@ -11,6 +11,7 @@ import (
        "sync"
        "time"
 
+       "git.arvados.org/arvados.git/sdk/go/arvados"
        "git.arvados.org/arvados.git/sdk/go/ctxlog"
        "github.com/prometheus/client_golang/prometheus"
        "github.com/sirupsen/logrus"
@@ -46,6 +47,7 @@ type Scheduler struct {
 
        mContainersAllocatedNotStarted   prometheus.Gauge
        mContainersNotAllocatedOverQuota prometheus.Gauge
+       mLongestWaitTimeSinceQueue       prometheus.Gauge
 }
 
 // New returns a new unstarted Scheduler.
@@ -87,6 +89,37 @@ func (sch *Scheduler) registerMetrics(reg *prometheus.Registry) {
                Help:      "Number of containers not allocated to a worker because the system has hit a quota.",
        })
        reg.MustRegister(sch.mContainersNotAllocatedOverQuota)
+       sch.mLongestWaitTimeSinceQueue = prometheus.NewGauge(prometheus.GaugeOpts{
+               Namespace: "arvados",
+               Subsystem: "dispatchcloud",
+               Name:      "containers_longest_wait_time_seconds",
+               Help:      "Current longest wait time of any container since queuing, and before the start of crunch-run.",
+       })
+       reg.MustRegister(sch.mLongestWaitTimeSinceQueue)
+}
+
+func (sch *Scheduler) updateMetrics() {
+       earliest := time.Time{}
+       entries, _ := sch.queue.Entries()
+       running := sch.pool.Running()
+       for _, ent := range entries {
+               if ent.Container.Priority > 0 &&
+                       (ent.Container.State == arvados.ContainerStateQueued || ent.Container.State == arvados.ContainerStateLocked) {
+                       // Exclude containers that are preparing to run the payload (i.e.
+                       // ContainerStateLocked and running on a worker, most likely loading the
+                       // payload image
+                       if _, ok := running[ent.Container.UUID]; !ok {
+                               if ent.Container.CreatedAt.Before(earliest) || earliest.IsZero() {
+                                       earliest = ent.Container.CreatedAt
+                               }
+                       }
+               }
+       }
+       if !earliest.IsZero() {
+               sch.mLongestWaitTimeSinceQueue.Set(time.Since(earliest).Seconds())
+       } else {
+               sch.mLongestWaitTimeSinceQueue.Set(0)
+       }
 }
 
 // Start starts the scheduler.
@@ -141,6 +174,7 @@ func (sch *Scheduler) run() {
        for {
                sch.runQueue()
                sch.sync()
+               sch.updateMetrics()
                select {
                case <-sch.stop:
                        return
index 953195c59d39718b6a179277ae8a323ad75f868f..c6eaeae2b618b11423512cc2be1cb5cfc454c20c 100644 (file)
@@ -173,15 +173,17 @@ type Pool struct {
        runnerMD5    [md5.Size]byte
        runnerCmd    string
 
-       mContainersRunning       prometheus.Gauge
-       mInstances               *prometheus.GaugeVec
-       mInstancesPrice          *prometheus.GaugeVec
-       mVCPUs                   *prometheus.GaugeVec
-       mMemory                  *prometheus.GaugeVec
-       mBootOutcomes            *prometheus.CounterVec
-       mDisappearances          *prometheus.CounterVec
-       mTimeToSSH               prometheus.Summary
-       mTimeToReadyForContainer prometheus.Summary
+       mContainersRunning        prometheus.Gauge
+       mInstances                *prometheus.GaugeVec
+       mInstancesPrice           *prometheus.GaugeVec
+       mVCPUs                    *prometheus.GaugeVec
+       mMemory                   *prometheus.GaugeVec
+       mBootOutcomes             *prometheus.CounterVec
+       mDisappearances           *prometheus.CounterVec
+       mTimeToSSH                prometheus.Summary
+       mTimeToReadyForContainer  prometheus.Summary
+       mTimeFromShutdownToGone   prometheus.Summary
+       mTimeFromQueueToCrunchRun prometheus.Summary
 }
 
 type createCall struct {
@@ -664,6 +666,22 @@ func (wp *Pool) registerMetrics(reg *prometheus.Registry) {
                Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.95: 0.005, 0.99: 0.001},
        })
        reg.MustRegister(wp.mTimeToReadyForContainer)
+       wp.mTimeFromShutdownToGone = prometheus.NewSummary(prometheus.SummaryOpts{
+               Namespace:  "arvados",
+               Subsystem:  "dispatchcloud",
+               Name:       "instances_time_from_shutdown_request_to_disappearance_seconds",
+               Help:       "Number of seconds between the first shutdown attempt and the disappearance of the worker.",
+               Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.95: 0.005, 0.99: 0.001},
+       })
+       reg.MustRegister(wp.mTimeFromShutdownToGone)
+       wp.mTimeFromQueueToCrunchRun = prometheus.NewSummary(prometheus.SummaryOpts{
+               Namespace:  "arvados",
+               Subsystem:  "dispatchcloud",
+               Name:       "containers_time_from_queue_to_crunch_run_seconds",
+               Help:       "Number of seconds between the queuing of a container and the start of crunch-run.",
+               Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.95: 0.005, 0.99: 0.001},
+       })
+       reg.MustRegister(wp.mTimeFromQueueToCrunchRun)
 }
 
 func (wp *Pool) runMetrics() {
@@ -933,6 +951,10 @@ func (wp *Pool) sync(threshold time.Time, instances []cloud.Instance) {
                if wp.mDisappearances != nil {
                        wp.mDisappearances.WithLabelValues(stateString[wkr.state]).Inc()
                }
+               // wkr.destroyed.IsZero() can happen if instance disappeared but we weren't trying to shut it down
+               if wp.mTimeFromShutdownToGone != nil && !wkr.destroyed.IsZero() {
+                       wp.mTimeFromShutdownToGone.Observe(time.Now().Sub(wkr.destroyed).Seconds())
+               }
                delete(wp.workers, id)
                go wkr.Close()
                notify = true
index 517a5d193e328b8f9f2ae2f1ce9d9b4db718ddf6..5b145d7c6599b75bb6e8b30f6c65e65d82186d84 100644 (file)
@@ -177,6 +177,9 @@ func (wkr *worker) startContainer(ctr arvados.Container) {
        }
        go func() {
                rr.Start()
+               if wkr.wp.mTimeFromQueueToCrunchRun != nil {
+                       wkr.wp.mTimeFromQueueToCrunchRun.Observe(time.Since(ctr.CreatedAt).Seconds())
+               }
                wkr.mtx.Lock()
                defer wkr.mtx.Unlock()
                now := time.Now()