16636: Merge branch 'master' into 16636-more-metrics
authorWard Vandewege <ward@curii.com>
Mon, 14 Sep 2020 17:03:18 +0000 (13:03 -0400)
committerWard Vandewege <ward@curii.com>
Mon, 14 Sep 2020 17:03:34 +0000 (13:03 -0400)
Arvados-DCO-1.1-Signed-off-by: Ward Vandewege <ward@curii.com>

doc/Gemfile.lock
lib/dispatchcloud/dispatcher_test.go
lib/dispatchcloud/worker/pool.go
lib/dispatchcloud/worker/worker.go

index 344a0a86b51555d60c7bb812afbdc0d5a1819349..b5e62cacd6b0b605599fc1cff58428d6d73673fc 100644 (file)
@@ -1,28 +1,23 @@
 GEM
   remote: https://rubygems.org/
   specs:
-    RedCloth (4.2.9)
-    coderay (1.1.0)
-    colorize (0.6.0)
-    kramdown (1.3.1)
-    less (1.2.21)
-      mutter (>= 0.4.2)
-      treetop (>= 1.4.2)
-    liquid (2.6.1)
-    makerakeworkwell (1.0.3)
-      rake (>= 0.9.2, < 11)
-    mutter (0.5.3)
-    polyglot (0.3.3)
-    rake (10.1.1)
-    treetop (1.4.15)
-      polyglot
-      polyglot (>= 0.3.1)
-    zenweb (3.3.1)
+    RedCloth (4.3.2)
+    coderay (1.1.3)
+    colorize (0.8.1)
+    commonjs (0.2.7)
+    kramdown (1.17.0)
+    less (2.6.0)
+      commonjs (~> 0.2.7)
+    liquid (4.0.3)
+    makerakeworkwell (1.0.4)
+      rake (>= 0.9.2, < 15)
+    rake (13.0.1)
+    zenweb (3.10.4)
       coderay (~> 1.0)
-      kramdown (~> 1.0)
-      less (~> 1.2)
+      kramdown (~> 1.4)
+      less (~> 2.0)
       makerakeworkwell (~> 1.0)
-      rake (>= 0.9, < 11)
+      rake (>= 0.9, < 15)
 
 PLATFORMS
   ruby
@@ -32,3 +27,6 @@ DEPENDENCIES
   colorize
   liquid
   zenweb
+
+BUNDLED WITH
+   2.1.4
index 6e1850410b28bf3394ec4e29c4416a9551ec6d91..80cb28f3508e28a810da019c913bb820ca5f3f09 100644 (file)
@@ -221,6 +221,10 @@ func (s *DispatcherSuite) TestDispatchToStubDriver(c *check.C) {
        c.Check(resp.Body.String(), check.Matches, `(?ms).*time_to_ready_for_container_seconds{quantile="0.95"} [0-9.]*`)
        c.Check(resp.Body.String(), check.Matches, `(?ms).*time_to_ready_for_container_seconds_count [0-9]*`)
        c.Check(resp.Body.String(), check.Matches, `(?ms).*time_to_ready_for_container_seconds_sum [0-9.]*`)
+       c.Check(resp.Body.String(), check.Matches, `(?ms).*time_from_shutdown_request_to_disappearance_seconds_count [0-9]*`)
+       c.Check(resp.Body.String(), check.Matches, `(?ms).*time_from_shutdown_request_to_disappearance_seconds_sum [0-9.]*`)
+       c.Check(resp.Body.String(), check.Matches, `(?ms).*time_from_queue_to_crunch_run_seconds_count [0-9]*`)
+       c.Check(resp.Body.String(), check.Matches, `(?ms).*time_from_queue_to_crunch_run_seconds_sum [0-9e+.]*`)
 }
 
 func (s *DispatcherSuite) TestAPIPermissions(c *check.C) {
index 086887cb44176f05c9446341a68d7176dd5ed7aa..67962c9d65a034f6cd225a26afdc9c0bece6877e 100644 (file)
@@ -170,15 +170,17 @@ type Pool struct {
        runnerMD5    [md5.Size]byte
        runnerCmd    string
 
-       mContainersRunning       prometheus.Gauge
-       mInstances               *prometheus.GaugeVec
-       mInstancesPrice          *prometheus.GaugeVec
-       mVCPUs                   *prometheus.GaugeVec
-       mMemory                  *prometheus.GaugeVec
-       mBootOutcomes            *prometheus.CounterVec
-       mDisappearances          *prometheus.CounterVec
-       mTimeToSSH               prometheus.Summary
-       mTimeToReadyForContainer prometheus.Summary
+       mContainersRunning        prometheus.Gauge
+       mInstances                *prometheus.GaugeVec
+       mInstancesPrice           *prometheus.GaugeVec
+       mVCPUs                    *prometheus.GaugeVec
+       mMemory                   *prometheus.GaugeVec
+       mBootOutcomes             *prometheus.CounterVec
+       mDisappearances           *prometheus.CounterVec
+       mTimeToSSH                prometheus.Summary
+       mTimeToReadyForContainer  prometheus.Summary
+       mTimeFromShutdownToGone   prometheus.Summary
+       mTimeFromQueueToCrunchRun prometheus.Summary
 }
 
 type createCall struct {
@@ -661,6 +663,22 @@ func (wp *Pool) registerMetrics(reg *prometheus.Registry) {
                Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.95: 0.005, 0.99: 0.001},
        })
        reg.MustRegister(wp.mTimeToReadyForContainer)
+       wp.mTimeFromShutdownToGone = prometheus.NewSummary(prometheus.SummaryOpts{
+               Namespace:  "arvados",
+               Subsystem:  "dispatchcloud",
+               Name:       "instances_time_from_shutdown_request_to_disappearance_seconds",
+               Help:       "Number of seconds between the first shutdown attempt and the disappearance of the worker.",
+               Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.95: 0.005, 0.99: 0.001},
+       })
+       reg.MustRegister(wp.mTimeFromShutdownToGone)
+       wp.mTimeFromQueueToCrunchRun = prometheus.NewSummary(prometheus.SummaryOpts{
+               Namespace:  "arvados",
+               Subsystem:  "dispatchcloud",
+               Name:       "containers_time_from_queue_to_crunch_run_seconds",
+               Help:       "Number of seconds between the queuing of a container and the start of crunch-run.",
+               Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.95: 0.005, 0.99: 0.001},
+       })
+       reg.MustRegister(wp.mTimeFromQueueToCrunchRun)
 }
 
 func (wp *Pool) runMetrics() {
@@ -930,6 +948,9 @@ func (wp *Pool) sync(threshold time.Time, instances []cloud.Instance) {
                if wp.mDisappearances != nil {
                        wp.mDisappearances.WithLabelValues(stateString[wkr.state]).Inc()
                }
+               if wp.mTimeFromShutdownToGone != nil {
+                       wp.mTimeFromShutdownToGone.Observe(time.Now().Sub(wkr.destroyed).Seconds())
+               }
                delete(wp.workers, id)
                go wkr.Close()
                notify = true
index 9199d4bafe764d806312638328cf13fd3b422e4d..95794d0b36e5d1d552be1c751091e472b562f3b0 100644 (file)
@@ -176,6 +176,9 @@ func (wkr *worker) startContainer(ctr arvados.Container) {
        }
        go func() {
                rr.Start()
+               if wkr.wp.mTimeFromQueueToCrunchRun != nil {
+                       wkr.wp.mTimeFromQueueToCrunchRun.Observe(time.Since(ctr.CreatedAt).Seconds())
+               }
                wkr.mtx.Lock()
                defer wkr.mtx.Unlock()
                now := time.Now()