16636: add instances_time_from_shutdown_request_to_disappearance_seconds
authorWard Vandewege <ward@curii.com>
Fri, 11 Sep 2020 16:30:07 +0000 (12:30 -0400)
committerWard Vandewege <ward@curii.com>
Fri, 11 Sep 2020 16:30:07 +0000 (12:30 -0400)
       metric

Arvados-DCO-1.1-Signed-off-by: Ward Vandewege <ward@curii.com>

lib/dispatchcloud/dispatcher_test.go
lib/dispatchcloud/worker/pool.go

index 6e1850410b28bf3394ec4e29c4416a9551ec6d91..92a42c7a21cc744bfe1c5dbb12e466f728d11e40 100644 (file)
@@ -221,6 +221,8 @@ func (s *DispatcherSuite) TestDispatchToStubDriver(c *check.C) {
        c.Check(resp.Body.String(), check.Matches, `(?ms).*time_to_ready_for_container_seconds{quantile="0.95"} [0-9.]*`)
        c.Check(resp.Body.String(), check.Matches, `(?ms).*time_to_ready_for_container_seconds_count [0-9]*`)
        c.Check(resp.Body.String(), check.Matches, `(?ms).*time_to_ready_for_container_seconds_sum [0-9.]*`)
+       c.Check(resp.Body.String(), check.Matches, `(?ms).*time_from_shutdown_request_to_disappearance_seconds_count [0-9]*`)
+       c.Check(resp.Body.String(), check.Matches, `(?ms).*time_from_shutdown_request_to_disappearance_seconds_sum [0-9.]*`)
 }
 
 func (s *DispatcherSuite) TestAPIPermissions(c *check.C) {
index 086887cb44176f05c9446341a68d7176dd5ed7aa..75bc01fc0e464e6a4d22ad9afaa3a4d213541c5e 100644 (file)
@@ -179,6 +179,7 @@ type Pool struct {
        mDisappearances          *prometheus.CounterVec
        mTimeToSSH               prometheus.Summary
        mTimeToReadyForContainer prometheus.Summary
+       mTimeFromShutdownToGone  prometheus.Summary
 }
 
 type createCall struct {
@@ -661,6 +662,14 @@ func (wp *Pool) registerMetrics(reg *prometheus.Registry) {
                Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.95: 0.005, 0.99: 0.001},
        })
        reg.MustRegister(wp.mTimeToReadyForContainer)
+       wp.mTimeFromShutdownToGone = prometheus.NewSummary(prometheus.SummaryOpts{
+               Namespace:  "arvados",
+               Subsystem:  "dispatchcloud",
+               Name:       "instances_time_from_shutdown_request_to_disappearance_seconds",
+               Help:       "Number of seconds between the first shutdown attempt and the disappearance of the worker.",
+               Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.95: 0.005, 0.99: 0.001},
+       })
+       reg.MustRegister(wp.mTimeFromShutdownToGone)
 }
 
 func (wp *Pool) runMetrics() {
@@ -930,6 +939,9 @@ func (wp *Pool) sync(threshold time.Time, instances []cloud.Instance) {
                if wp.mDisappearances != nil {
                        wp.mDisappearances.WithLabelValues(stateString[wkr.state]).Inc()
                }
+               if wp.mTimeFromShutdownToGone != nil {
+                       wp.mTimeFromShutdownToGone.Observe(time.Now().Sub(wkr.destroyed).Seconds())
+               }
                delete(wp.workers, id)
                go wkr.Close()
                notify = true