16838: a-d-c: metrics: add runProbe success/failure duration metrics.
authorWard Vandewege <ward@curii.com>
Wed, 16 Sep 2020 20:53:00 +0000 (16:53 -0400)
committerWard Vandewege <ward@curii.com>
Wed, 16 Sep 2020 20:53:00 +0000 (16:53 -0400)
Arvados-DCO-1.1-Signed-off-by: Ward Vandewege <ward@curii.com>

lib/dispatchcloud/dispatcher_test.go
lib/dispatchcloud/worker/pool.go
lib/dispatchcloud/worker/worker.go

index 9f1eb098e01aa029ebe735f2f8a7a956bf56c236..d5d90bf3518b75fb548e810e2ad8a7cc2c9867ba 100644 (file)
@@ -227,6 +227,10 @@ func (s *DispatcherSuite) TestDispatchToStubDriver(c *check.C) {
        c.Check(resp.Body.String(), check.Matches, `(?ms).*time_from_shutdown_request_to_disappearance_seconds_sum [0-9.]*`)
        c.Check(resp.Body.String(), check.Matches, `(?ms).*time_from_queue_to_crunch_run_seconds_count [0-9]*`)
        c.Check(resp.Body.String(), check.Matches, `(?ms).*time_from_queue_to_crunch_run_seconds_sum [0-9e+.]*`)
+       c.Check(resp.Body.String(), check.Matches, `(?ms).*run_probe_duration_seconds_count{outcome="success"} [0-9]*`)
+       c.Check(resp.Body.String(), check.Matches, `(?ms).*run_probe_duration_seconds_sum{outcome="success"} [0-9e+.]*`)
+       c.Check(resp.Body.String(), check.Matches, `(?ms).*run_probe_duration_seconds_count{outcome="fail"} [0-9]*`)
+       c.Check(resp.Body.String(), check.Matches, `(?ms).*run_probe_duration_seconds_sum{outcome="fail"} [0-9e+.]*`)
 }
 
 func (s *DispatcherSuite) TestAPIPermissions(c *check.C) {
index c6eaeae2b618b11423512cc2be1cb5cfc454c20c..78c51dee10c9c3496d07585ff174324c0400625f 100644 (file)
@@ -184,6 +184,7 @@ type Pool struct {
        mTimeToReadyForContainer  prometheus.Summary
        mTimeFromShutdownToGone   prometheus.Summary
        mTimeFromQueueToCrunchRun prometheus.Summary
+       mRunProbeDuration         *prometheus.SummaryVec
 }
 
 type createCall struct {
@@ -682,6 +683,17 @@ func (wp *Pool) registerMetrics(reg *prometheus.Registry) {
                Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.95: 0.005, 0.99: 0.001},
        })
        reg.MustRegister(wp.mTimeFromQueueToCrunchRun)
+       wp.mRunProbeDuration = prometheus.NewSummaryVec(prometheus.SummaryOpts{
+               Namespace:  "arvados",
+               Subsystem:  "dispatchcloud",
+               Name:       "instances_run_probe_duration_seconds",
+               Help:       "Number of seconds per runProbe call.",
+               Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.95: 0.005, 0.99: 0.001},
+       }, []string{"outcome"})
+       for _, v := range []string{"success", "fail"} {
+               wp.mRunProbeDuration.WithLabelValues(v).Observe(0)
+       }
+       reg.MustRegister(wp.mRunProbeDuration)
 }
 
 func (wp *Pool) runMetrics() {
index 5b145d7c6599b75bb6e8b30f6c65e65d82186d84..9e89d7daafc01d05b770fb065f88049dea231a7e 100644 (file)
@@ -192,7 +192,7 @@ func (wkr *worker) startContainer(ctr arvados.Container) {
 }
 
 // ProbeAndUpdate conducts appropriate boot/running probes (if any)
-// for the worker's curent state. If a previous probe is still
+// for the worker's current state. If a previous probe is still
 // running, it does nothing.
 //
 // It should be called in a new goroutine.
@@ -376,6 +376,7 @@ func (wkr *worker) probeRunning() (running []string, reportsBroken, ok bool) {
        if u := wkr.instance.RemoteUser(); u != "root" {
                cmd = "sudo " + cmd
        }
+       before := time.Now()
        stdout, stderr, err := wkr.executor.Execute(nil, cmd, nil)
        if err != nil {
                wkr.logger.WithFields(logrus.Fields{
@@ -383,8 +384,10 @@ func (wkr *worker) probeRunning() (running []string, reportsBroken, ok bool) {
                        "stdout":  string(stdout),
                        "stderr":  string(stderr),
                }).WithError(err).Warn("probe failed")
+               wkr.wp.mRunProbeDuration.WithLabelValues("fail").Observe(time.Now().Sub(before).Seconds())
                return
        }
+       wkr.wp.mRunProbeDuration.WithLabelValues("success").Observe(time.Now().Sub(before).Seconds())
        ok = true
 
        staleRunLock := false