From: Ward Vandewege Date: Fri, 18 Sep 2020 20:36:08 +0000 (-0400) Subject: Merge branch '16838-probe-metrics' into master X-Git-Tag: 2.1.0~54 X-Git-Url: https://git.arvados.org/arvados.git/commitdiff_plain/dbaa58c1c806799435ca5741b3f61bc6b65b6569?hp=8142a4596bb2bf5a9966aa5e79ff587bf9e9e9a6 Merge branch '16838-probe-metrics' into master closes #16838 Arvados-DCO-1.1-Signed-off-by: Ward Vandewege --- diff --git a/lib/dispatchcloud/dispatcher_test.go b/lib/dispatchcloud/dispatcher_test.go index 9f1eb098e0..d5d90bf351 100644 --- a/lib/dispatchcloud/dispatcher_test.go +++ b/lib/dispatchcloud/dispatcher_test.go @@ -227,6 +227,10 @@ func (s *DispatcherSuite) TestDispatchToStubDriver(c *check.C) { c.Check(resp.Body.String(), check.Matches, `(?ms).*time_from_shutdown_request_to_disappearance_seconds_sum [0-9.]*`) c.Check(resp.Body.String(), check.Matches, `(?ms).*time_from_queue_to_crunch_run_seconds_count [0-9]*`) c.Check(resp.Body.String(), check.Matches, `(?ms).*time_from_queue_to_crunch_run_seconds_sum [0-9e+.]*`) + c.Check(resp.Body.String(), check.Matches, `(?ms).*run_probe_duration_seconds_count{outcome="success"} [0-9]*`) + c.Check(resp.Body.String(), check.Matches, `(?ms).*run_probe_duration_seconds_sum{outcome="success"} [0-9e+.]*`) + c.Check(resp.Body.String(), check.Matches, `(?ms).*run_probe_duration_seconds_count{outcome="fail"} [0-9]*`) + c.Check(resp.Body.String(), check.Matches, `(?ms).*run_probe_duration_seconds_sum{outcome="fail"} [0-9e+.]*`) } func (s *DispatcherSuite) TestAPIPermissions(c *check.C) { diff --git a/lib/dispatchcloud/worker/pool.go b/lib/dispatchcloud/worker/pool.go index c6eaeae2b6..72d5ca56e7 100644 --- a/lib/dispatchcloud/worker/pool.go +++ b/lib/dispatchcloud/worker/pool.go @@ -184,6 +184,7 @@ type Pool struct { mTimeToReadyForContainer prometheus.Summary mTimeFromShutdownToGone prometheus.Summary mTimeFromQueueToCrunchRun prometheus.Summary + mRunProbeDuration *prometheus.SummaryVec } type createCall struct { @@ -682,6 +683,14 @@ func (wp *Pool) registerMetrics(reg *prometheus.Registry) { Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.95: 0.005, 0.99: 0.001}, }) reg.MustRegister(wp.mTimeFromQueueToCrunchRun) + wp.mRunProbeDuration = prometheus.NewSummaryVec(prometheus.SummaryOpts{ + Namespace: "arvados", + Subsystem: "dispatchcloud", + Name: "instances_run_probe_duration_seconds", + Help: "Number of seconds per runProbe call.", + Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.95: 0.005, 0.99: 0.001}, + }, []string{"outcome"}) + reg.MustRegister(wp.mRunProbeDuration) } func (wp *Pool) runMetrics() { diff --git a/lib/dispatchcloud/worker/worker.go b/lib/dispatchcloud/worker/worker.go index 5b145d7c65..9e89d7daaf 100644 --- a/lib/dispatchcloud/worker/worker.go +++ b/lib/dispatchcloud/worker/worker.go @@ -192,7 +192,7 @@ func (wkr *worker) startContainer(ctr arvados.Container) { } // ProbeAndUpdate conducts appropriate boot/running probes (if any) -// for the worker's curent state. If a previous probe is still +// for the worker's current state. If a previous probe is still // running, it does nothing. // // It should be called in a new goroutine. @@ -376,6 +376,7 @@ func (wkr *worker) probeRunning() (running []string, reportsBroken, ok bool) { if u := wkr.instance.RemoteUser(); u != "root" { cmd = "sudo " + cmd } + before := time.Now() stdout, stderr, err := wkr.executor.Execute(nil, cmd, nil) if err != nil { wkr.logger.WithFields(logrus.Fields{ @@ -383,8 +384,10 @@ func (wkr *worker) probeRunning() (running []string, reportsBroken, ok bool) { "stdout": string(stdout), "stderr": string(stderr), }).WithError(err).Warn("probe failed") + wkr.wp.mRunProbeDuration.WithLabelValues("fail").Observe(time.Now().Sub(before).Seconds()) return } + wkr.wp.mRunProbeDuration.WithLabelValues("success").Observe(time.Now().Sub(before).Seconds()) ok = true staleRunLock := false diff --git a/lib/dispatchcloud/worker/worker_test.go b/lib/dispatchcloud/worker/worker_test.go index a4c2a6370f..cfb7a1bfb7 100644 --- a/lib/dispatchcloud/worker/worker_test.go +++ b/lib/dispatchcloud/worker/worker_test.go @@ -17,6 +17,7 @@ import ( "git.arvados.org/arvados.git/lib/dispatchcloud/test" "git.arvados.org/arvados.git/sdk/go/arvados" "git.arvados.org/arvados.git/sdk/go/ctxlog" + "github.com/prometheus/client_golang/prometheus" check "gopkg.in/check.v1" ) @@ -239,6 +240,7 @@ func (suite *WorkerSuite) TestProbeAndUpdate(c *check.C) { runnerData: trial.deployRunner, runnerMD5: md5.Sum(trial.deployRunner), } + wp.registerMetrics(prometheus.NewRegistry()) if trial.deployRunner != nil { svHash := md5.Sum(trial.deployRunner) wp.runnerCmd = fmt.Sprintf("/var/run/arvados/crunch-run~%x", svHash)