16636: a-d-c: add a time-to-ssh and time-to-ready-for-container metrics
authorWard Vandewege <ward@curii.com>
Thu, 3 Sep 2020 17:10:42 +0000 (13:10 -0400)
committerWard Vandewege <ward@curii.com>
Thu, 3 Sep 2020 20:26:18 +0000 (16:26 -0400)
Arvados-DCO-1.1-Signed-off-by: Ward Vandewege <ward@curii.com>

lib/dispatchcloud/dispatcher_test.go
lib/dispatchcloud/worker/pool.go
lib/dispatchcloud/worker/verify.go
lib/dispatchcloud/worker/worker.go

index 42decff31d0cada0bb83ce66ba509aa5f5d13448..6e1850410b28bf3394ec4e29c4416a9551ec6d91 100644 (file)
@@ -215,6 +215,12 @@ func (s *DispatcherSuite) TestDispatchToStubDriver(c *check.C) {
        c.Check(resp.Body.String(), check.Matches, `(?ms).*boot_outcomes{outcome="success"} [^0].*`)
        c.Check(resp.Body.String(), check.Matches, `(?ms).*instances_disappeared{state="shutdown"} [^0].*`)
        c.Check(resp.Body.String(), check.Matches, `(?ms).*instances_disappeared{state="unknown"} 0\n.*`)
+       c.Check(resp.Body.String(), check.Matches, `(?ms).*time_to_ssh_seconds{quantile="0.95"} [0-9.]*`)
+       c.Check(resp.Body.String(), check.Matches, `(?ms).*time_to_ssh_seconds_count [0-9]*`)
+       c.Check(resp.Body.String(), check.Matches, `(?ms).*time_to_ssh_seconds_sum [0-9.]*`)
+       c.Check(resp.Body.String(), check.Matches, `(?ms).*time_to_ready_for_container_seconds{quantile="0.95"} [0-9.]*`)
+       c.Check(resp.Body.String(), check.Matches, `(?ms).*time_to_ready_for_container_seconds_count [0-9]*`)
+       c.Check(resp.Body.String(), check.Matches, `(?ms).*time_to_ready_for_container_seconds_sum [0-9.]*`)
 }
 
 func (s *DispatcherSuite) TestAPIPermissions(c *check.C) {
index 435b6e43ae4a3e150f680883aa7e124daf6e4230..6ca15de4939ce060edbb30775badbbd91e368345 100644 (file)
@@ -170,13 +170,15 @@ type Pool struct {
        runnerMD5    [md5.Size]byte
        runnerCmd    string
 
-       mContainersRunning prometheus.Gauge
-       mInstances         *prometheus.GaugeVec
-       mInstancesPrice    *prometheus.GaugeVec
-       mVCPUs             *prometheus.GaugeVec
-       mMemory            *prometheus.GaugeVec
-       mBootOutcomes      *prometheus.CounterVec
-       mDisappearances    *prometheus.CounterVec
+       mContainersRunning       prometheus.Gauge
+       mInstances               *prometheus.GaugeVec
+       mInstancesPrice          *prometheus.GaugeVec
+       mVCPUs                   *prometheus.GaugeVec
+       mMemory                  *prometheus.GaugeVec
+       mBootOutcomes            *prometheus.CounterVec
+       mDisappearances          *prometheus.CounterVec
+       mTimeToSSH               prometheus.Summary
+       mTimeToReadyForContainer prometheus.Summary
 }
 
 type createCall struct {
@@ -323,7 +325,7 @@ func (wp *Pool) Create(it arvados.InstanceType) bool {
                        wp.tagKeyPrefix + tagKeyIdleBehavior:   string(IdleBehaviorRun),
                        wp.tagKeyPrefix + tagKeyInstanceSecret: secret,
                }
-               initCmd := TagVerifier{nil, secret}.InitCommand()
+               initCmd := TagVerifier{nil, secret, nil}.InitCommand()
                inst, err := wp.instanceSet.Create(it, wp.imageID, tags, initCmd, wp.installPublicKey)
                wp.mtx.Lock()
                defer wp.mtx.Unlock()
@@ -367,6 +369,23 @@ func (wp *Pool) SetIdleBehavior(id cloud.InstanceID, idleBehavior IdleBehavior)
        return nil
 }
 
+// Successful connection to the SSH daemon, update the mTimeToSSH metric
+func (wp *Pool) reportSSHConnected(inst cloud.Instance) {
+       wkr := wp.workers[inst.ID()]
+       wkr.mtx.Lock()
+       defer wkr.mtx.Unlock()
+       if wkr.state != StateBooting || !wkr.firstSSHConnection.IsZero() {
+               // the node is not in booting state (can happen if a-d-c is restarted) OR
+               // this is not the first SSH connection
+               return
+       }
+
+       if wp.mTimeToSSH != nil {
+               wp.mTimeToSSH.Observe(time.Since(wkr.appeared).Seconds())
+       }
+       wkr.firstSSHConnection = time.Now()
+}
+
 // Add or update worker attached to the given instance.
 //
 // The second return value is true if a new worker is created.
@@ -377,7 +396,7 @@ func (wp *Pool) SetIdleBehavior(id cloud.InstanceID, idleBehavior IdleBehavior)
 // Caller must have lock.
 func (wp *Pool) updateWorker(inst cloud.Instance, it arvados.InstanceType) (*worker, bool) {
        secret := inst.Tags()[wp.tagKeyPrefix+tagKeyInstanceSecret]
-       inst = TagVerifier{inst, secret}
+       inst = TagVerifier{Instance: inst, Secret: secret, ReportVerified: wp.reportSSHConnected}
        id := inst.ID()
        if wkr := wp.workers[id]; wkr != nil {
                wkr.executor.SetTarget(inst)
@@ -626,6 +645,22 @@ func (wp *Pool) registerMetrics(reg *prometheus.Registry) {
                wp.mDisappearances.WithLabelValues(v).Add(0)
        }
        reg.MustRegister(wp.mDisappearances)
+       wp.mTimeToSSH = prometheus.NewSummary(prometheus.SummaryOpts{
+               Namespace:  "arvados",
+               Subsystem:  "dispatchcloud",
+               Name:       "instances_time_to_ssh_seconds",
+               Help:       "Number of seconds between instance creation and the first successful SSH connection.",
+               Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.95: 0.005, 0.99: 0.001},
+       })
+       reg.MustRegister(wp.mTimeToSSH)
+       wp.mTimeToReadyForContainer = prometheus.NewSummary(prometheus.SummaryOpts{
+               Namespace:  "arvados",
+               Subsystem:  "dispatchcloud",
+               Name:       "instances_time_to_ready_for_container_seconds",
+               Help:       "Number of seconds between the first successful SSH connection and ready to run a container.",
+               Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.95: 0.005, 0.99: 0.001},
+       })
+       reg.MustRegister(wp.mTimeToReadyForContainer)
 }
 
 func (wp *Pool) runMetrics() {
index 597950fca699a9834795dfbf25f6957cd9fdc92b..4e6ee86c6b0d2749b06bd104bf5ba7304e490094 100644 (file)
@@ -23,7 +23,8 @@ var (
 
 type TagVerifier struct {
        cloud.Instance
-       Secret string
+       Secret         string
+       ReportVerified func(cloud.Instance)
 }
 
 func (tv TagVerifier) InitCommand() cloud.InitCommand {
@@ -31,6 +32,7 @@ func (tv TagVerifier) InitCommand() cloud.InitCommand {
 }
 
 func (tv TagVerifier) VerifyHostKey(pubKey ssh.PublicKey, client *ssh.Client) error {
+       tv.ReportVerified(tv.Instance)
        if err := tv.Instance.VerifyHostKey(pubKey, client); err != cloud.ErrNotImplemented || tv.Secret == "" {
                // If the wrapped instance indicates it has a way to
                // verify the key, return that decision.
index 5d2360f3ccc64671b7193b281a7807d7b70de23b..9199d4bafe764d806312638328cf13fd3b422e4d 100644 (file)
@@ -103,11 +103,13 @@ type worker struct {
        updated             time.Time
        busy                time.Time
        destroyed           time.Time
+       firstSSHConnection  time.Time
        lastUUID            string
        running             map[string]*remoteRunner // remember to update state idle<->running when this changes
        starting            map[string]*remoteRunner // remember to update state idle<->running when this changes
        probing             chan struct{}
        bootOutcomeReported bool
+       timeToReadyReported bool
 }
 
 func (wkr *worker) onUnkillable(uuid string) {
@@ -140,6 +142,17 @@ func (wkr *worker) reportBootOutcome(outcome BootOutcome) {
        wkr.bootOutcomeReported = true
 }
 
+// caller must have lock.
+func (wkr *worker) reportTimeBetweenFirstSSHAndReadyForContainer() {
+       if wkr.timeToReadyReported {
+               return
+       }
+       if wkr.wp.mTimeToSSH != nil {
+               wkr.wp.mTimeToReadyForContainer.Observe(time.Since(wkr.firstSSHConnection).Seconds())
+       }
+       wkr.timeToReadyReported = true
+}
+
 // caller must have lock.
 func (wkr *worker) setIdleBehavior(idleBehavior IdleBehavior) {
        wkr.logger.WithField("IdleBehavior", idleBehavior).Info("set idle behavior")
@@ -313,6 +326,9 @@ func (wkr *worker) probeAndUpdate() {
 
        // Update state if this was the first successful boot-probe.
        if booted && (wkr.state == StateUnknown || wkr.state == StateBooting) {
+               if wkr.state == StateBooting {
+                       wkr.reportTimeBetweenFirstSSHAndReadyForContainer()
+               }
                // Note: this will change again below if
                // len(wkr.starting)+len(wkr.running) > 0.
                wkr.state = StateIdle