16636: a-d-c: add a time-to-ssh and time-to-ready-for-container metrics
[arvados.git] / lib / dispatchcloud / worker / pool.go
index 435b6e43ae4a3e150f680883aa7e124daf6e4230..6ca15de4939ce060edbb30775badbbd91e368345 100644 (file)
@@ -170,13 +170,15 @@ type Pool struct {
        runnerMD5    [md5.Size]byte
        runnerCmd    string
 
-       mContainersRunning prometheus.Gauge
-       mInstances         *prometheus.GaugeVec
-       mInstancesPrice    *prometheus.GaugeVec
-       mVCPUs             *prometheus.GaugeVec
-       mMemory            *prometheus.GaugeVec
-       mBootOutcomes      *prometheus.CounterVec
-       mDisappearances    *prometheus.CounterVec
+       mContainersRunning       prometheus.Gauge
+       mInstances               *prometheus.GaugeVec
+       mInstancesPrice          *prometheus.GaugeVec
+       mVCPUs                   *prometheus.GaugeVec
+       mMemory                  *prometheus.GaugeVec
+       mBootOutcomes            *prometheus.CounterVec
+       mDisappearances          *prometheus.CounterVec
+       mTimeToSSH               prometheus.Summary
+       mTimeToReadyForContainer prometheus.Summary
 }
 
 type createCall struct {
@@ -323,7 +325,7 @@ func (wp *Pool) Create(it arvados.InstanceType) bool {
                        wp.tagKeyPrefix + tagKeyIdleBehavior:   string(IdleBehaviorRun),
                        wp.tagKeyPrefix + tagKeyInstanceSecret: secret,
                }
-               initCmd := TagVerifier{nil, secret}.InitCommand()
+               initCmd := TagVerifier{nil, secret, nil}.InitCommand()
                inst, err := wp.instanceSet.Create(it, wp.imageID, tags, initCmd, wp.installPublicKey)
                wp.mtx.Lock()
                defer wp.mtx.Unlock()
@@ -367,6 +369,23 @@ func (wp *Pool) SetIdleBehavior(id cloud.InstanceID, idleBehavior IdleBehavior)
        return nil
 }
 
+// Successful connection to the SSH daemon, update the mTimeToSSH metric
+func (wp *Pool) reportSSHConnected(inst cloud.Instance) {
+       wkr := wp.workers[inst.ID()]
+       wkr.mtx.Lock()
+       defer wkr.mtx.Unlock()
+       if wkr.state != StateBooting || !wkr.firstSSHConnection.IsZero() {
+               // the node is not in booting state (can happen if a-d-c is restarted) OR
+               // this is not the first SSH connection
+               return
+       }
+
+       if wp.mTimeToSSH != nil {
+               wp.mTimeToSSH.Observe(time.Since(wkr.appeared).Seconds())
+       }
+       wkr.firstSSHConnection = time.Now()
+}
+
 // Add or update worker attached to the given instance.
 //
 // The second return value is true if a new worker is created.
@@ -377,7 +396,7 @@ func (wp *Pool) SetIdleBehavior(id cloud.InstanceID, idleBehavior IdleBehavior)
 // Caller must have lock.
 func (wp *Pool) updateWorker(inst cloud.Instance, it arvados.InstanceType) (*worker, bool) {
        secret := inst.Tags()[wp.tagKeyPrefix+tagKeyInstanceSecret]
-       inst = TagVerifier{inst, secret}
+       inst = TagVerifier{Instance: inst, Secret: secret, ReportVerified: wp.reportSSHConnected}
        id := inst.ID()
        if wkr := wp.workers[id]; wkr != nil {
                wkr.executor.SetTarget(inst)
@@ -626,6 +645,22 @@ func (wp *Pool) registerMetrics(reg *prometheus.Registry) {
                wp.mDisappearances.WithLabelValues(v).Add(0)
        }
        reg.MustRegister(wp.mDisappearances)
+       wp.mTimeToSSH = prometheus.NewSummary(prometheus.SummaryOpts{
+               Namespace:  "arvados",
+               Subsystem:  "dispatchcloud",
+               Name:       "instances_time_to_ssh_seconds",
+               Help:       "Number of seconds between instance creation and the first successful SSH connection.",
+               Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.95: 0.005, 0.99: 0.001},
+       })
+       reg.MustRegister(wp.mTimeToSSH)
+       wp.mTimeToReadyForContainer = prometheus.NewSummary(prometheus.SummaryOpts{
+               Namespace:  "arvados",
+               Subsystem:  "dispatchcloud",
+               Name:       "instances_time_to_ready_for_container_seconds",
+               Help:       "Number of seconds between the first successful SSH connection and ready to run a container.",
+               Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.95: 0.005, 0.99: 0.001},
+       })
+       reg.MustRegister(wp.mTimeToReadyForContainer)
 }
 
 func (wp *Pool) runMetrics() {