Merge branch '15888-remove-py2-from-test' into master
[arvados.git] / lib / dispatchcloud / worker / worker.go
index 6878bb0655ea1e3bc1401396f0c9cbfe4ad9bba0..9199d4bafe764d806312638328cf13fd3b422e4d 100644 (file)
@@ -43,33 +43,6 @@ var stateString = map[State]string{
        StateShutdown: "shutdown",
 }
 
-// BootOutcome is the result of a worker boot. It is used as a label in a metric.
-type BootOutcome string
-
-const (
-       BootOutcomeFailed       BootOutcome = "failure"
-       BootOutcomeSucceeded    BootOutcome = "success"
-       BootOutcomeIdleShutdown BootOutcome = "idle shutdown"
-       BootOutcomeDisappeared  BootOutcome = "disappeared"
-)
-
-var validBootOutcomes = map[BootOutcome]bool{
-       BootOutcomeFailed:       true,
-       BootOutcomeSucceeded:    true,
-       BootOutcomeIdleShutdown: true,
-       BootOutcomeDisappeared:  true,
-}
-
-func (wkr *worker) reportBootOutcome(outcome BootOutcome) {
-       if wkr.bootOutcomeReported {
-               return
-       }
-       if wkr.wp.mBootOutcomes != nil {
-               wkr.wp.mBootOutcomes.WithLabelValues(string(outcome)).Inc()
-       }
-       wkr.bootOutcomeReported = true
-}
-
 // String implements fmt.Stringer.
 func (s State) String() string {
        return stateString[s]
@@ -81,6 +54,23 @@ func (s State) MarshalText() ([]byte, error) {
        return []byte(stateString[s]), nil
 }
 
+// BootOutcome is the result of a worker boot. It is used as a label in a metric.
+type BootOutcome string
+
+const (
+       BootOutcomeFailed      BootOutcome = "failure"
+       BootOutcomeSucceeded   BootOutcome = "success"
+       BootOutcomeAborted     BootOutcome = "aborted"
+       BootOutcomeDisappeared BootOutcome = "disappeared"
+)
+
+var validBootOutcomes = map[BootOutcome]bool{
+       BootOutcomeFailed:      true,
+       BootOutcomeSucceeded:   true,
+       BootOutcomeAborted:     true,
+       BootOutcomeDisappeared: true,
+}
+
 // IdleBehavior indicates the behavior desired when a node becomes idle.
 type IdleBehavior string
 
@@ -113,11 +103,13 @@ type worker struct {
        updated             time.Time
        busy                time.Time
        destroyed           time.Time
+       firstSSHConnection  time.Time
        lastUUID            string
        running             map[string]*remoteRunner // remember to update state idle<->running when this changes
        starting            map[string]*remoteRunner // remember to update state idle<->running when this changes
        probing             chan struct{}
        bootOutcomeReported bool
+       timeToReadyReported bool
 }
 
 func (wkr *worker) onUnkillable(uuid string) {
@@ -139,6 +131,28 @@ func (wkr *worker) onKilled(uuid string) {
        go wkr.wp.notify()
 }
 
+// caller must have lock.
+func (wkr *worker) reportBootOutcome(outcome BootOutcome) {
+       if wkr.bootOutcomeReported {
+               return
+       }
+       if wkr.wp.mBootOutcomes != nil {
+               wkr.wp.mBootOutcomes.WithLabelValues(string(outcome)).Inc()
+       }
+       wkr.bootOutcomeReported = true
+}
+
+// caller must have lock.
+func (wkr *worker) reportTimeBetweenFirstSSHAndReadyForContainer() {
+       if wkr.timeToReadyReported {
+               return
+       }
+       if wkr.wp.mTimeToSSH != nil {
+               wkr.wp.mTimeToReadyForContainer.Observe(time.Since(wkr.firstSSHConnection).Seconds())
+       }
+       wkr.timeToReadyReported = true
+}
+
 // caller must have lock.
 func (wkr *worker) setIdleBehavior(idleBehavior IdleBehavior) {
        wkr.logger.WithField("IdleBehavior", idleBehavior).Info("set idle behavior")
@@ -312,6 +326,9 @@ func (wkr *worker) probeAndUpdate() {
 
        // Update state if this was the first successful boot-probe.
        if booted && (wkr.state == StateUnknown || wkr.state == StateBooting) {
+               if wkr.state == StateBooting {
+                       wkr.reportTimeBetweenFirstSSHAndReadyForContainer()
+               }
                // Note: this will change again below if
                // len(wkr.starting)+len(wkr.running) > 0.
                wkr.state = StateIdle
@@ -499,7 +516,7 @@ func (wkr *worker) shutdownIfIdle() bool {
                "IdleDuration": stats.Duration(time.Since(wkr.busy)),
                "IdleBehavior": wkr.idleBehavior,
        }).Info("shutdown worker")
-       wkr.reportBootOutcome(BootOutcomeIdleShutdown)
+       wkr.reportBootOutcome(BootOutcomeAborted)
        wkr.shutdown()
        return true
 }