X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/472fff42d6105a4457deeb1579e9d14caffc82dc..56e130608f8977d20b21c54f6ab8973d71e045a0:/lib/dispatchcloud/worker/pool.go

diff --git a/lib/dispatchcloud/worker/pool.go b/lib/dispatchcloud/worker/pool.go
index 086887cb44..e092e7adab 100644
--- a/lib/dispatchcloud/worker/pool.go
+++ b/lib/dispatchcloud/worker/pool.go
@@ -5,8 +5,10 @@
 package worker
 
 import (
+	"crypto/hmac"
 	"crypto/md5"
 	"crypto/rand"
+	"crypto/sha256"
 	"errors"
 	"fmt"
 	"io"
@@ -64,15 +66,16 @@ type Executor interface {
 }
 
 const (
-	defaultSyncInterval       = time.Minute
-	defaultProbeInterval      = time.Second * 10
-	defaultMaxProbesPerSecond = 10
-	defaultTimeoutIdle        = time.Minute
-	defaultTimeoutBooting     = time.Minute * 10
-	defaultTimeoutProbe       = time.Minute * 10
-	defaultTimeoutShutdown    = time.Second * 10
-	defaultTimeoutTERM        = time.Minute * 2
-	defaultTimeoutSignal      = time.Second * 5
+	defaultSyncInterval        = time.Minute
+	defaultProbeInterval       = time.Second * 10
+	defaultMaxProbesPerSecond  = 10
+	defaultTimeoutIdle         = time.Minute
+	defaultTimeoutBooting      = time.Minute * 10
+	defaultTimeoutProbe        = time.Minute * 10
+	defaultTimeoutShutdown     = time.Second * 10
+	defaultTimeoutTERM         = time.Minute * 2
+	defaultTimeoutSignal       = time.Second * 5
+	defaultTimeoutStaleRunLock = time.Second * 5
 
 	// Time after a quota error to try again anyway, even if no
 	// instances have been shutdown.
@@ -85,9 +88,8 @@ const (
 func duration(conf arvados.Duration, def time.Duration) time.Duration {
 	if conf > 0 {
 		return time.Duration(conf)
-	} else {
-		return def
 	}
+	return def
 }
 
 // NewPool creates a Pool of workers backed by instanceSet.
@@ -115,6 +117,8 @@ func NewPool(logger logrus.FieldLogger, arvClient *arvados.Client, reg *promethe
 		timeoutShutdown:                duration(cluster.Containers.CloudVMs.TimeoutShutdown, defaultTimeoutShutdown),
 		timeoutTERM:                    duration(cluster.Containers.CloudVMs.TimeoutTERM, defaultTimeoutTERM),
 		timeoutSignal:                  duration(cluster.Containers.CloudVMs.TimeoutSignal, defaultTimeoutSignal),
+		timeoutStaleRunLock:            duration(cluster.Containers.CloudVMs.TimeoutStaleRunLock, defaultTimeoutStaleRunLock),
+		systemRootToken:                cluster.SystemRootToken,
 		installPublicKey:               installPublicKey,
 		tagKeyPrefix:                   cluster.Containers.CloudVMs.TagKeyPrefix,
 		stop:                           make(chan bool),
@@ -152,6 +156,8 @@ type Pool struct {
 	timeoutShutdown                time.Duration
 	timeoutTERM                    time.Duration
 	timeoutSignal                  time.Duration
+	timeoutStaleRunLock            time.Duration
+	systemRootToken                string
 	installPublicKey               ssh.PublicKey
 	tagKeyPrefix                   string
 
@@ -170,15 +176,18 @@ type Pool struct {
 	runnerMD5    [md5.Size]byte
 	runnerCmd    string
 
-	mContainersRunning       prometheus.Gauge
-	mInstances               *prometheus.GaugeVec
-	mInstancesPrice          *prometheus.GaugeVec
-	mVCPUs                   *prometheus.GaugeVec
-	mMemory                  *prometheus.GaugeVec
-	mBootOutcomes            *prometheus.CounterVec
-	mDisappearances          *prometheus.CounterVec
-	mTimeToSSH               prometheus.Summary
-	mTimeToReadyForContainer prometheus.Summary
+	mContainersRunning        prometheus.Gauge
+	mInstances                *prometheus.GaugeVec
+	mInstancesPrice           *prometheus.GaugeVec
+	mVCPUs                    *prometheus.GaugeVec
+	mMemory                   *prometheus.GaugeVec
+	mBootOutcomes             *prometheus.CounterVec
+	mDisappearances           *prometheus.CounterVec
+	mTimeToSSH                prometheus.Summary
+	mTimeToReadyForContainer  prometheus.Summary
+	mTimeFromShutdownToGone   prometheus.Summary
+	mTimeFromQueueToCrunchRun prometheus.Summary
+	mRunProbeDuration         *prometheus.SummaryVec
 }
 
 type createCall struct {
@@ -661,6 +670,30 @@ func (wp *Pool) registerMetrics(reg *prometheus.Registry) {
 		Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.95: 0.005, 0.99: 0.001},
 	})
 	reg.MustRegister(wp.mTimeToReadyForContainer)
+	wp.mTimeFromShutdownToGone = prometheus.NewSummary(prometheus.SummaryOpts{
+		Namespace:  "arvados",
+		Subsystem:  "dispatchcloud",
+		Name:       "instances_time_from_shutdown_request_to_disappearance_seconds",
+		Help:       "Number of seconds between the first shutdown attempt and the disappearance of the worker.",
+		Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.95: 0.005, 0.99: 0.001},
+	})
+	reg.MustRegister(wp.mTimeFromShutdownToGone)
+	wp.mTimeFromQueueToCrunchRun = prometheus.NewSummary(prometheus.SummaryOpts{
+		Namespace:  "arvados",
+		Subsystem:  "dispatchcloud",
+		Name:       "containers_time_from_queue_to_crunch_run_seconds",
+		Help:       "Number of seconds between the queuing of a container and the start of crunch-run.",
+		Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.95: 0.005, 0.99: 0.001},
+	})
+	reg.MustRegister(wp.mTimeFromQueueToCrunchRun)
+	wp.mRunProbeDuration = prometheus.NewSummaryVec(prometheus.SummaryOpts{
+		Namespace:  "arvados",
+		Subsystem:  "dispatchcloud",
+		Name:       "instances_run_probe_duration_seconds",
+		Help:       "Number of seconds per runProbe call.",
+		Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.95: 0.005, 0.99: 0.001},
+	}, []string{"outcome"})
+	reg.MustRegister(wp.mRunProbeDuration)
 }
 
 func (wp *Pool) runMetrics() {
@@ -930,6 +963,10 @@ func (wp *Pool) sync(threshold time.Time, instances []cloud.Instance) {
 		if wp.mDisappearances != nil {
 			wp.mDisappearances.WithLabelValues(stateString[wkr.state]).Inc()
 		}
+		// wkr.destroyed.IsZero() can happen if instance disappeared but we weren't trying to shut it down
+		if wp.mTimeFromShutdownToGone != nil && !wkr.destroyed.IsZero() {
+			wp.mTimeFromShutdownToGone.Observe(time.Now().Sub(wkr.destroyed).Seconds())
+		}
 		delete(wp.workers, id)
 		go wkr.Close()
 		notify = true
@@ -957,6 +994,12 @@ func (wp *Pool) waitUntilLoaded() {
 	}
 }
 
+func (wp *Pool) gatewayAuthSecret(uuid string) string {
+	h := hmac.New(sha256.New, []byte(wp.systemRootToken))
+	fmt.Fprint(h, "%s", uuid)
+	return fmt.Sprintf("%x", h.Sum(nil))
+}
+
 // Return a random string of n hexadecimal digits (n*4 random bits). n
 // must be even.
 func randomHex(n int) string {