X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/a5fef23f2863cd0183ff596f4579110e2ddb3b3d..e44725a3792df227f189f88ffb2cd1dbf0e93489:/lib/dispatchcloud/worker/pool.go

diff --git a/lib/dispatchcloud/worker/pool.go b/lib/dispatchcloud/worker/pool.go
index 435b6e43ae..1d600e3702 100644
--- a/lib/dispatchcloud/worker/pool.go
+++ b/lib/dispatchcloud/worker/pool.go
@@ -5,12 +5,15 @@
 package worker
 
 import (
+	"crypto/hmac"
 	"crypto/md5"
 	"crypto/rand"
+	"crypto/sha256"
 	"errors"
 	"fmt"
 	"io"
 	"io/ioutil"
+	mathrand "math/rand"
 	"sort"
 	"strings"
 	"sync"
@@ -64,15 +67,16 @@ type Executor interface {
 }
 
 const (
-	defaultSyncInterval       = time.Minute
-	defaultProbeInterval      = time.Second * 10
-	defaultMaxProbesPerSecond = 10
-	defaultTimeoutIdle        = time.Minute
-	defaultTimeoutBooting     = time.Minute * 10
-	defaultTimeoutProbe       = time.Minute * 10
-	defaultTimeoutShutdown    = time.Second * 10
-	defaultTimeoutTERM        = time.Minute * 2
-	defaultTimeoutSignal      = time.Second * 5
+	defaultSyncInterval        = time.Minute
+	defaultProbeInterval       = time.Second * 10
+	defaultMaxProbesPerSecond  = 10
+	defaultTimeoutIdle         = time.Minute
+	defaultTimeoutBooting      = time.Minute * 10
+	defaultTimeoutProbe        = time.Minute * 10
+	defaultTimeoutShutdown     = time.Second * 10
+	defaultTimeoutTERM         = time.Minute * 2
+	defaultTimeoutSignal       = time.Second * 5
+	defaultTimeoutStaleRunLock = time.Second * 5
 
 	// Time after a quota error to try again anyway, even if no
 	// instances have been shutdown.
@@ -85,9 +89,8 @@ const (
 func duration(conf arvados.Duration, def time.Duration) time.Duration {
 	if conf > 0 {
 		return time.Duration(conf)
-	} else {
-		return def
 	}
+	return def
 }
 
 // NewPool creates a Pool of workers backed by instanceSet.
@@ -101,12 +104,15 @@ func NewPool(logger logrus.FieldLogger, arvClient *arvados.Client, reg *promethe
 		instanceSetID:                  instanceSetID,
 		instanceSet:                    &throttledInstanceSet{InstanceSet: instanceSet},
 		newExecutor:                    newExecutor,
+		cluster:                        cluster,
 		bootProbeCommand:               cluster.Containers.CloudVMs.BootProbeCommand,
+		instanceInitCommand:            cloud.InitCommand(cluster.Containers.CloudVMs.InstanceInitCommand),
 		runnerSource:                   cluster.Containers.CloudVMs.DeployRunnerBinary,
 		imageID:                        cloud.ImageID(cluster.Containers.CloudVMs.ImageID),
 		instanceTypes:                  cluster.InstanceTypes,
 		maxProbesPerSecond:             cluster.Containers.CloudVMs.MaxProbesPerSecond,
 		maxConcurrentInstanceCreateOps: cluster.Containers.CloudVMs.MaxConcurrentInstanceCreateOps,
+		maxInstances:                   cluster.Containers.CloudVMs.MaxInstances,
 		probeInterval:                  duration(cluster.Containers.CloudVMs.ProbeInterval, defaultProbeInterval),
 		syncInterval:                   duration(cluster.Containers.CloudVMs.SyncInterval, defaultSyncInterval),
 		timeoutIdle:                    duration(cluster.Containers.CloudVMs.TimeoutIdle, defaultTimeoutIdle),
@@ -115,8 +121,12 @@ func NewPool(logger logrus.FieldLogger, arvClient *arvados.Client, reg *promethe
 		timeoutShutdown:                duration(cluster.Containers.CloudVMs.TimeoutShutdown, defaultTimeoutShutdown),
 		timeoutTERM:                    duration(cluster.Containers.CloudVMs.TimeoutTERM, defaultTimeoutTERM),
 		timeoutSignal:                  duration(cluster.Containers.CloudVMs.TimeoutSignal, defaultTimeoutSignal),
+		timeoutStaleRunLock:            duration(cluster.Containers.CloudVMs.TimeoutStaleRunLock, defaultTimeoutStaleRunLock),
+		systemRootToken:                cluster.SystemRootToken,
 		installPublicKey:               installPublicKey,
 		tagKeyPrefix:                   cluster.Containers.CloudVMs.TagKeyPrefix,
+		runnerCmdDefault:               cluster.Containers.CrunchRunCommand,
+		runnerArgs:                     append([]string{"--runtime-engine=" + cluster.Containers.RuntimeEngine}, cluster.Containers.CrunchRunArgumentsList...),
 		stop:                           make(chan bool),
 	}
 	wp.registerMetrics(reg)
@@ -138,7 +148,9 @@ type Pool struct {
 	instanceSetID                  cloud.InstanceSetID
 	instanceSet                    *throttledInstanceSet
 	newExecutor                    func(cloud.Instance) Executor
+	cluster                        *arvados.Cluster
 	bootProbeCommand               string
+	instanceInitCommand            cloud.InitCommand
 	runnerSource                   string
 	imageID                        cloud.ImageID
 	instanceTypes                  map[string]arvados.InstanceType
@@ -146,14 +158,19 @@ type Pool struct {
 	probeInterval                  time.Duration
 	maxProbesPerSecond             int
 	maxConcurrentInstanceCreateOps int
+	maxInstances                   int
 	timeoutIdle                    time.Duration
 	timeoutBooting                 time.Duration
 	timeoutProbe                   time.Duration
 	timeoutShutdown                time.Duration
 	timeoutTERM                    time.Duration
 	timeoutSignal                  time.Duration
+	timeoutStaleRunLock            time.Duration
+	systemRootToken                string
 	installPublicKey               ssh.PublicKey
 	tagKeyPrefix                   string
+	runnerCmdDefault               string   // crunch-run command to use if not deploying a binary
+	runnerArgs                     []string // extra args passed to crunch-run
 
 	// private state
 	subscribers  map[<-chan struct{}]chan<- struct{}
@@ -170,13 +187,20 @@ type Pool struct {
 	runnerMD5    [md5.Size]byte
 	runnerCmd    string
 
-	mContainersRunning prometheus.Gauge
-	mInstances         *prometheus.GaugeVec
-	mInstancesPrice    *prometheus.GaugeVec
-	mVCPUs             *prometheus.GaugeVec
-	mMemory            *prometheus.GaugeVec
-	mBootOutcomes      *prometheus.CounterVec
-	mDisappearances    *prometheus.CounterVec
+	mContainersRunning        prometheus.Gauge
+	mInstances                *prometheus.GaugeVec
+	mInstancesPrice           *prometheus.GaugeVec
+	mVCPUs                    *prometheus.GaugeVec
+	mMemory                   *prometheus.GaugeVec
+	mBootOutcomes             *prometheus.CounterVec
+	mDisappearances           *prometheus.CounterVec
+	mTimeToSSH                prometheus.Summary
+	mTimeToReadyForContainer  prometheus.Summary
+	mTimeFromShutdownToGone   prometheus.Summary
+	mTimeFromQueueToCrunchRun prometheus.Summary
+	mRunProbeDuration         *prometheus.SummaryVec
+	mProbeAgeMax              prometheus.Gauge
+	mProbeAgeMedian           prometheus.Gauge
 }
 
 type createCall struct {
@@ -284,10 +308,10 @@ func (wp *Pool) Unallocated() map[arvados.InstanceType]int {
 // pool. The worker is added immediately; instance creation runs in
 // the background.
 //
-// Create returns false if a pre-existing error state prevents it from
-// even attempting to create a new instance. Those errors are logged
-// by the Pool, so the caller does not need to log anything in such
-// cases.
+// Create returns false if a pre-existing error or a configuration
+// setting prevents it from even attempting to create a new
+// instance. Those errors are logged by the Pool, so the caller does
+// not need to log anything in such cases.
 func (wp *Pool) Create(it arvados.InstanceType) bool {
 	logger := wp.logger.WithField("InstanceType", it.Name)
 	wp.setupOnce.Do(wp.setup)
@@ -297,7 +321,9 @@ func (wp *Pool) Create(it arvados.InstanceType) bool {
 	}
 	wp.mtx.Lock()
 	defer wp.mtx.Unlock()
-	if time.Now().Before(wp.atQuotaUntil) || wp.instanceSet.throttleCreate.Error() != nil {
+	if time.Now().Before(wp.atQuotaUntil) ||
+		wp.instanceSet.throttleCreate.Error() != nil ||
+		(wp.maxInstances > 0 && wp.maxInstances <= len(wp.workers)+len(wp.creating)) {
 		return false
 	}
 	// The maxConcurrentInstanceCreateOps knob throttles the number of node create
@@ -323,7 +349,7 @@ func (wp *Pool) Create(it arvados.InstanceType) bool {
 			wp.tagKeyPrefix + tagKeyIdleBehavior:   string(IdleBehaviorRun),
 			wp.tagKeyPrefix + tagKeyInstanceSecret: secret,
 		}
-		initCmd := TagVerifier{nil, secret}.InitCommand()
+		initCmd := TagVerifier{nil, secret, nil}.InitCommand() + "\n" + wp.instanceInitCommand
 		inst, err := wp.instanceSet.Create(it, wp.imageID, tags, initCmd, wp.installPublicKey)
 		wp.mtx.Lock()
 		defer wp.mtx.Unlock()
@@ -343,15 +369,19 @@ func (wp *Pool) Create(it arvados.InstanceType) bool {
 		}
 		wp.updateWorker(inst, it)
 	}()
+	if len(wp.creating)+len(wp.workers) == wp.maxInstances {
+		logger.Infof("now at MaxInstances limit of %d instances", wp.maxInstances)
+	}
 	return true
 }
 
 // AtQuota returns true if Create is not expected to work at the
-// moment.
+// moment (e.g., cloud provider has reported quota errors, or we are
+// already at our own configured quota).
 func (wp *Pool) AtQuota() bool {
 	wp.mtx.Lock()
 	defer wp.mtx.Unlock()
-	return time.Now().Before(wp.atQuotaUntil)
+	return time.Now().Before(wp.atQuotaUntil) || (wp.maxInstances > 0 && wp.maxInstances <= len(wp.workers)+len(wp.creating))
 }
 
 // SetIdleBehavior determines how the indicated instance will behave
@@ -367,6 +397,28 @@ func (wp *Pool) SetIdleBehavior(id cloud.InstanceID, idleBehavior IdleBehavior)
 	return nil
 }
 
+// Successful connection to the SSH daemon, update the mTimeToSSH metric
+func (wp *Pool) reportSSHConnected(inst cloud.Instance) {
+	wp.mtx.Lock()
+	defer wp.mtx.Unlock()
+	wkr, ok := wp.workers[inst.ID()]
+	if !ok {
+		// race: inst was removed from the pool
+		return
+	}
+	if wkr.state != StateBooting || !wkr.firstSSHConnection.IsZero() {
+		// the node is not in booting state (can happen if
+		// a-d-c is restarted) OR this is not the first SSH
+		// connection
+		return
+	}
+
+	wkr.firstSSHConnection = time.Now()
+	if wp.mTimeToSSH != nil {
+		wp.mTimeToSSH.Observe(wkr.firstSSHConnection.Sub(wkr.appeared).Seconds())
+	}
+}
+
 // Add or update worker attached to the given instance.
 //
 // The second return value is true if a new worker is created.
@@ -377,7 +429,7 @@ func (wp *Pool) SetIdleBehavior(id cloud.InstanceID, idleBehavior IdleBehavior)
 // Caller must have lock.
 func (wp *Pool) updateWorker(inst cloud.Instance, it arvados.InstanceType) (*worker, bool) {
 	secret := inst.Tags()[wp.tagKeyPrefix+tagKeyInstanceSecret]
-	inst = TagVerifier{inst, secret}
+	inst = TagVerifier{Instance: inst, Secret: secret, ReportVerified: wp.reportSSHConnected}
 	id := inst.ID()
 	if wkr := wp.workers[id]; wkr != nil {
 		wkr.executor.SetTarget(inst)
@@ -578,6 +630,20 @@ func (wp *Pool) registerMetrics(reg *prometheus.Registry) {
 		Help:      "Number of containers reported running by cloud VMs.",
 	})
 	reg.MustRegister(wp.mContainersRunning)
+	wp.mProbeAgeMax = prometheus.NewGauge(prometheus.GaugeOpts{
+		Namespace: "arvados",
+		Subsystem: "dispatchcloud",
+		Name:      "probe_age_seconds_max",
+		Help:      "Maximum number of seconds since an instance's most recent successful probe.",
+	})
+	reg.MustRegister(wp.mProbeAgeMax)
+	wp.mProbeAgeMedian = prometheus.NewGauge(prometheus.GaugeOpts{
+		Namespace: "arvados",
+		Subsystem: "dispatchcloud",
+		Name:      "probe_age_seconds_median",
+		Help:      "Median number of seconds since an instance's most recent successful probe.",
+	})
+	reg.MustRegister(wp.mProbeAgeMedian)
 	wp.mInstances = prometheus.NewGaugeVec(prometheus.GaugeOpts{
 		Namespace: "arvados",
 		Subsystem: "dispatchcloud",
@@ -626,6 +692,46 @@ func (wp *Pool) registerMetrics(reg *prometheus.Registry) {
 		wp.mDisappearances.WithLabelValues(v).Add(0)
 	}
 	reg.MustRegister(wp.mDisappearances)
+	wp.mTimeToSSH = prometheus.NewSummary(prometheus.SummaryOpts{
+		Namespace:  "arvados",
+		Subsystem:  "dispatchcloud",
+		Name:       "instances_time_to_ssh_seconds",
+		Help:       "Number of seconds between instance creation and the first successful SSH connection.",
+		Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.95: 0.005, 0.99: 0.001},
+	})
+	reg.MustRegister(wp.mTimeToSSH)
+	wp.mTimeToReadyForContainer = prometheus.NewSummary(prometheus.SummaryOpts{
+		Namespace:  "arvados",
+		Subsystem:  "dispatchcloud",
+		Name:       "instances_time_to_ready_for_container_seconds",
+		Help:       "Number of seconds between the first successful SSH connection and ready to run a container.",
+		Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.95: 0.005, 0.99: 0.001},
+	})
+	reg.MustRegister(wp.mTimeToReadyForContainer)
+	wp.mTimeFromShutdownToGone = prometheus.NewSummary(prometheus.SummaryOpts{
+		Namespace:  "arvados",
+		Subsystem:  "dispatchcloud",
+		Name:       "instances_time_from_shutdown_request_to_disappearance_seconds",
+		Help:       "Number of seconds between the first shutdown attempt and the disappearance of the worker.",
+		Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.95: 0.005, 0.99: 0.001},
+	})
+	reg.MustRegister(wp.mTimeFromShutdownToGone)
+	wp.mTimeFromQueueToCrunchRun = prometheus.NewSummary(prometheus.SummaryOpts{
+		Namespace:  "arvados",
+		Subsystem:  "dispatchcloud",
+		Name:       "containers_time_from_queue_to_crunch_run_seconds",
+		Help:       "Number of seconds between the queuing of a container and the start of crunch-run.",
+		Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.95: 0.005, 0.99: 0.001},
+	})
+	reg.MustRegister(wp.mTimeFromQueueToCrunchRun)
+	wp.mRunProbeDuration = prometheus.NewSummaryVec(prometheus.SummaryOpts{
+		Namespace:  "arvados",
+		Subsystem:  "dispatchcloud",
+		Name:       "instances_run_probe_duration_seconds",
+		Help:       "Number of seconds per runProbe call.",
+		Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.95: 0.005, 0.99: 0.001},
+	}, []string{"outcome"})
+	reg.MustRegister(wp.mRunProbeDuration)
 }
 
 func (wp *Pool) runMetrics() {
@@ -650,6 +756,8 @@ func (wp *Pool) updateMetrics() {
 	cpu := map[string]int64{}
 	mem := map[string]int64{}
 	var running int64
+	now := time.Now()
+	var probed []time.Time
 	for _, wkr := range wp.workers {
 		var cat string
 		switch {
@@ -669,6 +777,7 @@ func (wp *Pool) updateMetrics() {
 		cpu[cat] += int64(wkr.instType.VCPUs)
 		mem[cat] += int64(wkr.instType.RAM)
 		running += int64(len(wkr.running) + len(wkr.starting))
+		probed = append(probed, wkr.probed)
 	}
 	for _, cat := range []string{"inuse", "hold", "booting", "unknown", "idle"} {
 		wp.mInstancesPrice.WithLabelValues(cat).Set(price[cat])
@@ -685,6 +794,15 @@ func (wp *Pool) updateMetrics() {
 		wp.mInstances.WithLabelValues(k.cat, k.instType).Set(float64(v))
 	}
 	wp.mContainersRunning.Set(float64(running))
+
+	if len(probed) == 0 {
+		wp.mProbeAgeMax.Set(0)
+		wp.mProbeAgeMedian.Set(0)
+	} else {
+		sort.Slice(probed, func(i, j int) bool { return probed[i].Before(probed[j]) })
+		wp.mProbeAgeMax.Set(now.Sub(probed[0]).Seconds())
+		wp.mProbeAgeMedian.Set(now.Sub(probed[len(probed)/2]).Seconds())
+	}
 }
 
 func (wp *Pool) runProbes() {
@@ -700,6 +818,13 @@ func (wp *Pool) runProbes() {
 
 	workers := []cloud.InstanceID{}
 	for range probeticker.C {
+		// Add some jitter. Without this, if probeInterval is
+		// a multiple of syncInterval and sync is
+		// instantaneous (as with the loopback driver), the
+		// first few probes race with sync operations and
+		// don't update the workers.
+		time.Sleep(time.Duration(mathrand.Int63n(int64(wp.probeInterval) / 23)))
+
 		workers = workers[:0]
 		wp.mtx.Lock()
 		for id, wkr := range wp.workers {
@@ -783,6 +908,9 @@ func (wp *Pool) Instances() []InstanceView {
 // KillInstance destroys a cloud VM instance. It returns an error if
 // the given instance does not exist.
 func (wp *Pool) KillInstance(id cloud.InstanceID, reason string) error {
+	wp.setupOnce.Do(wp.setup)
+	wp.mtx.Lock()
+	defer wp.mtx.Unlock()
 	wkr, ok := wp.workers[id]
 	if !ok {
 		return errors.New("instance not found")
@@ -813,7 +941,7 @@ func (wp *Pool) loadRunnerData() error {
 	if wp.runnerData != nil {
 		return nil
 	} else if wp.runnerSource == "" {
-		wp.runnerCmd = "crunch-run"
+		wp.runnerCmd = wp.runnerCmdDefault
 		wp.runnerData = []byte{}
 		return nil
 	}
@@ -826,7 +954,7 @@ func (wp *Pool) loadRunnerData() error {
 	}
 	wp.runnerData = buf
 	wp.runnerMD5 = md5.Sum(buf)
-	wp.runnerCmd = fmt.Sprintf("/var/lib/arvados/crunch-run~%x", wp.runnerMD5)
+	wp.runnerCmd = fmt.Sprintf("/tmp/arvados-crunch-run/crunch-run~%x", wp.runnerMD5)
 	return nil
 }
 
@@ -895,6 +1023,10 @@ func (wp *Pool) sync(threshold time.Time, instances []cloud.Instance) {
 		if wp.mDisappearances != nil {
 			wp.mDisappearances.WithLabelValues(stateString[wkr.state]).Inc()
 		}
+		// wkr.destroyed.IsZero() can happen if instance disappeared but we weren't trying to shut it down
+		if wp.mTimeFromShutdownToGone != nil && !wkr.destroyed.IsZero() {
+			wp.mTimeFromShutdownToGone.Observe(time.Now().Sub(wkr.destroyed).Seconds())
+		}
 		delete(wp.workers, id)
 		go wkr.Close()
 		notify = true
@@ -922,6 +1054,12 @@ func (wp *Pool) waitUntilLoaded() {
 	}
 }
 
+func (wp *Pool) gatewayAuthSecret(uuid string) string {
+	h := hmac.New(sha256.New, []byte(wp.systemRootToken))
+	fmt.Fprint(h, uuid)
+	return fmt.Sprintf("%x", h.Sum(nil))
+}
+
 // Return a random string of n hexadecimal digits (n*4 random bits). n
 // must be even.
 func randomHex(n int) string {