X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/a5fef23f2863cd0183ff596f4579110e2ddb3b3d..e44725a3792df227f189f88ffb2cd1dbf0e93489:/lib/dispatchcloud/worker/pool.go diff --git a/lib/dispatchcloud/worker/pool.go b/lib/dispatchcloud/worker/pool.go index 435b6e43ae..1d600e3702 100644 --- a/lib/dispatchcloud/worker/pool.go +++ b/lib/dispatchcloud/worker/pool.go @@ -5,12 +5,15 @@ package worker import ( + "crypto/hmac" "crypto/md5" "crypto/rand" + "crypto/sha256" "errors" "fmt" "io" "io/ioutil" + mathrand "math/rand" "sort" "strings" "sync" @@ -64,15 +67,16 @@ type Executor interface { } const ( - defaultSyncInterval = time.Minute - defaultProbeInterval = time.Second * 10 - defaultMaxProbesPerSecond = 10 - defaultTimeoutIdle = time.Minute - defaultTimeoutBooting = time.Minute * 10 - defaultTimeoutProbe = time.Minute * 10 - defaultTimeoutShutdown = time.Second * 10 - defaultTimeoutTERM = time.Minute * 2 - defaultTimeoutSignal = time.Second * 5 + defaultSyncInterval = time.Minute + defaultProbeInterval = time.Second * 10 + defaultMaxProbesPerSecond = 10 + defaultTimeoutIdle = time.Minute + defaultTimeoutBooting = time.Minute * 10 + defaultTimeoutProbe = time.Minute * 10 + defaultTimeoutShutdown = time.Second * 10 + defaultTimeoutTERM = time.Minute * 2 + defaultTimeoutSignal = time.Second * 5 + defaultTimeoutStaleRunLock = time.Second * 5 // Time after a quota error to try again anyway, even if no // instances have been shutdown. @@ -85,9 +89,8 @@ const ( func duration(conf arvados.Duration, def time.Duration) time.Duration { if conf > 0 { return time.Duration(conf) - } else { - return def } + return def } // NewPool creates a Pool of workers backed by instanceSet. @@ -101,12 +104,15 @@ func NewPool(logger logrus.FieldLogger, arvClient *arvados.Client, reg *promethe instanceSetID: instanceSetID, instanceSet: &throttledInstanceSet{InstanceSet: instanceSet}, newExecutor: newExecutor, + cluster: cluster, bootProbeCommand: cluster.Containers.CloudVMs.BootProbeCommand, + instanceInitCommand: cloud.InitCommand(cluster.Containers.CloudVMs.InstanceInitCommand), runnerSource: cluster.Containers.CloudVMs.DeployRunnerBinary, imageID: cloud.ImageID(cluster.Containers.CloudVMs.ImageID), instanceTypes: cluster.InstanceTypes, maxProbesPerSecond: cluster.Containers.CloudVMs.MaxProbesPerSecond, maxConcurrentInstanceCreateOps: cluster.Containers.CloudVMs.MaxConcurrentInstanceCreateOps, + maxInstances: cluster.Containers.CloudVMs.MaxInstances, probeInterval: duration(cluster.Containers.CloudVMs.ProbeInterval, defaultProbeInterval), syncInterval: duration(cluster.Containers.CloudVMs.SyncInterval, defaultSyncInterval), timeoutIdle: duration(cluster.Containers.CloudVMs.TimeoutIdle, defaultTimeoutIdle), @@ -115,8 +121,12 @@ func NewPool(logger logrus.FieldLogger, arvClient *arvados.Client, reg *promethe timeoutShutdown: duration(cluster.Containers.CloudVMs.TimeoutShutdown, defaultTimeoutShutdown), timeoutTERM: duration(cluster.Containers.CloudVMs.TimeoutTERM, defaultTimeoutTERM), timeoutSignal: duration(cluster.Containers.CloudVMs.TimeoutSignal, defaultTimeoutSignal), + timeoutStaleRunLock: duration(cluster.Containers.CloudVMs.TimeoutStaleRunLock, defaultTimeoutStaleRunLock), + systemRootToken: cluster.SystemRootToken, installPublicKey: installPublicKey, tagKeyPrefix: cluster.Containers.CloudVMs.TagKeyPrefix, + runnerCmdDefault: cluster.Containers.CrunchRunCommand, + runnerArgs: append([]string{"--runtime-engine=" + cluster.Containers.RuntimeEngine}, cluster.Containers.CrunchRunArgumentsList...), stop: make(chan bool), } wp.registerMetrics(reg) @@ -138,7 +148,9 @@ type Pool struct { instanceSetID cloud.InstanceSetID instanceSet *throttledInstanceSet newExecutor func(cloud.Instance) Executor + cluster *arvados.Cluster bootProbeCommand string + instanceInitCommand cloud.InitCommand runnerSource string imageID cloud.ImageID instanceTypes map[string]arvados.InstanceType @@ -146,14 +158,19 @@ type Pool struct { probeInterval time.Duration maxProbesPerSecond int maxConcurrentInstanceCreateOps int + maxInstances int timeoutIdle time.Duration timeoutBooting time.Duration timeoutProbe time.Duration timeoutShutdown time.Duration timeoutTERM time.Duration timeoutSignal time.Duration + timeoutStaleRunLock time.Duration + systemRootToken string installPublicKey ssh.PublicKey tagKeyPrefix string + runnerCmdDefault string // crunch-run command to use if not deploying a binary + runnerArgs []string // extra args passed to crunch-run // private state subscribers map[<-chan struct{}]chan<- struct{} @@ -170,13 +187,20 @@ type Pool struct { runnerMD5 [md5.Size]byte runnerCmd string - mContainersRunning prometheus.Gauge - mInstances *prometheus.GaugeVec - mInstancesPrice *prometheus.GaugeVec - mVCPUs *prometheus.GaugeVec - mMemory *prometheus.GaugeVec - mBootOutcomes *prometheus.CounterVec - mDisappearances *prometheus.CounterVec + mContainersRunning prometheus.Gauge + mInstances *prometheus.GaugeVec + mInstancesPrice *prometheus.GaugeVec + mVCPUs *prometheus.GaugeVec + mMemory *prometheus.GaugeVec + mBootOutcomes *prometheus.CounterVec + mDisappearances *prometheus.CounterVec + mTimeToSSH prometheus.Summary + mTimeToReadyForContainer prometheus.Summary + mTimeFromShutdownToGone prometheus.Summary + mTimeFromQueueToCrunchRun prometheus.Summary + mRunProbeDuration *prometheus.SummaryVec + mProbeAgeMax prometheus.Gauge + mProbeAgeMedian prometheus.Gauge } type createCall struct { @@ -284,10 +308,10 @@ func (wp *Pool) Unallocated() map[arvados.InstanceType]int { // pool. The worker is added immediately; instance creation runs in // the background. // -// Create returns false if a pre-existing error state prevents it from -// even attempting to create a new instance. Those errors are logged -// by the Pool, so the caller does not need to log anything in such -// cases. +// Create returns false if a pre-existing error or a configuration +// setting prevents it from even attempting to create a new +// instance. Those errors are logged by the Pool, so the caller does +// not need to log anything in such cases. func (wp *Pool) Create(it arvados.InstanceType) bool { logger := wp.logger.WithField("InstanceType", it.Name) wp.setupOnce.Do(wp.setup) @@ -297,7 +321,9 @@ func (wp *Pool) Create(it arvados.InstanceType) bool { } wp.mtx.Lock() defer wp.mtx.Unlock() - if time.Now().Before(wp.atQuotaUntil) || wp.instanceSet.throttleCreate.Error() != nil { + if time.Now().Before(wp.atQuotaUntil) || + wp.instanceSet.throttleCreate.Error() != nil || + (wp.maxInstances > 0 && wp.maxInstances <= len(wp.workers)+len(wp.creating)) { return false } // The maxConcurrentInstanceCreateOps knob throttles the number of node create @@ -323,7 +349,7 @@ func (wp *Pool) Create(it arvados.InstanceType) bool { wp.tagKeyPrefix + tagKeyIdleBehavior: string(IdleBehaviorRun), wp.tagKeyPrefix + tagKeyInstanceSecret: secret, } - initCmd := TagVerifier{nil, secret}.InitCommand() + initCmd := TagVerifier{nil, secret, nil}.InitCommand() + "\n" + wp.instanceInitCommand inst, err := wp.instanceSet.Create(it, wp.imageID, tags, initCmd, wp.installPublicKey) wp.mtx.Lock() defer wp.mtx.Unlock() @@ -343,15 +369,19 @@ func (wp *Pool) Create(it arvados.InstanceType) bool { } wp.updateWorker(inst, it) }() + if len(wp.creating)+len(wp.workers) == wp.maxInstances { + logger.Infof("now at MaxInstances limit of %d instances", wp.maxInstances) + } return true } // AtQuota returns true if Create is not expected to work at the -// moment. +// moment (e.g., cloud provider has reported quota errors, or we are +// already at our own configured quota). func (wp *Pool) AtQuota() bool { wp.mtx.Lock() defer wp.mtx.Unlock() - return time.Now().Before(wp.atQuotaUntil) + return time.Now().Before(wp.atQuotaUntil) || (wp.maxInstances > 0 && wp.maxInstances <= len(wp.workers)+len(wp.creating)) } // SetIdleBehavior determines how the indicated instance will behave @@ -367,6 +397,28 @@ func (wp *Pool) SetIdleBehavior(id cloud.InstanceID, idleBehavior IdleBehavior) return nil } +// Successful connection to the SSH daemon, update the mTimeToSSH metric +func (wp *Pool) reportSSHConnected(inst cloud.Instance) { + wp.mtx.Lock() + defer wp.mtx.Unlock() + wkr, ok := wp.workers[inst.ID()] + if !ok { + // race: inst was removed from the pool + return + } + if wkr.state != StateBooting || !wkr.firstSSHConnection.IsZero() { + // the node is not in booting state (can happen if + // a-d-c is restarted) OR this is not the first SSH + // connection + return + } + + wkr.firstSSHConnection = time.Now() + if wp.mTimeToSSH != nil { + wp.mTimeToSSH.Observe(wkr.firstSSHConnection.Sub(wkr.appeared).Seconds()) + } +} + // Add or update worker attached to the given instance. // // The second return value is true if a new worker is created. @@ -377,7 +429,7 @@ func (wp *Pool) SetIdleBehavior(id cloud.InstanceID, idleBehavior IdleBehavior) // Caller must have lock. func (wp *Pool) updateWorker(inst cloud.Instance, it arvados.InstanceType) (*worker, bool) { secret := inst.Tags()[wp.tagKeyPrefix+tagKeyInstanceSecret] - inst = TagVerifier{inst, secret} + inst = TagVerifier{Instance: inst, Secret: secret, ReportVerified: wp.reportSSHConnected} id := inst.ID() if wkr := wp.workers[id]; wkr != nil { wkr.executor.SetTarget(inst) @@ -578,6 +630,20 @@ func (wp *Pool) registerMetrics(reg *prometheus.Registry) { Help: "Number of containers reported running by cloud VMs.", }) reg.MustRegister(wp.mContainersRunning) + wp.mProbeAgeMax = prometheus.NewGauge(prometheus.GaugeOpts{ + Namespace: "arvados", + Subsystem: "dispatchcloud", + Name: "probe_age_seconds_max", + Help: "Maximum number of seconds since an instance's most recent successful probe.", + }) + reg.MustRegister(wp.mProbeAgeMax) + wp.mProbeAgeMedian = prometheus.NewGauge(prometheus.GaugeOpts{ + Namespace: "arvados", + Subsystem: "dispatchcloud", + Name: "probe_age_seconds_median", + Help: "Median number of seconds since an instance's most recent successful probe.", + }) + reg.MustRegister(wp.mProbeAgeMedian) wp.mInstances = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Namespace: "arvados", Subsystem: "dispatchcloud", @@ -626,6 +692,46 @@ func (wp *Pool) registerMetrics(reg *prometheus.Registry) { wp.mDisappearances.WithLabelValues(v).Add(0) } reg.MustRegister(wp.mDisappearances) + wp.mTimeToSSH = prometheus.NewSummary(prometheus.SummaryOpts{ + Namespace: "arvados", + Subsystem: "dispatchcloud", + Name: "instances_time_to_ssh_seconds", + Help: "Number of seconds between instance creation and the first successful SSH connection.", + Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.95: 0.005, 0.99: 0.001}, + }) + reg.MustRegister(wp.mTimeToSSH) + wp.mTimeToReadyForContainer = prometheus.NewSummary(prometheus.SummaryOpts{ + Namespace: "arvados", + Subsystem: "dispatchcloud", + Name: "instances_time_to_ready_for_container_seconds", + Help: "Number of seconds between the first successful SSH connection and ready to run a container.", + Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.95: 0.005, 0.99: 0.001}, + }) + reg.MustRegister(wp.mTimeToReadyForContainer) + wp.mTimeFromShutdownToGone = prometheus.NewSummary(prometheus.SummaryOpts{ + Namespace: "arvados", + Subsystem: "dispatchcloud", + Name: "instances_time_from_shutdown_request_to_disappearance_seconds", + Help: "Number of seconds between the first shutdown attempt and the disappearance of the worker.", + Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.95: 0.005, 0.99: 0.001}, + }) + reg.MustRegister(wp.mTimeFromShutdownToGone) + wp.mTimeFromQueueToCrunchRun = prometheus.NewSummary(prometheus.SummaryOpts{ + Namespace: "arvados", + Subsystem: "dispatchcloud", + Name: "containers_time_from_queue_to_crunch_run_seconds", + Help: "Number of seconds between the queuing of a container and the start of crunch-run.", + Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.95: 0.005, 0.99: 0.001}, + }) + reg.MustRegister(wp.mTimeFromQueueToCrunchRun) + wp.mRunProbeDuration = prometheus.NewSummaryVec(prometheus.SummaryOpts{ + Namespace: "arvados", + Subsystem: "dispatchcloud", + Name: "instances_run_probe_duration_seconds", + Help: "Number of seconds per runProbe call.", + Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.95: 0.005, 0.99: 0.001}, + }, []string{"outcome"}) + reg.MustRegister(wp.mRunProbeDuration) } func (wp *Pool) runMetrics() { @@ -650,6 +756,8 @@ func (wp *Pool) updateMetrics() { cpu := map[string]int64{} mem := map[string]int64{} var running int64 + now := time.Now() + var probed []time.Time for _, wkr := range wp.workers { var cat string switch { @@ -669,6 +777,7 @@ func (wp *Pool) updateMetrics() { cpu[cat] += int64(wkr.instType.VCPUs) mem[cat] += int64(wkr.instType.RAM) running += int64(len(wkr.running) + len(wkr.starting)) + probed = append(probed, wkr.probed) } for _, cat := range []string{"inuse", "hold", "booting", "unknown", "idle"} { wp.mInstancesPrice.WithLabelValues(cat).Set(price[cat]) @@ -685,6 +794,15 @@ func (wp *Pool) updateMetrics() { wp.mInstances.WithLabelValues(k.cat, k.instType).Set(float64(v)) } wp.mContainersRunning.Set(float64(running)) + + if len(probed) == 0 { + wp.mProbeAgeMax.Set(0) + wp.mProbeAgeMedian.Set(0) + } else { + sort.Slice(probed, func(i, j int) bool { return probed[i].Before(probed[j]) }) + wp.mProbeAgeMax.Set(now.Sub(probed[0]).Seconds()) + wp.mProbeAgeMedian.Set(now.Sub(probed[len(probed)/2]).Seconds()) + } } func (wp *Pool) runProbes() { @@ -700,6 +818,13 @@ func (wp *Pool) runProbes() { workers := []cloud.InstanceID{} for range probeticker.C { + // Add some jitter. Without this, if probeInterval is + // a multiple of syncInterval and sync is + // instantaneous (as with the loopback driver), the + // first few probes race with sync operations and + // don't update the workers. + time.Sleep(time.Duration(mathrand.Int63n(int64(wp.probeInterval) / 23))) + workers = workers[:0] wp.mtx.Lock() for id, wkr := range wp.workers { @@ -783,6 +908,9 @@ func (wp *Pool) Instances() []InstanceView { // KillInstance destroys a cloud VM instance. It returns an error if // the given instance does not exist. func (wp *Pool) KillInstance(id cloud.InstanceID, reason string) error { + wp.setupOnce.Do(wp.setup) + wp.mtx.Lock() + defer wp.mtx.Unlock() wkr, ok := wp.workers[id] if !ok { return errors.New("instance not found") @@ -813,7 +941,7 @@ func (wp *Pool) loadRunnerData() error { if wp.runnerData != nil { return nil } else if wp.runnerSource == "" { - wp.runnerCmd = "crunch-run" + wp.runnerCmd = wp.runnerCmdDefault wp.runnerData = []byte{} return nil } @@ -826,7 +954,7 @@ func (wp *Pool) loadRunnerData() error { } wp.runnerData = buf wp.runnerMD5 = md5.Sum(buf) - wp.runnerCmd = fmt.Sprintf("/var/lib/arvados/crunch-run~%x", wp.runnerMD5) + wp.runnerCmd = fmt.Sprintf("/tmp/arvados-crunch-run/crunch-run~%x", wp.runnerMD5) return nil } @@ -895,6 +1023,10 @@ func (wp *Pool) sync(threshold time.Time, instances []cloud.Instance) { if wp.mDisappearances != nil { wp.mDisappearances.WithLabelValues(stateString[wkr.state]).Inc() } + // wkr.destroyed.IsZero() can happen if instance disappeared but we weren't trying to shut it down + if wp.mTimeFromShutdownToGone != nil && !wkr.destroyed.IsZero() { + wp.mTimeFromShutdownToGone.Observe(time.Now().Sub(wkr.destroyed).Seconds()) + } delete(wp.workers, id) go wkr.Close() notify = true @@ -922,6 +1054,12 @@ func (wp *Pool) waitUntilLoaded() { } } +func (wp *Pool) gatewayAuthSecret(uuid string) string { + h := hmac.New(sha256.New, []byte(wp.systemRootToken)) + fmt.Fprint(h, uuid) + return fmt.Sprintf("%x", h.Sum(nil)) +} + // Return a random string of n hexadecimal digits (n*4 random bits). n // must be even. func randomHex(n int) string {