17170: Add "arvados-client shell" subcommand and backend support.
[arvados.git] / lib / dispatchcloud / worker / pool.go
index b1b570bd900f3a661816a217910d35fde3bc563b..e092e7adab5bfe0e3d9b5ff354ea4ff7ff371010 100644 (file)
@@ -5,39 +5,50 @@
 package worker
 
 import (
-       "bytes"
+       "crypto/hmac"
+       "crypto/md5"
+       "crypto/rand"
+       "crypto/sha256"
+       "errors"
+       "fmt"
        "io"
+       "io/ioutil"
        "sort"
        "strings"
        "sync"
        "time"
 
-       "git.curoverse.com/arvados.git/lib/cloud"
-       "git.curoverse.com/arvados.git/sdk/go/arvados"
-       "github.com/Sirupsen/logrus"
+       "git.arvados.org/arvados.git/lib/cloud"
+       "git.arvados.org/arvados.git/sdk/go/arvados"
        "github.com/prometheus/client_golang/prometheus"
+       "github.com/sirupsen/logrus"
+       "golang.org/x/crypto/ssh"
 )
 
 const (
-       tagKeyInstanceType = "InstanceType"
-       tagKeyHold         = "Hold"
+       tagKeyInstanceType   = "InstanceType"
+       tagKeyIdleBehavior   = "IdleBehavior"
+       tagKeyInstanceSecret = "InstanceSecret"
+       tagKeyInstanceSetID  = "InstanceSetID"
 )
 
 // An InstanceView shows a worker's current state and recent activity.
 type InstanceView struct {
-       Instance             string
-       Price                float64
-       ArvadosInstanceType  string
-       ProviderInstanceType string
-       LastContainerUUID    string
-       LastBusy             time.Time
-       WorkerState          string
+       Instance             cloud.InstanceID `json:"instance"`
+       Address              string           `json:"address"`
+       Price                float64          `json:"price"`
+       ArvadosInstanceType  string           `json:"arvados_instance_type"`
+       ProviderInstanceType string           `json:"provider_instance_type"`
+       LastContainerUUID    string           `json:"last_container_uuid"`
+       LastBusy             time.Time        `json:"last_busy"`
+       WorkerState          string           `json:"worker_state"`
+       IdleBehavior         IdleBehavior     `json:"idle_behavior"`
 }
 
 // An Executor executes shell commands on a remote host.
 type Executor interface {
        // Run cmd on the current target.
-       Execute(cmd string, stdin io.Reader) (stdout, stderr []byte, err error)
+       Execute(env map[string]string, cmd string, stdin io.Reader) (stdout, stderr []byte, err error)
 
        // Use the given target for subsequent operations. The new
        // target is the same host as the previous target, but it
@@ -55,40 +66,62 @@ type Executor interface {
 }
 
 const (
-       defaultSyncInterval       = time.Minute
-       defaultProbeInterval      = time.Second * 10
-       defaultMaxProbesPerSecond = 10
-       defaultTimeoutIdle        = time.Minute
-       defaultTimeoutBooting     = time.Minute * 10
-       defaultTimeoutProbe       = time.Minute * 10
+       defaultSyncInterval        = time.Minute
+       defaultProbeInterval       = time.Second * 10
+       defaultMaxProbesPerSecond  = 10
+       defaultTimeoutIdle         = time.Minute
+       defaultTimeoutBooting      = time.Minute * 10
+       defaultTimeoutProbe        = time.Minute * 10
+       defaultTimeoutShutdown     = time.Second * 10
+       defaultTimeoutTERM         = time.Minute * 2
+       defaultTimeoutSignal       = time.Second * 5
+       defaultTimeoutStaleRunLock = time.Second * 5
+
+       // Time after a quota error to try again anyway, even if no
+       // instances have been shutdown.
+       quotaErrorTTL = time.Minute
+
+       // Time between "X failed because rate limiting" messages
+       logRateLimitErrorInterval = time.Second * 10
 )
 
 func duration(conf arvados.Duration, def time.Duration) time.Duration {
        if conf > 0 {
                return time.Duration(conf)
-       } else {
-               return def
        }
+       return def
 }
 
 // NewPool creates a Pool of workers backed by instanceSet.
 //
 // New instances are configured and set up according to the given
 // cluster configuration.
-func NewPool(logger logrus.FieldLogger, reg *prometheus.Registry, instanceSet cloud.InstanceSet, newExecutor func(cloud.Instance) Executor, cluster *arvados.Cluster) *Pool {
+func NewPool(logger logrus.FieldLogger, arvClient *arvados.Client, reg *prometheus.Registry, instanceSetID cloud.InstanceSetID, instanceSet cloud.InstanceSet, newExecutor func(cloud.Instance) Executor, installPublicKey ssh.PublicKey, cluster *arvados.Cluster) *Pool {
        wp := &Pool{
-               logger:             logger,
-               instanceSet:        instanceSet,
-               newExecutor:        newExecutor,
-               bootProbeCommand:   cluster.CloudVMs.BootProbeCommand,
-               imageID:            cloud.ImageID(cluster.CloudVMs.ImageID),
-               instanceTypes:      cluster.InstanceTypes,
-               maxProbesPerSecond: cluster.Dispatch.MaxProbesPerSecond,
-               probeInterval:      duration(cluster.Dispatch.ProbeInterval, defaultProbeInterval),
-               syncInterval:       duration(cluster.CloudVMs.SyncInterval, defaultSyncInterval),
-               timeoutIdle:        duration(cluster.CloudVMs.TimeoutIdle, defaultTimeoutIdle),
-               timeoutBooting:     duration(cluster.CloudVMs.TimeoutBooting, defaultTimeoutBooting),
-               timeoutProbe:       duration(cluster.CloudVMs.TimeoutProbe, defaultTimeoutProbe),
+               logger:                         logger,
+               arvClient:                      arvClient,
+               instanceSetID:                  instanceSetID,
+               instanceSet:                    &throttledInstanceSet{InstanceSet: instanceSet},
+               newExecutor:                    newExecutor,
+               bootProbeCommand:               cluster.Containers.CloudVMs.BootProbeCommand,
+               runnerSource:                   cluster.Containers.CloudVMs.DeployRunnerBinary,
+               imageID:                        cloud.ImageID(cluster.Containers.CloudVMs.ImageID),
+               instanceTypes:                  cluster.InstanceTypes,
+               maxProbesPerSecond:             cluster.Containers.CloudVMs.MaxProbesPerSecond,
+               maxConcurrentInstanceCreateOps: cluster.Containers.CloudVMs.MaxConcurrentInstanceCreateOps,
+               probeInterval:                  duration(cluster.Containers.CloudVMs.ProbeInterval, defaultProbeInterval),
+               syncInterval:                   duration(cluster.Containers.CloudVMs.SyncInterval, defaultSyncInterval),
+               timeoutIdle:                    duration(cluster.Containers.CloudVMs.TimeoutIdle, defaultTimeoutIdle),
+               timeoutBooting:                 duration(cluster.Containers.CloudVMs.TimeoutBooting, defaultTimeoutBooting),
+               timeoutProbe:                   duration(cluster.Containers.CloudVMs.TimeoutProbe, defaultTimeoutProbe),
+               timeoutShutdown:                duration(cluster.Containers.CloudVMs.TimeoutShutdown, defaultTimeoutShutdown),
+               timeoutTERM:                    duration(cluster.Containers.CloudVMs.TimeoutTERM, defaultTimeoutTERM),
+               timeoutSignal:                  duration(cluster.Containers.CloudVMs.TimeoutSignal, defaultTimeoutSignal),
+               timeoutStaleRunLock:            duration(cluster.Containers.CloudVMs.TimeoutStaleRunLock, defaultTimeoutStaleRunLock),
+               systemRootToken:                cluster.SystemRootToken,
+               installPublicKey:               installPublicKey,
+               tagKeyPrefix:                   cluster.Containers.CloudVMs.TagKeyPrefix,
+               stop:                           make(chan bool),
        }
        wp.registerMetrics(reg)
        go func() {
@@ -104,64 +137,87 @@ func NewPool(logger logrus.FieldLogger, reg *prometheus.Registry, instanceSet cl
 // zero Pool should not be used. Call NewPool to create a new Pool.
 type Pool struct {
        // configuration
-       logger             logrus.FieldLogger
-       instanceSet        cloud.InstanceSet
-       newExecutor        func(cloud.Instance) Executor
-       bootProbeCommand   string
-       imageID            cloud.ImageID
-       instanceTypes      map[string]arvados.InstanceType
-       syncInterval       time.Duration
-       probeInterval      time.Duration
-       maxProbesPerSecond int
-       timeoutIdle        time.Duration
-       timeoutBooting     time.Duration
-       timeoutProbe       time.Duration
+       logger                         logrus.FieldLogger
+       arvClient                      *arvados.Client
+       instanceSetID                  cloud.InstanceSetID
+       instanceSet                    *throttledInstanceSet
+       newExecutor                    func(cloud.Instance) Executor
+       bootProbeCommand               string
+       runnerSource                   string
+       imageID                        cloud.ImageID
+       instanceTypes                  map[string]arvados.InstanceType
+       syncInterval                   time.Duration
+       probeInterval                  time.Duration
+       maxProbesPerSecond             int
+       maxConcurrentInstanceCreateOps int
+       timeoutIdle                    time.Duration
+       timeoutBooting                 time.Duration
+       timeoutProbe                   time.Duration
+       timeoutShutdown                time.Duration
+       timeoutTERM                    time.Duration
+       timeoutSignal                  time.Duration
+       timeoutStaleRunLock            time.Duration
+       systemRootToken                string
+       installPublicKey               ssh.PublicKey
+       tagKeyPrefix                   string
 
        // private state
        subscribers  map[<-chan struct{}]chan<- struct{}
-       creating     map[arvados.InstanceType]int // goroutines waiting for (InstanceSet)Create to return
+       creating     map[string]createCall // unfinished (cloud.InstanceSet)Create calls (key is instance secret)
        workers      map[cloud.InstanceID]*worker
        loaded       bool                 // loaded list of instances from InstanceSet at least once
-       exited       map[string]time.Time // containers whose crunch-run proc has exited, but KillContainer has not been called
+       exited       map[string]time.Time // containers whose crunch-run proc has exited, but ForgetContainer has not been called
        atQuotaUntil time.Time
        atQuotaErr   cloud.QuotaError
        stop         chan bool
        mtx          sync.RWMutex
        setupOnce    sync.Once
+       runnerData   []byte
+       runnerMD5    [md5.Size]byte
+       runnerCmd    string
+
+       mContainersRunning        prometheus.Gauge
+       mInstances                *prometheus.GaugeVec
+       mInstancesPrice           *prometheus.GaugeVec
+       mVCPUs                    *prometheus.GaugeVec
+       mMemory                   *prometheus.GaugeVec
+       mBootOutcomes             *prometheus.CounterVec
+       mDisappearances           *prometheus.CounterVec
+       mTimeToSSH                prometheus.Summary
+       mTimeToReadyForContainer  prometheus.Summary
+       mTimeFromShutdownToGone   prometheus.Summary
+       mTimeFromQueueToCrunchRun prometheus.Summary
+       mRunProbeDuration         *prometheus.SummaryVec
+}
+
+type createCall struct {
+       time         time.Time
+       instanceType arvados.InstanceType
+}
+
+func (wp *Pool) CheckHealth() error {
+       wp.setupOnce.Do(wp.setup)
+       if err := wp.loadRunnerData(); err != nil {
+               return fmt.Errorf("error loading runner binary: %s", err)
+       }
+       return nil
+}
 
-       mInstances         prometheus.Gauge
-       mContainersRunning prometheus.Gauge
-       mVCPUs             prometheus.Gauge
-       mVCPUsInuse        prometheus.Gauge
-       mMemory            prometheus.Gauge
-       mMemoryInuse       prometheus.Gauge
-}
-
-type worker struct {
-       state    State
-       instance cloud.Instance
-       executor Executor
-       instType arvados.InstanceType
-       vcpus    int64
-       memory   int64
-       probed   time.Time
-       updated  time.Time
-       busy     time.Time
-       lastUUID string
-       running  map[string]struct{}
-       starting map[string]struct{}
-       probing  chan struct{}
-}
-
-// Subscribe returns a channel that becomes ready whenever a worker's
-// state changes.
+// Subscribe returns a buffered channel that becomes ready after any
+// change to the pool's state that could have scheduling implications:
+// a worker's state changes, a new worker appears, the cloud
+// provider's API rate limiting period ends, etc.
+//
+// Additional events that occur while the channel is already ready
+// will be dropped, so it is OK if the caller services the channel
+// slowly.
 //
 // Example:
 //
 //     ch := wp.Subscribe()
 //     defer wp.Unsubscribe(ch)
 //     for range ch {
-//             // ...try scheduling some work...
+//             tryScheduling(wp)
 //             if done {
 //                     break
 //             }
@@ -184,53 +240,121 @@ func (wp *Pool) Unsubscribe(ch <-chan struct{}) {
 }
 
 // Unallocated returns the number of unallocated (creating + booting +
-// idle + unknown) workers for each instance type.
+// idle + unknown) workers for each instance type.  Workers in
+// hold/drain mode are not included.
 func (wp *Pool) Unallocated() map[arvados.InstanceType]int {
        wp.setupOnce.Do(wp.setup)
        wp.mtx.RLock()
        defer wp.mtx.RUnlock()
-       u := map[arvados.InstanceType]int{}
-       for it, c := range wp.creating {
-               u[it] = c
+       unalloc := map[arvados.InstanceType]int{}
+       creating := map[arvados.InstanceType]int{}
+       oldestCreate := map[arvados.InstanceType]time.Time{}
+       for _, cc := range wp.creating {
+               it := cc.instanceType
+               creating[it]++
+               if t, ok := oldestCreate[it]; !ok || t.After(cc.time) {
+                       oldestCreate[it] = cc.time
+               }
        }
        for _, wkr := range wp.workers {
-               if len(wkr.running)+len(wkr.starting) == 0 && (wkr.state == StateRunning || wkr.state == StateBooting || wkr.state == StateUnknown) {
-                       u[wkr.instType]++
+               // Skip workers that are not expected to become
+               // available soon. Note len(wkr.running)>0 is not
+               // redundant here: it can be true even in
+               // StateUnknown.
+               if wkr.state == StateShutdown ||
+                       wkr.state == StateRunning ||
+                       wkr.idleBehavior != IdleBehaviorRun ||
+                       len(wkr.running) > 0 {
+                       continue
                }
+               it := wkr.instType
+               unalloc[it]++
+               if wkr.state == StateUnknown && creating[it] > 0 && wkr.appeared.After(oldestCreate[it]) {
+                       // If up to N new workers appear in
+                       // Instances() while we are waiting for N
+                       // Create() calls to complete, we assume we're
+                       // just seeing a race between Instances() and
+                       // Create() responses.
+                       //
+                       // The other common reason why nodes have
+                       // state==Unknown is that they appeared at
+                       // startup, before any Create calls. They
+                       // don't match the above timing condition, so
+                       // we never mistakenly attribute them to
+                       // pending Create calls.
+                       creating[it]--
+               }
+       }
+       for it, c := range creating {
+               unalloc[it] += c
        }
-       return u
+       return unalloc
 }
 
 // Create a new instance with the given type, and add it to the worker
 // pool. The worker is added immediately; instance creation runs in
 // the background.
-func (wp *Pool) Create(it arvados.InstanceType) error {
+//
+// Create returns false if a pre-existing error state prevents it from
+// even attempting to create a new instance. Those errors are logged
+// by the Pool, so the caller does not need to log anything in such
+// cases.
+func (wp *Pool) Create(it arvados.InstanceType) bool {
        logger := wp.logger.WithField("InstanceType", it.Name)
        wp.setupOnce.Do(wp.setup)
+       if wp.loadRunnerData() != nil {
+               // Boot probe is certain to fail.
+               return false
+       }
        wp.mtx.Lock()
        defer wp.mtx.Unlock()
-       if time.Now().Before(wp.atQuotaUntil) {
-               return wp.atQuotaErr
+       if time.Now().Before(wp.atQuotaUntil) || wp.instanceSet.throttleCreate.Error() != nil {
+               return false
        }
-       tags := cloud.InstanceTags{tagKeyInstanceType: it.Name}
-       wp.creating[it]++
+       // The maxConcurrentInstanceCreateOps knob throttles the number of node create
+       // requests in flight. It was added to work around a limitation in Azure's
+       // managed disks, which support no more than 20 concurrent node creation
+       // requests from a single disk image (cf.
+       // https://docs.microsoft.com/en-us/azure/virtual-machines/linux/capture-image).
+       // The code assumes that node creation, from Azure's perspective, means the
+       // period until the instance appears in the "get all instances" list.
+       if wp.maxConcurrentInstanceCreateOps > 0 && len(wp.creating) >= wp.maxConcurrentInstanceCreateOps {
+               logger.Info("reached MaxConcurrentInstanceCreateOps")
+               wp.instanceSet.throttleCreate.ErrorUntil(errors.New("reached MaxConcurrentInstanceCreateOps"), time.Now().Add(5*time.Second), wp.notify)
+               return false
+       }
+       now := time.Now()
+       secret := randomHex(instanceSecretLength)
+       wp.creating[secret] = createCall{time: now, instanceType: it}
        go func() {
                defer wp.notify()
-               inst, err := wp.instanceSet.Create(it, wp.imageID, tags, nil)
+               tags := cloud.InstanceTags{
+                       wp.tagKeyPrefix + tagKeyInstanceSetID:  string(wp.instanceSetID),
+                       wp.tagKeyPrefix + tagKeyInstanceType:   it.Name,
+                       wp.tagKeyPrefix + tagKeyIdleBehavior:   string(IdleBehaviorRun),
+                       wp.tagKeyPrefix + tagKeyInstanceSecret: secret,
+               }
+               initCmd := TagVerifier{nil, secret, nil}.InitCommand()
+               inst, err := wp.instanceSet.Create(it, wp.imageID, tags, initCmd, wp.installPublicKey)
                wp.mtx.Lock()
                defer wp.mtx.Unlock()
-               wp.creating[it]--
-               if err, ok := err.(cloud.QuotaError); ok && err.IsQuotaError() {
-                       wp.atQuotaErr = err
-                       wp.atQuotaUntil = time.Now().Add(time.Minute)
-               }
+               // delete() is deferred so the updateWorker() call
+               // below knows to use StateBooting when adding a new
+               // worker.
+               defer delete(wp.creating, secret)
                if err != nil {
+                       if err, ok := err.(cloud.QuotaError); ok && err.IsQuotaError() {
+                               wp.atQuotaErr = err
+                               wp.atQuotaUntil = time.Now().Add(quotaErrorTTL)
+                               time.AfterFunc(quotaErrorTTL, wp.notify)
+                       }
                        logger.WithError(err).Error("create failed")
+                       wp.instanceSet.throttleCreate.CheckRateLimitError(err, wp.logger, "create instance", wp.notify)
                        return
                }
-               wp.updateWorker(inst, it, StateBooting)
+               wp.updateWorker(inst, it)
        }()
-       return nil
+       return true
 }
 
 // AtQuota returns true if Create is not expected to work at the
@@ -241,43 +365,101 @@ func (wp *Pool) AtQuota() bool {
        return time.Now().Before(wp.atQuotaUntil)
 }
 
-// Add or update worker attached to the given instance. Use
-// initialState if a new worker is created. Caller must have lock.
+// SetIdleBehavior determines how the indicated instance will behave
+// when it has no containers running.
+func (wp *Pool) SetIdleBehavior(id cloud.InstanceID, idleBehavior IdleBehavior) error {
+       wp.mtx.Lock()
+       defer wp.mtx.Unlock()
+       wkr, ok := wp.workers[id]
+       if !ok {
+               return errors.New("requested instance does not exist")
+       }
+       wkr.setIdleBehavior(idleBehavior)
+       return nil
+}
+
+// Successful connection to the SSH daemon, update the mTimeToSSH metric
+func (wp *Pool) reportSSHConnected(inst cloud.Instance) {
+       wp.mtx.Lock()
+       defer wp.mtx.Unlock()
+       wkr := wp.workers[inst.ID()]
+       if wkr.state != StateBooting || !wkr.firstSSHConnection.IsZero() {
+               // the node is not in booting state (can happen if a-d-c is restarted) OR
+               // this is not the first SSH connection
+               return
+       }
+
+       wkr.firstSSHConnection = time.Now()
+       if wp.mTimeToSSH != nil {
+               wp.mTimeToSSH.Observe(wkr.firstSSHConnection.Sub(wkr.appeared).Seconds())
+       }
+}
+
+// Add or update worker attached to the given instance.
 //
-// Returns true when a new worker is created.
-func (wp *Pool) updateWorker(inst cloud.Instance, it arvados.InstanceType, initialState State) bool {
+// The second return value is true if a new worker is created.
+//
+// A newly added instance has state=StateBooting if its tags match an
+// entry in wp.creating, otherwise StateUnknown.
+//
+// Caller must have lock.
+func (wp *Pool) updateWorker(inst cloud.Instance, it arvados.InstanceType) (*worker, bool) {
+       secret := inst.Tags()[wp.tagKeyPrefix+tagKeyInstanceSecret]
+       inst = TagVerifier{Instance: inst, Secret: secret, ReportVerified: wp.reportSSHConnected}
        id := inst.ID()
-       if wp.workers[id] != nil {
-               wp.workers[id].executor.SetTarget(inst)
-               wp.workers[id].instance = inst
-               wp.workers[id].updated = time.Now()
-               if initialState == StateBooting && wp.workers[id].state == StateUnknown {
-                       wp.workers[id].state = StateBooting
-               }
-               return false
+       if wkr := wp.workers[id]; wkr != nil {
+               wkr.executor.SetTarget(inst)
+               wkr.instance = inst
+               wkr.updated = time.Now()
+               wkr.saveTags()
+               return wkr, false
+       }
+
+       state := StateUnknown
+       if _, ok := wp.creating[secret]; ok {
+               state = StateBooting
        }
-       if initialState == StateUnknown && inst.Tags()[tagKeyHold] != "" {
-               initialState = StateHold
+
+       // If an instance has a valid IdleBehavior tag when it first
+       // appears, initialize the new worker accordingly (this is how
+       // we restore IdleBehavior that was set by a prior dispatch
+       // process); otherwise, default to "run". After this,
+       // wkr.idleBehavior is the source of truth, and will only be
+       // changed via SetIdleBehavior().
+       idleBehavior := IdleBehavior(inst.Tags()[wp.tagKeyPrefix+tagKeyIdleBehavior])
+       if !validIdleBehavior[idleBehavior] {
+               idleBehavior = IdleBehaviorRun
        }
-       wp.logger.WithFields(logrus.Fields{
+
+       logger := wp.logger.WithFields(logrus.Fields{
                "InstanceType": it.Name,
-               "Instance":     inst,
-               "State":        initialState,
+               "Instance":     inst.ID(),
+               "Address":      inst.Address(),
+       })
+       logger.WithFields(logrus.Fields{
+               "State":        state,
+               "IdleBehavior": idleBehavior,
        }).Infof("instance appeared in cloud")
        now := time.Now()
-       wp.workers[id] = &worker{
-               executor: wp.newExecutor(inst),
-               state:    initialState,
-               instance: inst,
-               instType: it,
-               probed:   now,
-               busy:     now,
-               updated:  now,
-               running:  make(map[string]struct{}),
-               starting: make(map[string]struct{}),
-               probing:  make(chan struct{}, 1),
-       }
-       return true
+       wkr := &worker{
+               mtx:          &wp.mtx,
+               wp:           wp,
+               logger:       logger,
+               executor:     wp.newExecutor(inst),
+               state:        state,
+               idleBehavior: idleBehavior,
+               instance:     inst,
+               instType:     it,
+               appeared:     now,
+               probed:       now,
+               busy:         now,
+               updated:      now,
+               running:      make(map[string]*remoteRunner),
+               starting:     make(map[string]*remoteRunner),
+               probing:      make(chan struct{}, 1),
+       }
+       wp.workers[id] = wkr
+       return wkr, true
 }
 
 // Shutdown shuts down a worker with the given type, or returns false
@@ -288,46 +470,28 @@ func (wp *Pool) Shutdown(it arvados.InstanceType) bool {
        defer wp.mtx.Unlock()
        logger := wp.logger.WithField("InstanceType", it.Name)
        logger.Info("shutdown requested")
-       for _, tryState := range []State{StateBooting, StateRunning} {
+       for _, tryState := range []State{StateBooting, StateIdle} {
                // TODO: shutdown the worker with the longest idle
-               // time (Running) or the earliest create time
-               // (Booting)
+               // time (Idle) or the earliest create time (Booting)
                for _, wkr := range wp.workers {
-                       if wkr.state != tryState || len(wkr.running)+len(wkr.starting) > 0 {
-                               continue
-                       }
-                       if wkr.instType != it {
-                               continue
+                       if wkr.idleBehavior != IdleBehaviorHold && wkr.state == tryState && wkr.instType == it {
+                               logger.WithField("Instance", wkr.instance.ID()).Info("shutting down")
+                               wkr.reportBootOutcome(BootOutcomeAborted)
+                               wkr.shutdown()
+                               return true
                        }
-                       logger = logger.WithField("Instance", wkr.instance)
-                       logger.Info("shutting down")
-                       wp.shutdown(wkr, logger)
-                       return true
                }
        }
        return false
 }
 
-// caller must have lock
-func (wp *Pool) shutdown(wkr *worker, logger logrus.FieldLogger) {
-       wkr.updated = time.Now()
-       wkr.state = StateShutdown
-       go func() {
-               err := wkr.instance.Destroy()
-               if err != nil {
-                       logger.WithError(err).WithField("Instance", wkr.instance).Warn("shutdown failed")
-                       return
-               }
-               wp.mtx.Lock()
-               wp.atQuotaUntil = time.Now()
-               wp.mtx.Unlock()
-               wp.notify()
-       }()
-}
-
-// Workers returns the current number of workers in each state.
-func (wp *Pool) Workers() map[State]int {
+// CountWorkers returns the current number of workers in each state.
+//
+// CountWorkers blocks, if necessary, until the initial instance list
+// has been loaded from the cloud provider.
+func (wp *Pool) CountWorkers() map[State]int {
        wp.setupOnce.Do(wp.setup)
+       wp.waitUntilLoaded()
        wp.mtx.Lock()
        defer wp.mtx.Unlock()
        r := map[State]int{}
@@ -338,6 +502,12 @@ func (wp *Pool) Workers() map[State]int {
 }
 
 // Running returns the container UUIDs being prepared/run on workers.
+//
+// In the returned map, the time value indicates when the Pool
+// observed that the container process had exited. A container that
+// has not yet exited has a zero time value. The caller should use
+// ForgetContainer() to garbage-collect the entries for exited
+// containers.
 func (wp *Pool) Running() map[string]time.Time {
        wp.setupOnce.Do(wp.setup)
        wp.mtx.Lock()
@@ -360,17 +530,12 @@ func (wp *Pool) Running() map[string]time.Time {
 // StartContainer starts a container on an idle worker immediately if
 // possible, otherwise returns false.
 func (wp *Pool) StartContainer(it arvados.InstanceType, ctr arvados.Container) bool {
-       logger := wp.logger.WithFields(logrus.Fields{
-               "InstanceType":  it.Name,
-               "ContainerUUID": ctr.UUID,
-               "Priority":      ctr.Priority,
-       })
        wp.setupOnce.Do(wp.setup)
        wp.mtx.Lock()
        defer wp.mtx.Unlock()
        var wkr *worker
        for _, w := range wp.workers {
-               if w.instType == it && w.state == StateRunning && len(w.running)+len(w.starting) == 0 {
+               if w.instType == it && w.state == StateIdle && w.idleBehavior == IdleBehaviorRun {
                        if wkr == nil || w.busy.After(wkr.busy) {
                                wkr = w
                        }
@@ -379,33 +544,7 @@ func (wp *Pool) StartContainer(it arvados.InstanceType, ctr arvados.Container) b
        if wkr == nil {
                return false
        }
-       logger = logger.WithField("Instance", wkr.instance)
-       logger.Debug("starting container")
-       wkr.starting[ctr.UUID] = struct{}{}
-       go func() {
-               stdout, stderr, err := wkr.executor.Execute("crunch-run --detach '"+ctr.UUID+"'", nil)
-               wp.mtx.Lock()
-               defer wp.mtx.Unlock()
-               now := time.Now()
-               wkr.updated = now
-               wkr.busy = now
-               delete(wkr.starting, ctr.UUID)
-               wkr.running[ctr.UUID] = struct{}{}
-               wkr.lastUUID = ctr.UUID
-               if err != nil {
-                       logger.WithField("stdout", string(stdout)).
-                               WithField("stderr", string(stderr)).
-                               WithError(err).
-                               Error("error starting crunch-run process")
-                       // Leave uuid in wkr.running, though: it's
-                       // possible the error was just a communication
-                       // failure and the process was in fact
-                       // started.  Wait for next probe to find out.
-                       return
-               }
-               logger.Info("crunch-run process started")
-               wkr.lastUUID = ctr.UUID
-       }()
+       wkr.startContainer(ctr)
        return true
 }
 
@@ -414,45 +553,45 @@ func (wp *Pool) StartContainer(it arvados.InstanceType, ctr arvados.Container) b
 //
 // KillContainer returns immediately; the act of killing the container
 // takes some time, and runs in the background.
-func (wp *Pool) KillContainer(uuid string) {
+//
+// KillContainer returns false if the container has already ended.
+func (wp *Pool) KillContainer(uuid string, reason string) bool {
        wp.mtx.Lock()
        defer wp.mtx.Unlock()
-       if _, ok := wp.exited[uuid]; ok {
-               wp.logger.WithField("ContainerUUID", uuid).Debug("clearing placeholder for exited crunch-run process")
-               delete(wp.exited, uuid)
-               return
-       }
+       logger := wp.logger.WithFields(logrus.Fields{
+               "ContainerUUID": uuid,
+               "Reason":        reason,
+       })
        for _, wkr := range wp.workers {
-               if _, ok := wkr.running[uuid]; ok {
-                       go wp.kill(wkr, uuid)
-                       return
+               rr := wkr.running[uuid]
+               if rr == nil {
+                       rr = wkr.starting[uuid]
+               }
+               if rr != nil {
+                       rr.Kill(reason)
+                       return true
                }
        }
-       wp.logger.WithField("ContainerUUID", uuid).Debug("cannot kill: already disappeared")
+       logger.Debug("cannot kill: already disappeared")
+       return false
 }
 
-func (wp *Pool) kill(wkr *worker, uuid string) {
-       logger := wp.logger.WithFields(logrus.Fields{
-               "ContainerUUID": uuid,
-               "Instance":      wkr.instance,
-       })
-       logger.Debug("killing process")
-       stdout, stderr, err := wkr.executor.Execute("crunch-run --kill "+uuid, nil)
-       if err != nil {
-               logger.WithFields(logrus.Fields{
-                       "stderr": string(stderr),
-                       "stdout": string(stdout),
-                       "error":  err,
-               }).Warn("kill failed")
-               return
-       }
-       logger.Debug("killing process succeeded")
+// ForgetContainer clears the placeholder for the given exited
+// container, so it isn't returned by subsequent calls to Running().
+//
+// ForgetContainer has no effect if the container has not yet exited.
+//
+// The "container exited at time T" placeholder (which necessitates
+// ForgetContainer) exists to make it easier for the caller
+// (scheduler) to distinguish a container that exited without
+// finalizing its state from a container that exited too recently for
+// its final state to have appeared in the scheduler's queue cache.
+func (wp *Pool) ForgetContainer(uuid string) {
        wp.mtx.Lock()
        defer wp.mtx.Unlock()
-       if _, ok := wkr.running[uuid]; ok {
-               delete(wkr.running, uuid)
-               wkr.updated = time.Now()
-               go wp.notify()
+       if _, ok := wp.exited[uuid]; ok {
+               wp.logger.WithField("ContainerUUID", uuid).Debug("clearing placeholder for exited crunch-run process")
+               delete(wp.exited, uuid)
        }
 }
 
@@ -460,13 +599,6 @@ func (wp *Pool) registerMetrics(reg *prometheus.Registry) {
        if reg == nil {
                reg = prometheus.NewRegistry()
        }
-       wp.mInstances = prometheus.NewGauge(prometheus.GaugeOpts{
-               Namespace: "arvados",
-               Subsystem: "dispatchcloud",
-               Name:      "instances_total",
-               Help:      "Number of cloud VMs including pending, booting, running, held, and shutting down.",
-       })
-       reg.MustRegister(wp.mInstances)
        wp.mContainersRunning = prometheus.NewGauge(prometheus.GaugeOpts{
                Namespace: "arvados",
                Subsystem: "dispatchcloud",
@@ -474,40 +606,100 @@ func (wp *Pool) registerMetrics(reg *prometheus.Registry) {
                Help:      "Number of containers reported running by cloud VMs.",
        })
        reg.MustRegister(wp.mContainersRunning)
-
-       wp.mVCPUs = prometheus.NewGauge(prometheus.GaugeOpts{
+       wp.mInstances = prometheus.NewGaugeVec(prometheus.GaugeOpts{
+               Namespace: "arvados",
+               Subsystem: "dispatchcloud",
+               Name:      "instances_total",
+               Help:      "Number of cloud VMs.",
+       }, []string{"category", "instance_type"})
+       reg.MustRegister(wp.mInstances)
+       wp.mInstancesPrice = prometheus.NewGaugeVec(prometheus.GaugeOpts{
+               Namespace: "arvados",
+               Subsystem: "dispatchcloud",
+               Name:      "instances_price",
+               Help:      "Price of cloud VMs.",
+       }, []string{"category"})
+       reg.MustRegister(wp.mInstancesPrice)
+       wp.mVCPUs = prometheus.NewGaugeVec(prometheus.GaugeOpts{
                Namespace: "arvados",
                Subsystem: "dispatchcloud",
                Name:      "vcpus_total",
                Help:      "Total VCPUs on all cloud VMs.",
-       })
+       }, []string{"category"})
        reg.MustRegister(wp.mVCPUs)
-       wp.mVCPUsInuse = prometheus.NewGauge(prometheus.GaugeOpts{
-               Namespace: "arvados",
-               Subsystem: "dispatchcloud",
-               Name:      "vcpus_inuse",
-               Help:      "VCPUs on cloud VMs that are running containers.",
-       })
-       reg.MustRegister(wp.mVCPUsInuse)
-       wp.mMemory = prometheus.NewGauge(prometheus.GaugeOpts{
+       wp.mMemory = prometheus.NewGaugeVec(prometheus.GaugeOpts{
                Namespace: "arvados",
                Subsystem: "dispatchcloud",
                Name:      "memory_bytes_total",
                Help:      "Total memory on all cloud VMs.",
-       })
+       }, []string{"category"})
        reg.MustRegister(wp.mMemory)
-       wp.mMemoryInuse = prometheus.NewGauge(prometheus.GaugeOpts{
+       wp.mBootOutcomes = prometheus.NewCounterVec(prometheus.CounterOpts{
+               Namespace: "arvados",
+               Subsystem: "dispatchcloud",
+               Name:      "boot_outcomes",
+               Help:      "Boot outcomes by type.",
+       }, []string{"outcome"})
+       for k := range validBootOutcomes {
+               wp.mBootOutcomes.WithLabelValues(string(k)).Add(0)
+       }
+       reg.MustRegister(wp.mBootOutcomes)
+       wp.mDisappearances = prometheus.NewCounterVec(prometheus.CounterOpts{
                Namespace: "arvados",
                Subsystem: "dispatchcloud",
-               Name:      "memory_bytes_inuse",
-               Help:      "Memory on cloud VMs that are running containers.",
+               Name:      "instances_disappeared",
+               Help:      "Number of occurrences of an instance disappearing from the cloud provider's list of instances.",
+       }, []string{"state"})
+       for _, v := range stateString {
+               wp.mDisappearances.WithLabelValues(v).Add(0)
+       }
+       reg.MustRegister(wp.mDisappearances)
+       wp.mTimeToSSH = prometheus.NewSummary(prometheus.SummaryOpts{
+               Namespace:  "arvados",
+               Subsystem:  "dispatchcloud",
+               Name:       "instances_time_to_ssh_seconds",
+               Help:       "Number of seconds between instance creation and the first successful SSH connection.",
+               Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.95: 0.005, 0.99: 0.001},
+       })
+       reg.MustRegister(wp.mTimeToSSH)
+       wp.mTimeToReadyForContainer = prometheus.NewSummary(prometheus.SummaryOpts{
+               Namespace:  "arvados",
+               Subsystem:  "dispatchcloud",
+               Name:       "instances_time_to_ready_for_container_seconds",
+               Help:       "Number of seconds between the first successful SSH connection and ready to run a container.",
+               Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.95: 0.005, 0.99: 0.001},
        })
-       reg.MustRegister(wp.mMemoryInuse)
+       reg.MustRegister(wp.mTimeToReadyForContainer)
+       wp.mTimeFromShutdownToGone = prometheus.NewSummary(prometheus.SummaryOpts{
+               Namespace:  "arvados",
+               Subsystem:  "dispatchcloud",
+               Name:       "instances_time_from_shutdown_request_to_disappearance_seconds",
+               Help:       "Number of seconds between the first shutdown attempt and the disappearance of the worker.",
+               Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.95: 0.005, 0.99: 0.001},
+       })
+       reg.MustRegister(wp.mTimeFromShutdownToGone)
+       wp.mTimeFromQueueToCrunchRun = prometheus.NewSummary(prometheus.SummaryOpts{
+               Namespace:  "arvados",
+               Subsystem:  "dispatchcloud",
+               Name:       "containers_time_from_queue_to_crunch_run_seconds",
+               Help:       "Number of seconds between the queuing of a container and the start of crunch-run.",
+               Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.95: 0.005, 0.99: 0.001},
+       })
+       reg.MustRegister(wp.mTimeFromQueueToCrunchRun)
+       wp.mRunProbeDuration = prometheus.NewSummaryVec(prometheus.SummaryOpts{
+               Namespace:  "arvados",
+               Subsystem:  "dispatchcloud",
+               Name:       "instances_run_probe_duration_seconds",
+               Help:       "Number of seconds per runProbe call.",
+               Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.95: 0.005, 0.99: 0.001},
+       }, []string{"outcome"})
+       reg.MustRegister(wp.mRunProbeDuration)
 }
 
 func (wp *Pool) runMetrics() {
        ch := wp.Subscribe()
        defer wp.Unsubscribe(ch)
+       wp.updateMetrics()
        for range ch {
                wp.updateMetrics()
        }
@@ -517,23 +709,50 @@ func (wp *Pool) updateMetrics() {
        wp.mtx.RLock()
        defer wp.mtx.RUnlock()
 
-       var alloc, cpu, cpuInuse, mem, memInuse int64
+       type entKey struct {
+               cat      string
+               instType string
+       }
+       instances := map[entKey]int64{}
+       price := map[string]float64{}
+       cpu := map[string]int64{}
+       mem := map[string]int64{}
+       var running int64
        for _, wkr := range wp.workers {
-               cpu += int64(wkr.instType.VCPUs)
-               mem += int64(wkr.instType.RAM)
-               if len(wkr.running)+len(wkr.starting) == 0 {
-                       continue
+               var cat string
+               switch {
+               case len(wkr.running)+len(wkr.starting) > 0:
+                       cat = "inuse"
+               case wkr.idleBehavior == IdleBehaviorHold:
+                       cat = "hold"
+               case wkr.state == StateBooting:
+                       cat = "booting"
+               case wkr.state == StateUnknown:
+                       cat = "unknown"
+               default:
+                       cat = "idle"
+               }
+               instances[entKey{cat, wkr.instType.Name}]++
+               price[cat] += wkr.instType.Price
+               cpu[cat] += int64(wkr.instType.VCPUs)
+               mem[cat] += int64(wkr.instType.RAM)
+               running += int64(len(wkr.running) + len(wkr.starting))
+       }
+       for _, cat := range []string{"inuse", "hold", "booting", "unknown", "idle"} {
+               wp.mInstancesPrice.WithLabelValues(cat).Set(price[cat])
+               wp.mVCPUs.WithLabelValues(cat).Set(float64(cpu[cat]))
+               wp.mMemory.WithLabelValues(cat).Set(float64(mem[cat]))
+               // make sure to reset gauges for non-existing category/nodetype combinations
+               for _, it := range wp.instanceTypes {
+                       if _, ok := instances[entKey{cat, it.Name}]; !ok {
+                               wp.mInstances.WithLabelValues(cat, it.Name).Set(float64(0))
+                       }
                }
-               alloc += int64(len(wkr.running) + len(wkr.starting))
-               cpuInuse += int64(wkr.instType.VCPUs)
-               memInuse += int64(wkr.instType.RAM)
        }
-       wp.mInstances.Set(float64(len(wp.workers)))
-       wp.mContainersRunning.Set(float64(alloc))
-       wp.mVCPUs.Set(float64(cpu))
-       wp.mMemory.Set(float64(mem))
-       wp.mVCPUsInuse.Set(float64(cpuInuse))
-       wp.mMemoryInuse.Set(float64(memInuse))
+       for k, v := range instances {
+               wp.mInstances.WithLabelValues(k.cat, k.instType).Set(float64(v))
+       }
+       wp.mContainersRunning.Set(float64(running))
 }
 
 func (wp *Pool) runProbes() {
@@ -552,7 +771,7 @@ func (wp *Pool) runProbes() {
                workers = workers[:0]
                wp.mtx.Lock()
                for id, wkr := range wp.workers {
-                       if wkr.state == StateShutdown || wp.shutdownIfIdle(wkr) {
+                       if wkr.state == StateShutdown || wkr.shutdownIfIdle() {
                                continue
                        }
                        workers = append(workers, id)
@@ -563,20 +782,12 @@ func (wp *Pool) runProbes() {
                        wp.mtx.Lock()
                        wkr, ok := wp.workers[id]
                        wp.mtx.Unlock()
-                       if !ok || wkr.state == StateShutdown {
-                               // Deleted/shutdown while we
-                               // were probing others
+                       if !ok {
+                               // Deleted while we were probing
+                               // others
                                continue
                        }
-                       select {
-                       case wkr.probing <- struct{}{}:
-                               go func() {
-                                       wp.probeAndUpdate(wkr)
-                                       <-wkr.probing
-                               }()
-                       default:
-                               wp.logger.WithField("Instance", wkr.instance).Debug("still waiting for last probe to finish")
-                       }
+                       go wkr.ProbeAndUpdate()
                        select {
                        case <-wp.stop:
                                return
@@ -605,45 +816,6 @@ func (wp *Pool) runSync() {
        }
 }
 
-// caller must have lock.
-func (wp *Pool) shutdownIfBroken(wkr *worker, dur time.Duration) {
-       if wkr.state == StateHold {
-               return
-       }
-       label, threshold := "", wp.timeoutProbe
-       if wkr.state == StateBooting {
-               label, threshold = "new ", wp.timeoutBooting
-       }
-       if dur < threshold {
-               return
-       }
-       wp.logger.WithFields(logrus.Fields{
-               "Instance": wkr.instance,
-               "Duration": dur,
-               "Since":    wkr.probed,
-               "State":    wkr.state,
-       }).Warnf("%sinstance unresponsive, shutting down", label)
-       wp.shutdown(wkr, wp.logger)
-}
-
-// caller must have lock.
-func (wp *Pool) shutdownIfIdle(wkr *worker) bool {
-       if len(wkr.running)+len(wkr.starting) > 0 || wkr.state != StateRunning {
-               return false
-       }
-       age := time.Since(wkr.busy)
-       if age < wp.timeoutIdle {
-               return false
-       }
-       logger := wp.logger.WithFields(logrus.Fields{
-               "Age":      age,
-               "Instance": wkr.instance,
-       })
-       logger.Info("shutdown idle worker")
-       wp.shutdown(wkr, logger)
-       return true
-}
-
 // Stop synchronizing with the InstanceSet.
 func (wp *Pool) Stop() {
        wp.setupOnce.Do(wp.setup)
@@ -658,27 +830,72 @@ func (wp *Pool) Instances() []InstanceView {
        wp.mtx.Lock()
        for _, w := range wp.workers {
                r = append(r, InstanceView{
-                       Instance:             w.instance.String(),
+                       Instance:             w.instance.ID(),
+                       Address:              w.instance.Address(),
                        Price:                w.instType.Price,
                        ArvadosInstanceType:  w.instType.Name,
                        ProviderInstanceType: w.instType.ProviderType,
                        LastContainerUUID:    w.lastUUID,
                        LastBusy:             w.busy,
                        WorkerState:          w.state.String(),
+                       IdleBehavior:         w.idleBehavior,
                })
        }
        wp.mtx.Unlock()
        sort.Slice(r, func(i, j int) bool {
-               return strings.Compare(r[i].Instance, r[j].Instance) < 0
+               return strings.Compare(string(r[i].Instance), string(r[j].Instance)) < 0
        })
        return r
 }
 
+// KillInstance destroys a cloud VM instance. It returns an error if
+// the given instance does not exist.
+func (wp *Pool) KillInstance(id cloud.InstanceID, reason string) error {
+       wkr, ok := wp.workers[id]
+       if !ok {
+               return errors.New("instance not found")
+       }
+       wkr.logger.WithField("Reason", reason).Info("shutting down")
+       wkr.reportBootOutcome(BootOutcomeAborted)
+       wkr.shutdown()
+       return nil
+}
+
 func (wp *Pool) setup() {
-       wp.creating = map[arvados.InstanceType]int{}
+       wp.creating = map[string]createCall{}
        wp.exited = map[string]time.Time{}
        wp.workers = map[cloud.InstanceID]*worker{}
        wp.subscribers = map[<-chan struct{}]chan<- struct{}{}
+       wp.loadRunnerData()
+}
+
+// Load the runner program to be deployed on worker nodes into
+// wp.runnerData, if necessary. Errors are logged.
+//
+// If auto-deploy is disabled, len(wp.runnerData) will be 0.
+//
+// Caller must not have lock.
+func (wp *Pool) loadRunnerData() error {
+       wp.mtx.Lock()
+       defer wp.mtx.Unlock()
+       if wp.runnerData != nil {
+               return nil
+       } else if wp.runnerSource == "" {
+               wp.runnerCmd = "crunch-run"
+               wp.runnerData = []byte{}
+               return nil
+       }
+       logger := wp.logger.WithField("source", wp.runnerSource)
+       logger.Debug("loading runner")
+       buf, err := ioutil.ReadFile(wp.runnerSource)
+       if err != nil {
+               logger.WithError(err).Error("failed to load runner program")
+               return err
+       }
+       wp.runnerData = buf
+       wp.runnerMD5 = md5.Sum(buf)
+       wp.runnerCmd = fmt.Sprintf("/var/lib/arvados/crunch-run~%x", wp.runnerMD5)
+       return nil
 }
 
 func (wp *Pool) notify() {
@@ -694,10 +911,14 @@ func (wp *Pool) notify() {
 
 func (wp *Pool) getInstancesAndSync() error {
        wp.setupOnce.Do(wp.setup)
+       if err := wp.instanceSet.throttleInstances.Error(); err != nil {
+               return err
+       }
        wp.logger.Debug("getting instance list")
        threshold := time.Now()
-       instances, err := wp.instanceSet.Instances(cloud.InstanceTags{})
+       instances, err := wp.instanceSet.Instances(cloud.InstanceTags{wp.tagKeyPrefix + tagKeyInstanceSetID: string(wp.instanceSetID)})
        if err != nil {
+               wp.instanceSet.throttleInstances.CheckRateLimitError(err, wp.logger, "list instances", wp.notify)
                return err
        }
        wp.sync(threshold, instances)
@@ -715,14 +936,17 @@ func (wp *Pool) sync(threshold time.Time, instances []cloud.Instance) {
        notify := false
 
        for _, inst := range instances {
-               itTag := inst.Tags()[tagKeyInstanceType]
+               itTag := inst.Tags()[wp.tagKeyPrefix+tagKeyInstanceType]
                it, ok := wp.instanceTypes[itTag]
                if !ok {
-                       wp.logger.WithField("Instance", inst).Errorf("unknown InstanceType tag %q --- ignoring", itTag)
+                       wp.logger.WithField("Instance", inst.ID()).Errorf("unknown InstanceType tag %q --- ignoring", itTag)
                        continue
                }
-               if wp.updateWorker(inst, it, StateUnknown) {
+               if wkr, isNew := wp.updateWorker(inst, it); isNew {
                        notify = true
+               } else if wkr.state == StateShutdown && time.Since(wkr.destroyed) > wp.timeoutShutdown {
+                       wp.logger.WithField("Instance", inst.ID()).Info("worker still listed after shutdown; retrying")
+                       wkr.shutdown()
                }
        }
 
@@ -731,16 +955,25 @@ func (wp *Pool) sync(threshold time.Time, instances []cloud.Instance) {
                        continue
                }
                logger := wp.logger.WithFields(logrus.Fields{
-                       "Instance":    wkr.instance,
+                       "Instance":    wkr.instance.ID(),
                        "WorkerState": wkr.state,
                })
                logger.Info("instance disappeared in cloud")
+               wkr.reportBootOutcome(BootOutcomeDisappeared)
+               if wp.mDisappearances != nil {
+                       wp.mDisappearances.WithLabelValues(stateString[wkr.state]).Inc()
+               }
+               // wkr.destroyed.IsZero() can happen if instance disappeared but we weren't trying to shut it down
+               if wp.mTimeFromShutdownToGone != nil && !wkr.destroyed.IsZero() {
+                       wp.mTimeFromShutdownToGone.Observe(time.Now().Sub(wkr.destroyed).Seconds())
+               }
                delete(wp.workers, id)
-               go wkr.executor.Close()
+               go wkr.Close()
                notify = true
        }
 
        if !wp.loaded {
+               notify = true
                wp.loaded = true
                wp.logger.WithField("N", len(wp.workers)).Info("loaded initial instance list")
        }
@@ -750,139 +983,30 @@ func (wp *Pool) sync(threshold time.Time, instances []cloud.Instance) {
        }
 }
 
-// should be called in a new goroutine
-func (wp *Pool) probeAndUpdate(wkr *worker) {
-       logger := wp.logger.WithField("Instance", wkr.instance)
-       wp.mtx.Lock()
-       updated := wkr.updated
-       needProbeRunning := wkr.state == StateRunning
-       needProbeBooted := wkr.state == StateUnknown || wkr.state == StateBooting
-       wp.mtx.Unlock()
-       if !needProbeBooted && !needProbeRunning {
-               return
-       }
-
-       var (
-               ctrUUIDs []string
-               ok       bool
-               stderr   []byte
-       )
-       if needProbeBooted {
-               ok, stderr = wp.probeBooted(wkr)
-               wp.mtx.Lock()
-               if ok && (wkr.state == StateUnknown || wkr.state == StateBooting) {
-                       wkr.state = StateRunning
-                       wkr.probed = time.Now()
-                       logger.Info("instance booted")
-                       go wp.notify()
-               }
-               needProbeRunning = wkr.state == StateRunning
-               wp.mtx.Unlock()
-       }
-       if needProbeRunning {
-               ctrUUIDs, ok, stderr = wp.probeRunning(wkr)
-       }
-       logger = logger.WithField("stderr", string(stderr))
-       wp.mtx.Lock()
-       defer wp.mtx.Unlock()
-       if !ok {
-               if wkr.state == StateShutdown {
-                       // Skip the logging noise if shutdown was
-                       // initiated during probe.
-                       return
-               }
-               dur := time.Since(wkr.probed)
-               logger := logger.WithFields(logrus.Fields{
-                       "Duration": dur,
-                       "State":    wkr.state,
-               })
-               if wkr.state == StateBooting {
-                       logger.Debug("new instance not responding")
-               } else {
-                       logger.Info("instance not responding")
-               }
-               wp.shutdownIfBroken(wkr, dur)
-               return
-       }
-
-       updateTime := time.Now()
-       wkr.probed = updateTime
-
-       if updated != wkr.updated {
-               // Worker was updated after the probe began, so
-               // wkr.running might have a container UUID that was
-               // not yet running when ctrUUIDs was generated. Leave
-               // wkr.running alone and wait for the next probe to
-               // catch up on any changes.
-               return
-       }
-
-       if len(ctrUUIDs) > 0 {
-               wkr.busy = updateTime
-               wkr.lastUUID = ctrUUIDs[0]
-       } else if len(wkr.running) > 0 {
-               // Actual last-busy time was sometime between wkr.busy
-               // and now. Now is the earliest opportunity to take
-               // advantage of the non-busy state, though.
-               wkr.busy = updateTime
-       }
-       running := map[string]struct{}{}
-       changed := false
-       for _, uuid := range ctrUUIDs {
-               running[uuid] = struct{}{}
-               if _, ok := wkr.running[uuid]; !ok {
-                       changed = true
-               }
-       }
-       for uuid := range wkr.running {
-               if _, ok := running[uuid]; !ok {
-                       logger.WithField("ContainerUUID", uuid).Info("crunch-run process ended")
-                       wp.exited[uuid] = updateTime
-                       changed = true
-               }
-       }
-       if changed {
-               wkr.running = running
-               wkr.updated = updateTime
-               go wp.notify()
+func (wp *Pool) waitUntilLoaded() {
+       ch := wp.Subscribe()
+       wp.mtx.RLock()
+       defer wp.mtx.RUnlock()
+       for !wp.loaded {
+               wp.mtx.RUnlock()
+               <-ch
+               wp.mtx.RLock()
        }
 }
 
-func (wp *Pool) probeRunning(wkr *worker) (running []string, ok bool, stderr []byte) {
-       cmd := "crunch-run --list"
-       stdout, stderr, err := wkr.executor.Execute(cmd, nil)
-       if err != nil {
-               wp.logger.WithFields(logrus.Fields{
-                       "Instance": wkr.instance,
-                       "Command":  cmd,
-                       "stdout":   string(stdout),
-                       "stderr":   string(stderr),
-               }).WithError(err).Warn("probe failed")
-               return nil, false, stderr
-       }
-       stdout = bytes.TrimRight(stdout, "\n")
-       if len(stdout) == 0 {
-               return nil, true, stderr
-       }
-       return strings.Split(string(stdout), "\n"), true, stderr
+func (wp *Pool) gatewayAuthSecret(uuid string) string {
+       h := hmac.New(sha256.New, []byte(wp.systemRootToken))
+       fmt.Fprint(h, "%s", uuid)
+       return fmt.Sprintf("%x", h.Sum(nil))
 }
 
-func (wp *Pool) probeBooted(wkr *worker) (ok bool, stderr []byte) {
-       cmd := wp.bootProbeCommand
-       if cmd == "" {
-               cmd = "true"
-       }
-       stdout, stderr, err := wkr.executor.Execute(cmd, nil)
-       logger := wp.logger.WithFields(logrus.Fields{
-               "Instance": wkr.instance,
-               "Command":  cmd,
-               "stdout":   string(stdout),
-               "stderr":   string(stderr),
-       })
+// Return a random string of n hexadecimal digits (n*4 random bits). n
+// must be even.
+func randomHex(n int) string {
+       buf := make([]byte, n/2)
+       _, err := rand.Read(buf)
        if err != nil {
-               logger.WithError(err).Debug("boot probe failed")
-               return false, stderr
+               panic(err)
        }
-       logger.Info("boot probe succeeded")
-       return true, stderr
+       return fmt.Sprintf("%x", buf)
 }