Merge branch '19954-permission-dedup-doc'
[arvados.git] / lib / dispatchcloud / worker / pool.go
1 // Copyright (C) The Arvados Authors. All rights reserved.
2 //
3 // SPDX-License-Identifier: AGPL-3.0
4
5 package worker
6
7 import (
8         "crypto/hmac"
9         "crypto/md5"
10         "crypto/rand"
11         "crypto/sha256"
12         "errors"
13         "fmt"
14         "io"
15         "io/ioutil"
16         mathrand "math/rand"
17         "sort"
18         "strings"
19         "sync"
20         "time"
21
22         "git.arvados.org/arvados.git/lib/cloud"
23         "git.arvados.org/arvados.git/sdk/go/arvados"
24         "github.com/prometheus/client_golang/prometheus"
25         "github.com/sirupsen/logrus"
26         "golang.org/x/crypto/ssh"
27 )
28
29 const (
30         tagKeyInstanceType   = "InstanceType"
31         tagKeyIdleBehavior   = "IdleBehavior"
32         tagKeyInstanceSecret = "InstanceSecret"
33         tagKeyInstanceSetID  = "InstanceSetID"
34 )
35
36 // An InstanceView shows a worker's current state and recent activity.
37 type InstanceView struct {
38         Instance             cloud.InstanceID `json:"instance"`
39         Address              string           `json:"address"`
40         Price                float64          `json:"price"`
41         ArvadosInstanceType  string           `json:"arvados_instance_type"`
42         ProviderInstanceType string           `json:"provider_instance_type"`
43         LastContainerUUID    string           `json:"last_container_uuid"`
44         LastBusy             time.Time        `json:"last_busy"`
45         WorkerState          string           `json:"worker_state"`
46         IdleBehavior         IdleBehavior     `json:"idle_behavior"`
47 }
48
49 // An Executor executes shell commands on a remote host.
50 type Executor interface {
51         // Run cmd on the current target.
52         Execute(env map[string]string, cmd string, stdin io.Reader) (stdout, stderr []byte, err error)
53
54         // Use the given target for subsequent operations. The new
55         // target is the same host as the previous target, but it
56         // might return a different address and verify a different
57         // host key.
58         //
59         // SetTarget is called frequently, and in most cases the new
60         // target will behave exactly the same as the old one. An
61         // implementation should optimize accordingly.
62         //
63         // SetTarget must not block on concurrent Execute calls.
64         SetTarget(cloud.ExecutorTarget)
65
66         Close()
67 }
68
69 const (
70         defaultSyncInterval        = time.Minute
71         defaultProbeInterval       = time.Second * 10
72         defaultMaxProbesPerSecond  = 10
73         defaultTimeoutIdle         = time.Minute
74         defaultTimeoutBooting      = time.Minute * 10
75         defaultTimeoutProbe        = time.Minute * 10
76         defaultTimeoutShutdown     = time.Second * 10
77         defaultTimeoutTERM         = time.Minute * 2
78         defaultTimeoutSignal       = time.Second * 5
79         defaultTimeoutStaleRunLock = time.Second * 5
80
81         // Time after a quota error to try again anyway, even if no
82         // instances have been shutdown.
83         quotaErrorTTL = time.Minute
84
85         // Time between "X failed because rate limiting" messages
86         logRateLimitErrorInterval = time.Second * 10
87 )
88
89 func duration(conf arvados.Duration, def time.Duration) time.Duration {
90         if conf > 0 {
91                 return time.Duration(conf)
92         }
93         return def
94 }
95
96 // NewPool creates a Pool of workers backed by instanceSet.
97 //
98 // New instances are configured and set up according to the given
99 // cluster configuration.
100 func NewPool(logger logrus.FieldLogger, arvClient *arvados.Client, reg *prometheus.Registry, instanceSetID cloud.InstanceSetID, instanceSet cloud.InstanceSet, newExecutor func(cloud.Instance) Executor, installPublicKey ssh.PublicKey, cluster *arvados.Cluster) *Pool {
101         wp := &Pool{
102                 logger:                         logger,
103                 arvClient:                      arvClient,
104                 instanceSetID:                  instanceSetID,
105                 instanceSet:                    &throttledInstanceSet{InstanceSet: instanceSet},
106                 newExecutor:                    newExecutor,
107                 cluster:                        cluster,
108                 bootProbeCommand:               cluster.Containers.CloudVMs.BootProbeCommand,
109                 runnerSource:                   cluster.Containers.CloudVMs.DeployRunnerBinary,
110                 imageID:                        cloud.ImageID(cluster.Containers.CloudVMs.ImageID),
111                 instanceTypes:                  cluster.InstanceTypes,
112                 maxProbesPerSecond:             cluster.Containers.CloudVMs.MaxProbesPerSecond,
113                 maxConcurrentInstanceCreateOps: cluster.Containers.CloudVMs.MaxConcurrentInstanceCreateOps,
114                 maxInstances:                   cluster.Containers.CloudVMs.MaxInstances,
115                 probeInterval:                  duration(cluster.Containers.CloudVMs.ProbeInterval, defaultProbeInterval),
116                 syncInterval:                   duration(cluster.Containers.CloudVMs.SyncInterval, defaultSyncInterval),
117                 timeoutIdle:                    duration(cluster.Containers.CloudVMs.TimeoutIdle, defaultTimeoutIdle),
118                 timeoutBooting:                 duration(cluster.Containers.CloudVMs.TimeoutBooting, defaultTimeoutBooting),
119                 timeoutProbe:                   duration(cluster.Containers.CloudVMs.TimeoutProbe, defaultTimeoutProbe),
120                 timeoutShutdown:                duration(cluster.Containers.CloudVMs.TimeoutShutdown, defaultTimeoutShutdown),
121                 timeoutTERM:                    duration(cluster.Containers.CloudVMs.TimeoutTERM, defaultTimeoutTERM),
122                 timeoutSignal:                  duration(cluster.Containers.CloudVMs.TimeoutSignal, defaultTimeoutSignal),
123                 timeoutStaleRunLock:            duration(cluster.Containers.CloudVMs.TimeoutStaleRunLock, defaultTimeoutStaleRunLock),
124                 systemRootToken:                cluster.SystemRootToken,
125                 installPublicKey:               installPublicKey,
126                 tagKeyPrefix:                   cluster.Containers.CloudVMs.TagKeyPrefix,
127                 runnerCmdDefault:               cluster.Containers.CrunchRunCommand,
128                 runnerArgs:                     append([]string{"--runtime-engine=" + cluster.Containers.RuntimeEngine}, cluster.Containers.CrunchRunArgumentsList...),
129                 stop:                           make(chan bool),
130         }
131         wp.registerMetrics(reg)
132         go func() {
133                 wp.setupOnce.Do(wp.setup)
134                 go wp.runMetrics()
135                 go wp.runProbes()
136                 go wp.runSync()
137         }()
138         return wp
139 }
140
141 // Pool is a resizable worker pool backed by a cloud.InstanceSet. A
142 // zero Pool should not be used. Call NewPool to create a new Pool.
143 type Pool struct {
144         // configuration
145         logger                         logrus.FieldLogger
146         arvClient                      *arvados.Client
147         instanceSetID                  cloud.InstanceSetID
148         instanceSet                    *throttledInstanceSet
149         newExecutor                    func(cloud.Instance) Executor
150         cluster                        *arvados.Cluster
151         bootProbeCommand               string
152         runnerSource                   string
153         imageID                        cloud.ImageID
154         instanceTypes                  map[string]arvados.InstanceType
155         syncInterval                   time.Duration
156         probeInterval                  time.Duration
157         maxProbesPerSecond             int
158         maxConcurrentInstanceCreateOps int
159         maxInstances                   int
160         timeoutIdle                    time.Duration
161         timeoutBooting                 time.Duration
162         timeoutProbe                   time.Duration
163         timeoutShutdown                time.Duration
164         timeoutTERM                    time.Duration
165         timeoutSignal                  time.Duration
166         timeoutStaleRunLock            time.Duration
167         systemRootToken                string
168         installPublicKey               ssh.PublicKey
169         tagKeyPrefix                   string
170         runnerCmdDefault               string   // crunch-run command to use if not deploying a binary
171         runnerArgs                     []string // extra args passed to crunch-run
172
173         // private state
174         subscribers  map[<-chan struct{}]chan<- struct{}
175         creating     map[string]createCall // unfinished (cloud.InstanceSet)Create calls (key is instance secret)
176         workers      map[cloud.InstanceID]*worker
177         loaded       bool                 // loaded list of instances from InstanceSet at least once
178         exited       map[string]time.Time // containers whose crunch-run proc has exited, but ForgetContainer has not been called
179         atQuotaUntil time.Time
180         atQuotaErr   cloud.QuotaError
181         stop         chan bool
182         mtx          sync.RWMutex
183         setupOnce    sync.Once
184         runnerData   []byte
185         runnerMD5    [md5.Size]byte
186         runnerCmd    string
187
188         mContainersRunning        prometheus.Gauge
189         mInstances                *prometheus.GaugeVec
190         mInstancesPrice           *prometheus.GaugeVec
191         mVCPUs                    *prometheus.GaugeVec
192         mMemory                   *prometheus.GaugeVec
193         mBootOutcomes             *prometheus.CounterVec
194         mDisappearances           *prometheus.CounterVec
195         mTimeToSSH                prometheus.Summary
196         mTimeToReadyForContainer  prometheus.Summary
197         mTimeFromShutdownToGone   prometheus.Summary
198         mTimeFromQueueToCrunchRun prometheus.Summary
199         mRunProbeDuration         *prometheus.SummaryVec
200 }
201
202 type createCall struct {
203         time         time.Time
204         instanceType arvados.InstanceType
205 }
206
207 func (wp *Pool) CheckHealth() error {
208         wp.setupOnce.Do(wp.setup)
209         if err := wp.loadRunnerData(); err != nil {
210                 return fmt.Errorf("error loading runner binary: %s", err)
211         }
212         return nil
213 }
214
215 // Subscribe returns a buffered channel that becomes ready after any
216 // change to the pool's state that could have scheduling implications:
217 // a worker's state changes, a new worker appears, the cloud
218 // provider's API rate limiting period ends, etc.
219 //
220 // Additional events that occur while the channel is already ready
221 // will be dropped, so it is OK if the caller services the channel
222 // slowly.
223 //
224 // Example:
225 //
226 //      ch := wp.Subscribe()
227 //      defer wp.Unsubscribe(ch)
228 //      for range ch {
229 //              tryScheduling(wp)
230 //              if done {
231 //                      break
232 //              }
233 //      }
234 func (wp *Pool) Subscribe() <-chan struct{} {
235         wp.setupOnce.Do(wp.setup)
236         wp.mtx.Lock()
237         defer wp.mtx.Unlock()
238         ch := make(chan struct{}, 1)
239         wp.subscribers[ch] = ch
240         return ch
241 }
242
243 // Unsubscribe stops sending updates to the given channel.
244 func (wp *Pool) Unsubscribe(ch <-chan struct{}) {
245         wp.setupOnce.Do(wp.setup)
246         wp.mtx.Lock()
247         defer wp.mtx.Unlock()
248         delete(wp.subscribers, ch)
249 }
250
251 // Unallocated returns the number of unallocated (creating + booting +
252 // idle + unknown) workers for each instance type.  Workers in
253 // hold/drain mode are not included.
254 func (wp *Pool) Unallocated() map[arvados.InstanceType]int {
255         wp.setupOnce.Do(wp.setup)
256         wp.mtx.RLock()
257         defer wp.mtx.RUnlock()
258         unalloc := map[arvados.InstanceType]int{}
259         creating := map[arvados.InstanceType]int{}
260         oldestCreate := map[arvados.InstanceType]time.Time{}
261         for _, cc := range wp.creating {
262                 it := cc.instanceType
263                 creating[it]++
264                 if t, ok := oldestCreate[it]; !ok || t.After(cc.time) {
265                         oldestCreate[it] = cc.time
266                 }
267         }
268         for _, wkr := range wp.workers {
269                 // Skip workers that are not expected to become
270                 // available soon. Note len(wkr.running)>0 is not
271                 // redundant here: it can be true even in
272                 // StateUnknown.
273                 if wkr.state == StateShutdown ||
274                         wkr.state == StateRunning ||
275                         wkr.idleBehavior != IdleBehaviorRun ||
276                         len(wkr.running) > 0 {
277                         continue
278                 }
279                 it := wkr.instType
280                 unalloc[it]++
281                 if wkr.state == StateUnknown && creating[it] > 0 && wkr.appeared.After(oldestCreate[it]) {
282                         // If up to N new workers appear in
283                         // Instances() while we are waiting for N
284                         // Create() calls to complete, we assume we're
285                         // just seeing a race between Instances() and
286                         // Create() responses.
287                         //
288                         // The other common reason why nodes have
289                         // state==Unknown is that they appeared at
290                         // startup, before any Create calls. They
291                         // don't match the above timing condition, so
292                         // we never mistakenly attribute them to
293                         // pending Create calls.
294                         creating[it]--
295                 }
296         }
297         for it, c := range creating {
298                 unalloc[it] += c
299         }
300         return unalloc
301 }
302
303 // Create a new instance with the given type, and add it to the worker
304 // pool. The worker is added immediately; instance creation runs in
305 // the background.
306 //
307 // Create returns false if a pre-existing error or a configuration
308 // setting prevents it from even attempting to create a new
309 // instance. Those errors are logged by the Pool, so the caller does
310 // not need to log anything in such cases.
311 func (wp *Pool) Create(it arvados.InstanceType) bool {
312         logger := wp.logger.WithField("InstanceType", it.Name)
313         wp.setupOnce.Do(wp.setup)
314         if wp.loadRunnerData() != nil {
315                 // Boot probe is certain to fail.
316                 return false
317         }
318         wp.mtx.Lock()
319         defer wp.mtx.Unlock()
320         if time.Now().Before(wp.atQuotaUntil) ||
321                 wp.instanceSet.throttleCreate.Error() != nil ||
322                 (wp.maxInstances > 0 && wp.maxInstances <= len(wp.workers)+len(wp.creating)) {
323                 return false
324         }
325         // The maxConcurrentInstanceCreateOps knob throttles the number of node create
326         // requests in flight. It was added to work around a limitation in Azure's
327         // managed disks, which support no more than 20 concurrent node creation
328         // requests from a single disk image (cf.
329         // https://docs.microsoft.com/en-us/azure/virtual-machines/linux/capture-image).
330         // The code assumes that node creation, from Azure's perspective, means the
331         // period until the instance appears in the "get all instances" list.
332         if wp.maxConcurrentInstanceCreateOps > 0 && len(wp.creating) >= wp.maxConcurrentInstanceCreateOps {
333                 logger.Info("reached MaxConcurrentInstanceCreateOps")
334                 wp.instanceSet.throttleCreate.ErrorUntil(errors.New("reached MaxConcurrentInstanceCreateOps"), time.Now().Add(5*time.Second), wp.notify)
335                 return false
336         }
337         now := time.Now()
338         secret := randomHex(instanceSecretLength)
339         wp.creating[secret] = createCall{time: now, instanceType: it}
340         go func() {
341                 defer wp.notify()
342                 tags := cloud.InstanceTags{
343                         wp.tagKeyPrefix + tagKeyInstanceSetID:  string(wp.instanceSetID),
344                         wp.tagKeyPrefix + tagKeyInstanceType:   it.Name,
345                         wp.tagKeyPrefix + tagKeyIdleBehavior:   string(IdleBehaviorRun),
346                         wp.tagKeyPrefix + tagKeyInstanceSecret: secret,
347                 }
348                 initCmd := TagVerifier{nil, secret, nil}.InitCommand()
349                 inst, err := wp.instanceSet.Create(it, wp.imageID, tags, initCmd, wp.installPublicKey)
350                 wp.mtx.Lock()
351                 defer wp.mtx.Unlock()
352                 // delete() is deferred so the updateWorker() call
353                 // below knows to use StateBooting when adding a new
354                 // worker.
355                 defer delete(wp.creating, secret)
356                 if err != nil {
357                         if err, ok := err.(cloud.QuotaError); ok && err.IsQuotaError() {
358                                 wp.atQuotaErr = err
359                                 wp.atQuotaUntil = time.Now().Add(quotaErrorTTL)
360                                 time.AfterFunc(quotaErrorTTL, wp.notify)
361                         }
362                         logger.WithError(err).Error("create failed")
363                         wp.instanceSet.throttleCreate.CheckRateLimitError(err, wp.logger, "create instance", wp.notify)
364                         return
365                 }
366                 wp.updateWorker(inst, it)
367         }()
368         if len(wp.creating)+len(wp.workers) == wp.maxInstances {
369                 logger.Infof("now at MaxInstances limit of %d instances", wp.maxInstances)
370         }
371         return true
372 }
373
374 // AtQuota returns true if Create is not expected to work at the
375 // moment (e.g., cloud provider has reported quota errors, or we are
376 // already at our own configured quota).
377 func (wp *Pool) AtQuota() bool {
378         wp.mtx.Lock()
379         defer wp.mtx.Unlock()
380         return time.Now().Before(wp.atQuotaUntil) || (wp.maxInstances > 0 && wp.maxInstances <= len(wp.workers)+len(wp.creating))
381 }
382
383 // SetIdleBehavior determines how the indicated instance will behave
384 // when it has no containers running.
385 func (wp *Pool) SetIdleBehavior(id cloud.InstanceID, idleBehavior IdleBehavior) error {
386         wp.mtx.Lock()
387         defer wp.mtx.Unlock()
388         wkr, ok := wp.workers[id]
389         if !ok {
390                 return errors.New("requested instance does not exist")
391         }
392         wkr.setIdleBehavior(idleBehavior)
393         return nil
394 }
395
396 // Successful connection to the SSH daemon, update the mTimeToSSH metric
397 func (wp *Pool) reportSSHConnected(inst cloud.Instance) {
398         wp.mtx.Lock()
399         defer wp.mtx.Unlock()
400         wkr := wp.workers[inst.ID()]
401         if wkr.state != StateBooting || !wkr.firstSSHConnection.IsZero() {
402                 // the node is not in booting state (can happen if a-d-c is restarted) OR
403                 // this is not the first SSH connection
404                 return
405         }
406
407         wkr.firstSSHConnection = time.Now()
408         if wp.mTimeToSSH != nil {
409                 wp.mTimeToSSH.Observe(wkr.firstSSHConnection.Sub(wkr.appeared).Seconds())
410         }
411 }
412
413 // Add or update worker attached to the given instance.
414 //
415 // The second return value is true if a new worker is created.
416 //
417 // A newly added instance has state=StateBooting if its tags match an
418 // entry in wp.creating, otherwise StateUnknown.
419 //
420 // Caller must have lock.
421 func (wp *Pool) updateWorker(inst cloud.Instance, it arvados.InstanceType) (*worker, bool) {
422         secret := inst.Tags()[wp.tagKeyPrefix+tagKeyInstanceSecret]
423         inst = TagVerifier{Instance: inst, Secret: secret, ReportVerified: wp.reportSSHConnected}
424         id := inst.ID()
425         if wkr := wp.workers[id]; wkr != nil {
426                 wkr.executor.SetTarget(inst)
427                 wkr.instance = inst
428                 wkr.updated = time.Now()
429                 wkr.saveTags()
430                 return wkr, false
431         }
432
433         state := StateUnknown
434         if _, ok := wp.creating[secret]; ok {
435                 state = StateBooting
436         }
437
438         // If an instance has a valid IdleBehavior tag when it first
439         // appears, initialize the new worker accordingly (this is how
440         // we restore IdleBehavior that was set by a prior dispatch
441         // process); otherwise, default to "run". After this,
442         // wkr.idleBehavior is the source of truth, and will only be
443         // changed via SetIdleBehavior().
444         idleBehavior := IdleBehavior(inst.Tags()[wp.tagKeyPrefix+tagKeyIdleBehavior])
445         if !validIdleBehavior[idleBehavior] {
446                 idleBehavior = IdleBehaviorRun
447         }
448
449         logger := wp.logger.WithFields(logrus.Fields{
450                 "InstanceType": it.Name,
451                 "Instance":     inst.ID(),
452                 "Address":      inst.Address(),
453         })
454         logger.WithFields(logrus.Fields{
455                 "State":        state,
456                 "IdleBehavior": idleBehavior,
457         }).Infof("instance appeared in cloud")
458         now := time.Now()
459         wkr := &worker{
460                 mtx:          &wp.mtx,
461                 wp:           wp,
462                 logger:       logger,
463                 executor:     wp.newExecutor(inst),
464                 state:        state,
465                 idleBehavior: idleBehavior,
466                 instance:     inst,
467                 instType:     it,
468                 appeared:     now,
469                 probed:       now,
470                 busy:         now,
471                 updated:      now,
472                 running:      make(map[string]*remoteRunner),
473                 starting:     make(map[string]*remoteRunner),
474                 probing:      make(chan struct{}, 1),
475         }
476         wp.workers[id] = wkr
477         return wkr, true
478 }
479
480 // Shutdown shuts down a worker with the given type, or returns false
481 // if all workers with the given type are busy.
482 func (wp *Pool) Shutdown(it arvados.InstanceType) bool {
483         wp.setupOnce.Do(wp.setup)
484         wp.mtx.Lock()
485         defer wp.mtx.Unlock()
486         logger := wp.logger.WithField("InstanceType", it.Name)
487         logger.Info("shutdown requested")
488         for _, tryState := range []State{StateBooting, StateIdle} {
489                 // TODO: shutdown the worker with the longest idle
490                 // time (Idle) or the earliest create time (Booting)
491                 for _, wkr := range wp.workers {
492                         if wkr.idleBehavior != IdleBehaviorHold && wkr.state == tryState && wkr.instType == it {
493                                 logger.WithField("Instance", wkr.instance.ID()).Info("shutting down")
494                                 wkr.reportBootOutcome(BootOutcomeAborted)
495                                 wkr.shutdown()
496                                 return true
497                         }
498                 }
499         }
500         return false
501 }
502
503 // CountWorkers returns the current number of workers in each state.
504 //
505 // CountWorkers blocks, if necessary, until the initial instance list
506 // has been loaded from the cloud provider.
507 func (wp *Pool) CountWorkers() map[State]int {
508         wp.setupOnce.Do(wp.setup)
509         wp.waitUntilLoaded()
510         wp.mtx.Lock()
511         defer wp.mtx.Unlock()
512         r := map[State]int{}
513         for _, w := range wp.workers {
514                 r[w.state]++
515         }
516         return r
517 }
518
519 // Running returns the container UUIDs being prepared/run on workers.
520 //
521 // In the returned map, the time value indicates when the Pool
522 // observed that the container process had exited. A container that
523 // has not yet exited has a zero time value. The caller should use
524 // ForgetContainer() to garbage-collect the entries for exited
525 // containers.
526 func (wp *Pool) Running() map[string]time.Time {
527         wp.setupOnce.Do(wp.setup)
528         wp.mtx.Lock()
529         defer wp.mtx.Unlock()
530         r := map[string]time.Time{}
531         for _, wkr := range wp.workers {
532                 for uuid := range wkr.running {
533                         r[uuid] = time.Time{}
534                 }
535                 for uuid := range wkr.starting {
536                         r[uuid] = time.Time{}
537                 }
538         }
539         for uuid, exited := range wp.exited {
540                 r[uuid] = exited
541         }
542         return r
543 }
544
545 // StartContainer starts a container on an idle worker immediately if
546 // possible, otherwise returns false.
547 func (wp *Pool) StartContainer(it arvados.InstanceType, ctr arvados.Container) bool {
548         wp.setupOnce.Do(wp.setup)
549         wp.mtx.Lock()
550         defer wp.mtx.Unlock()
551         var wkr *worker
552         for _, w := range wp.workers {
553                 if w.instType == it && w.state == StateIdle && w.idleBehavior == IdleBehaviorRun {
554                         if wkr == nil || w.busy.After(wkr.busy) {
555                                 wkr = w
556                         }
557                 }
558         }
559         if wkr == nil {
560                 return false
561         }
562         wkr.startContainer(ctr)
563         return true
564 }
565
566 // KillContainer kills the crunch-run process for the given container
567 // UUID, if it's running on any worker.
568 //
569 // KillContainer returns immediately; the act of killing the container
570 // takes some time, and runs in the background.
571 //
572 // KillContainer returns false if the container has already ended.
573 func (wp *Pool) KillContainer(uuid string, reason string) bool {
574         wp.mtx.Lock()
575         defer wp.mtx.Unlock()
576         logger := wp.logger.WithFields(logrus.Fields{
577                 "ContainerUUID": uuid,
578                 "Reason":        reason,
579         })
580         for _, wkr := range wp.workers {
581                 rr := wkr.running[uuid]
582                 if rr == nil {
583                         rr = wkr.starting[uuid]
584                 }
585                 if rr != nil {
586                         rr.Kill(reason)
587                         return true
588                 }
589         }
590         logger.Debug("cannot kill: already disappeared")
591         return false
592 }
593
594 // ForgetContainer clears the placeholder for the given exited
595 // container, so it isn't returned by subsequent calls to Running().
596 //
597 // ForgetContainer has no effect if the container has not yet exited.
598 //
599 // The "container exited at time T" placeholder (which necessitates
600 // ForgetContainer) exists to make it easier for the caller
601 // (scheduler) to distinguish a container that exited without
602 // finalizing its state from a container that exited too recently for
603 // its final state to have appeared in the scheduler's queue cache.
604 func (wp *Pool) ForgetContainer(uuid string) {
605         wp.mtx.Lock()
606         defer wp.mtx.Unlock()
607         if _, ok := wp.exited[uuid]; ok {
608                 wp.logger.WithField("ContainerUUID", uuid).Debug("clearing placeholder for exited crunch-run process")
609                 delete(wp.exited, uuid)
610         }
611 }
612
613 func (wp *Pool) registerMetrics(reg *prometheus.Registry) {
614         if reg == nil {
615                 reg = prometheus.NewRegistry()
616         }
617         wp.mContainersRunning = prometheus.NewGauge(prometheus.GaugeOpts{
618                 Namespace: "arvados",
619                 Subsystem: "dispatchcloud",
620                 Name:      "containers_running",
621                 Help:      "Number of containers reported running by cloud VMs.",
622         })
623         reg.MustRegister(wp.mContainersRunning)
624         wp.mInstances = prometheus.NewGaugeVec(prometheus.GaugeOpts{
625                 Namespace: "arvados",
626                 Subsystem: "dispatchcloud",
627                 Name:      "instances_total",
628                 Help:      "Number of cloud VMs.",
629         }, []string{"category", "instance_type"})
630         reg.MustRegister(wp.mInstances)
631         wp.mInstancesPrice = prometheus.NewGaugeVec(prometheus.GaugeOpts{
632                 Namespace: "arvados",
633                 Subsystem: "dispatchcloud",
634                 Name:      "instances_price",
635                 Help:      "Price of cloud VMs.",
636         }, []string{"category"})
637         reg.MustRegister(wp.mInstancesPrice)
638         wp.mVCPUs = prometheus.NewGaugeVec(prometheus.GaugeOpts{
639                 Namespace: "arvados",
640                 Subsystem: "dispatchcloud",
641                 Name:      "vcpus_total",
642                 Help:      "Total VCPUs on all cloud VMs.",
643         }, []string{"category"})
644         reg.MustRegister(wp.mVCPUs)
645         wp.mMemory = prometheus.NewGaugeVec(prometheus.GaugeOpts{
646                 Namespace: "arvados",
647                 Subsystem: "dispatchcloud",
648                 Name:      "memory_bytes_total",
649                 Help:      "Total memory on all cloud VMs.",
650         }, []string{"category"})
651         reg.MustRegister(wp.mMemory)
652         wp.mBootOutcomes = prometheus.NewCounterVec(prometheus.CounterOpts{
653                 Namespace: "arvados",
654                 Subsystem: "dispatchcloud",
655                 Name:      "boot_outcomes",
656                 Help:      "Boot outcomes by type.",
657         }, []string{"outcome"})
658         for k := range validBootOutcomes {
659                 wp.mBootOutcomes.WithLabelValues(string(k)).Add(0)
660         }
661         reg.MustRegister(wp.mBootOutcomes)
662         wp.mDisappearances = prometheus.NewCounterVec(prometheus.CounterOpts{
663                 Namespace: "arvados",
664                 Subsystem: "dispatchcloud",
665                 Name:      "instances_disappeared",
666                 Help:      "Number of occurrences of an instance disappearing from the cloud provider's list of instances.",
667         }, []string{"state"})
668         for _, v := range stateString {
669                 wp.mDisappearances.WithLabelValues(v).Add(0)
670         }
671         reg.MustRegister(wp.mDisappearances)
672         wp.mTimeToSSH = prometheus.NewSummary(prometheus.SummaryOpts{
673                 Namespace:  "arvados",
674                 Subsystem:  "dispatchcloud",
675                 Name:       "instances_time_to_ssh_seconds",
676                 Help:       "Number of seconds between instance creation and the first successful SSH connection.",
677                 Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.95: 0.005, 0.99: 0.001},
678         })
679         reg.MustRegister(wp.mTimeToSSH)
680         wp.mTimeToReadyForContainer = prometheus.NewSummary(prometheus.SummaryOpts{
681                 Namespace:  "arvados",
682                 Subsystem:  "dispatchcloud",
683                 Name:       "instances_time_to_ready_for_container_seconds",
684                 Help:       "Number of seconds between the first successful SSH connection and ready to run a container.",
685                 Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.95: 0.005, 0.99: 0.001},
686         })
687         reg.MustRegister(wp.mTimeToReadyForContainer)
688         wp.mTimeFromShutdownToGone = prometheus.NewSummary(prometheus.SummaryOpts{
689                 Namespace:  "arvados",
690                 Subsystem:  "dispatchcloud",
691                 Name:       "instances_time_from_shutdown_request_to_disappearance_seconds",
692                 Help:       "Number of seconds between the first shutdown attempt and the disappearance of the worker.",
693                 Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.95: 0.005, 0.99: 0.001},
694         })
695         reg.MustRegister(wp.mTimeFromShutdownToGone)
696         wp.mTimeFromQueueToCrunchRun = prometheus.NewSummary(prometheus.SummaryOpts{
697                 Namespace:  "arvados",
698                 Subsystem:  "dispatchcloud",
699                 Name:       "containers_time_from_queue_to_crunch_run_seconds",
700                 Help:       "Number of seconds between the queuing of a container and the start of crunch-run.",
701                 Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.95: 0.005, 0.99: 0.001},
702         })
703         reg.MustRegister(wp.mTimeFromQueueToCrunchRun)
704         wp.mRunProbeDuration = prometheus.NewSummaryVec(prometheus.SummaryOpts{
705                 Namespace:  "arvados",
706                 Subsystem:  "dispatchcloud",
707                 Name:       "instances_run_probe_duration_seconds",
708                 Help:       "Number of seconds per runProbe call.",
709                 Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.95: 0.005, 0.99: 0.001},
710         }, []string{"outcome"})
711         reg.MustRegister(wp.mRunProbeDuration)
712 }
713
714 func (wp *Pool) runMetrics() {
715         ch := wp.Subscribe()
716         defer wp.Unsubscribe(ch)
717         wp.updateMetrics()
718         for range ch {
719                 wp.updateMetrics()
720         }
721 }
722
723 func (wp *Pool) updateMetrics() {
724         wp.mtx.RLock()
725         defer wp.mtx.RUnlock()
726
727         type entKey struct {
728                 cat      string
729                 instType string
730         }
731         instances := map[entKey]int64{}
732         price := map[string]float64{}
733         cpu := map[string]int64{}
734         mem := map[string]int64{}
735         var running int64
736         for _, wkr := range wp.workers {
737                 var cat string
738                 switch {
739                 case len(wkr.running)+len(wkr.starting) > 0:
740                         cat = "inuse"
741                 case wkr.idleBehavior == IdleBehaviorHold:
742                         cat = "hold"
743                 case wkr.state == StateBooting:
744                         cat = "booting"
745                 case wkr.state == StateUnknown:
746                         cat = "unknown"
747                 default:
748                         cat = "idle"
749                 }
750                 instances[entKey{cat, wkr.instType.Name}]++
751                 price[cat] += wkr.instType.Price
752                 cpu[cat] += int64(wkr.instType.VCPUs)
753                 mem[cat] += int64(wkr.instType.RAM)
754                 running += int64(len(wkr.running) + len(wkr.starting))
755         }
756         for _, cat := range []string{"inuse", "hold", "booting", "unknown", "idle"} {
757                 wp.mInstancesPrice.WithLabelValues(cat).Set(price[cat])
758                 wp.mVCPUs.WithLabelValues(cat).Set(float64(cpu[cat]))
759                 wp.mMemory.WithLabelValues(cat).Set(float64(mem[cat]))
760                 // make sure to reset gauges for non-existing category/nodetype combinations
761                 for _, it := range wp.instanceTypes {
762                         if _, ok := instances[entKey{cat, it.Name}]; !ok {
763                                 wp.mInstances.WithLabelValues(cat, it.Name).Set(float64(0))
764                         }
765                 }
766         }
767         for k, v := range instances {
768                 wp.mInstances.WithLabelValues(k.cat, k.instType).Set(float64(v))
769         }
770         wp.mContainersRunning.Set(float64(running))
771 }
772
773 func (wp *Pool) runProbes() {
774         maxPPS := wp.maxProbesPerSecond
775         if maxPPS < 1 {
776                 maxPPS = defaultMaxProbesPerSecond
777         }
778         limitticker := time.NewTicker(time.Second / time.Duration(maxPPS))
779         defer limitticker.Stop()
780
781         probeticker := time.NewTicker(wp.probeInterval)
782         defer probeticker.Stop()
783
784         workers := []cloud.InstanceID{}
785         for range probeticker.C {
786                 // Add some jitter. Without this, if probeInterval is
787                 // a multiple of syncInterval and sync is
788                 // instantaneous (as with the loopback driver), the
789                 // first few probes race with sync operations and
790                 // don't update the workers.
791                 time.Sleep(time.Duration(mathrand.Int63n(int64(wp.probeInterval) / 23)))
792
793                 workers = workers[:0]
794                 wp.mtx.Lock()
795                 for id, wkr := range wp.workers {
796                         if wkr.state == StateShutdown || wkr.shutdownIfIdle() {
797                                 continue
798                         }
799                         workers = append(workers, id)
800                 }
801                 wp.mtx.Unlock()
802
803                 for _, id := range workers {
804                         wp.mtx.Lock()
805                         wkr, ok := wp.workers[id]
806                         wp.mtx.Unlock()
807                         if !ok {
808                                 // Deleted while we were probing
809                                 // others
810                                 continue
811                         }
812                         go wkr.ProbeAndUpdate()
813                         select {
814                         case <-wp.stop:
815                                 return
816                         case <-limitticker.C:
817                         }
818                 }
819         }
820 }
821
822 func (wp *Pool) runSync() {
823         // sync once immediately, then wait syncInterval, sync again,
824         // etc.
825         timer := time.NewTimer(1)
826         for {
827                 select {
828                 case <-timer.C:
829                         err := wp.getInstancesAndSync()
830                         if err != nil {
831                                 wp.logger.WithError(err).Warn("sync failed")
832                         }
833                         timer.Reset(wp.syncInterval)
834                 case <-wp.stop:
835                         wp.logger.Debug("worker.Pool stopped")
836                         return
837                 }
838         }
839 }
840
841 // Stop synchronizing with the InstanceSet.
842 func (wp *Pool) Stop() {
843         wp.setupOnce.Do(wp.setup)
844         close(wp.stop)
845 }
846
847 // Instances returns an InstanceView for each worker in the pool,
848 // summarizing its current state and recent activity.
849 func (wp *Pool) Instances() []InstanceView {
850         var r []InstanceView
851         wp.setupOnce.Do(wp.setup)
852         wp.mtx.Lock()
853         for _, w := range wp.workers {
854                 r = append(r, InstanceView{
855                         Instance:             w.instance.ID(),
856                         Address:              w.instance.Address(),
857                         Price:                w.instType.Price,
858                         ArvadosInstanceType:  w.instType.Name,
859                         ProviderInstanceType: w.instType.ProviderType,
860                         LastContainerUUID:    w.lastUUID,
861                         LastBusy:             w.busy,
862                         WorkerState:          w.state.String(),
863                         IdleBehavior:         w.idleBehavior,
864                 })
865         }
866         wp.mtx.Unlock()
867         sort.Slice(r, func(i, j int) bool {
868                 return strings.Compare(string(r[i].Instance), string(r[j].Instance)) < 0
869         })
870         return r
871 }
872
873 // KillInstance destroys a cloud VM instance. It returns an error if
874 // the given instance does not exist.
875 func (wp *Pool) KillInstance(id cloud.InstanceID, reason string) error {
876         wkr, ok := wp.workers[id]
877         if !ok {
878                 return errors.New("instance not found")
879         }
880         wkr.logger.WithField("Reason", reason).Info("shutting down")
881         wkr.reportBootOutcome(BootOutcomeAborted)
882         wkr.shutdown()
883         return nil
884 }
885
886 func (wp *Pool) setup() {
887         wp.creating = map[string]createCall{}
888         wp.exited = map[string]time.Time{}
889         wp.workers = map[cloud.InstanceID]*worker{}
890         wp.subscribers = map[<-chan struct{}]chan<- struct{}{}
891         wp.loadRunnerData()
892 }
893
894 // Load the runner program to be deployed on worker nodes into
895 // wp.runnerData, if necessary. Errors are logged.
896 //
897 // If auto-deploy is disabled, len(wp.runnerData) will be 0.
898 //
899 // Caller must not have lock.
900 func (wp *Pool) loadRunnerData() error {
901         wp.mtx.Lock()
902         defer wp.mtx.Unlock()
903         if wp.runnerData != nil {
904                 return nil
905         } else if wp.runnerSource == "" {
906                 wp.runnerCmd = wp.runnerCmdDefault
907                 wp.runnerData = []byte{}
908                 return nil
909         }
910         logger := wp.logger.WithField("source", wp.runnerSource)
911         logger.Debug("loading runner")
912         buf, err := ioutil.ReadFile(wp.runnerSource)
913         if err != nil {
914                 logger.WithError(err).Error("failed to load runner program")
915                 return err
916         }
917         wp.runnerData = buf
918         wp.runnerMD5 = md5.Sum(buf)
919         wp.runnerCmd = fmt.Sprintf("/tmp/arvados-crunch-run/crunch-run~%x", wp.runnerMD5)
920         return nil
921 }
922
923 func (wp *Pool) notify() {
924         wp.mtx.RLock()
925         defer wp.mtx.RUnlock()
926         for _, send := range wp.subscribers {
927                 select {
928                 case send <- struct{}{}:
929                 default:
930                 }
931         }
932 }
933
934 func (wp *Pool) getInstancesAndSync() error {
935         wp.setupOnce.Do(wp.setup)
936         if err := wp.instanceSet.throttleInstances.Error(); err != nil {
937                 return err
938         }
939         wp.logger.Debug("getting instance list")
940         threshold := time.Now()
941         instances, err := wp.instanceSet.Instances(cloud.InstanceTags{wp.tagKeyPrefix + tagKeyInstanceSetID: string(wp.instanceSetID)})
942         if err != nil {
943                 wp.instanceSet.throttleInstances.CheckRateLimitError(err, wp.logger, "list instances", wp.notify)
944                 return err
945         }
946         wp.sync(threshold, instances)
947         wp.logger.Debug("sync done")
948         return nil
949 }
950
951 // Add/remove/update workers based on instances, which was obtained
952 // from the instanceSet. However, don't clobber any other updates that
953 // already happened after threshold.
954 func (wp *Pool) sync(threshold time.Time, instances []cloud.Instance) {
955         wp.mtx.Lock()
956         defer wp.mtx.Unlock()
957         wp.logger.WithField("Instances", len(instances)).Debug("sync instances")
958         notify := false
959
960         for _, inst := range instances {
961                 itTag := inst.Tags()[wp.tagKeyPrefix+tagKeyInstanceType]
962                 it, ok := wp.instanceTypes[itTag]
963                 if !ok {
964                         wp.logger.WithField("Instance", inst.ID()).Errorf("unknown InstanceType tag %q --- ignoring", itTag)
965                         continue
966                 }
967                 if wkr, isNew := wp.updateWorker(inst, it); isNew {
968                         notify = true
969                 } else if wkr.state == StateShutdown && time.Since(wkr.destroyed) > wp.timeoutShutdown {
970                         wp.logger.WithField("Instance", inst.ID()).Info("worker still listed after shutdown; retrying")
971                         wkr.shutdown()
972                 }
973         }
974
975         for id, wkr := range wp.workers {
976                 if wkr.updated.After(threshold) {
977                         continue
978                 }
979                 logger := wp.logger.WithFields(logrus.Fields{
980                         "Instance":    wkr.instance.ID(),
981                         "WorkerState": wkr.state,
982                 })
983                 logger.Info("instance disappeared in cloud")
984                 wkr.reportBootOutcome(BootOutcomeDisappeared)
985                 if wp.mDisappearances != nil {
986                         wp.mDisappearances.WithLabelValues(stateString[wkr.state]).Inc()
987                 }
988                 // wkr.destroyed.IsZero() can happen if instance disappeared but we weren't trying to shut it down
989                 if wp.mTimeFromShutdownToGone != nil && !wkr.destroyed.IsZero() {
990                         wp.mTimeFromShutdownToGone.Observe(time.Now().Sub(wkr.destroyed).Seconds())
991                 }
992                 delete(wp.workers, id)
993                 go wkr.Close()
994                 notify = true
995         }
996
997         if !wp.loaded {
998                 notify = true
999                 wp.loaded = true
1000                 wp.logger.WithField("N", len(wp.workers)).Info("loaded initial instance list")
1001         }
1002
1003         if notify {
1004                 go wp.notify()
1005         }
1006 }
1007
1008 func (wp *Pool) waitUntilLoaded() {
1009         ch := wp.Subscribe()
1010         wp.mtx.RLock()
1011         defer wp.mtx.RUnlock()
1012         for !wp.loaded {
1013                 wp.mtx.RUnlock()
1014                 <-ch
1015                 wp.mtx.RLock()
1016         }
1017 }
1018
1019 func (wp *Pool) gatewayAuthSecret(uuid string) string {
1020         h := hmac.New(sha256.New, []byte(wp.systemRootToken))
1021         fmt.Fprint(h, uuid)
1022         return fmt.Sprintf("%x", h.Sum(nil))
1023 }
1024
1025 // Return a random string of n hexadecimal digits (n*4 random bits). n
1026 // must be even.
1027 func randomHex(n int) string {
1028         buf := make([]byte, n/2)
1029         _, err := rand.Read(buf)
1030         if err != nil {
1031                 panic(err)
1032         }
1033         return fmt.Sprintf("%x", buf)
1034 }