More Salt installer doc refactoring.
[arvados.git] / lib / dispatchcloud / worker / worker.go
1 // Copyright (C) The Arvados Authors. All rights reserved.
2 //
3 // SPDX-License-Identifier: AGPL-3.0
4
5 package worker
6
7 import (
8         "bytes"
9         "fmt"
10         "path/filepath"
11         "strings"
12         "sync"
13         "time"
14
15         "git.arvados.org/arvados.git/lib/cloud"
16         "git.arvados.org/arvados.git/sdk/go/arvados"
17         "git.arvados.org/arvados.git/sdk/go/stats"
18         "github.com/sirupsen/logrus"
19 )
20
21 const (
22         // TODO: configurable
23         maxPingFailTime = 10 * time.Minute
24 )
25
26 // State indicates whether a worker is available to do work, and (if
27 // not) whether/when it is expected to become ready.
28 type State int
29
30 const (
31         StateUnknown  State = iota // might be running a container already
32         StateBooting               // instance is booting
33         StateIdle                  // instance booted, no containers are running
34         StateRunning               // instance is running one or more containers
35         StateShutdown              // worker has stopped monitoring the instance
36 )
37
38 var stateString = map[State]string{
39         StateUnknown:  "unknown",
40         StateBooting:  "booting",
41         StateIdle:     "idle",
42         StateRunning:  "running",
43         StateShutdown: "shutdown",
44 }
45
46 // String implements fmt.Stringer.
47 func (s State) String() string {
48         return stateString[s]
49 }
50
51 // MarshalText implements encoding.TextMarshaler so a JSON encoding of
52 // map[State]anything uses the state's string representation.
53 func (s State) MarshalText() ([]byte, error) {
54         return []byte(stateString[s]), nil
55 }
56
57 // BootOutcome is the result of a worker boot. It is used as a label in a metric.
58 type BootOutcome string
59
60 const (
61         BootOutcomeFailed      BootOutcome = "failure"
62         BootOutcomeSucceeded   BootOutcome = "success"
63         BootOutcomeAborted     BootOutcome = "aborted"
64         BootOutcomeDisappeared BootOutcome = "disappeared"
65 )
66
67 var validBootOutcomes = map[BootOutcome]bool{
68         BootOutcomeFailed:      true,
69         BootOutcomeSucceeded:   true,
70         BootOutcomeAborted:     true,
71         BootOutcomeDisappeared: true,
72 }
73
74 // IdleBehavior indicates the behavior desired when a node becomes idle.
75 type IdleBehavior string
76
77 const (
78         IdleBehaviorRun   IdleBehavior = "run"   // run containers, or shutdown on idle timeout
79         IdleBehaviorHold  IdleBehavior = "hold"  // don't shutdown or run more containers
80         IdleBehaviorDrain IdleBehavior = "drain" // shutdown immediately when idle
81 )
82
83 var validIdleBehavior = map[IdleBehavior]bool{
84         IdleBehaviorRun:   true,
85         IdleBehaviorHold:  true,
86         IdleBehaviorDrain: true,
87 }
88
89 type worker struct {
90         logger   logrus.FieldLogger
91         executor Executor
92         wp       *Pool
93
94         mtx                 sync.Locker // must be wp's Locker.
95         state               State
96         idleBehavior        IdleBehavior
97         instance            cloud.Instance
98         instType            arvados.InstanceType
99         vcpus               int64
100         memory              int64
101         appeared            time.Time
102         probed              time.Time
103         updated             time.Time
104         busy                time.Time
105         destroyed           time.Time
106         firstSSHConnection  time.Time
107         lastUUID            string
108         running             map[string]*remoteRunner // remember to update state idle<->running when this changes
109         starting            map[string]*remoteRunner // remember to update state idle<->running when this changes
110         probing             chan struct{}
111         bootOutcomeReported bool
112         timeToReadyReported bool
113         staleRunLockSince   time.Time
114 }
115
116 func (wkr *worker) onUnkillable(uuid string) {
117         wkr.mtx.Lock()
118         defer wkr.mtx.Unlock()
119         logger := wkr.logger.WithField("ContainerUUID", uuid)
120         if wkr.idleBehavior == IdleBehaviorHold {
121                 logger.Warn("unkillable container, but worker has IdleBehavior=Hold")
122                 return
123         }
124         logger.Warn("unkillable container, draining worker")
125         wkr.setIdleBehavior(IdleBehaviorDrain)
126 }
127
128 func (wkr *worker) onKilled(uuid string) {
129         wkr.mtx.Lock()
130         defer wkr.mtx.Unlock()
131         wkr.closeRunner(uuid)
132         go wkr.wp.notify()
133 }
134
135 // caller must have lock.
136 func (wkr *worker) reportBootOutcome(outcome BootOutcome) {
137         if wkr.bootOutcomeReported {
138                 return
139         }
140         if wkr.wp.mBootOutcomes != nil {
141                 wkr.wp.mBootOutcomes.WithLabelValues(string(outcome)).Inc()
142         }
143         wkr.bootOutcomeReported = true
144 }
145
146 // caller must have lock.
147 func (wkr *worker) reportTimeBetweenFirstSSHAndReadyForContainer() {
148         if wkr.timeToReadyReported {
149                 return
150         }
151         if wkr.wp.mTimeToSSH != nil {
152                 wkr.wp.mTimeToReadyForContainer.Observe(time.Since(wkr.firstSSHConnection).Seconds())
153         }
154         wkr.timeToReadyReported = true
155 }
156
157 // caller must have lock.
158 func (wkr *worker) setIdleBehavior(idleBehavior IdleBehavior) {
159         wkr.logger.WithField("IdleBehavior", idleBehavior).Info("set idle behavior")
160         wkr.idleBehavior = idleBehavior
161         wkr.saveTags()
162         wkr.shutdownIfIdle()
163 }
164
165 // caller must have lock.
166 func (wkr *worker) startContainer(ctr arvados.Container) {
167         logger := wkr.logger.WithFields(logrus.Fields{
168                 "ContainerUUID": ctr.UUID,
169                 "Priority":      ctr.Priority,
170         })
171         logger.Debug("starting container")
172         rr := newRemoteRunner(ctr.UUID, wkr)
173         wkr.starting[ctr.UUID] = rr
174         if wkr.state != StateRunning {
175                 wkr.state = StateRunning
176                 go wkr.wp.notify()
177         }
178         go func() {
179                 rr.Start()
180                 if wkr.wp.mTimeFromQueueToCrunchRun != nil {
181                         wkr.wp.mTimeFromQueueToCrunchRun.Observe(time.Since(ctr.CreatedAt).Seconds())
182                 }
183                 wkr.mtx.Lock()
184                 defer wkr.mtx.Unlock()
185                 now := time.Now()
186                 wkr.updated = now
187                 wkr.busy = now
188                 delete(wkr.starting, ctr.UUID)
189                 wkr.running[ctr.UUID] = rr
190                 wkr.lastUUID = ctr.UUID
191         }()
192 }
193
194 // ProbeAndUpdate conducts appropriate boot/running probes (if any)
195 // for the worker's current state. If a previous probe is still
196 // running, it does nothing.
197 //
198 // It should be called in a new goroutine.
199 func (wkr *worker) ProbeAndUpdate() {
200         select {
201         case wkr.probing <- struct{}{}:
202                 wkr.probeAndUpdate()
203                 <-wkr.probing
204         default:
205                 wkr.logger.Debug("still waiting for last probe to finish")
206         }
207 }
208
209 // probeAndUpdate calls probeBooted and/or probeRunning if needed, and
210 // updates state accordingly.
211 //
212 // In StateUnknown: Call both probeBooted and probeRunning.
213 // In StateBooting: Call probeBooted; if successful, call probeRunning.
214 // In StateRunning: Call probeRunning.
215 // In StateIdle: Call probeRunning.
216 // In StateShutdown: Do nothing.
217 //
218 // If both probes succeed, wkr.state changes to
219 // StateIdle/StateRunning.
220 //
221 // If probeRunning succeeds, wkr.running is updated. (This means
222 // wkr.running might be non-empty even in StateUnknown, if the boot
223 // probe failed.)
224 //
225 // probeAndUpdate should be called in a new goroutine.
226 func (wkr *worker) probeAndUpdate() {
227         wkr.mtx.Lock()
228         updated := wkr.updated
229         initialState := wkr.state
230         wkr.mtx.Unlock()
231
232         var (
233                 booted   bool
234                 ctrUUIDs []string
235                 ok       bool
236                 stderr   []byte // from probeBooted
237         )
238
239         switch initialState {
240         case StateShutdown:
241                 return
242         case StateIdle, StateRunning:
243                 booted = true
244         case StateUnknown, StateBooting:
245         default:
246                 panic(fmt.Sprintf("unknown state %s", initialState))
247         }
248
249         probeStart := time.Now()
250         logger := wkr.logger.WithField("ProbeStart", probeStart)
251
252         if !booted {
253                 booted, stderr = wkr.probeBooted()
254                 if !booted {
255                         // Pretend this probe succeeded if another
256                         // concurrent attempt succeeded.
257                         wkr.mtx.Lock()
258                         booted = wkr.state == StateRunning || wkr.state == StateIdle
259                         wkr.mtx.Unlock()
260                 }
261                 if booted {
262                         logger.Info("instance booted; will try probeRunning")
263                 }
264         }
265         reportedBroken := false
266         if booted || wkr.state == StateUnknown {
267                 ctrUUIDs, reportedBroken, ok = wkr.probeRunning()
268         }
269         wkr.mtx.Lock()
270         defer wkr.mtx.Unlock()
271         if reportedBroken && wkr.idleBehavior == IdleBehaviorRun {
272                 logger.Info("probe reported broken instance")
273                 wkr.reportBootOutcome(BootOutcomeFailed)
274                 wkr.setIdleBehavior(IdleBehaviorDrain)
275         }
276         if !ok || (!booted && len(ctrUUIDs) == 0 && len(wkr.running) == 0) {
277                 if wkr.state == StateShutdown && wkr.updated.After(updated) {
278                         // Skip the logging noise if shutdown was
279                         // initiated during probe.
280                         return
281                 }
282                 // Using the start time of the probe as the timeout
283                 // threshold ensures we always initiate at least one
284                 // probe attempt after the boot/probe timeout expires
285                 // (otherwise, a slow probe failure could cause us to
286                 // shutdown an instance even though it did in fact
287                 // boot/recover before the timeout expired).
288                 dur := probeStart.Sub(wkr.probed)
289                 if wkr.shutdownIfBroken(dur) {
290                         // stderr from failed run-probes will have
291                         // been logged already, but boot-probe
292                         // failures are normal so they are logged only
293                         // at Debug level. This is our chance to log
294                         // some evidence about why the node never
295                         // booted, even in non-debug mode.
296                         if !booted {
297                                 wkr.reportBootOutcome(BootOutcomeFailed)
298                                 logger.WithFields(logrus.Fields{
299                                         "Duration": dur,
300                                         "stderr":   string(stderr),
301                                 }).Info("boot failed")
302                         }
303                 }
304                 return
305         }
306
307         updateTime := time.Now()
308         wkr.probed = updateTime
309
310         if updated != wkr.updated {
311                 // Worker was updated after the probe began, so
312                 // wkr.running might have a container UUID that was
313                 // not yet running when ctrUUIDs was generated. Leave
314                 // wkr.running alone and wait for the next probe to
315                 // catch up on any changes.
316                 return
317         }
318
319         if len(ctrUUIDs) > 0 {
320                 wkr.busy = updateTime
321                 wkr.lastUUID = ctrUUIDs[0]
322         } else if len(wkr.running) > 0 {
323                 // Actual last-busy time was sometime between wkr.busy
324                 // and now. Now is the earliest opportunity to take
325                 // advantage of the non-busy state, though.
326                 wkr.busy = updateTime
327         }
328
329         changed := wkr.updateRunning(ctrUUIDs)
330
331         // Update state if this was the first successful boot-probe.
332         if booted && (wkr.state == StateUnknown || wkr.state == StateBooting) {
333                 if wkr.state == StateBooting {
334                         wkr.reportTimeBetweenFirstSSHAndReadyForContainer()
335                 }
336                 // Note: this will change again below if
337                 // len(wkr.starting)+len(wkr.running) > 0.
338                 wkr.state = StateIdle
339                 changed = true
340         }
341
342         // If wkr.state and wkr.running aren't changing then there's
343         // no need to log anything, notify the scheduler, move state
344         // back and forth between idle/running, etc.
345         if !changed {
346                 return
347         }
348
349         // Log whenever a run-probe reveals crunch-run processes
350         // appearing/disappearing before boot-probe succeeds.
351         if wkr.state == StateUnknown && changed {
352                 logger.WithFields(logrus.Fields{
353                         "RunningContainers": len(wkr.running),
354                         "State":             wkr.state,
355                 }).Info("crunch-run probe succeeded, but boot probe is still failing")
356         }
357
358         if wkr.state == StateIdle && len(wkr.starting)+len(wkr.running) > 0 {
359                 wkr.state = StateRunning
360         } else if wkr.state == StateRunning && len(wkr.starting)+len(wkr.running) == 0 {
361                 wkr.state = StateIdle
362         }
363         wkr.updated = updateTime
364         if booted && (initialState == StateUnknown || initialState == StateBooting) {
365                 wkr.reportBootOutcome(BootOutcomeSucceeded)
366                 logger.WithFields(logrus.Fields{
367                         "RunningContainers": len(wkr.running),
368                         "State":             wkr.state,
369                 }).Info("probes succeeded, instance is in service")
370         }
371         go wkr.wp.notify()
372 }
373
374 func (wkr *worker) probeRunning() (running []string, reportsBroken, ok bool) {
375         cmd := wkr.wp.runnerCmd + " --list"
376         if u := wkr.instance.RemoteUser(); u != "root" {
377                 cmd = "sudo " + cmd
378         }
379         before := time.Now()
380         stdout, stderr, err := wkr.executor.Execute(nil, cmd, nil)
381         if err != nil {
382                 wkr.logger.WithFields(logrus.Fields{
383                         "Command": cmd,
384                         "stdout":  string(stdout),
385                         "stderr":  string(stderr),
386                 }).WithError(err).Warn("probe failed")
387                 wkr.wp.mRunProbeDuration.WithLabelValues("fail").Observe(time.Now().Sub(before).Seconds())
388                 return
389         }
390         wkr.wp.mRunProbeDuration.WithLabelValues("success").Observe(time.Now().Sub(before).Seconds())
391         ok = true
392
393         staleRunLock := false
394         for _, s := range strings.Split(string(stdout), "\n") {
395                 // Each line of the "crunch-run --list" output is one
396                 // of the following:
397                 //
398                 // * a container UUID, indicating that processes
399                 //   related to that container are currently running.
400                 //   Optionally followed by " stale", indicating that
401                 //   the crunch-run process itself has exited (the
402                 //   remaining process is probably arv-mount).
403                 //
404                 // * the string "broken", indicating that the instance
405                 //   appears incapable of starting containers.
406                 //
407                 // See ListProcesses() in lib/crunchrun/background.go.
408                 if s == "" {
409                         // empty string following final newline
410                 } else if s == "broken" {
411                         reportsBroken = true
412                 } else if toks := strings.Split(s, " "); len(toks) == 1 {
413                         running = append(running, s)
414                 } else if toks[1] == "stale" {
415                         wkr.logger.WithField("ContainerUUID", toks[0]).Info("probe reported stale run lock")
416                         staleRunLock = true
417                 }
418         }
419         wkr.mtx.Lock()
420         defer wkr.mtx.Unlock()
421         if !staleRunLock {
422                 wkr.staleRunLockSince = time.Time{}
423         } else if wkr.staleRunLockSince.IsZero() {
424                 wkr.staleRunLockSince = time.Now()
425         } else if dur := time.Now().Sub(wkr.staleRunLockSince); dur > wkr.wp.timeoutStaleRunLock {
426                 wkr.logger.WithField("Duration", dur).Warn("reporting broken after reporting stale run lock for too long")
427                 reportsBroken = true
428         }
429         return
430 }
431
432 func (wkr *worker) probeBooted() (ok bool, stderr []byte) {
433         cmd := wkr.wp.bootProbeCommand
434         if cmd == "" {
435                 cmd = "true"
436         }
437         stdout, stderr, err := wkr.executor.Execute(nil, cmd, nil)
438         logger := wkr.logger.WithFields(logrus.Fields{
439                 "Command": cmd,
440                 "stdout":  string(stdout),
441                 "stderr":  string(stderr),
442         })
443         if err != nil {
444                 logger.WithError(err).Debug("boot probe failed")
445                 return false, stderr
446         }
447         logger.Info("boot probe succeeded")
448         if err = wkr.wp.loadRunnerData(); err != nil {
449                 wkr.logger.WithError(err).Warn("cannot boot worker: error loading runner binary")
450                 return false, stderr
451         } else if len(wkr.wp.runnerData) == 0 {
452                 // Assume crunch-run is already installed
453         } else if _, stderr2, err := wkr.copyRunnerData(); err != nil {
454                 wkr.logger.WithError(err).WithField("stderr", string(stderr2)).Warn("error copying runner binary")
455                 return false, stderr2
456         } else {
457                 stderr = append(stderr, stderr2...)
458         }
459         return true, stderr
460 }
461
462 func (wkr *worker) copyRunnerData() (stdout, stderr []byte, err error) {
463         hash := fmt.Sprintf("%x", wkr.wp.runnerMD5)
464         dstdir, _ := filepath.Split(wkr.wp.runnerCmd)
465         logger := wkr.logger.WithFields(logrus.Fields{
466                 "hash": hash,
467                 "path": wkr.wp.runnerCmd,
468         })
469
470         stdout, stderr, err = wkr.executor.Execute(nil, `md5sum `+wkr.wp.runnerCmd, nil)
471         if err == nil && len(stderr) == 0 && bytes.Equal(stdout, []byte(hash+"  "+wkr.wp.runnerCmd+"\n")) {
472                 logger.Info("runner binary already exists on worker, with correct hash")
473                 return
474         }
475
476         // Note touch+chmod come before writing data, to avoid the
477         // possibility of md5 being correct while file mode is
478         // incorrect.
479         cmd := `set -e; dstdir="` + dstdir + `"; dstfile="` + wkr.wp.runnerCmd + `"; mkdir -p "$dstdir"; touch "$dstfile"; chmod 0755 "$dstdir" "$dstfile"; cat >"$dstfile"`
480         if wkr.instance.RemoteUser() != "root" {
481                 cmd = `sudo sh -c '` + strings.Replace(cmd, "'", "'\\''", -1) + `'`
482         }
483         logger.WithField("cmd", cmd).Info("installing runner binary on worker")
484         stdout, stderr, err = wkr.executor.Execute(nil, cmd, bytes.NewReader(wkr.wp.runnerData))
485         return
486 }
487
488 // caller must have lock.
489 func (wkr *worker) shutdownIfBroken(dur time.Duration) bool {
490         if wkr.idleBehavior == IdleBehaviorHold {
491                 // Never shut down.
492                 return false
493         }
494         label, threshold := "", wkr.wp.timeoutProbe
495         if wkr.state == StateUnknown || wkr.state == StateBooting {
496                 label, threshold = "new ", wkr.wp.timeoutBooting
497         }
498         if dur < threshold {
499                 return false
500         }
501         wkr.logger.WithFields(logrus.Fields{
502                 "Duration": dur,
503                 "Since":    wkr.probed,
504                 "State":    wkr.state,
505         }).Warnf("%sinstance unresponsive, shutting down", label)
506         wkr.shutdown()
507         return true
508 }
509
510 // Returns true if the instance is eligible for shutdown: either it's
511 // been idle too long, or idleBehavior=Drain and nothing is running.
512 //
513 // caller must have lock.
514 func (wkr *worker) eligibleForShutdown() bool {
515         if wkr.idleBehavior == IdleBehaviorHold {
516                 return false
517         }
518         draining := wkr.idleBehavior == IdleBehaviorDrain
519         switch wkr.state {
520         case StateBooting:
521                 return draining
522         case StateIdle:
523                 return draining || time.Since(wkr.busy) >= wkr.wp.timeoutIdle
524         case StateRunning:
525                 if !draining {
526                         return false
527                 }
528                 for _, rr := range wkr.running {
529                         if !rr.givenup {
530                                 return false
531                         }
532                 }
533                 for _, rr := range wkr.starting {
534                         if !rr.givenup {
535                                 return false
536                         }
537                 }
538                 // draining, and all remaining runners are just trying
539                 // to force-kill their crunch-run procs
540                 return true
541         default:
542                 return false
543         }
544 }
545
546 // caller must have lock.
547 func (wkr *worker) shutdownIfIdle() bool {
548         if !wkr.eligibleForShutdown() {
549                 return false
550         }
551         wkr.logger.WithFields(logrus.Fields{
552                 "State":        wkr.state,
553                 "IdleDuration": stats.Duration(time.Since(wkr.busy)),
554                 "IdleBehavior": wkr.idleBehavior,
555         }).Info("shutdown worker")
556         wkr.reportBootOutcome(BootOutcomeAborted)
557         wkr.shutdown()
558         return true
559 }
560
561 // caller must have lock.
562 func (wkr *worker) shutdown() {
563         now := time.Now()
564         wkr.updated = now
565         wkr.destroyed = now
566         wkr.state = StateShutdown
567         go wkr.wp.notify()
568         go func() {
569                 err := wkr.instance.Destroy()
570                 if err != nil {
571                         wkr.logger.WithError(err).Warn("shutdown failed")
572                         return
573                 }
574         }()
575 }
576
577 // Save worker tags to cloud provider metadata, if they don't already
578 // match. Caller must have lock.
579 func (wkr *worker) saveTags() {
580         instance := wkr.instance
581         tags := instance.Tags()
582         update := cloud.InstanceTags{
583                 wkr.wp.tagKeyPrefix + tagKeyInstanceType: wkr.instType.Name,
584                 wkr.wp.tagKeyPrefix + tagKeyIdleBehavior: string(wkr.idleBehavior),
585         }
586         save := false
587         for k, v := range update {
588                 if tags[k] != v {
589                         tags[k] = v
590                         save = true
591                 }
592         }
593         if save {
594                 go func() {
595                         err := instance.SetTags(tags)
596                         if err != nil {
597                                 wkr.wp.logger.WithField("Instance", instance.ID()).WithError(err).Warnf("error updating tags")
598                         }
599                 }()
600         }
601 }
602
603 func (wkr *worker) Close() {
604         // This might take time, so do it after unlocking mtx.
605         defer wkr.executor.Close()
606
607         wkr.mtx.Lock()
608         defer wkr.mtx.Unlock()
609         for uuid, rr := range wkr.running {
610                 wkr.logger.WithField("ContainerUUID", uuid).Info("crunch-run process abandoned")
611                 rr.Close()
612         }
613         for uuid, rr := range wkr.starting {
614                 wkr.logger.WithField("ContainerUUID", uuid).Info("crunch-run process abandoned")
615                 rr.Close()
616         }
617 }
618
619 // Add/remove entries in wkr.running to match ctrUUIDs returned by a
620 // probe. Returns true if anything was added or removed.
621 //
622 // Caller must have lock.
623 func (wkr *worker) updateRunning(ctrUUIDs []string) (changed bool) {
624         alive := map[string]bool{}
625         for _, uuid := range ctrUUIDs {
626                 alive[uuid] = true
627                 if _, ok := wkr.running[uuid]; ok {
628                         // unchanged
629                 } else if rr, ok := wkr.starting[uuid]; ok {
630                         wkr.running[uuid] = rr
631                         delete(wkr.starting, uuid)
632                         changed = true
633                 } else {
634                         // We didn't start it -- it must have been
635                         // started by a previous dispatcher process.
636                         wkr.logger.WithField("ContainerUUID", uuid).Info("crunch-run process detected")
637                         wkr.running[uuid] = newRemoteRunner(uuid, wkr)
638                         changed = true
639                 }
640         }
641         for uuid := range wkr.running {
642                 if !alive[uuid] {
643                         wkr.closeRunner(uuid)
644                         changed = true
645                 }
646         }
647         return
648 }
649
650 // caller must have lock.
651 func (wkr *worker) closeRunner(uuid string) {
652         rr := wkr.running[uuid]
653         if rr == nil {
654                 return
655         }
656         wkr.logger.WithField("ContainerUUID", uuid).Info("crunch-run process ended")
657         delete(wkr.running, uuid)
658         rr.Close()
659
660         now := time.Now()
661         wkr.updated = now
662         wkr.wp.exited[uuid] = now
663         if wkr.state == StateRunning && len(wkr.running)+len(wkr.starting) == 0 {
664                 wkr.state = StateIdle
665         }
666 }