lib/dispatchcloud/worker/worker.go

   1 // Copyright (C) The Arvados Authors. All rights reserved.
   2 //
   3 // SPDX-License-Identifier: AGPL-3.0
   4
   5 package worker
   6
   7 import (
   8         "bytes"
   9         "encoding/json"
  10         "fmt"
  11         "strings"
  12         "sync"
  13         "time"
  14
  15         "git.curoverse.com/arvados.git/lib/cloud"
  16         "git.curoverse.com/arvados.git/sdk/go/arvados"
  17         "git.curoverse.com/arvados.git/sdk/go/stats"
  18         "github.com/sirupsen/logrus"
  19 )
  20
  21 const (
  22         // TODO: configurable
  23         maxPingFailTime = 10 * time.Minute
  24 )
  25
  26 // State indicates whether a worker is available to do work, and (if
  27 // not) whether/when it is expected to become ready.
  28 type State int
  29
  30 const (
  31         StateUnknown  State = iota // might be running a container already
  32         StateBooting               // instance is booting
  33         StateIdle                  // instance booted, no containers are running
  34         StateRunning               // instance is running one or more containers
  35         StateShutdown              // worker has stopped monitoring the instance
  36 )
  37
  38 var stateString = map[State]string{
  39         StateUnknown:  "unknown",
  40         StateBooting:  "booting",
  41         StateIdle:     "idle",
  42         StateRunning:  "running",
  43         StateShutdown: "shutdown",
  44 }
  45
  46 // String implements fmt.Stringer.
  47 func (s State) String() string {
  48         return stateString[s]
  49 }
  50
  51 // MarshalText implements encoding.TextMarshaler so a JSON encoding of
  52 // map[State]anything uses the state's string representation.
  53 func (s State) MarshalText() ([]byte, error) {
  54         return []byte(stateString[s]), nil
  55 }
  56
  57 // IdleBehavior indicates the behavior desired when a node becomes idle.
  58 type IdleBehavior string
  59
  60 const (
  61         IdleBehaviorRun   IdleBehavior = "run"   // run containers, or shutdown on idle timeout
  62         IdleBehaviorHold  IdleBehavior = "hold"  // don't shutdown or run more containers
  63         IdleBehaviorDrain IdleBehavior = "drain" // shutdown immediately when idle
  64 )
  65
  66 var validIdleBehavior = map[IdleBehavior]bool{
  67         IdleBehaviorRun:   true,
  68         IdleBehaviorHold:  true,
  69         IdleBehaviorDrain: true,
  70 }
  71
  72 type worker struct {
  73         logger   logrus.FieldLogger
  74         executor Executor
  75         wp       *Pool
  76
  77         mtx          sync.Locker // must be wp's Locker.
  78         state        State
  79         idleBehavior IdleBehavior
  80         instance     cloud.Instance
  81         instType     arvados.InstanceType
  82         vcpus        int64
  83         memory       int64
  84         appeared     time.Time
  85         probed       time.Time
  86         updated      time.Time
  87         busy         time.Time
  88         destroyed    time.Time
  89         lastUUID     string
  90         running      map[string]struct{} // remember to update state idle<->running when this changes
  91         starting     map[string]struct{} // remember to update state idle<->running when this changes
  92         probing      chan struct{}
  93 }
  94
  95 // caller must have lock.
  96 func (wkr *worker) startContainer(ctr arvados.Container) {
  97         logger := wkr.logger.WithFields(logrus.Fields{
  98                 "ContainerUUID": ctr.UUID,
  99                 "Priority":      ctr.Priority,
 100         })
 101         logger = logger.WithField("Instance", wkr.instance.ID())
 102         logger.Debug("starting container")
 103         wkr.starting[ctr.UUID] = struct{}{}
 104         wkr.state = StateRunning
 105         go func() {
 106                 env := map[string]string{
 107                         "ARVADOS_API_HOST":  wkr.wp.arvClient.APIHost,
 108                         "ARVADOS_API_TOKEN": wkr.wp.arvClient.AuthToken,
 109                 }
 110                 if wkr.wp.arvClient.Insecure {
 111                         env["ARVADOS_API_HOST_INSECURE"] = "1"
 112                 }
 113                 envJSON, err := json.Marshal(env)
 114                 if err != nil {
 115                         panic(err)
 116                 }
 117                 stdin := bytes.NewBuffer(envJSON)
 118                 cmd := "crunch-run --detach --stdin-env '" + ctr.UUID + "'"
 119                 if u := wkr.instance.RemoteUser(); u != "root" {
 120                         cmd = "sudo " + cmd
 121                 }
 122                 stdout, stderr, err := wkr.executor.Execute(nil, cmd, stdin)
 123                 wkr.mtx.Lock()
 124                 defer wkr.mtx.Unlock()
 125                 now := time.Now()
 126                 wkr.updated = now
 127                 wkr.busy = now
 128                 delete(wkr.starting, ctr.UUID)
 129                 wkr.running[ctr.UUID] = struct{}{}
 130                 wkr.lastUUID = ctr.UUID
 131                 if err != nil {
 132                         logger.WithField("stdout", string(stdout)).
 133                                 WithField("stderr", string(stderr)).
 134                                 WithError(err).
 135                                 Error("error starting crunch-run process")
 136                         // Leave uuid in wkr.running, though: it's
 137                         // possible the error was just a communication
 138                         // failure and the process was in fact
 139                         // started.  Wait for next probe to find out.
 140                         return
 141                 }
 142                 logger.Info("crunch-run process started")
 143                 wkr.lastUUID = ctr.UUID
 144         }()
 145 }
 146
 147 // ProbeAndUpdate conducts appropriate boot/running probes (if any)
 148 // for the worker's curent state. If a previous probe is still
 149 // running, it does nothing.
 150 //
 151 // It should be called in a new goroutine.
 152 func (wkr *worker) ProbeAndUpdate() {
 153         select {
 154         case wkr.probing <- struct{}{}:
 155                 wkr.probeAndUpdate()
 156                 <-wkr.probing
 157         default:
 158                 wkr.logger.Debug("still waiting for last probe to finish")
 159         }
 160 }
 161
 162 // probeAndUpdate calls probeBooted and/or probeRunning if needed, and
 163 // updates state accordingly.
 164 //
 165 // In StateUnknown: Call both probeBooted and probeRunning.
 166 // In StateBooting: Call probeBooted; if successful, call probeRunning.
 167 // In StateRunning: Call probeRunning.
 168 // In StateIdle: Call probeRunning.
 169 // In StateShutdown: Do nothing.
 170 //
 171 // If both probes succeed, wkr.state changes to
 172 // StateIdle/StateRunning.
 173 //
 174 // If probeRunning succeeds, wkr.running is updated. (This means
 175 // wkr.running might be non-empty even in StateUnknown, if the boot
 176 // probe failed.)
 177 //
 178 // probeAndUpdate should be called in a new goroutine.
 179 func (wkr *worker) probeAndUpdate() {
 180         wkr.mtx.Lock()
 181         updated := wkr.updated
 182         initialState := wkr.state
 183         wkr.mtx.Unlock()
 184
 185         var (
 186                 booted   bool
 187                 ctrUUIDs []string
 188                 ok       bool
 189                 stderr   []byte // from probeBooted
 190         )
 191
 192         switch initialState {
 193         case StateShutdown:
 194                 return
 195         case StateIdle, StateRunning:
 196                 booted = true
 197         case StateUnknown, StateBooting:
 198         default:
 199                 panic(fmt.Sprintf("unknown state %s", initialState))
 200         }
 201
 202         probeStart := time.Now()
 203         logger := wkr.logger.WithField("ProbeStart", probeStart)
 204
 205         if !booted {
 206                 booted, stderr = wkr.probeBooted()
 207                 if !booted {
 208                         // Pretend this probe succeeded if another
 209                         // concurrent attempt succeeded.
 210                         wkr.mtx.Lock()
 211                         booted = wkr.state == StateRunning || wkr.state == StateIdle
 212                         wkr.mtx.Unlock()
 213                 }
 214                 if booted {
 215                         logger.Info("instance booted; will try probeRunning")
 216                 }
 217         }
 218         if booted || wkr.state == StateUnknown {
 219                 ctrUUIDs, ok = wkr.probeRunning()
 220         }
 221         wkr.mtx.Lock()
 222         defer wkr.mtx.Unlock()
 223         if !ok || (!booted && len(ctrUUIDs) == 0 && len(wkr.running) == 0) {
 224                 if wkr.state == StateShutdown && wkr.updated.After(updated) {
 225                         // Skip the logging noise if shutdown was
 226                         // initiated during probe.
 227                         return
 228                 }
 229                 // Using the start time of the probe as the timeout
 230                 // threshold ensures we always initiate at least one
 231                 // probe attempt after the boot/probe timeout expires
 232                 // (otherwise, a slow probe failure could cause us to
 233                 // shutdown an instance even though it did in fact
 234                 // boot/recover before the timeout expired).
 235                 dur := probeStart.Sub(wkr.probed)
 236                 if wkr.shutdownIfBroken(dur) {
 237                         // stderr from failed run-probes will have
 238                         // been logged already, but boot-probe
 239                         // failures are normal so they are logged only
 240                         // at Debug level. This is our chance to log
 241                         // some evidence about why the node never
 242                         // booted, even in non-debug mode.
 243                         if !booted {
 244                                 logger.WithFields(logrus.Fields{
 245                                         "Duration": dur,
 246                                         "stderr":   string(stderr),
 247                                 }).Info("boot failed")
 248                         }
 249                 }
 250                 return
 251         }
 252
 253         updateTime := time.Now()
 254         wkr.probed = updateTime
 255
 256         if updated != wkr.updated {
 257                 // Worker was updated after the probe began, so
 258                 // wkr.running might have a container UUID that was
 259                 // not yet running when ctrUUIDs was generated. Leave
 260                 // wkr.running alone and wait for the next probe to
 261                 // catch up on any changes.
 262                 return
 263         }
 264
 265         if len(ctrUUIDs) > 0 {
 266                 wkr.busy = updateTime
 267                 wkr.lastUUID = ctrUUIDs[0]
 268         } else if len(wkr.running) > 0 {
 269                 // Actual last-busy time was sometime between wkr.busy
 270                 // and now. Now is the earliest opportunity to take
 271                 // advantage of the non-busy state, though.
 272                 wkr.busy = updateTime
 273         }
 274         changed := false
 275
 276         // Build a new "running" map. Set changed=true if it differs
 277         // from the existing map (wkr.running) to ensure the scheduler
 278         // gets notified below.
 279         running := map[string]struct{}{}
 280         for _, uuid := range ctrUUIDs {
 281                 running[uuid] = struct{}{}
 282                 if _, ok := wkr.running[uuid]; !ok {
 283                         if _, ok := wkr.starting[uuid]; !ok {
 284                                 // We didn't start it -- it must have
 285                                 // been started by a previous
 286                                 // dispatcher process.
 287                                 logger.WithField("ContainerUUID", uuid).Info("crunch-run process detected")
 288                         }
 289                         changed = true
 290                 }
 291         }
 292         for uuid := range wkr.running {
 293                 if _, ok := running[uuid]; !ok {
 294                         logger.WithField("ContainerUUID", uuid).Info("crunch-run process ended")
 295                         wkr.wp.notifyExited(uuid, updateTime)
 296                         changed = true
 297                 }
 298         }
 299
 300         // Update state if this was the first successful boot-probe.
 301         if booted && (wkr.state == StateUnknown || wkr.state == StateBooting) {
 302                 // Note: this will change again below if
 303                 // len(wkr.starting)+len(wkr.running) > 0.
 304                 wkr.state = StateIdle
 305                 changed = true
 306         }
 307
 308         // If wkr.state and wkr.running aren't changing then there's
 309         // no need to log anything, notify the scheduler, move state
 310         // back and forth between idle/running, etc.
 311         if !changed {
 312                 return
 313         }
 314
 315         // Log whenever a run-probe reveals crunch-run processes
 316         // appearing/disappearing before boot-probe succeeds.
 317         if wkr.state == StateUnknown && len(running) != len(wkr.running) {
 318                 logger.WithFields(logrus.Fields{
 319                         "RunningContainers": len(running),
 320                         "State":             wkr.state,
 321                 }).Info("crunch-run probe succeeded, but boot probe is still failing")
 322         }
 323
 324         wkr.running = running
 325         if wkr.state == StateIdle && len(wkr.starting)+len(wkr.running) > 0 {
 326                 wkr.state = StateRunning
 327         } else if wkr.state == StateRunning && len(wkr.starting)+len(wkr.running) == 0 {
 328                 wkr.state = StateIdle
 329         }
 330         wkr.updated = updateTime
 331         if booted && (initialState == StateUnknown || initialState == StateBooting) {
 332                 logger.WithFields(logrus.Fields{
 333                         "RunningContainers": len(running),
 334                         "State":             wkr.state,
 335                 }).Info("probes succeeded, instance is in service")
 336         }
 337         go wkr.wp.notify()
 338 }
 339
 340 func (wkr *worker) probeRunning() (running []string, ok bool) {
 341         cmd := "crunch-run --list"
 342         if u := wkr.instance.RemoteUser(); u != "root" {
 343                 cmd = "sudo " + cmd
 344         }
 345         stdout, stderr, err := wkr.executor.Execute(nil, cmd, nil)
 346         if err != nil {
 347                 wkr.logger.WithFields(logrus.Fields{
 348                         "Command": cmd,
 349                         "stdout":  string(stdout),
 350                         "stderr":  string(stderr),
 351                 }).WithError(err).Warn("probe failed")
 352                 return nil, false
 353         }
 354         stdout = bytes.TrimRight(stdout, "\n")
 355         if len(stdout) == 0 {
 356                 return nil, true
 357         }
 358         return strings.Split(string(stdout), "\n"), true
 359 }
 360
 361 func (wkr *worker) probeBooted() (ok bool, stderr []byte) {
 362         cmd := wkr.wp.bootProbeCommand
 363         if cmd == "" {
 364                 cmd = "true"
 365         }
 366         stdout, stderr, err := wkr.executor.Execute(nil, cmd, nil)
 367         logger := wkr.logger.WithFields(logrus.Fields{
 368                 "Command": cmd,
 369                 "stdout":  string(stdout),
 370                 "stderr":  string(stderr),
 371         })
 372         if err != nil {
 373                 logger.WithError(err).Debug("boot probe failed")
 374                 return false, stderr
 375         }
 376         logger.Info("boot probe succeeded")
 377         return true, stderr
 378 }
 379
 380 // caller must have lock.
 381 func (wkr *worker) shutdownIfBroken(dur time.Duration) bool {
 382         if wkr.idleBehavior == IdleBehaviorHold {
 383                 // Never shut down.
 384                 return false
 385         }
 386         label, threshold := "", wkr.wp.timeoutProbe
 387         if wkr.state == StateUnknown || wkr.state == StateBooting {
 388                 label, threshold = "new ", wkr.wp.timeoutBooting
 389         }
 390         if dur < threshold {
 391                 return false
 392         }
 393         wkr.logger.WithFields(logrus.Fields{
 394                 "Duration": dur,
 395                 "Since":    wkr.probed,
 396                 "State":    wkr.state,
 397         }).Warnf("%sinstance unresponsive, shutting down", label)
 398         wkr.shutdown()
 399         return true
 400 }
 401
 402 // caller must have lock.
 403 func (wkr *worker) shutdownIfIdle() bool {
 404         if wkr.idleBehavior == IdleBehaviorHold {
 405                 // Never shut down.
 406                 return false
 407         }
 408         age := time.Since(wkr.busy)
 409
 410         old := age >= wkr.wp.timeoutIdle
 411         draining := wkr.idleBehavior == IdleBehaviorDrain
 412         shouldShutdown := ((old || draining) && wkr.state == StateIdle) ||
 413                 (draining && wkr.state == StateBooting)
 414         if !shouldShutdown {
 415                 return false
 416         }
 417
 418         wkr.logger.WithFields(logrus.Fields{
 419                 "State":        wkr.state,
 420                 "IdleDuration": stats.Duration(age),
 421                 "IdleBehavior": wkr.idleBehavior,
 422         }).Info("shutdown idle worker")
 423         wkr.shutdown()
 424         return true
 425 }
 426
 427 // caller must have lock.
 428 func (wkr *worker) shutdown() {
 429         now := time.Now()
 430         wkr.updated = now
 431         wkr.destroyed = now
 432         wkr.state = StateShutdown
 433         go wkr.wp.notify()
 434         go func() {
 435                 err := wkr.instance.Destroy()
 436                 if err != nil {
 437                         wkr.logger.WithError(err).Warn("shutdown failed")
 438                         return
 439                 }
 440         }()
 441 }
 442
 443 // Save worker tags to cloud provider metadata, if they don't already
 444 // match. Caller must have lock.
 445 func (wkr *worker) saveTags() {
 446         instance := wkr.instance
 447         tags := instance.Tags()
 448         update := cloud.InstanceTags{
 449                 tagKeyInstanceType: wkr.instType.Name,
 450                 tagKeyIdleBehavior: string(wkr.idleBehavior),
 451         }
 452         save := false
 453         for k, v := range update {
 454                 if tags[k] != v {
 455                         tags[k] = v
 456                         save = true
 457                 }
 458         }
 459         if save {
 460                 go func() {
 461                         err := instance.SetTags(tags)
 462                         if err != nil {
 463                                 wkr.wp.logger.WithField("Instance", instance.ID()).WithError(err).Warnf("error updating tags")
 464                         }
 465                 }()
 466         }
 467 }