lib/dispatchcloud/scheduler/scheduler.go

   1 // Copyright (C) The Arvados Authors. All rights reserved.
   2 //
   3 // SPDX-License-Identifier: AGPL-3.0
   4
   5 // Package scheduler uses a resizable worker pool to execute
   6 // containers in priority order.
   7 package scheduler
   8
   9 import (
  10         "context"
  11         "sync"
  12         "time"
  13
  14         "git.arvados.org/arvados.git/sdk/go/arvados"
  15         "git.arvados.org/arvados.git/sdk/go/ctxlog"
  16         "github.com/prometheus/client_golang/prometheus"
  17         "github.com/sirupsen/logrus"
  18 )
  19
  20 // A Scheduler maps queued containers onto unallocated workers in
  21 // priority order, creating new workers if needed. It locks containers
  22 // that can be mapped onto existing/pending workers, and starts them
  23 // if possible.
  24 //
  25 // A Scheduler unlocks any containers that are locked but can't be
  26 // mapped. (For example, this happens when the cloud provider reaches
  27 // quota/capacity and a previously mappable container's priority is
  28 // surpassed by a newer container.)
  29 //
  30 // If it encounters errors while creating new workers, a Scheduler
  31 // shuts down idle workers, in case they are consuming quota.
  32 type Scheduler struct {
  33         logger              logrus.FieldLogger
  34         client              *arvados.Client
  35         queue               ContainerQueue
  36         pool                WorkerPool
  37         reg                 *prometheus.Registry
  38         staleLockTimeout    time.Duration
  39         queueUpdateInterval time.Duration
  40
  41         uuidOp map[string]string // operation in progress: "lock", "cancel", ...
  42         mtx    sync.Mutex
  43         wakeup *time.Timer
  44
  45         runOnce sync.Once
  46         stop    chan struct{}
  47         stopped chan struct{}
  48
  49         last503time        time.Time // last time API responded 503
  50         maxConcurrency     int       // dynamic container limit (0 = unlimited), see runQueue()
  51         supervisorFraction float64   // maximum fraction of "supervisor" containers (these are containers who's main job is to launch other containers, e.g. workflow runners)
  52         maxInstances       int       // maximum number of instances the pool will bring up (0 = unlimited)
  53
  54         mContainersAllocatedNotStarted   prometheus.Gauge
  55         mContainersNotAllocatedOverQuota prometheus.Gauge
  56         mLongestWaitTimeSinceQueue       prometheus.Gauge
  57         mLast503Time                     prometheus.Gauge
  58         mMaxContainerConcurrency         prometheus.Gauge
  59 }
  60
  61 // New returns a new unstarted Scheduler.
  62 //
  63 // Any given queue and pool should not be used by more than one
  64 // scheduler at a time.
  65 func New(ctx context.Context, client *arvados.Client, queue ContainerQueue, pool WorkerPool, reg *prometheus.Registry, staleLockTimeout, queueUpdateInterval time.Duration, minQuota, maxInstances int, supervisorFraction float64) *Scheduler {
  66         sch := &Scheduler{
  67                 logger:              ctxlog.FromContext(ctx),
  68                 client:              client,
  69                 queue:               queue,
  70                 pool:                pool,
  71                 reg:                 reg,
  72                 staleLockTimeout:    staleLockTimeout,
  73                 queueUpdateInterval: queueUpdateInterval,
  74                 wakeup:              time.NewTimer(time.Second),
  75                 stop:                make(chan struct{}),
  76                 stopped:             make(chan struct{}),
  77                 uuidOp:              map[string]string{},
  78                 supervisorFraction:  supervisorFraction,
  79                 maxInstances:        maxInstances,
  80         }
  81         if minQuota > 0 {
  82                 sch.maxConcurrency = minQuota
  83         } else {
  84                 sch.maxConcurrency = maxInstances
  85         }
  86         sch.registerMetrics(reg)
  87         return sch
  88 }
  89
  90 func (sch *Scheduler) registerMetrics(reg *prometheus.Registry) {
  91         if reg == nil {
  92                 reg = prometheus.NewRegistry()
  93         }
  94         sch.mContainersAllocatedNotStarted = prometheus.NewGauge(prometheus.GaugeOpts{
  95                 Namespace: "arvados",
  96                 Subsystem: "dispatchcloud",
  97                 Name:      "containers_allocated_not_started",
  98                 Help:      "Number of containers allocated to a worker but not started yet (worker is booting).",
  99         })
 100         reg.MustRegister(sch.mContainersAllocatedNotStarted)
 101         sch.mContainersNotAllocatedOverQuota = prometheus.NewGauge(prometheus.GaugeOpts{
 102                 Namespace: "arvados",
 103                 Subsystem: "dispatchcloud",
 104                 Name:      "containers_not_allocated_over_quota",
 105                 Help:      "Number of containers not allocated to a worker because the system has hit a quota.",
 106         })
 107         reg.MustRegister(sch.mContainersNotAllocatedOverQuota)
 108         sch.mLongestWaitTimeSinceQueue = prometheus.NewGauge(prometheus.GaugeOpts{
 109                 Namespace: "arvados",
 110                 Subsystem: "dispatchcloud",
 111                 Name:      "containers_longest_wait_time_seconds",
 112                 Help:      "Current longest wait time of any container since queuing, and before the start of crunch-run.",
 113         })
 114         reg.MustRegister(sch.mLongestWaitTimeSinceQueue)
 115         sch.mLast503Time = prometheus.NewGauge(prometheus.GaugeOpts{
 116                 Namespace: "arvados",
 117                 Subsystem: "dispatchcloud",
 118                 Name:      "last_503_time",
 119                 Help:      "Time of most recent 503 error received from API.",
 120         })
 121         reg.MustRegister(sch.mLast503Time)
 122         sch.mMaxContainerConcurrency = prometheus.NewGauge(prometheus.GaugeOpts{
 123                 Namespace: "arvados",
 124                 Subsystem: "dispatchcloud",
 125                 Name:      "max_concurrent_containers",
 126                 Help:      "Dynamically assigned limit on number of containers scheduled concurrency, set after receiving 503 errors from API.",
 127         })
 128         reg.MustRegister(sch.mMaxContainerConcurrency)
 129         reg.MustRegister(prometheus.NewGaugeFunc(prometheus.GaugeOpts{
 130                 Namespace: "arvados",
 131                 Subsystem: "dispatchcloud",
 132                 Name:      "at_quota",
 133                 Help:      "Flag indicating the cloud driver is reporting an at-quota condition.",
 134         }, func() float64 {
 135                 if sch.pool.AtQuota() {
 136                         return 1
 137                 } else {
 138                         return 0
 139                 }
 140         }))
 141 }
 142
 143 func (sch *Scheduler) updateMetrics() {
 144         earliest := time.Time{}
 145         entries, _ := sch.queue.Entries()
 146         running := sch.pool.Running()
 147         for _, ent := range entries {
 148                 if ent.Container.Priority > 0 &&
 149                         (ent.Container.State == arvados.ContainerStateQueued || ent.Container.State == arvados.ContainerStateLocked) {
 150                         // Exclude containers that are preparing to run the payload (i.e.
 151                         // ContainerStateLocked and running on a worker, most likely loading the
 152                         // payload image
 153                         if _, ok := running[ent.Container.UUID]; !ok {
 154                                 if ent.Container.CreatedAt.Before(earliest) || earliest.IsZero() {
 155                                         earliest = ent.Container.CreatedAt
 156                                 }
 157                         }
 158                 }
 159         }
 160         if !earliest.IsZero() {
 161                 sch.mLongestWaitTimeSinceQueue.Set(time.Since(earliest).Seconds())
 162         } else {
 163                 sch.mLongestWaitTimeSinceQueue.Set(0)
 164         }
 165 }
 166
 167 // Start starts the scheduler.
 168 func (sch *Scheduler) Start() {
 169         go sch.runOnce.Do(sch.run)
 170 }
 171
 172 // Stop stops the scheduler. No other method should be called after
 173 // Stop.
 174 func (sch *Scheduler) Stop() {
 175         close(sch.stop)
 176         <-sch.stopped
 177 }
 178
 179 func (sch *Scheduler) run() {
 180         defer close(sch.stopped)
 181
 182         // Ensure the queue is fetched once before attempting anything.
 183         for err := sch.queue.Update(); err != nil; err = sch.queue.Update() {
 184                 sch.logger.Errorf("error updating queue: %s", err)
 185                 d := sch.queueUpdateInterval / 10
 186                 if d < time.Second {
 187                         d = time.Second
 188                 }
 189                 sch.logger.Infof("waiting %s before retry", d)
 190                 time.Sleep(d)
 191         }
 192
 193         // Keep the queue up to date.
 194         go func() {
 195                 for {
 196                         starttime := time.Now()
 197                         err := sch.queue.Update()
 198                         if err != nil {
 199                                 sch.logger.Errorf("error updating queue: %s", err)
 200                         }
 201                         // If the previous update took a long time,
 202                         // that probably means the server is
 203                         // overloaded, so wait that long before doing
 204                         // another. Otherwise, wait for the configured
 205                         // poll interval.
 206                         delay := time.Since(starttime)
 207                         if delay < sch.queueUpdateInterval {
 208                                 delay = sch.queueUpdateInterval
 209                         }
 210                         time.Sleep(delay)
 211                 }
 212         }()
 213
 214         t0 := time.Now()
 215         sch.logger.Infof("FixStaleLocks starting.")
 216         sch.fixStaleLocks()
 217         sch.logger.Infof("FixStaleLocks finished (%s), starting scheduling.", time.Since(t0))
 218
 219         poolNotify := sch.pool.Subscribe()
 220         defer sch.pool.Unsubscribe(poolNotify)
 221
 222         queueNotify := sch.queue.Subscribe()
 223         defer sch.queue.Unsubscribe(queueNotify)
 224
 225         for {
 226                 sch.runQueue()
 227                 sch.sync()
 228                 sch.updateMetrics()
 229                 select {
 230                 case <-sch.stop:
 231                         return
 232                 case <-queueNotify:
 233                 case <-poolNotify:
 234                 case <-sch.wakeup.C:
 235                 }
 236         }
 237 }