19973: Add metrics for automatic container concurrency limit.
authorTom Clegg <tom@curii.com>
Fri, 10 Feb 2023 15:54:49 +0000 (10:54 -0500)
committerTom Clegg <tom@curii.com>
Fri, 10 Feb 2023 15:54:49 +0000 (10:54 -0500)
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom@curii.com>

lib/dispatchcloud/scheduler/run_queue.go
lib/dispatchcloud/scheduler/scheduler.go

index cfd95e94549745140c619f26b9f914aa656c969d..057ff8d6e29505fa9319071d1f135a939ef0a83e 100644 (file)
@@ -62,6 +62,8 @@ func (sch *Scheduler) runQueue() {
                        sch.maxConcurrency = max
                }
        }
+       sch.mLast503Time.Set(float64(sch.last503time.Unix()))
+       sch.mMaxContainerConcurrency.Set(float64(sch.maxConcurrency))
 
        sch.logger.WithFields(logrus.Fields{
                "Containers":     len(sorted),
index 589aa3ec1140774a5446fdcf0331cb5d781734f7..4644dc4ea4db00782b38589f546f5cb22d577e88 100644 (file)
@@ -52,6 +52,8 @@ type Scheduler struct {
        mContainersAllocatedNotStarted   prometheus.Gauge
        mContainersNotAllocatedOverQuota prometheus.Gauge
        mLongestWaitTimeSinceQueue       prometheus.Gauge
+       mLast503Time                     prometheus.Gauge
+       mMaxContainerConcurrency         prometheus.Gauge
 }
 
 // New returns a new unstarted Scheduler.
@@ -101,6 +103,20 @@ func (sch *Scheduler) registerMetrics(reg *prometheus.Registry) {
                Help:      "Current longest wait time of any container since queuing, and before the start of crunch-run.",
        })
        reg.MustRegister(sch.mLongestWaitTimeSinceQueue)
+       sch.mLast503Time = prometheus.NewGauge(prometheus.GaugeOpts{
+               Namespace: "arvados",
+               Subsystem: "dispatchcloud",
+               Name:      "last_503_time",
+               Help:      "Time of most recent 503 error received from API.",
+       })
+       reg.MustRegister(sch.mLast503Time)
+       sch.mMaxContainerConcurrency = prometheus.NewGauge(prometheus.GaugeOpts{
+               Namespace: "arvados",
+               Subsystem: "dispatchcloud",
+               Name:      "max_concurrent_containers",
+               Help:      "Dynamically assigned limit on number of containers scheduled concurrency, set after receiving 503 errors from API.",
+       })
+       reg.MustRegister(sch.mMaxContainerConcurrency)
 }
 
 func (sch *Scheduler) updateMetrics() {