GEM
remote: https://rubygems.org/
specs:
- RedCloth (4.2.9)
- coderay (1.1.0)
- colorize (0.6.0)
- kramdown (1.3.1)
- less (1.2.21)
- mutter (>= 0.4.2)
- treetop (>= 1.4.2)
- liquid (2.6.1)
- makerakeworkwell (1.0.3)
- rake (>= 0.9.2, < 11)
- mutter (0.5.3)
- polyglot (0.3.3)
- rake (10.1.1)
- treetop (1.4.15)
- polyglot
- polyglot (>= 0.3.1)
- zenweb (3.3.1)
+ RedCloth (4.3.2)
+ coderay (1.1.3)
+ colorize (0.8.1)
+ commonjs (0.2.7)
+ kramdown (1.17.0)
+ less (2.6.0)
+ commonjs (~> 0.2.7)
+ liquid (4.0.3)
+ makerakeworkwell (1.0.4)
+ rake (>= 0.9.2, < 15)
+ rake (13.0.1)
+ zenweb (3.10.4)
coderay (~> 1.0)
- kramdown (~> 1.0)
- less (~> 1.2)
+ kramdown (~> 1.4)
+ less (~> 2.0)
makerakeworkwell (~> 1.0)
- rake (>= 0.9, < 11)
+ rake (>= 0.9, < 15)
PLATFORMS
ruby
colorize
liquid
zenweb
+
+BUNDLED WITH
+ 2.1.4
*next[upd.UUID] = upd
}
}
- selectParam := []string{"uuid", "state", "priority", "runtime_constraints", "container_image", "mounts", "scheduling_parameters"}
+ selectParam := []string{"uuid", "state", "priority", "runtime_constraints", "container_image", "mounts", "scheduling_parameters", "created_at"}
limitParam := 1000
mine, err := cq.fetchAll(arvados.ResourceListParams{
c.Check(resp.Body.String(), check.Matches, `(?ms).*time_to_ready_for_container_seconds{quantile="0.95"} [0-9.]*`)
c.Check(resp.Body.String(), check.Matches, `(?ms).*time_to_ready_for_container_seconds_count [0-9]*`)
c.Check(resp.Body.String(), check.Matches, `(?ms).*time_to_ready_for_container_seconds_sum [0-9.]*`)
+ c.Check(resp.Body.String(), check.Matches, `(?ms).*time_from_shutdown_request_to_disappearance_seconds_count [0-9]*`)
+ c.Check(resp.Body.String(), check.Matches, `(?ms).*time_from_shutdown_request_to_disappearance_seconds_sum [0-9.]*`)
+ c.Check(resp.Body.String(), check.Matches, `(?ms).*time_from_queue_to_crunch_run_seconds_count [0-9]*`)
+ c.Check(resp.Body.String(), check.Matches, `(?ms).*time_from_queue_to_crunch_run_seconds_sum [0-9e+.]*`)
}
func (s *DispatcherSuite) TestAPIPermissions(c *check.C) {
c.Check(pool.Running(), check.HasLen, 0)
}
-func (*SchedulerSuite) TestContainersAllocatedNotStartedMetric(c *check.C) {
+func (*SchedulerSuite) TestContainersMetrics(c *check.C) {
ctx := ctxlog.Context(context.Background(), ctxlog.TestLogger(c))
queue := test.Queue{
ChooseType: chooseType,
Containers: []arvados.Container{
{
- UUID: test.ContainerUUID(1),
- Priority: 1,
- State: arvados.ContainerStateLocked,
+ UUID: test.ContainerUUID(1),
+ Priority: 1,
+ State: arvados.ContainerStateLocked,
+ CreatedAt: time.Now().Add(-10 * time.Second),
RuntimeConstraints: arvados.RuntimeConstraints{
VCPUs: 1,
RAM: 1 << 30,
}
sch := New(ctx, &queue, &pool, nil, time.Millisecond, time.Millisecond)
sch.runQueue()
+ sch.updateMetrics()
c.Check(int(testutil.ToFloat64(sch.mContainersAllocatedNotStarted)), check.Equals, 1)
c.Check(int(testutil.ToFloat64(sch.mContainersNotAllocatedOverQuota)), check.Equals, 0)
+ c.Check(int(testutil.ToFloat64(sch.mLongestWaitTimeSinceQueue)), check.Equals, 10)
// Create a pool without workers. The queued container will not be started, and the
// 'over quota' metric will be 1 because no workers are available and canCreate defaults
pool = stubPool{}
sch = New(ctx, &queue, &pool, nil, time.Millisecond, time.Millisecond)
sch.runQueue()
+ sch.updateMetrics()
c.Check(int(testutil.ToFloat64(sch.mContainersAllocatedNotStarted)), check.Equals, 0)
c.Check(int(testutil.ToFloat64(sch.mContainersNotAllocatedOverQuota)), check.Equals, 1)
+ c.Check(int(testutil.ToFloat64(sch.mLongestWaitTimeSinceQueue)), check.Equals, 10)
+
+ // Reset the queue, and create a pool with an idle worker. The queued
+ // container will be started immediately and mLongestWaitTimeSinceQueue
+ // should be zero.
+ queue = test.Queue{
+ ChooseType: chooseType,
+ Containers: []arvados.Container{
+ {
+ UUID: test.ContainerUUID(1),
+ Priority: 1,
+ State: arvados.ContainerStateLocked,
+ CreatedAt: time.Now().Add(-10 * time.Second),
+ RuntimeConstraints: arvados.RuntimeConstraints{
+ VCPUs: 1,
+ RAM: 1 << 30,
+ },
+ },
+ },
+ }
+ queue.Update()
+
+ pool = stubPool{
+ idle: map[arvados.InstanceType]int{test.InstanceType(1): 1},
+ unalloc: map[arvados.InstanceType]int{test.InstanceType(1): 1},
+ running: map[string]time.Time{},
+ }
+ sch = New(ctx, &queue, &pool, nil, time.Millisecond, time.Millisecond)
+ sch.runQueue()
+ sch.updateMetrics()
+
+ c.Check(int(testutil.ToFloat64(sch.mLongestWaitTimeSinceQueue)), check.Equals, 0)
}
"sync"
"time"
+ "git.arvados.org/arvados.git/sdk/go/arvados"
"git.arvados.org/arvados.git/sdk/go/ctxlog"
"github.com/prometheus/client_golang/prometheus"
"github.com/sirupsen/logrus"
mContainersAllocatedNotStarted prometheus.Gauge
mContainersNotAllocatedOverQuota prometheus.Gauge
+ mLongestWaitTimeSinceQueue prometheus.Gauge
}
// New returns a new unstarted Scheduler.
Help: "Number of containers not allocated to a worker because the system has hit a quota.",
})
reg.MustRegister(sch.mContainersNotAllocatedOverQuota)
+ sch.mLongestWaitTimeSinceQueue = prometheus.NewGauge(prometheus.GaugeOpts{
+ Namespace: "arvados",
+ Subsystem: "dispatchcloud",
+ Name: "containers_longest_wait_time_seconds",
+ Help: "Current longest wait time of any container since queuing, and before the start of crunch-run.",
+ })
+ reg.MustRegister(sch.mLongestWaitTimeSinceQueue)
+}
+
+func (sch *Scheduler) updateMetrics() {
+ earliest := time.Time{}
+ entries, _ := sch.queue.Entries()
+ running := sch.pool.Running()
+ for _, ent := range entries {
+ if ent.Container.Priority > 0 &&
+ (ent.Container.State == arvados.ContainerStateQueued || ent.Container.State == arvados.ContainerStateLocked) {
+ // Exclude containers that are preparing to run the payload (i.e.
+ // ContainerStateLocked and running on a worker, most likely loading the
+ // payload image
+ if _, ok := running[ent.Container.UUID]; !ok {
+ if ent.Container.CreatedAt.Before(earliest) || earliest.IsZero() {
+ earliest = ent.Container.CreatedAt
+ }
+ }
+ }
+ }
+ if !earliest.IsZero() {
+ sch.mLongestWaitTimeSinceQueue.Set(time.Since(earliest).Seconds())
+ } else {
+ sch.mLongestWaitTimeSinceQueue.Set(0)
+ }
}
// Start starts the scheduler.
for {
sch.runQueue()
sch.sync()
+ sch.updateMetrics()
select {
case <-sch.stop:
return
runnerMD5 [md5.Size]byte
runnerCmd string
- mContainersRunning prometheus.Gauge
- mInstances *prometheus.GaugeVec
- mInstancesPrice *prometheus.GaugeVec
- mVCPUs *prometheus.GaugeVec
- mMemory *prometheus.GaugeVec
- mBootOutcomes *prometheus.CounterVec
- mDisappearances *prometheus.CounterVec
- mTimeToSSH prometheus.Summary
- mTimeToReadyForContainer prometheus.Summary
+ mContainersRunning prometheus.Gauge
+ mInstances *prometheus.GaugeVec
+ mInstancesPrice *prometheus.GaugeVec
+ mVCPUs *prometheus.GaugeVec
+ mMemory *prometheus.GaugeVec
+ mBootOutcomes *prometheus.CounterVec
+ mDisappearances *prometheus.CounterVec
+ mTimeToSSH prometheus.Summary
+ mTimeToReadyForContainer prometheus.Summary
+ mTimeFromShutdownToGone prometheus.Summary
+ mTimeFromQueueToCrunchRun prometheus.Summary
}
type createCall struct {
Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.95: 0.005, 0.99: 0.001},
})
reg.MustRegister(wp.mTimeToReadyForContainer)
+ wp.mTimeFromShutdownToGone = prometheus.NewSummary(prometheus.SummaryOpts{
+ Namespace: "arvados",
+ Subsystem: "dispatchcloud",
+ Name: "instances_time_from_shutdown_request_to_disappearance_seconds",
+ Help: "Number of seconds between the first shutdown attempt and the disappearance of the worker.",
+ Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.95: 0.005, 0.99: 0.001},
+ })
+ reg.MustRegister(wp.mTimeFromShutdownToGone)
+ wp.mTimeFromQueueToCrunchRun = prometheus.NewSummary(prometheus.SummaryOpts{
+ Namespace: "arvados",
+ Subsystem: "dispatchcloud",
+ Name: "containers_time_from_queue_to_crunch_run_seconds",
+ Help: "Number of seconds between the queuing of a container and the start of crunch-run.",
+ Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.95: 0.005, 0.99: 0.001},
+ })
+ reg.MustRegister(wp.mTimeFromQueueToCrunchRun)
}
func (wp *Pool) runMetrics() {
if wp.mDisappearances != nil {
wp.mDisappearances.WithLabelValues(stateString[wkr.state]).Inc()
}
+ // wkr.destroyed.IsZero() can happen if instance disappeared but we weren't trying to shut it down
+ if wp.mTimeFromShutdownToGone != nil && !wkr.destroyed.IsZero() {
+ wp.mTimeFromShutdownToGone.Observe(time.Now().Sub(wkr.destroyed).Seconds())
+ }
delete(wp.workers, id)
go wkr.Close()
notify = true
}
go func() {
rr.Start()
+ if wkr.wp.mTimeFromQueueToCrunchRun != nil {
+ wkr.wp.mTimeFromQueueToCrunchRun.Observe(time.Since(ctr.CreatedAt).Seconds())
+ }
wkr.mtx.Lock()
defer wkr.mtx.Unlock()
now := time.Now()