From 8668fd5770e7512857821007fba8babf90117ffd Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Wed, 2 Oct 2024 13:13:56 -0400 Subject: [PATCH] 22132: Revise the messages some more and update tests. Arvados-DCO-1.1-Signed-off-by: Peter Amstutz --- lib/dispatchcloud/dispatcher_test.go | 36 ++++++++++++------------ lib/dispatchcloud/scheduler/run_queue.go | 29 +++++++++++-------- 2 files changed, 35 insertions(+), 30 deletions(-) diff --git a/lib/dispatchcloud/dispatcher_test.go b/lib/dispatchcloud/dispatcher_test.go index d651e73a67..032abb277a 100644 --- a/lib/dispatchcloud/dispatcher_test.go +++ b/lib/dispatchcloud/dispatcher_test.go @@ -434,24 +434,24 @@ func (s *DispatcherSuite) TestManagementAPI_Containers(c *check.C) { expect := ` 0 zzzzz-dz642-000000000000000 (Running) "" 1 zzzzz-dz642-000000000000001 (Running) "" - 2 zzzzz-dz642-000000000000002 (Locked) "waiting for suitable instance type to become available: queue position 1" - 3 zzzzz-dz642-000000000000003 (Locked) "waiting for suitable instance type to become available: queue position 2" - 4 zzzzz-dz642-000000000000004 (Queued) "waiting while cluster is running at capacity: queue position 3" - 5 zzzzz-dz642-000000000000005 (Queued) "waiting while cluster is running at capacity: queue position 4" - 6 zzzzz-dz642-000000000000006 (Queued) "waiting while cluster is running at capacity: queue position 5" - 7 zzzzz-dz642-000000000000007 (Queued) "waiting while cluster is running at capacity: queue position 6" - 8 zzzzz-dz642-000000000000008 (Queued) "waiting while cluster is running at capacity: queue position 7" - 9 zzzzz-dz642-000000000000009 (Queued) "waiting while cluster is running at capacity: queue position 8" - 10 zzzzz-dz642-000000000000010 (Queued) "waiting while cluster is running at capacity: queue position 9" - 11 zzzzz-dz642-000000000000011 (Queued) "waiting while cluster is running at capacity: queue position 10" - 12 zzzzz-dz642-000000000000012 (Queued) "waiting while cluster is running at capacity: queue position 11" - 13 zzzzz-dz642-000000000000013 (Queued) "waiting while cluster is running at capacity: queue position 12" - 14 zzzzz-dz642-000000000000014 (Queued) "waiting while cluster is running at capacity: queue position 13" - 15 zzzzz-dz642-000000000000015 (Queued) "waiting while cluster is running at capacity: queue position 14" - 16 zzzzz-dz642-000000000000016 (Queued) "waiting while cluster is running at capacity: queue position 15" - 17 zzzzz-dz642-000000000000017 (Queued) "waiting while cluster is running at capacity: queue position 16" - 18 zzzzz-dz642-000000000000018 (Queued) "waiting while cluster is running at capacity: queue position 17" - 19 zzzzz-dz642-000000000000019 (Queued) "waiting while cluster is running at capacity: queue position 18" + 2 zzzzz-dz642-000000000000002 (Locked) "Waiting in container queue at position 1. Cluster is at capacity for all the eligible instance types (type4, type6) and cannot start a new instance right now." + 3 zzzzz-dz642-000000000000003 (Locked) "Waiting in container queue at position 2. Cluster is at capacity for all the eligible instance types (type4, type6) and cannot start a new instance right now." + 4 zzzzz-dz642-000000000000004 (Queued) "Waiting in container queue at position 3. Cluster is at capacity and cannot start any new instances right now." + 5 zzzzz-dz642-000000000000005 (Queued) "Waiting in container queue at position 4. Cluster is at capacity and cannot start any new instances right now." + 6 zzzzz-dz642-000000000000006 (Queued) "Waiting in container queue at position 5. Cluster is at capacity and cannot start any new instances right now." + 7 zzzzz-dz642-000000000000007 (Queued) "Waiting in container queue at position 6. Cluster is at capacity and cannot start any new instances right now." + 8 zzzzz-dz642-000000000000008 (Queued) "Waiting in container queue at position 7. Cluster is at capacity and cannot start any new instances right now." + 9 zzzzz-dz642-000000000000009 (Queued) "Waiting in container queue at position 8. Cluster is at capacity and cannot start any new instances right now." + 10 zzzzz-dz642-000000000000010 (Queued) "Waiting in container queue at position 9. Cluster is at capacity and cannot start any new instances right now." + 11 zzzzz-dz642-000000000000011 (Queued) "Waiting in container queue at position 10. Cluster is at capacity and cannot start any new instances right now." + 12 zzzzz-dz642-000000000000012 (Queued) "Waiting in container queue at position 11. Cluster is at capacity and cannot start any new instances right now." + 13 zzzzz-dz642-000000000000013 (Queued) "Waiting in container queue at position 12. Cluster is at capacity and cannot start any new instances right now." + 14 zzzzz-dz642-000000000000014 (Queued) "Waiting in container queue at position 13. Cluster is at capacity and cannot start any new instances right now." + 15 zzzzz-dz642-000000000000015 (Queued) "Waiting in container queue at position 14. Cluster is at capacity and cannot start any new instances right now." + 16 zzzzz-dz642-000000000000016 (Queued) "Waiting in container queue at position 15. Cluster is at capacity and cannot start any new instances right now." + 17 zzzzz-dz642-000000000000017 (Queued) "Waiting in container queue at position 16. Cluster is at capacity and cannot start any new instances right now." + 18 zzzzz-dz642-000000000000018 (Queued) "Waiting in container queue at position 17. Cluster is at capacity and cannot start any new instances right now." + 19 zzzzz-dz642-000000000000019 (Queued) "Waiting in container queue at position 18. Cluster is at capacity and cannot start any new instances right now." ` sequence := make(map[string][]string) var summary string diff --git a/lib/dispatchcloud/scheduler/run_queue.go b/lib/dispatchcloud/scheduler/run_queue.go index 0979f9c83d..c881edcbab 100644 --- a/lib/dispatchcloud/scheduler/run_queue.go +++ b/lib/dispatchcloud/scheduler/run_queue.go @@ -7,6 +7,7 @@ package scheduler import ( "fmt" "sort" + "strings" "time" "git.arvados.org/arvados.git/lib/dispatchcloud/container" @@ -25,14 +26,14 @@ type QueueEnt struct { } const ( - schedStatusPreparingRuntimeEnvironment = "An instance has been allocated and Crunch is now preparing to run the container." - schedStatusPriorityZero = "This container will not be scheduled because its priority is 0 and state is %v." - schedStatusSupervisorLimitReached = "The cluster is at capacity, this workflow has position %v in the workflow queue." - schedStatusWaitingForPreviousAttempt = "waiting for previous attempt to exit" - schedStatusWaitingNewInstance = "waiting for new instance to be ready" - schedStatusWaitingInstanceType = "waiting for suitable instance type to become available" // ": queue position X" appended at runtime - schedStatusWaitingCloudResources = "waiting for cloud resources" - schedStatusWaitingClusterCapacity = "waiting while cluster is running at capacity" // ": queue position X" appended at runtime + schedStatusPreparingRuntimeEnvironment = "Container is allocated to an instance and preparing to run." + schedStatusPriorityZero = "This container will not be scheduled to run because its priority is 0 and state is %v." + schedStatusSupervisorLimitReached = "Waiting in workflow queue at position %v. Cluster is at capacity and cannot start any new workflows right now." + schedStatusWaitingForPreviousAttempt = "Waiting for previous container attempt to exit." + schedStatusWaitingNewInstance = "Waiting for a %v instance to boot and be ready to accept work." + schedStatusWaitingInstanceType = "Waiting in container queue at position %v. Cluster is at capacity for all the eligible instance types (%v) and cannot start a new instance right now." + schedStatusWaitingCloudResources = "Waiting in container queue at position %v. Cluster is at cloud account limits and cannot start any new instances right now." + schedStatusWaitingClusterCapacity = "Waiting in container queue at position %v. Cluster is at capacity and cannot start any new instances right now." ) // Queue returns the sorted queue from the last scheduling iteration. @@ -287,7 +288,7 @@ tryrun: sorted[i].SchedulingStatus = schedStatusPreparingRuntimeEnvironment logger.Trace("StartContainer => true") } else { - sorted[i].SchedulingStatus = schedStatusWaitingNewInstance + sorted[i].SchedulingStatus = fmt.Sprintf(schedStatusWaitingNewInstance, unallocType.Name) logger.Trace("StartContainer => false") containerAllocatedWorkerBootingCount += 1 dontstart[unallocType] = true @@ -318,7 +319,11 @@ tryrun: // runQueue(), rather than run // container B now. qpos++ - sorted[i].SchedulingStatus = schedStatusWaitingInstanceType + fmt.Sprintf(": queue position %d", qpos) + var typenames []string + for _, tp := range types { + typenames = append(typenames, tp.Name) + } + sorted[i].SchedulingStatus = fmt.Sprintf(schedStatusWaitingInstanceType, qpos, strings.Join(typenames, ", ")) logger.Trace("all eligible types at capacity") continue } @@ -333,7 +338,7 @@ tryrun: // asynchronously and does its own logging // about the eventual outcome, so we don't // need to.) - sorted[i].SchedulingStatus = schedStatusWaitingNewInstance + sorted[i].SchedulingStatus = fmt.Sprintf(schedStatusWaitingNewInstance, availableType.Name) logger.Info("creating new instance") // Don't bother trying to start the container // yet -- obviously the instance will take @@ -355,7 +360,7 @@ tryrun: for i, ent := range sorted { if ent.SchedulingStatus == "" && (ent.Container.State == arvados.ContainerStateQueued || ent.Container.State == arvados.ContainerStateLocked) { qpos++ - sorted[i].SchedulingStatus = fmt.Sprintf("%s: queue position %d", qreason, qpos) + sorted[i].SchedulingStatus = fmt.Sprintf(qreason, qpos) } } sch.lastQueue.Store(sorted) -- 2.30.2