X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/6d1ebf894a02151f751686003dc67ed4788d6c10..1bff2ab0181be31492c53351afc1c3c1e58ea05d:/services/crunch-dispatch-slurm/squeue.go diff --git a/services/crunch-dispatch-slurm/squeue.go b/services/crunch-dispatch-slurm/squeue.go index 365b4e8570..742943f197 100644 --- a/services/crunch-dispatch-slurm/squeue.go +++ b/services/crunch-dispatch-slurm/squeue.go @@ -91,7 +91,15 @@ func (sqc *SqueueChecker) reniceAll() { } sort.Slice(jobs, func(i, j int) bool { - return jobs[i].wantPriority > jobs[j].wantPriority + if jobs[i].wantPriority != jobs[j].wantPriority { + return jobs[i].wantPriority > jobs[j].wantPriority + } else { + // break ties with container uuid -- + // otherwise, the ordering would change from + // one interval to the next, and we'd do many + // pointless slurm queue rearrangements. + return jobs[i].uuid > jobs[j].uuid + } }) renice := wantNice(jobs, sqc.PrioritySpread) for i, job := range jobs { @@ -149,7 +157,7 @@ func (sqc *SqueueChecker) check() { replacing.nice = n newq[uuid] = replacing - if state == "PENDING" && reason == "BadConstraints" && p == 0 && replacing.wantPriority > 0 { + if state == "PENDING" && ((reason == "BadConstraints" && p == 0) || reason == "launch failed requeued held") && replacing.wantPriority > 0 { // When using SLURM 14.x or 15.x, our queued // jobs land in this state when "scontrol // reconfigure" invalidates their feature @@ -163,7 +171,14 @@ func (sqc *SqueueChecker) check() { // reappeared, so rather than second-guessing // whether SLURM is ready, we just keep trying // this until it works. + // + // "launch failed requeued held" seems to be + // another manifestation of this problem, + // resolved the same way. + log.Printf("releasing held job %q", uuid) sqc.Slurm.Release(uuid) + } else if p < 1<<20 && replacing.wantPriority > 0 { + log.Printf("warning: job %q has low priority %d, nice %d, state %q, reason %q", uuid, p, n, state, reason) } } sqc.queue = newq