X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/4b563772c0dcbca24c75c0fe5aafab1009b0e3c7..f4aa4dbbefe8b6dd65e3a112642da288774cf951:/services/crunch-dispatch-slurm/squeue.go?ds=sidebyside diff --git a/services/crunch-dispatch-slurm/squeue.go b/services/crunch-dispatch-slurm/squeue.go index ee79c6f774..742943f197 100644 --- a/services/crunch-dispatch-slurm/squeue.go +++ b/services/crunch-dispatch-slurm/squeue.go @@ -157,7 +157,7 @@ func (sqc *SqueueChecker) check() { replacing.nice = n newq[uuid] = replacing - if state == "PENDING" && reason == "BadConstraints" && p == 0 && replacing.wantPriority > 0 { + if state == "PENDING" && ((reason == "BadConstraints" && p == 0) || reason == "launch failed requeued held") && replacing.wantPriority > 0 { // When using SLURM 14.x or 15.x, our queued // jobs land in this state when "scontrol // reconfigure" invalidates their feature @@ -171,7 +171,14 @@ func (sqc *SqueueChecker) check() { // reappeared, so rather than second-guessing // whether SLURM is ready, we just keep trying // this until it works. + // + // "launch failed requeued held" seems to be + // another manifestation of this problem, + // resolved the same way. + log.Printf("releasing held job %q", uuid) sqc.Slurm.Release(uuid) + } else if p < 1<<20 && replacing.wantPriority > 0 { + log.Printf("warning: job %q has low priority %d, nice %d, state %q, reason %q", uuid, p, n, state, reason) } } sqc.queue = newq