X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/2c437234e2b72d0249d45f0ed3498bf4f1fa8f99..f4aa4dbbefe8b6dd65e3a112642da288774cf951:/services/crunch-dispatch-slurm/squeue.go diff --git a/services/crunch-dispatch-slurm/squeue.go b/services/crunch-dispatch-slurm/squeue.go index 9514da822b..742943f197 100644 --- a/services/crunch-dispatch-slurm/squeue.go +++ b/services/crunch-dispatch-slurm/squeue.go @@ -157,7 +157,7 @@ func (sqc *SqueueChecker) check() { replacing.nice = n newq[uuid] = replacing - if state == "PENDING" && reason == "BadConstraints" && p == 0 && replacing.wantPriority > 0 { + if state == "PENDING" && ((reason == "BadConstraints" && p == 0) || reason == "launch failed requeued held") && replacing.wantPriority > 0 { // When using SLURM 14.x or 15.x, our queued // jobs land in this state when "scontrol // reconfigure" invalidates their feature @@ -171,6 +171,10 @@ func (sqc *SqueueChecker) check() { // reappeared, so rather than second-guessing // whether SLURM is ready, we just keep trying // this until it works. + // + // "launch failed requeued held" seems to be + // another manifestation of this problem, + // resolved the same way. log.Printf("releasing held job %q", uuid) sqc.Slurm.Release(uuid) } else if p < 1<<20 && replacing.wantPriority > 0 {