replacing.nice = n
newq[uuid] = replacing
- if state == "PENDING" && reason == "BadConstraints" && p == 0 && replacing.wantPriority > 0 {
+ if state == "PENDING" && ((reason == "BadConstraints" && p == 0) || reason == "launch failed requeued held") && replacing.wantPriority > 0 {
// When using SLURM 14.x or 15.x, our queued
// jobs land in this state when "scontrol
// reconfigure" invalidates their feature
// reappeared, so rather than second-guessing
// whether SLURM is ready, we just keep trying
// this until it works.
+ //
+ // "launch failed requeued held" seems to be
+ // another manifestation of this problem,
+ // resolved the same way.
+ log.Printf("releasing held job %q", uuid)
sqc.Slurm.Release(uuid)
+ } else if p < 1<<20 && replacing.wantPriority > 0 {
+ log.Printf("warning: job %q has low priority %d, nice %d, state %q, reason %q", uuid, p, n, state, reason)
}
}
sqc.queue = newq