projects
/
arvados.git
/ blobdiff
summary
|
shortlog
|
log
|
commit
|
commitdiff
|
tree
raw
|
inline
| side by side
12061: Change busywait approach to preserve assertion messages.
[arvados.git]
/
services
/
crunch-dispatch-slurm
/
squeue.go
diff --git
a/services/crunch-dispatch-slurm/squeue.go
b/services/crunch-dispatch-slurm/squeue.go
index ee79c6f774c1ca4cb277f1c356ebca792d790f49..742943f197580e186e7fd1f7b8084a1357f3661d 100644
(file)
--- a/
services/crunch-dispatch-slurm/squeue.go
+++ b/
services/crunch-dispatch-slurm/squeue.go
@@
-157,7
+157,7
@@
func (sqc *SqueueChecker) check() {
replacing.nice = n
newq[uuid] = replacing
replacing.nice = n
newq[uuid] = replacing
- if state == "PENDING" &&
reason == "BadConstraints" && p == 0
&& replacing.wantPriority > 0 {
+ if state == "PENDING" &&
((reason == "BadConstraints" && p == 0) || reason == "launch failed requeued held")
&& replacing.wantPriority > 0 {
// When using SLURM 14.x or 15.x, our queued
// jobs land in this state when "scontrol
// reconfigure" invalidates their feature
// When using SLURM 14.x or 15.x, our queued
// jobs land in this state when "scontrol
// reconfigure" invalidates their feature
@@
-171,7
+171,14
@@
func (sqc *SqueueChecker) check() {
// reappeared, so rather than second-guessing
// whether SLURM is ready, we just keep trying
// this until it works.
// reappeared, so rather than second-guessing
// whether SLURM is ready, we just keep trying
// this until it works.
+ //
+ // "launch failed requeued held" seems to be
+ // another manifestation of this problem,
+ // resolved the same way.
+ log.Printf("releasing held job %q", uuid)
sqc.Slurm.Release(uuid)
sqc.Slurm.Release(uuid)
+ } else if p < 1<<20 && replacing.wantPriority > 0 {
+ log.Printf("warning: job %q has low priority %d, nice %d, state %q, reason %q", uuid, p, n, state, reason)
}
}
sqc.queue = newq
}
}
sqc.queue = newq