"fmt"
"git.arvados.org/arvados.git/lib/dispatchcloud/container"
+ "git.arvados.org/arvados.git/lib/dispatchcloud/worker"
"git.arvados.org/arvados.git/sdk/go/arvados"
"github.com/sirupsen/logrus"
)
// Running containers whose crunch-run processes have exited are
// cancelled.
func (sch *Scheduler) sync() {
+ anyUnknownWorkers := sch.pool.CountWorkers()[worker.StateUnknown] > 0
running := sch.pool.Running()
qEntries, qUpdated := sch.queue.Entries()
for uuid, ent := range qEntries {
switch ent.Container.State {
case arvados.ContainerStateRunning:
if !running {
- go sch.cancel(uuid, "not running on any worker")
+ if !anyUnknownWorkers {
+ go sch.cancel(uuid, "not running on any worker")
+ }
} else if !exited.IsZero() && qUpdated.After(exited) {
go sch.cancel(uuid, "state=Running after crunch-run exited")
} else if ent.Container.Priority == 0 {
}
func (sch *Scheduler) kill(uuid string, reason string) {
+ if !sch.uuidLock(uuid, "kill") {
+ return
+ }
+ defer sch.uuidUnlock(uuid)
sch.pool.KillContainer(uuid, reason)
sch.pool.ForgetContainer(uuid)
}
func (sch *Scheduler) requeue(ent container.QueueEnt, reason string) {
uuid := ent.Container.UUID
- if !sch.uuidLock(uuid, "cancel") {
+ if !sch.uuidLock(uuid, "requeue") {
return
}
defer sch.uuidUnlock(uuid)