X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/14a6eb786a0d01e86ccae7645e880661caf6f0cb..3aaefcb3c76ff470b475d950398d01255e87712a:/lib/dispatchcloud/scheduler/sync.go?ds=sidebyside diff --git a/lib/dispatchcloud/scheduler/sync.go b/lib/dispatchcloud/scheduler/sync.go index e306db00ce..fc683505f9 100644 --- a/lib/dispatchcloud/scheduler/sync.go +++ b/lib/dispatchcloud/scheduler/sync.go @@ -7,8 +7,9 @@ package scheduler import ( "fmt" - "git.curoverse.com/arvados.git/lib/dispatchcloud/container" - "git.curoverse.com/arvados.git/sdk/go/arvados" + "git.arvados.org/arvados.git/lib/dispatchcloud/container" + "git.arvados.org/arvados.git/lib/dispatchcloud/worker" + "git.arvados.org/arvados.git/sdk/go/arvados" "github.com/sirupsen/logrus" ) @@ -23,6 +24,7 @@ import ( // Running containers whose crunch-run processes have exited are // cancelled. func (sch *Scheduler) sync() { + anyUnknownWorkers := sch.pool.CountWorkers()[worker.StateUnknown] > 0 running := sch.pool.Running() qEntries, qUpdated := sch.queue.Entries() for uuid, ent := range qEntries { @@ -30,7 +32,9 @@ func (sch *Scheduler) sync() { switch ent.Container.State { case arvados.ContainerStateRunning: if !running { - go sch.cancel(uuid, "not running on any worker") + if !anyUnknownWorkers { + go sch.cancel(uuid, "not running on any worker") + } } else if !exited.IsZero() && qUpdated.After(exited) { go sch.cancel(uuid, "state=Running after crunch-run exited") } else if ent.Container.Priority == 0 { @@ -105,13 +109,17 @@ func (sch *Scheduler) cancel(uuid string, reason string) { } func (sch *Scheduler) kill(uuid string, reason string) { + if !sch.uuidLock(uuid, "kill") { + return + } + defer sch.uuidUnlock(uuid) sch.pool.KillContainer(uuid, reason) sch.pool.ForgetContainer(uuid) } func (sch *Scheduler) requeue(ent container.QueueEnt, reason string) { uuid := ent.Container.UUID - if !sch.uuidLock(uuid, "cancel") { + if !sch.uuidLock(uuid, "requeue") { return } defer sch.uuidUnlock(uuid)