From 3aa3fb78afa46e98c9be345045f4fea9fea0f08c Mon Sep 17 00:00:00 2001 From: Tom Clegg Date: Tue, 4 Aug 2020 16:06:29 -0400 Subject: [PATCH] 16663: Don't kill orphaned containers when unprobed nodes exist. Arvados-DCO-1.1-Signed-off-by: Tom Clegg --- lib/dispatchcloud/scheduler/sync.go | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/lib/dispatchcloud/scheduler/sync.go b/lib/dispatchcloud/scheduler/sync.go index de69df9822..116ca76431 100644 --- a/lib/dispatchcloud/scheduler/sync.go +++ b/lib/dispatchcloud/scheduler/sync.go @@ -8,6 +8,7 @@ import ( "fmt" "git.arvados.org/arvados.git/lib/dispatchcloud/container" + "git.arvados.org/arvados.git/lib/dispatchcloud/worker" "git.arvados.org/arvados.git/sdk/go/arvados" "github.com/sirupsen/logrus" ) @@ -23,6 +24,7 @@ import ( // Running containers whose crunch-run processes have exited are // cancelled. func (sch *Scheduler) sync() { + anyUnknownWorkers := sch.pool.CountWorkers()[worker.StateUnknown] > 0 running := sch.pool.Running() qEntries, qUpdated := sch.queue.Entries() for uuid, ent := range qEntries { @@ -30,7 +32,9 @@ func (sch *Scheduler) sync() { switch ent.Container.State { case arvados.ContainerStateRunning: if !running { - go sch.cancel(uuid, "not running on any worker") + if !anyUnknownWorkers { + go sch.cancel(uuid, "not running on any worker") + } } else if !exited.IsZero() && qUpdated.After(exited) { go sch.cancel(uuid, "state=Running after crunch-run exited") } else if ent.Container.Priority == 0 { -- 2.39.5