16663: Don't kill orphaned containers when unprobed nodes exist.
authorTom Clegg <tom@tomclegg.ca>
Tue, 4 Aug 2020 20:06:29 +0000 (16:06 -0400)
committerTom Clegg <tom@tomclegg.ca>
Tue, 4 Aug 2020 20:06:29 +0000 (16:06 -0400)
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom@tomclegg.ca>

lib/dispatchcloud/scheduler/sync.go

index de69df98227e624fc29ef8e55884e8457db29592..116ca7643117d3f4df3b6e8d4e99864a44d6dfe6 100644 (file)
@@ -8,6 +8,7 @@ import (
        "fmt"
 
        "git.arvados.org/arvados.git/lib/dispatchcloud/container"
+       "git.arvados.org/arvados.git/lib/dispatchcloud/worker"
        "git.arvados.org/arvados.git/sdk/go/arvados"
        "github.com/sirupsen/logrus"
 )
@@ -23,6 +24,7 @@ import (
 // Running containers whose crunch-run processes have exited are
 // cancelled.
 func (sch *Scheduler) sync() {
+       anyUnknownWorkers := sch.pool.CountWorkers()[worker.StateUnknown] > 0
        running := sch.pool.Running()
        qEntries, qUpdated := sch.queue.Entries()
        for uuid, ent := range qEntries {
@@ -30,7 +32,9 @@ func (sch *Scheduler) sync() {
                switch ent.Container.State {
                case arvados.ContainerStateRunning:
                        if !running {
-                               go sch.cancel(uuid, "not running on any worker")
+                               if !anyUnknownWorkers {
+                                       go sch.cancel(uuid, "not running on any worker")
+                               }
                        } else if !exited.IsZero() && qUpdated.After(exited) {
                                go sch.cancel(uuid, "state=Running after crunch-run exited")
                        } else if ent.Container.Priority == 0 {