X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/ca2d946973b6ae25dd594ddecec54e02b83bc44e..4529d84afb3549ccb4ae9005a8f64f558c2bbe5c:/lib/dispatchcloud/scheduler/sync.go

diff --git a/lib/dispatchcloud/scheduler/sync.go b/lib/dispatchcloud/scheduler/sync.go
index 23fc621dea..4d601d6ae8 100644
--- a/lib/dispatchcloud/scheduler/sync.go
+++ b/lib/dispatchcloud/scheduler/sync.go
@@ -7,11 +7,14 @@ package scheduler
 import (
 	"fmt"
 
-	"git.curoverse.com/arvados.git/lib/dispatchcloud/container"
-	"git.curoverse.com/arvados.git/sdk/go/arvados"
+	"git.arvados.org/arvados.git/lib/dispatchcloud/container"
+	"git.arvados.org/arvados.git/lib/dispatchcloud/worker"
+	"git.arvados.org/arvados.git/sdk/go/arvados"
 	"github.com/sirupsen/logrus"
 )
 
+var reportedUnexpectedState = false
+
 // sync resolves discrepancies between the queue and the pool:
 //
 // Lingering crunch-run processes for finalized and unlocked/requeued
@@ -23,6 +26,7 @@ import (
 // Running containers whose crunch-run processes have exited are
 // cancelled.
 func (sch *Scheduler) sync() {
+	anyUnknownWorkers := sch.pool.CountWorkers()[worker.StateUnknown] > 0
 	running := sch.pool.Running()
 	qEntries, qUpdated := sch.queue.Entries()
 	for uuid, ent := range qEntries {
@@ -30,11 +34,13 @@ func (sch *Scheduler) sync() {
 		switch ent.Container.State {
 		case arvados.ContainerStateRunning:
 			if !running {
-				go sch.cancel(ent, "not running on any worker")
+				if !anyUnknownWorkers {
+					go sch.cancel(uuid, "not running on any worker")
+				}
 			} else if !exited.IsZero() && qUpdated.After(exited) {
-				go sch.cancel(ent, "state=\"Running\" after crunch-run exited")
+				go sch.cancel(uuid, "state=Running after crunch-run exited")
 			} else if ent.Container.Priority == 0 {
-				go sch.kill(ent, "priority=0")
+				go sch.kill(uuid, "priority=0")
 			}
 		case arvados.ContainerStateComplete, arvados.ContainerStateCancelled:
 			if running {
@@ -46,12 +52,12 @@ func (sch *Scheduler) sync() {
 				// of kill() will be to make the
 				// worker available for the next
 				// container.
-				go sch.kill(ent, fmt.Sprintf("state=%q", ent.Container.State))
+				go sch.kill(uuid, fmt.Sprintf("state=%s", ent.Container.State))
 			} else {
 				sch.logger.WithFields(logrus.Fields{
 					"ContainerUUID": uuid,
 					"State":         ent.Container.State,
-				}).Info("container finished")
+				}).Info("container finished -- dropping from queue")
 				sch.queue.Forget(uuid)
 			}
 		case arvados.ContainerStateQueued:
@@ -60,27 +66,41 @@ func (sch *Scheduler) sync() {
 				// a network outage and is still
 				// preparing to run a container that
 				// has already been unlocked/requeued.
-				go sch.kill(ent, fmt.Sprintf("state=%q", ent.Container.State))
+				go sch.kill(uuid, fmt.Sprintf("pool says running, but queue says state=%s", ent.Container.State))
+			} else if ent.Container.Priority == 0 {
+				sch.logger.WithFields(logrus.Fields{
+					"ContainerUUID": uuid,
+					"State":         ent.Container.State,
+					"Priority":      ent.Container.Priority,
+				}).Info("container on hold -- dropping from queue")
+				sch.queue.Forget(uuid)
 			}
 		case arvados.ContainerStateLocked:
 			if running && !exited.IsZero() && qUpdated.After(exited) {
 				go sch.requeue(ent, "crunch-run exited")
 			} else if running && exited.IsZero() && ent.Container.Priority == 0 {
-				go sch.kill(ent, "priority=0")
+				go sch.kill(uuid, "priority=0")
 			} else if !running && ent.Container.Priority == 0 {
 				go sch.requeue(ent, "priority=0")
 			}
 		default:
-			sch.logger.WithFields(logrus.Fields{
-				"ContainerUUID": uuid,
-				"State":         ent.Container.State,
-			}).Error("BUG: unexpected state")
+			if !reportedUnexpectedState {
+				sch.logger.WithFields(logrus.Fields{
+					"ContainerUUID": uuid,
+					"State":         ent.Container.State,
+				}).Error("BUG: unexpected state")
+				reportedUnexpectedState = true
+			}
+		}
+	}
+	for uuid := range running {
+		if _, known := qEntries[uuid]; !known {
+			go sch.kill(uuid, "not in queue")
 		}
 	}
 }
 
-func (sch *Scheduler) cancel(ent container.QueueEnt, reason string) {
-	uuid := ent.Container.UUID
+func (sch *Scheduler) cancel(uuid string, reason string) {
 	if !sch.uuidLock(uuid, "cancel") {
 		return
 	}
@@ -93,16 +113,22 @@ func (sch *Scheduler) cancel(ent container.QueueEnt, reason string) {
 	}
 }
 
-func (sch *Scheduler) kill(ent container.QueueEnt, reason string) {
-	uuid := ent.Container.UUID
-	logger := sch.logger.WithField("ContainerUUID", uuid)
-	logger.Debugf("killing crunch-run process because %s", reason)
-	sch.pool.KillContainer(uuid)
+func (sch *Scheduler) kill(uuid string, reason string) {
+	if !sch.uuidLock(uuid, "kill") {
+		return
+	}
+	defer sch.uuidUnlock(uuid)
+	sch.logger.WithFields(logrus.Fields{
+		"ContainerUUID": uuid,
+		"reason":        reason,
+	}).Debug("kill")
+	sch.pool.KillContainer(uuid, reason)
+	sch.pool.ForgetContainer(uuid)
 }
 
 func (sch *Scheduler) requeue(ent container.QueueEnt, reason string) {
 	uuid := ent.Container.UUID
-	if !sch.uuidLock(uuid, "cancel") {
+	if !sch.uuidLock(uuid, "requeue") {
 		return
 	}
 	defer sch.uuidUnlock(uuid)