16723: Reject invalid updates in API stub.
authorTom Clegg <tom@tomclegg.ca>
Fri, 21 Aug 2020 02:21:14 +0000 (22:21 -0400)
committerTom Clegg <tom@tomclegg.ca>
Fri, 21 Aug 2020 13:33:38 +0000 (09:33 -0400)
When dispatcher notices crunch-run has exited and the last known state
is Locked, it requeues the container. If crunch-run changed the
container state to Running before exiting (and dispatcher hasn't
noticed yet), dispatcher relies on RailsAPI/controller to reject the
requeue attempt.

Without this, the scheduler's state=Queued call was being accepted
even after losing a race to the stub's state=Running call,
occasionally causing a container to run twice and fail the randomized
simulation test.

Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom@tomclegg.ca>

lib/dispatchcloud/dispatcher_test.go
lib/dispatchcloud/test/queue.go

index aa5f22a501331a2bdd108878987e25b133df720b..3b45e75fd063f76f4bbe5f96b37025c738e8aad2 100644 (file)
@@ -115,6 +115,7 @@ func (s *DispatcherSuite) TestDispatchToStubDriver(c *check.C) {
                ChooseType: func(ctr *arvados.Container) (arvados.InstanceType, error) {
                        return ChooseInstanceType(s.cluster, ctr)
                },
+               Logger: ctxlog.TestLogger(c),
        }
        for i := 0; i < 200; i++ {
                queue.Containers = append(queue.Containers, arvados.Container{
index 11d410fb1b9a931b8b65cb990aea1298babf7269..74b84122f286d912d8f8ef392e3eb860e6b1831d 100644 (file)
@@ -11,6 +11,7 @@ import (
 
        "git.arvados.org/arvados.git/lib/dispatchcloud/container"
        "git.arvados.org/arvados.git/sdk/go/arvados"
+       "github.com/sirupsen/logrus"
 )
 
 // Queue is a test stub for container.Queue. The caller specifies the
@@ -23,6 +24,8 @@ type Queue struct {
        // must not be nil.
        ChooseType func(*arvados.Container) (arvados.InstanceType, error)
 
+       Logger logrus.FieldLogger
+
        entries     map[string]container.QueueEnt
        updTime     time.Time
        subscribers map[<-chan struct{}]chan struct{}
@@ -166,13 +169,36 @@ func (q *Queue) Notify(upd arvados.Container) bool {
        defer q.mtx.Unlock()
        for i, ctr := range q.Containers {
                if ctr.UUID == upd.UUID {
-                       if ctr.State != arvados.ContainerStateComplete && ctr.State != arvados.ContainerStateCancelled {
+                       if allowContainerUpdate[ctr.State][upd.State] {
                                q.Containers[i] = upd
                                return true
+                       } else {
+                               if q.Logger != nil {
+                                       q.Logger.WithField("ContainerUUID", ctr.UUID).Infof("test.Queue rejected update from %s to %s", ctr.State, upd.State)
+                               }
+                               return false
                        }
-                       return false
                }
        }
        q.Containers = append(q.Containers, upd)
        return true
 }
+
+var allowContainerUpdate = map[arvados.ContainerState]map[arvados.ContainerState]bool{
+       arvados.ContainerStateQueued: map[arvados.ContainerState]bool{
+               arvados.ContainerStateQueued:    true,
+               arvados.ContainerStateLocked:    true,
+               arvados.ContainerStateCancelled: true,
+       },
+       arvados.ContainerStateLocked: map[arvados.ContainerState]bool{
+               arvados.ContainerStateQueued:    true,
+               arvados.ContainerStateLocked:    true,
+               arvados.ContainerStateRunning:   true,
+               arvados.ContainerStateCancelled: true,
+       },
+       arvados.ContainerStateRunning: map[arvados.ContainerState]bool{
+               arvados.ContainerStateRunning:   true,
+               arvados.ContainerStateCancelled: true,
+               arvados.ContainerStateComplete:  true,
+       },
+}