Merge branch 'master' into 16811-public-favs
[arvados.git] / lib / dispatchcloud / scheduler / sync_test.go
index 305ab9e04eb379c82288853b3df9891bc639bf5b..a3ff0636e1cd9e7eec69beacc1956c3fa3db08c9 100644 (file)
@@ -48,9 +48,71 @@ func (*SchedulerSuite) TestForgetIrrelevantContainers(c *check.C) {
        ents, _ := queue.Entries()
        c.Check(ents, check.HasLen, 1)
 
-       sch := New(ctx, &queue, &pool, time.Millisecond, time.Millisecond)
+       sch := New(ctx, &queue, &pool, nil, time.Millisecond, time.Millisecond)
        sch.sync()
 
        ents, _ = queue.Entries()
        c.Check(ents, check.HasLen, 0)
 }
+
+func (*SchedulerSuite) TestCancelOrphanedContainers(c *check.C) {
+       ctx := ctxlog.Context(context.Background(), ctxlog.TestLogger(c))
+       pool := stubPool{
+               unalloc: map[arvados.InstanceType]int{test.InstanceType(1): 1},
+               unknown: map[arvados.InstanceType]int{test.InstanceType(1): 1},
+       }
+       queue := test.Queue{
+               ChooseType: chooseType,
+               Containers: []arvados.Container{
+                       {
+                               UUID:     test.ContainerUUID(1),
+                               Priority: 0,
+                               State:    arvados.ContainerStateRunning,
+                               RuntimeConstraints: arvados.RuntimeConstraints{
+                                       VCPUs: 1,
+                                       RAM:   1 << 30,
+                               },
+                       },
+               },
+       }
+       queue.Update()
+
+       ents, _ := queue.Entries()
+       c.Check(ents, check.HasLen, 1)
+
+       sch := New(ctx, &queue, &pool, nil, time.Millisecond, time.Millisecond)
+
+       // Sync shouldn't cancel the container because it might be
+       // running on the VM with state=="unknown".
+       //
+       // (Cancel+forget happens asynchronously and requires multiple
+       // sync() calls, so even after 10x sync-and-sleep iterations,
+       // we aren't 100% confident that sync isn't trying to
+       // cancel. But in the test environment, the goroutines started
+       // by sync() access stubs and therefore run quickly, so it
+       // works fine in practice. We accept that if the code is
+       // broken, the test will still pass occasionally.)
+       for i := 0; i < 10; i++ {
+               sch.sync()
+               time.Sleep(time.Millisecond)
+       }
+       ents, _ = queue.Entries()
+       c.Check(ents, check.HasLen, 1)
+       c.Check(ents[test.ContainerUUID(1)].Container.State, check.Equals, arvados.ContainerStateRunning)
+
+       // Sync should cancel & forget the container when the
+       // "unknown" node goes away.
+       //
+       // (As above, cancel+forget is async and requires multiple
+       // sync() calls, but stubs are fast so in practice this takes
+       // much less than 1s to complete.)
+       pool.unknown = nil
+       for deadline := time.Now().Add(time.Second); ; time.Sleep(time.Millisecond) {
+               sch.sync()
+               ents, _ = queue.Entries()
+               if len(ents) == 0 || time.Now().After(deadline) {
+                       break
+               }
+       }
+       c.Check(ents, check.HasLen, 0)
+}