13078: Fix jobs stuck in "held" state in old SLURM versions.
[arvados.git] / services / crunch-dispatch-slurm / squeue_test.go
1 // Copyright (C) The Arvados Authors. All rights reserved.
2 //
3 // SPDX-License-Identifier: AGPL-3.0
4
5 package main
6
7 import (
8         "time"
9
10         . "gopkg.in/check.v1"
11 )
12
13 var _ = Suite(&SqueueSuite{})
14
15 type SqueueSuite struct{}
16
17 func (s *SqueueSuite) TestReniceAll(c *C) {
18         uuids := []string{"zzzzz-dz642-fake0fake0fake0", "zzzzz-dz642-fake1fake1fake1", "zzzzz-dz642-fake2fake2fake2"}
19         for _, test := range []struct {
20                 spread int64
21                 squeue string
22                 want   map[string]int64
23                 expect [][]string
24         }{
25                 {
26                         spread: 1,
27                         squeue: uuids[0] + " 10000 4294000000 PENDING Resources\n",
28                         want:   map[string]int64{uuids[0]: 1},
29                         expect: [][]string{{uuids[0], "0"}},
30                 },
31                 { // fake0 priority is too high
32                         spread: 1,
33                         squeue: uuids[0] + " 10000 4294000777 PENDING Resources\n" + uuids[1] + " 10000 4294000444 PENDING Resources\n",
34                         want:   map[string]int64{uuids[0]: 1, uuids[1]: 999},
35                         expect: [][]string{{uuids[1], "0"}, {uuids[0], "334"}},
36                 },
37                 { // specify spread
38                         spread: 100,
39                         squeue: uuids[0] + " 10000 4294000777 PENDING Resources\n" + uuids[1] + " 10000 4294000444 PENDING Resources\n",
40                         want:   map[string]int64{uuids[0]: 1, uuids[1]: 999},
41                         expect: [][]string{{uuids[1], "0"}, {uuids[0], "433"}},
42                 },
43                 { // ignore fake2 because SetPriority() not called
44                         spread: 1,
45                         squeue: uuids[0] + " 10000 4294000000 PENDING Resources\n" + uuids[1] + " 10000 4294000111 PENDING Resources\n" + uuids[2] + " 10000 4294000222 PENDING Resources\n",
46                         want:   map[string]int64{uuids[0]: 999, uuids[1]: 1},
47                         expect: [][]string{{uuids[0], "0"}, {uuids[1], "112"}},
48                 },
49                 { // ignore fake2 because slurm priority=0
50                         spread: 1,
51                         squeue: uuids[0] + " 10000 4294000000 PENDING Resources\n" + uuids[1] + " 10000 4294000111 PENDING Resources\n" + uuids[2] + " 10000 0 PENDING Resources\n",
52                         want:   map[string]int64{uuids[0]: 999, uuids[1]: 1, uuids[2]: 997},
53                         expect: [][]string{{uuids[0], "0"}, {uuids[1], "112"}},
54                 },
55         } {
56                 c.Logf("spread=%d squeue=%q want=%v -> expect=%v", test.spread, test.squeue, test.want, test.expect)
57                 slurm := &slurmFake{
58                         queue: test.squeue,
59                 }
60                 sqc := &SqueueChecker{
61                         Slurm:          slurm,
62                         PrioritySpread: test.spread,
63                         Period:         time.Hour,
64                 }
65                 sqc.startOnce.Do(sqc.start)
66                 sqc.check()
67                 for uuid, pri := range test.want {
68                         sqc.SetPriority(uuid, pri)
69                 }
70                 sqc.reniceAll()
71                 c.Check(slurm.didRenice, DeepEquals, test.expect)
72                 sqc.Stop()
73         }
74 }
75
76 // If the given UUID isn't in the slurm queue yet, SetPriority()
77 // should wait for it to appear on the very next poll, then give up.
78 func (s *SqueueSuite) TestSetPriorityBeforeQueued(c *C) {
79         uuidGood := "zzzzz-dz642-fake0fake0fake0"
80         uuidBad := "zzzzz-dz642-fake1fake1fake1"
81
82         slurm := &slurmFake{}
83         sqc := &SqueueChecker{
84                 Slurm:  slurm,
85                 Period: time.Hour,
86         }
87         sqc.startOnce.Do(sqc.start)
88         sqc.Stop()
89         sqc.check()
90
91         done := make(chan struct{})
92         go func() {
93                 sqc.SetPriority(uuidGood, 123)
94                 sqc.SetPriority(uuidBad, 345)
95                 close(done)
96         }()
97         c.Check(sqc.queue[uuidGood], IsNil)
98         c.Check(sqc.queue[uuidBad], IsNil)
99         timeout := time.NewTimer(time.Second)
100         defer timeout.Stop()
101         tick := time.NewTicker(time.Millisecond)
102         defer tick.Stop()
103         for {
104                 select {
105                 case <-tick.C:
106                         slurm.queue = uuidGood + " 0 12345 PENDING Resources\n"
107                         sqc.check()
108
109                         // Avoid immediately selecting this case again
110                         // on the next iteration if check() took
111                         // longer than one tick.
112                         select {
113                         case <-tick.C:
114                         default:
115                         }
116                 case <-timeout.C:
117                         c.Fatal("timed out")
118                 case <-done:
119                         c.Assert(sqc.queue[uuidGood], NotNil)
120                         c.Check(sqc.queue[uuidGood].wantPriority, Equals, int64(123))
121                         c.Check(sqc.queue[uuidBad], IsNil)
122                         return
123                 }
124         }
125 }