13399: If slurm refuses to renice past 10K, stop trying.
authorTom Clegg <tclegg@veritasgenetics.com>
Tue, 3 Jul 2018 21:03:11 +0000 (17:03 -0400)
committerTom Clegg <tclegg@veritasgenetics.com>
Tue, 17 Jul 2018 21:06:40 +0000 (17:06 -0400)
Fixes log spam.

Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg@veritasgenetics.com>

services/crunch-dispatch-slurm/crunch-dispatch-slurm_test.go
services/crunch-dispatch-slurm/squeue.go
services/crunch-dispatch-slurm/squeue_test.go

index 23a8a0ca01124df89575c5724a4b6cb6650527fb..719ec98d27aa19d65eceb7d3db3a46f506aed2f0 100644 (file)
@@ -55,11 +55,12 @@ func (s *IntegrationSuite) TearDownTest(c *C) {
 }
 
 type slurmFake struct {
-       didBatch   [][]string
-       didCancel  []string
-       didRelease []string
-       didRenice  [][]string
-       queue      string
+       didBatch      [][]string
+       didCancel     []string
+       didRelease    []string
+       didRenice     [][]string
+       queue         string
+       rejectNice10K bool
        // If non-nil, run this func during the 2nd+ call to Cancel()
        onCancel func()
        // Error returned by Batch()
@@ -82,6 +83,9 @@ func (sf *slurmFake) Release(name string) error {
 
 func (sf *slurmFake) Renice(name string, nice int64) error {
        sf.didRenice = append(sf.didRenice, []string{name, fmt.Sprintf("%d", nice)})
+       if sf.rejectNice10K && nice > 10000 {
+               return errors.New("scontrol: error: Invalid nice value, must be between -10000 and 10000")
+       }
        return nil
 }
 
index 742943f197580e186e7fd1f7b8084a1357f3661d..4067d27d5a43b8b603ed3e8aa4db07ebad66721d 100644 (file)
@@ -14,11 +14,14 @@ import (
        "time"
 )
 
+const slurm15NiceLimit int64 = 10000
+
 type slurmJob struct {
        uuid         string
        wantPriority int64
        priority     int64 // current slurm priority (incorporates nice value)
        nice         int64 // current slurm nice value
+       hitNiceLimit bool
 }
 
 // Squeue implements asynchronous polling monitor of the SLURM queue using the
@@ -103,10 +106,18 @@ func (sqc *SqueueChecker) reniceAll() {
        })
        renice := wantNice(jobs, sqc.PrioritySpread)
        for i, job := range jobs {
-               if renice[i] == job.nice {
+               niceNew := renice[i]
+               if job.hitNiceLimit && niceNew > slurm15NiceLimit {
+                       niceNew = slurm15NiceLimit
+               }
+               if niceNew == job.nice {
                        continue
                }
-               sqc.Slurm.Renice(job.uuid, renice[i])
+               err := sqc.Slurm.Renice(job.uuid, niceNew)
+               if err != nil && niceNew > slurm15NiceLimit && strings.Contains(err.Error(), "Invalid nice value") {
+                       log.Printf("container %q clamping nice values at %d, priority order will not be correct", job.uuid, slurm15NiceLimit)
+                       job.hitNiceLimit = true
+               }
        }
 }
 
index c9329fdf95bf87028346fb727b8521dc8edfa1cd..ef036dabd781edd425b29fc28f847ae18370d700 100644 (file)
@@ -103,6 +103,50 @@ func (s *SqueueSuite) TestReniceAll(c *C) {
        }
 }
 
+// If a limited nice range prevents desired priority adjustments, give
+// up and clamp nice to 10K.
+func (s *SqueueSuite) TestReniceInvalidNiceValue(c *C) {
+       uuids := []string{"zzzzz-dz642-fake0fake0fake0", "zzzzz-dz642-fake1fake1fake1", "zzzzz-dz642-fake2fake2fake2"}
+       slurm := &slurmFake{
+               queue:         uuids[0] + " 0 4294000222 PENDING Resources\n" + uuids[1] + " 0 4294555222 PENDING Resources\n",
+               rejectNice10K: true,
+       }
+       sqc := &SqueueChecker{
+               Slurm:          slurm,
+               PrioritySpread: 1,
+               Period:         time.Hour,
+       }
+       sqc.startOnce.Do(sqc.start)
+       sqc.check()
+       sqc.SetPriority(uuids[0], 2)
+       sqc.SetPriority(uuids[1], 1)
+
+       // First attempt should renice to 555001, which will fail
+       sqc.reniceAll()
+       c.Check(slurm.didRenice, DeepEquals, [][]string{{uuids[1], "555001"}})
+
+       // Next attempt should renice to 10K, which will succeed
+       sqc.reniceAll()
+       c.Check(slurm.didRenice, DeepEquals, [][]string{{uuids[1], "555001"}, {uuids[1], "10000"}})
+       // ...so we'll change the squeue response to reflect the
+       // updated priority+nice, and make sure sqc sees that...
+       slurm.queue = uuids[0] + " 0 4294000222 PENDING Resources\n" + uuids[1] + " 10000 4294545222 PENDING Resources\n"
+       sqc.check()
+
+       // Next attempt should leave nice alone because it's already
+       // at the 10K limit
+       sqc.reniceAll()
+       c.Check(slurm.didRenice, DeepEquals, [][]string{{uuids[1], "555001"}, {uuids[1], "10000"}})
+
+       // Back to normal if desired nice value falls below 10K
+       slurm.queue = uuids[0] + " 0 4294000222 PENDING Resources\n" + uuids[1] + " 10000 4294000111 PENDING Resources\n"
+       sqc.check()
+       sqc.reniceAll()
+       c.Check(slurm.didRenice, DeepEquals, [][]string{{uuids[1], "555001"}, {uuids[1], "10000"}, {uuids[1], "9890"}})
+
+       sqc.Stop()
+}
+
 // If the given UUID isn't in the slurm queue yet, SetPriority()
 // should wait for it to appear on the very next poll, then give up.
 func (s *SqueueSuite) TestSetPriorityBeforeQueued(c *C) {