21254: Fix racy keep-balance test.
authorTom Clegg <tom@curii.com>
Thu, 7 Dec 2023 19:01:09 +0000 (14:01 -0500)
committerTom Clegg <tom@curii.com>
Thu, 7 Dec 2023 19:01:09 +0000 (14:01 -0500)
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom@curii.com>

services/keep-balance/balance_run_test.go
services/keep-balance/server.go

index b7b3fb61233249beb6c6df48e46c8de4ea678e22..fefd2c6c1bd440758d1d956affb8a5603140cb85 100644 (file)
@@ -15,6 +15,7 @@ import (
        "os"
        "strings"
        "sync"
+       "syscall"
        "time"
 
        "git.arvados.org/arvados.git/lib/config"
@@ -639,7 +640,7 @@ func (s *runSuite) TestRunForever(c *check.C) {
 
        ctx, cancel := context.WithCancel(context.Background())
        defer cancel()
-       s.config.Collections.BalancePeriod = arvados.Duration(time.Millisecond)
+       s.config.Collections.BalancePeriod = arvados.Duration(100 * time.Millisecond)
        srv := s.newServer(&opts)
 
        done := make(chan bool)
@@ -648,14 +649,34 @@ func (s *runSuite) TestRunForever(c *check.C) {
                close(done)
        }()
 
+       procself, err := os.FindProcess(os.Getpid())
+       c.Assert(err, check.IsNil)
+
        // Each run should send 4 pull lists + 4 trash lists. The
        // first run should also send 4 empty trash lists at
        // startup. We should complete all four runs in much less than
        // a second.
+       completedRuns := 0
        for t0 := time.Now(); time.Since(t0) < 10*time.Second; {
-               if pullReqs.Count() >= 16 && trashReqs.Count() == pullReqs.Count()+4 {
+               pulls := pullReqs.Count()
+               if pulls >= 16 && trashReqs.Count() == pulls+4 {
                        break
                }
+               if pulls > 4 {
+                       // Once the 2nd run has started automatically
+                       // (indicating that our BalancePeriod is
+                       // working) we switch to a long wait time to
+                       // effectively stop the timed runs, and
+                       // instead start sending a single SIGUSR1 at
+                       // the end of each (2nd or 3rd) run, to ensure
+                       // we get exactly 4 runs in total.
+                       srv.Cluster.Collections.BalancePeriod = arvados.Duration(time.Minute)
+                       if pulls%4 == 0 && pulls <= 12 && pulls/4 > completedRuns {
+                               completedRuns = pulls / 4
+                               c.Logf("completed run %d, sending SIGUSR1 to trigger next run", completedRuns)
+                               procself.Signal(syscall.SIGUSR1)
+                       }
+               }
                time.Sleep(time.Millisecond)
        }
        cancel()
index 480791ffa2637da8f282fb28507dbbcb046bcfbf..7a59c1e8c0edace3a8cb7637c6759a4962d44a99 100644 (file)
@@ -100,6 +100,7 @@ func (srv *Server) runForever(ctx context.Context) error {
 
        sigUSR1 := make(chan os.Signal, 1)
        signal.Notify(sigUSR1, syscall.SIGUSR1)
+       defer signal.Stop(sigUSR1)
 
        logger.Info("acquiring service lock")
        dblock.KeepBalanceService.Lock(ctx, func(context.Context) (*sqlx.DB, error) { return srv.DB, nil })
@@ -126,7 +127,6 @@ func (srv *Server) runForever(ctx context.Context) error {
 
                select {
                case <-ctx.Done():
-                       signal.Stop(sigUSR1)
                        return nil
                case <-ticker.C:
                        logger.Print("timer went off")
@@ -135,8 +135,7 @@ func (srv *Server) runForever(ctx context.Context) error {
                        // Reset the timer so we don't start the N+1st
                        // run too soon after the Nth run is triggered
                        // by SIGUSR1.
-                       ticker.Stop()
-                       ticker = time.NewTicker(time.Duration(srv.Cluster.Collections.BalancePeriod))
+                       ticker.Reset(time.Duration(srv.Cluster.Collections.BalancePeriod))
                }
                logger.Print("starting next run")
        }