From 9251549928dd5206d4a14e5f9811caa66aa64c65 Mon Sep 17 00:00:00 2001 From: Tom Clegg Date: Thu, 7 Dec 2023 14:01:09 -0500 Subject: [PATCH] 21254: Fix racy keep-balance test. Arvados-DCO-1.1-Signed-off-by: Tom Clegg --- services/keep-balance/balance_run_test.go | 25 +++++++++++++++++++++-- services/keep-balance/server.go | 5 ++--- 2 files changed, 25 insertions(+), 5 deletions(-) diff --git a/services/keep-balance/balance_run_test.go b/services/keep-balance/balance_run_test.go index b7b3fb6123..fefd2c6c1b 100644 --- a/services/keep-balance/balance_run_test.go +++ b/services/keep-balance/balance_run_test.go @@ -15,6 +15,7 @@ import ( "os" "strings" "sync" + "syscall" "time" "git.arvados.org/arvados.git/lib/config" @@ -639,7 +640,7 @@ func (s *runSuite) TestRunForever(c *check.C) { ctx, cancel := context.WithCancel(context.Background()) defer cancel() - s.config.Collections.BalancePeriod = arvados.Duration(time.Millisecond) + s.config.Collections.BalancePeriod = arvados.Duration(100 * time.Millisecond) srv := s.newServer(&opts) done := make(chan bool) @@ -648,14 +649,34 @@ func (s *runSuite) TestRunForever(c *check.C) { close(done) }() + procself, err := os.FindProcess(os.Getpid()) + c.Assert(err, check.IsNil) + // Each run should send 4 pull lists + 4 trash lists. The // first run should also send 4 empty trash lists at // startup. We should complete all four runs in much less than // a second. + completedRuns := 0 for t0 := time.Now(); time.Since(t0) < 10*time.Second; { - if pullReqs.Count() >= 16 && trashReqs.Count() == pullReqs.Count()+4 { + pulls := pullReqs.Count() + if pulls >= 16 && trashReqs.Count() == pulls+4 { break } + if pulls > 4 { + // Once the 2nd run has started automatically + // (indicating that our BalancePeriod is + // working) we switch to a long wait time to + // effectively stop the timed runs, and + // instead start sending a single SIGUSR1 at + // the end of each (2nd or 3rd) run, to ensure + // we get exactly 4 runs in total. + srv.Cluster.Collections.BalancePeriod = arvados.Duration(time.Minute) + if pulls%4 == 0 && pulls <= 12 && pulls/4 > completedRuns { + completedRuns = pulls / 4 + c.Logf("completed run %d, sending SIGUSR1 to trigger next run", completedRuns) + procself.Signal(syscall.SIGUSR1) + } + } time.Sleep(time.Millisecond) } cancel() diff --git a/services/keep-balance/server.go b/services/keep-balance/server.go index 480791ffa2..7a59c1e8c0 100644 --- a/services/keep-balance/server.go +++ b/services/keep-balance/server.go @@ -100,6 +100,7 @@ func (srv *Server) runForever(ctx context.Context) error { sigUSR1 := make(chan os.Signal, 1) signal.Notify(sigUSR1, syscall.SIGUSR1) + defer signal.Stop(sigUSR1) logger.Info("acquiring service lock") dblock.KeepBalanceService.Lock(ctx, func(context.Context) (*sqlx.DB, error) { return srv.DB, nil }) @@ -126,7 +127,6 @@ func (srv *Server) runForever(ctx context.Context) error { select { case <-ctx.Done(): - signal.Stop(sigUSR1) return nil case <-ticker.C: logger.Print("timer went off") @@ -135,8 +135,7 @@ func (srv *Server) runForever(ctx context.Context) error { // Reset the timer so we don't start the N+1st // run too soon after the Nth run is triggered // by SIGUSR1. - ticker.Stop() - ticker = time.NewTicker(time.Duration(srv.Cluster.Collections.BalancePeriod)) + ticker.Reset(time.Duration(srv.Cluster.Collections.BalancePeriod)) } logger.Print("starting next run") } -- 2.30.2