21254: Fix racy keep-balance test.
[arvados.git] / services / keep-balance / balance_run_test.go
index aeed517d094e93bf2b67a4a65a6cae0396847874..fefd2c6c1bd440758d1d956affb8a5603140cb85 100644 (file)
@@ -5,7 +5,6 @@
 package keepbalance
 
 import (
-       "bytes"
        "context"
        "encoding/json"
        "fmt"
@@ -16,6 +15,7 @@ import (
        "os"
        "strings"
        "sync"
+       "syscall"
        "time"
 
        "git.arvados.org/arvados.git/lib/config"
@@ -24,7 +24,6 @@ import (
        "git.arvados.org/arvados.git/sdk/go/ctxlog"
        "github.com/jmoiron/sqlx"
        "github.com/prometheus/client_golang/prometheus"
-       "github.com/prometheus/common/expfmt"
        check "gopkg.in/check.v1"
 )
 
@@ -91,21 +90,29 @@ var stubMounts = map[string][]arvados.KeepMount{
                UUID:           "zzzzz-ivpuk-000000000000000",
                DeviceID:       "keep0-vol0",
                StorageClasses: map[string]bool{"default": true},
+               AllowWrite:     true,
+               AllowTrash:     true,
        }},
        "keep1.zzzzz.arvadosapi.com:25107": {{
                UUID:           "zzzzz-ivpuk-100000000000000",
                DeviceID:       "keep1-vol0",
                StorageClasses: map[string]bool{"default": true},
+               AllowWrite:     true,
+               AllowTrash:     true,
        }},
        "keep2.zzzzz.arvadosapi.com:25107": {{
                UUID:           "zzzzz-ivpuk-200000000000000",
                DeviceID:       "keep2-vol0",
                StorageClasses: map[string]bool{"default": true},
+               AllowWrite:     true,
+               AllowTrash:     true,
        }},
        "keep3.zzzzz.arvadosapi.com:25107": {{
                UUID:           "zzzzz-ivpuk-300000000000000",
                DeviceID:       "keep3-vol0",
                StorageClasses: map[string]bool{"default": true},
+               AllowWrite:     true,
+               AllowTrash:     true,
        }},
 }
 
@@ -390,9 +397,7 @@ func (s *runSuite) TestRefuseZeroCollections(c *check.C) {
        _, err := s.db.Exec(`delete from collections`)
        c.Assert(err, check.IsNil)
        opts := RunOptions{
-               CommitPulls: true,
-               CommitTrash: true,
-               Logger:      ctxlog.TestLogger(c),
+               Logger: ctxlog.TestLogger(c),
        }
        s.stub.serveCurrentUserAdmin()
        s.stub.serveZeroCollections()
@@ -410,8 +415,6 @@ func (s *runSuite) TestRefuseZeroCollections(c *check.C) {
 
 func (s *runSuite) TestRefuseBadIndex(c *check.C) {
        opts := RunOptions{
-               CommitPulls: true,
-               CommitTrash: true,
                ChunkPrefix: "abc",
                Logger:      ctxlog.TestLogger(c),
        }
@@ -433,9 +436,7 @@ func (s *runSuite) TestRefuseBadIndex(c *check.C) {
 
 func (s *runSuite) TestRefuseNonAdmin(c *check.C) {
        opts := RunOptions{
-               CommitPulls: true,
-               CommitTrash: true,
-               Logger:      ctxlog.TestLogger(c),
+               Logger: ctxlog.TestLogger(c),
        }
        s.stub.serveCurrentUserNotAdmin()
        s.stub.serveZeroCollections()
@@ -462,8 +463,6 @@ func (s *runSuite) TestInvalidChunkPrefix(c *check.C) {
                s.SetUpTest(c)
                c.Logf("trying invalid prefix %q", trial.prefix)
                opts := RunOptions{
-                       CommitPulls: true,
-                       CommitTrash: true,
                        ChunkPrefix: trial.prefix,
                        Logger:      ctxlog.TestLogger(c),
                }
@@ -483,9 +482,7 @@ func (s *runSuite) TestInvalidChunkPrefix(c *check.C) {
 
 func (s *runSuite) TestRefuseSameDeviceDifferentVolumes(c *check.C) {
        opts := RunOptions{
-               CommitPulls: true,
-               CommitTrash: true,
-               Logger:      ctxlog.TestLogger(c),
+               Logger: ctxlog.TestLogger(c),
        }
        s.stub.serveCurrentUserAdmin()
        s.stub.serveZeroCollections()
@@ -513,9 +510,7 @@ func (s *runSuite) TestWriteLostBlocks(c *check.C) {
        s.config.Collections.BlobMissingReport = lostf.Name()
        defer os.Remove(lostf.Name())
        opts := RunOptions{
-               CommitPulls: true,
-               CommitTrash: true,
-               Logger:      ctxlog.TestLogger(c),
+               Logger: ctxlog.TestLogger(c),
        }
        s.stub.serveCurrentUserAdmin()
        s.stub.serveFooBarFileCollections()
@@ -534,10 +529,10 @@ func (s *runSuite) TestWriteLostBlocks(c *check.C) {
 }
 
 func (s *runSuite) TestDryRun(c *check.C) {
+       s.config.Collections.BalanceTrashLimit = 0
+       s.config.Collections.BalancePullLimit = 0
        opts := RunOptions{
-               CommitPulls: false,
-               CommitTrash: false,
-               Logger:      ctxlog.TestLogger(c),
+               Logger: ctxlog.TestLogger(c),
        }
        s.stub.serveCurrentUserAdmin()
        collReqs := s.stub.serveFooBarFileCollections()
@@ -555,7 +550,10 @@ func (s *runSuite) TestDryRun(c *check.C) {
        }
        c.Check(trashReqs.Count(), check.Equals, 0)
        c.Check(pullReqs.Count(), check.Equals, 0)
-       c.Check(bal.stats.pulls, check.Not(check.Equals), 0)
+       c.Check(bal.stats.pulls, check.Equals, 0)
+       c.Check(bal.stats.pullsDeferred, check.Not(check.Equals), 0)
+       c.Check(bal.stats.trashes, check.Equals, 0)
+       c.Check(bal.stats.trashesDeferred, check.Not(check.Equals), 0)
        c.Check(bal.stats.underrep.replicas, check.Not(check.Equals), 0)
        c.Check(bal.stats.overrep.replicas, check.Not(check.Equals), 0)
 }
@@ -564,10 +562,8 @@ func (s *runSuite) TestCommit(c *check.C) {
        s.config.Collections.BlobMissingReport = c.MkDir() + "/keep-balance-lost-blocks-test-"
        s.config.ManagementToken = "xyzzy"
        opts := RunOptions{
-               CommitPulls: true,
-               CommitTrash: true,
-               Logger:      ctxlog.TestLogger(c),
-               Dumper:      ctxlog.TestLogger(c),
+               Logger: ctxlog.TestLogger(c),
+               Dumper: ctxlog.TestLogger(c),
        }
        s.stub.serveCurrentUserAdmin()
        s.stub.serveFooBarFileCollections()
@@ -591,21 +587,17 @@ func (s *runSuite) TestCommit(c *check.C) {
        c.Assert(err, check.IsNil)
        c.Check(string(lost), check.Not(check.Matches), `(?ms).*acbd18db4cc2f85cedef654fccc4a4d8.*`)
 
-       buf, err := s.getMetrics(c, srv)
-       c.Check(err, check.IsNil)
-       bufstr := buf.String()
-       c.Check(bufstr, check.Matches, `(?ms).*\narvados_keep_total_bytes 15\n.*`)
-       c.Check(bufstr, check.Matches, `(?ms).*\narvados_keepbalance_changeset_compute_seconds_sum [0-9\.]+\n.*`)
-       c.Check(bufstr, check.Matches, `(?ms).*\narvados_keepbalance_changeset_compute_seconds_count 1\n.*`)
-       c.Check(bufstr, check.Matches, `(?ms).*\narvados_keep_dedup_byte_ratio [1-9].*`)
-       c.Check(bufstr, check.Matches, `(?ms).*\narvados_keep_dedup_block_ratio [1-9].*`)
+       metrics := arvadostest.GatherMetricsAsString(srv.Metrics.reg)
+       c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_total_bytes 15\n.*`)
+       c.Check(metrics, check.Matches, `(?ms).*\narvados_keepbalance_changeset_compute_seconds_sum [0-9\.]+\n.*`)
+       c.Check(metrics, check.Matches, `(?ms).*\narvados_keepbalance_changeset_compute_seconds_count 1\n.*`)
+       c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_dedup_byte_ratio [1-9].*`)
+       c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_dedup_block_ratio [1-9].*`)
 }
 
 func (s *runSuite) TestChunkPrefix(c *check.C) {
        s.config.Collections.BlobMissingReport = c.MkDir() + "/keep-balance-lost-blocks-test-"
        opts := RunOptions{
-               CommitPulls: true,
-               CommitTrash: true,
                ChunkPrefix: "ac", // catch "foo" but not "bar"
                Logger:      ctxlog.TestLogger(c),
                Dumper:      ctxlog.TestLogger(c),
@@ -635,10 +627,8 @@ func (s *runSuite) TestChunkPrefix(c *check.C) {
 func (s *runSuite) TestRunForever(c *check.C) {
        s.config.ManagementToken = "xyzzy"
        opts := RunOptions{
-               CommitPulls: true,
-               CommitTrash: true,
-               Logger:      ctxlog.TestLogger(c),
-               Dumper:      ctxlog.TestLogger(c),
+               Logger: ctxlog.TestLogger(c),
+               Dumper: ctxlog.TestLogger(c),
        }
        s.stub.serveCurrentUserAdmin()
        s.stub.serveFooBarFileCollections()
@@ -650,7 +640,7 @@ func (s *runSuite) TestRunForever(c *check.C) {
 
        ctx, cancel := context.WithCancel(context.Background())
        defer cancel()
-       s.config.Collections.BalancePeriod = arvados.Duration(time.Millisecond)
+       s.config.Collections.BalancePeriod = arvados.Duration(100 * time.Millisecond)
        srv := s.newServer(&opts)
 
        done := make(chan bool)
@@ -659,14 +649,34 @@ func (s *runSuite) TestRunForever(c *check.C) {
                close(done)
        }()
 
+       procself, err := os.FindProcess(os.Getpid())
+       c.Assert(err, check.IsNil)
+
        // Each run should send 4 pull lists + 4 trash lists. The
        // first run should also send 4 empty trash lists at
        // startup. We should complete all four runs in much less than
        // a second.
+       completedRuns := 0
        for t0 := time.Now(); time.Since(t0) < 10*time.Second; {
-               if pullReqs.Count() >= 16 && trashReqs.Count() == pullReqs.Count()+4 {
+               pulls := pullReqs.Count()
+               if pulls >= 16 && trashReqs.Count() == pulls+4 {
                        break
                }
+               if pulls > 4 {
+                       // Once the 2nd run has started automatically
+                       // (indicating that our BalancePeriod is
+                       // working) we switch to a long wait time to
+                       // effectively stop the timed runs, and
+                       // instead start sending a single SIGUSR1 at
+                       // the end of each (2nd or 3rd) run, to ensure
+                       // we get exactly 4 runs in total.
+                       srv.Cluster.Collections.BalancePeriod = arvados.Duration(time.Minute)
+                       if pulls%4 == 0 && pulls <= 12 && pulls/4 > completedRuns {
+                               completedRuns = pulls / 4
+                               c.Logf("completed run %d, sending SIGUSR1 to trigger next run", completedRuns)
+                               procself.Signal(syscall.SIGUSR1)
+                       }
+               }
                time.Sleep(time.Millisecond)
        }
        cancel()
@@ -674,23 +684,6 @@ func (s *runSuite) TestRunForever(c *check.C) {
        c.Check(pullReqs.Count() >= 16, check.Equals, true)
        c.Check(trashReqs.Count(), check.Equals, pullReqs.Count()+4)
 
-       buf, err := s.getMetrics(c, srv)
-       c.Check(err, check.IsNil)
-       c.Check(buf, check.Matches, `(?ms).*\narvados_keepbalance_changeset_compute_seconds_count `+fmt.Sprintf("%d", pullReqs.Count()/4)+`\n.*`)
-}
-
-func (s *runSuite) getMetrics(c *check.C, srv *Server) (*bytes.Buffer, error) {
-       mfs, err := srv.Metrics.reg.Gather()
-       if err != nil {
-               return nil, err
-       }
-
-       var buf bytes.Buffer
-       for _, mf := range mfs {
-               if _, err := expfmt.MetricFamilyToText(&buf, mf); err != nil {
-                       return nil, err
-               }
-       }
-
-       return &buf, nil
+       metrics := arvadostest.GatherMetricsAsString(srv.Metrics.reg)
+       c.Check(metrics, check.Matches, `(?ms).*\narvados_keepbalance_changeset_compute_seconds_count `+fmt.Sprintf("%d", pullReqs.Count()/4)+`\n.*`)
 }