Merge branch '18071-dblock-keep-balance-and-dispatch'
authorTom Clegg <tom@curii.com>
Mon, 31 Oct 2022 17:12:31 +0000 (13:12 -0400)
committerTom Clegg <tom@curii.com>
Mon, 31 Oct 2022 17:12:31 +0000 (13:12 -0400)
closes #18071

Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom@curii.com>

16 files changed:
lib/controller/dblock/dblock.go
lib/controller/dblock/dblock_test.go [new file with mode: 0644]
lib/controller/federation.go
lib/controller/handler.go
lib/controller/handler_test.go
lib/controller/trash.go
lib/ctrlctx/db.go
lib/dispatchcloud/dispatcher.go
lib/dispatchcloud/dispatcher_test.go
lib/lsf/dispatch.go
services/crunch-dispatch-slurm/crunch-dispatch-slurm.go
services/keep-balance/balance.go
services/keep-balance/balance_run_test.go
services/keep-balance/integration_test.go
services/keep-balance/main.go
services/keep-balance/server.go

index a46201bb45af793062126689d773be61f9bbe232..ad2733abfa36df82c72c4aa3c7a6c090c6496efb 100644 (file)
@@ -7,6 +7,8 @@ package dblock
 import (
        "context"
        "database/sql"
+       "fmt"
+       "net"
        "sync"
        "time"
 
@@ -15,9 +17,12 @@ import (
 )
 
 var (
-       TrashSweep        = &DBLocker{key: 10001}
-       ContainerLogSweep = &DBLocker{key: 10002}
-       retryDelay        = 5 * time.Second
+       TrashSweep         = &DBLocker{key: 10001}
+       ContainerLogSweep  = &DBLocker{key: 10002}
+       KeepBalanceService = &DBLocker{key: 10003} // keep-balance service in periodic-sweep loop
+       KeepBalanceActive  = &DBLocker{key: 10004} // keep-balance sweep in progress (either -once=true or service loop)
+       Dispatch           = &DBLocker{key: 10005} // any dispatcher running
+       retryDelay         = 5 * time.Second
 )
 
 // DBLocker uses pg_advisory_lock to maintain a cluster-wide lock for
@@ -31,8 +36,11 @@ type DBLocker struct {
 }
 
 // Lock acquires the advisory lock, waiting/reconnecting if needed.
-func (dbl *DBLocker) Lock(ctx context.Context, getdb func(context.Context) (*sqlx.DB, error)) {
-       logger := ctxlog.FromContext(ctx)
+//
+// Returns false if ctx is canceled before the lock is acquired.
+func (dbl *DBLocker) Lock(ctx context.Context, getdb func(context.Context) (*sqlx.DB, error)) bool {
+       logger := ctxlog.FromContext(ctx).WithField("ID", dbl.key)
+       var lastHeldBy string
        for ; ; time.Sleep(retryDelay) {
                dbl.mtx.Lock()
                if dbl.conn != nil {
@@ -41,55 +49,87 @@ func (dbl *DBLocker) Lock(ctx context.Context, getdb func(context.Context) (*sql
                        dbl.mtx.Unlock()
                        continue
                }
+               if ctx.Err() != nil {
+                       dbl.mtx.Unlock()
+                       return false
+               }
                db, err := getdb(ctx)
-               if err != nil {
-                       logger.WithError(err).Infof("error getting database pool")
+               if err == context.Canceled {
+                       dbl.mtx.Unlock()
+                       return false
+               } else if err != nil {
+                       logger.WithError(err).Info("error getting database pool")
                        dbl.mtx.Unlock()
                        continue
                }
                conn, err := db.Conn(ctx)
-               if err != nil {
+               if err == context.Canceled {
+                       dbl.mtx.Unlock()
+                       return false
+               } else if err != nil {
                        logger.WithError(err).Info("error getting database connection")
                        dbl.mtx.Unlock()
                        continue
                }
                var locked bool
                err = conn.QueryRowContext(ctx, `SELECT pg_try_advisory_lock($1)`, dbl.key).Scan(&locked)
-               if err != nil {
-                       logger.WithError(err).Infof("error getting pg_try_advisory_lock %d", dbl.key)
+               if err == context.Canceled {
+                       return false
+               } else if err != nil {
+                       logger.WithError(err).Info("error getting pg_try_advisory_lock")
                        conn.Close()
                        dbl.mtx.Unlock()
                        continue
                }
                if !locked {
+                       var host string
+                       var port int
+                       err = conn.QueryRowContext(ctx, `SELECT client_addr, client_port FROM pg_stat_activity WHERE pid IN
+                               (SELECT pid FROM pg_locks
+                                WHERE locktype = $1 AND objid = $2)`, "advisory", dbl.key).Scan(&host, &port)
+                       if err != nil {
+                               logger.WithError(err).Info("error getting other client info")
+                       } else {
+                               heldBy := net.JoinHostPort(host, fmt.Sprintf("%d", port))
+                               if lastHeldBy != heldBy {
+                                       logger.WithField("DBClient", heldBy).Info("waiting for other process to release lock")
+                                       lastHeldBy = heldBy
+                               }
+                       }
                        conn.Close()
                        dbl.mtx.Unlock()
                        continue
                }
-               logger.Debugf("acquired pg_advisory_lock %d", dbl.key)
+               logger.Debug("acquired pg_advisory_lock")
                dbl.ctx, dbl.getdb, dbl.conn = ctx, getdb, conn
                dbl.mtx.Unlock()
-               return
+               return true
        }
 }
 
 // Check confirms that the lock is still active (i.e., the session is
 // still alive), and re-acquires if needed. Panics if Lock is not
 // acquired first.
-func (dbl *DBLocker) Check() {
+//
+// Returns false if the context passed to Lock() is canceled before
+// the lock is confirmed or reacquired.
+func (dbl *DBLocker) Check() bool {
        dbl.mtx.Lock()
        err := dbl.conn.PingContext(dbl.ctx)
-       if err == nil {
-               ctxlog.FromContext(dbl.ctx).Debugf("pg_advisory_lock %d connection still alive", dbl.key)
+       if err == context.Canceled {
+               dbl.mtx.Unlock()
+               return false
+       } else if err == nil {
+               ctxlog.FromContext(dbl.ctx).WithField("ID", dbl.key).Debug("connection still alive")
                dbl.mtx.Unlock()
-               return
+               return true
        }
        ctxlog.FromContext(dbl.ctx).WithError(err).Info("database connection ping failed")
        dbl.conn.Close()
        dbl.conn = nil
        ctx, getdb := dbl.ctx, dbl.getdb
        dbl.mtx.Unlock()
-       dbl.Lock(ctx, getdb)
+       return dbl.Lock(ctx, getdb)
 }
 
 func (dbl *DBLocker) Unlock() {
@@ -98,9 +138,9 @@ func (dbl *DBLocker) Unlock() {
        if dbl.conn != nil {
                _, err := dbl.conn.ExecContext(context.Background(), `SELECT pg_advisory_unlock($1)`, dbl.key)
                if err != nil {
-                       ctxlog.FromContext(dbl.ctx).WithError(err).Infof("error releasing pg_advisory_lock %d", dbl.key)
+                       ctxlog.FromContext(dbl.ctx).WithError(err).WithField("ID", dbl.key).Info("error releasing pg_advisory_lock")
                } else {
-                       ctxlog.FromContext(dbl.ctx).Debugf("released pg_advisory_lock %d", dbl.key)
+                       ctxlog.FromContext(dbl.ctx).WithField("ID", dbl.key).Debug("released pg_advisory_lock")
                }
                dbl.conn.Close()
                dbl.conn = nil
diff --git a/lib/controller/dblock/dblock_test.go b/lib/controller/dblock/dblock_test.go
new file mode 100644 (file)
index 0000000..b10b2a3
--- /dev/null
@@ -0,0 +1,91 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package dblock
+
+import (
+       "bytes"
+       "context"
+       "sync"
+       "testing"
+       "time"
+
+       "git.arvados.org/arvados.git/lib/config"
+       "git.arvados.org/arvados.git/sdk/go/arvados"
+       "git.arvados.org/arvados.git/sdk/go/arvadostest"
+       "git.arvados.org/arvados.git/sdk/go/ctxlog"
+       "github.com/jmoiron/sqlx"
+       "github.com/sirupsen/logrus"
+       check "gopkg.in/check.v1"
+)
+
+func Test(t *testing.T) {
+       check.TestingT(t)
+}
+
+var _ = check.Suite(&suite{})
+
+type suite struct {
+       cluster *arvados.Cluster
+       db      *sqlx.DB
+       getdb   func(context.Context) (*sqlx.DB, error)
+}
+
+var testLocker = &DBLocker{key: 999}
+
+func (s *suite) SetUpSuite(c *check.C) {
+       cfg, err := config.NewLoader(nil, ctxlog.TestLogger(c)).Load()
+       c.Assert(err, check.IsNil)
+       s.cluster, err = cfg.GetCluster("")
+       c.Assert(err, check.IsNil)
+       s.db = arvadostest.DB(c, s.cluster)
+       s.getdb = func(context.Context) (*sqlx.DB, error) { return s.db, nil }
+}
+
+func (s *suite) TestLock(c *check.C) {
+       retryDelay = 10 * time.Millisecond
+
+       var logbuf bytes.Buffer
+       logger := ctxlog.New(&logbuf, "text", "debug")
+       logger.Level = logrus.DebugLevel
+       ctx := ctxlog.Context(context.Background(), logger)
+       ctx, cancel := context.WithCancel(ctx)
+       defer cancel()
+       testLocker.Lock(ctx, s.getdb)
+       testLocker.Check()
+
+       lock2 := make(chan bool)
+       var wg sync.WaitGroup
+       defer wg.Wait()
+       wg.Add(1)
+       go func() {
+               defer wg.Done()
+               testLocker2 := &DBLocker{key: 999}
+               testLocker2.Lock(ctx, s.getdb)
+               close(lock2)
+               testLocker2.Check()
+               testLocker2.Unlock()
+       }()
+
+       // Second lock should wait for first to Unlock
+       select {
+       case <-time.After(time.Second / 10):
+               c.Check(logbuf.String(), check.Matches, `(?ms).*level=info.*DBClient="[^"]+:\d+".*ID=999.*`)
+       case <-lock2:
+               c.Log("double-lock")
+               c.Fail()
+       }
+
+       testLocker.Check()
+       testLocker.Unlock()
+
+       // Now the second lock should succeed within retryDelay
+       select {
+       case <-time.After(retryDelay * 2):
+               c.Log("timed out")
+               c.Fail()
+       case <-lock2:
+       }
+       c.Logf("%s", logbuf.String())
+}
index e7d6e29b88c1f683f981a1ee5df2b53cf7c862af..93b8315a63be588a0b3e2e1b3182337e68defeff 100644 (file)
@@ -142,7 +142,7 @@ type CurrentUser struct {
 // non-nil, true, nil -- if the token is valid
 func (h *Handler) validateAPItoken(req *http.Request, token string) (*CurrentUser, bool, error) {
        user := CurrentUser{Authorization: arvados.APIClientAuthorization{APIToken: token}}
-       db, err := h.db(req.Context())
+       db, err := h.dbConnector.GetDB(req.Context())
        if err != nil {
                ctxlog.FromContext(req.Context()).WithError(err).Debugf("validateAPItoken(%s): database error", token)
                return nil, false, err
@@ -179,7 +179,7 @@ func (h *Handler) validateAPItoken(req *http.Request, token string) (*CurrentUse
 }
 
 func (h *Handler) createAPItoken(req *http.Request, userUUID string, scopes []string) (*arvados.APIClientAuthorization, error) {
-       db, err := h.db(req.Context())
+       db, err := h.dbConnector.GetDB(req.Context())
        if err != nil {
                return nil, err
        }
index e1392bef92652bfbf8a61155a062752608809495..4c6fca7f77276c3981c591d18429d8520d3e76b7 100644 (file)
@@ -6,7 +6,6 @@ package controller
 
 import (
        "context"
-       "errors"
        "fmt"
        "net/http"
        "net/http/httptest"
@@ -21,10 +20,8 @@ import (
        "git.arvados.org/arvados.git/lib/controller/router"
        "git.arvados.org/arvados.git/lib/ctrlctx"
        "git.arvados.org/arvados.git/sdk/go/arvados"
-       "git.arvados.org/arvados.git/sdk/go/ctxlog"
        "git.arvados.org/arvados.git/sdk/go/health"
        "git.arvados.org/arvados.git/sdk/go/httpserver"
-       "github.com/jmoiron/sqlx"
 
        // sqlx needs lib/pq to talk to PostgreSQL
        _ "github.com/lib/pq"
@@ -40,8 +37,7 @@ type Handler struct {
        proxy          *proxy
        secureClient   *http.Client
        insecureClient *http.Client
-       pgdb           *sqlx.DB
-       pgdbMtx        sync.Mutex
+       dbConnector    ctrlctx.DBConnector
 }
 
 func (h *Handler) ServeHTTP(w http.ResponseWriter, req *http.Request) {
@@ -65,7 +61,7 @@ func (h *Handler) ServeHTTP(w http.ResponseWriter, req *http.Request) {
 
 func (h *Handler) CheckHealth() error {
        h.setupOnce.Do(h.setup)
-       _, err := h.db(context.TODO())
+       _, err := h.dbConnector.GetDB(context.TODO())
        if err != nil {
                return err
        }
@@ -97,17 +93,18 @@ func (h *Handler) setup() {
        mux := http.NewServeMux()
        healthFuncs := make(map[string]health.Func)
 
-       oidcAuthorizer := localdb.OIDCAccessTokenAuthorizer(h.Cluster, h.db)
+       h.dbConnector = ctrlctx.DBConnector{PostgreSQL: h.Cluster.PostgreSQL}
+       oidcAuthorizer := localdb.OIDCAccessTokenAuthorizer(h.Cluster, h.dbConnector.GetDB)
        h.federation = federation.New(h.Cluster, &healthFuncs)
        rtr := router.New(h.federation, router.Config{
                MaxRequestSize: h.Cluster.API.MaxRequestSize,
                WrapCalls: api.ComposeWrappers(
-                       ctrlctx.WrapCallsInTransactions(h.db),
+                       ctrlctx.WrapCallsInTransactions(h.dbConnector.GetDB),
                        oidcAuthorizer.WrapCalls,
                        ctrlctx.WrapCallsWithAuth(h.Cluster)),
        })
 
-       healthRoutes := health.Routes{"ping": func() error { _, err := h.db(context.TODO()); return err }}
+       healthRoutes := health.Routes{"ping": func() error { _, err := h.dbConnector.GetDB(context.TODO()); return err }}
        for name, f := range healthFuncs {
                healthRoutes[name] = f
        }
@@ -158,31 +155,6 @@ func (h *Handler) setup() {
        go h.containerLogSweepWorker()
 }
 
-var errDBConnection = errors.New("database connection error")
-
-func (h *Handler) db(ctx context.Context) (*sqlx.DB, error) {
-       h.pgdbMtx.Lock()
-       defer h.pgdbMtx.Unlock()
-       if h.pgdb != nil {
-               return h.pgdb, nil
-       }
-
-       db, err := sqlx.Open("postgres", h.Cluster.PostgreSQL.Connection.String())
-       if err != nil {
-               ctxlog.FromContext(ctx).WithError(err).Error("postgresql connect failed")
-               return nil, errDBConnection
-       }
-       if p := h.Cluster.PostgreSQL.ConnectionPool; p > 0 {
-               db.SetMaxOpenConns(p)
-       }
-       if err := db.Ping(); err != nil {
-               ctxlog.FromContext(ctx).WithError(err).Error("postgresql connect succeeded but ping failed")
-               return nil, errDBConnection
-       }
-       h.pgdb = db
-       return db, nil
-}
-
 type middlewareFunc func(http.ResponseWriter, *http.Request, http.Handler)
 
 func prepend(next http.Handler, middleware middlewareFunc) http.Handler {
index 0ffe0255f5c59ca6ae40930bd994badf0672d05e..a240b195ea9ff583eede71720ab8c9dfdf3c3798 100644 (file)
@@ -284,6 +284,7 @@ func (s *HandlerSuite) TestLogoutGoogle(c *check.C) {
 }
 
 func (s *HandlerSuite) TestValidateV1APIToken(c *check.C) {
+       c.Assert(s.handler.CheckHealth(), check.IsNil)
        req := httptest.NewRequest("GET", "/arvados/v1/users/current", nil)
        user, ok, err := s.handler.validateAPItoken(req, arvadostest.ActiveToken)
        c.Assert(err, check.IsNil)
@@ -295,6 +296,7 @@ func (s *HandlerSuite) TestValidateV1APIToken(c *check.C) {
 }
 
 func (s *HandlerSuite) TestValidateV2APIToken(c *check.C) {
+       c.Assert(s.handler.CheckHealth(), check.IsNil)
        req := httptest.NewRequest("GET", "/arvados/v1/users/current", nil)
        user, ok, err := s.handler.validateAPItoken(req, arvadostest.ActiveTokenV2)
        c.Assert(err, check.IsNil)
@@ -337,6 +339,7 @@ func (s *HandlerSuite) TestLogTokenUUID(c *check.C) {
 }
 
 func (s *HandlerSuite) TestCreateAPIToken(c *check.C) {
+       c.Assert(s.handler.CheckHealth(), check.IsNil)
        req := httptest.NewRequest("GET", "/arvados/v1/users/current", nil)
        auth, err := s.handler.createAPItoken(req, arvadostest.ActiveUserUUID, nil)
        c.Assert(err, check.IsNil)
@@ -477,7 +480,7 @@ func (s *HandlerSuite) TestTrashSweep(c *check.C) {
        coll, err := s.handler.federation.CollectionCreate(ctx, arvados.CreateOptions{Attrs: map[string]interface{}{"name": "test trash sweep"}, EnsureUniqueName: true})
        c.Assert(err, check.IsNil)
        defer s.handler.federation.CollectionDelete(ctx, arvados.DeleteOptions{UUID: coll.UUID})
-       db, err := s.handler.db(s.ctx)
+       db, err := s.handler.dbConnector.GetDB(s.ctx)
        c.Assert(err, check.IsNil)
        _, err = db.ExecContext(s.ctx, `update collections set trash_at = $1, delete_at = $2 where uuid = $3`, time.Now().UTC().Add(time.Second/10), time.Now().UTC().Add(time.Hour), coll.UUID)
        c.Assert(err, check.IsNil)
@@ -550,7 +553,7 @@ func (s *HandlerSuite) TestLogActivity(c *check.C) {
                        c.Assert(err, check.IsNil)
                }
        }
-       db, err := s.handler.db(s.ctx)
+       db, err := s.handler.dbConnector.GetDB(s.ctx)
        c.Assert(err, check.IsNil)
        for _, userUUID := range []string{arvadostest.ActiveUserUUID, arvadostest.SpectatorUserUUID} {
                var rows int
index 9a7b0814cee7477dc7b506921aa99be0ef34ed77..99e7aec0b66c4dbed9462c498ece37e38971157b 100644 (file)
@@ -20,10 +20,16 @@ func (h *Handler) periodicWorker(workerName string, interval time.Duration, lock
                logger.Debugf("interval is %v, not running worker", interval)
                return
        }
-       locker.Lock(ctx, h.db)
+       if !locker.Lock(ctx, h.dbConnector.GetDB) {
+               // context canceled
+               return
+       }
        defer locker.Unlock()
        for time.Sleep(interval); ctx.Err() == nil; time.Sleep(interval) {
-               locker.Check()
+               if !locker.Check() {
+                       // context canceled
+                       return
+               }
                err := run(ctx)
                if err != nil {
                        logger.WithError(err).Infof("%s failed", workerName)
@@ -41,7 +47,7 @@ func (h *Handler) trashSweepWorker() {
 
 func (h *Handler) containerLogSweepWorker() {
        h.periodicWorker("container log sweep", h.Cluster.Containers.Logging.SweepInterval.Duration(), dblock.ContainerLogSweep, func(ctx context.Context) error {
-               db, err := h.db(ctx)
+               db, err := h.dbConnector.GetDB(ctx)
                if err != nil {
                        return err
                }
index a76420860604b9a6fb9823bdc6b3775c70f85ff4..2a05096ce18b7430e7e1e487dd5d710024ac9193 100644 (file)
@@ -10,6 +10,7 @@ import (
        "sync"
 
        "git.arvados.org/arvados.git/lib/controller/api"
+       "git.arvados.org/arvados.git/sdk/go/arvados"
        "git.arvados.org/arvados.git/sdk/go/ctxlog"
        "github.com/jmoiron/sqlx"
 
@@ -142,3 +143,33 @@ func CurrentTx(ctx context.Context) (*sqlx.Tx, error) {
        })
        return txn.tx, txn.err
 }
+
+var errDBConnection = errors.New("database connection error")
+
+type DBConnector struct {
+       PostgreSQL arvados.PostgreSQL
+       pgdb       *sqlx.DB
+       mtx        sync.Mutex
+}
+
+func (dbc *DBConnector) GetDB(ctx context.Context) (*sqlx.DB, error) {
+       dbc.mtx.Lock()
+       defer dbc.mtx.Unlock()
+       if dbc.pgdb != nil {
+               return dbc.pgdb, nil
+       }
+       db, err := sqlx.Open("postgres", dbc.PostgreSQL.Connection.String())
+       if err != nil {
+               ctxlog.FromContext(ctx).WithError(err).Error("postgresql connect failed")
+               return nil, errDBConnection
+       }
+       if p := dbc.PostgreSQL.ConnectionPool; p > 0 {
+               db.SetMaxOpenConns(p)
+       }
+       if err := db.Ping(); err != nil {
+               ctxlog.FromContext(ctx).WithError(err).Error("postgresql connect succeeded but ping failed")
+               return nil, errDBConnection
+       }
+       dbc.pgdb = db
+       return db, nil
+}
index ae91a710e395295f47a34cb5645f980021e79021..3403c50c972987e7f6f21a927a6db592fac9f6fc 100644 (file)
@@ -15,6 +15,8 @@ import (
        "time"
 
        "git.arvados.org/arvados.git/lib/cloud"
+       "git.arvados.org/arvados.git/lib/controller/dblock"
+       "git.arvados.org/arvados.git/lib/ctrlctx"
        "git.arvados.org/arvados.git/lib/dispatchcloud/container"
        "git.arvados.org/arvados.git/lib/dispatchcloud/scheduler"
        "git.arvados.org/arvados.git/lib/dispatchcloud/sshexecutor"
@@ -53,6 +55,7 @@ type dispatcher struct {
        Registry      *prometheus.Registry
        InstanceSetID cloud.InstanceSetID
 
+       dbConnector ctrlctx.DBConnector
        logger      logrus.FieldLogger
        instanceSet cloud.InstanceSet
        pool        pool
@@ -118,6 +121,7 @@ func (disp *dispatcher) setup() {
 
 func (disp *dispatcher) initialize() {
        disp.logger = ctxlog.FromContext(disp.Context)
+       disp.dbConnector = ctrlctx.DBConnector{PostgreSQL: disp.Cluster.PostgreSQL}
 
        disp.ArvClient.AuthToken = disp.AuthToken
 
@@ -143,6 +147,7 @@ func (disp *dispatcher) initialize() {
        if err != nil {
                disp.logger.Fatalf("error initializing driver: %s", err)
        }
+       dblock.Dispatch.Lock(disp.Context, disp.dbConnector.GetDB)
        disp.instanceSet = instanceSet
        disp.pool = worker.NewPool(disp.logger, disp.ArvClient, disp.Registry, disp.InstanceSetID, disp.instanceSet, disp.newExecutor, disp.sshKey.PublicKey(), disp.Cluster)
        disp.queue = container.NewQueue(disp.logger, disp.Registry, disp.typeChooser, disp.ArvClient)
@@ -175,6 +180,7 @@ func (disp *dispatcher) initialize() {
 }
 
 func (disp *dispatcher) run() {
+       defer dblock.Dispatch.Unlock()
        defer close(disp.stopped)
        defer disp.instanceSet.Stop()
        defer disp.pool.Stop()
index 829a053636d5dc07abaac1c649810c5416e09fb6..2d486da5fd5a9d4aafbbc0b82f06d0c20c7f91e8 100644 (file)
@@ -15,6 +15,7 @@ import (
        "sync"
        "time"
 
+       "git.arvados.org/arvados.git/lib/config"
        "git.arvados.org/arvados.git/lib/dispatchcloud/test"
        "git.arvados.org/arvados.git/sdk/go/arvados"
        "git.arvados.org/arvados.git/sdk/go/arvadostest"
@@ -49,8 +50,16 @@ func (s *DispatcherSuite) SetUpTest(c *check.C) {
                MinTimeBetweenCreateCalls: time.Millisecond,
        }
 
+       // We need the postgresql connection info from the integration
+       // test config.
+       cfg, err := config.NewLoader(nil, ctxlog.FromContext(s.ctx)).Load()
+       c.Assert(err, check.IsNil)
+       testcluster, err := cfg.GetCluster("")
+       c.Assert(err, check.IsNil)
+
        s.cluster = &arvados.Cluster{
                ManagementToken: "test-management-token",
+               PostgreSQL:      testcluster.PostgreSQL,
                Containers: arvados.ContainersConfig{
                        CrunchRunCommand:       "crunch-run",
                        CrunchRunArgumentsList: []string{"--foo", "--extra='args'"},
@@ -184,12 +193,18 @@ func (s *DispatcherSuite) TestDispatchToStubDriver(c *check.C) {
        err := s.disp.CheckHealth()
        c.Check(err, check.IsNil)
 
-       select {
-       case <-done:
-               c.Logf("containers finished (%s), waiting for instances to shutdown and queue to clear", time.Since(start))
-       case <-time.After(10 * time.Second):
-               c.Fatalf("timed out; still waiting for %d containers: %q", len(waiting), waiting)
+       for len(waiting) > 0 {
+               waswaiting := len(waiting)
+               select {
+               case <-done:
+                       // loop will end because len(waiting)==0
+               case <-time.After(3 * time.Second):
+                       if len(waiting) >= waswaiting {
+                               c.Fatalf("timed out; no progress in 3s while waiting for %d containers: %q", len(waiting), waiting)
+                       }
+               }
        }
+       c.Logf("containers finished (%s), waiting for instances to shutdown and queue to clear", time.Since(start))
 
        deadline := time.Now().Add(5 * time.Second)
        for range time.NewTicker(10 * time.Millisecond).C {
index d362f66d14b3ee12b9a4fb6b197b9a34747d944c..d1408d23cb1a4e3c2274f40d2f02b66bda29e82d 100644 (file)
@@ -18,6 +18,8 @@ import (
        "time"
 
        "git.arvados.org/arvados.git/lib/cmd"
+       "git.arvados.org/arvados.git/lib/controller/dblock"
+       "git.arvados.org/arvados.git/lib/ctrlctx"
        "git.arvados.org/arvados.git/lib/dispatchcloud"
        "git.arvados.org/arvados.git/lib/service"
        "git.arvados.org/arvados.git/sdk/go/arvados"
@@ -58,6 +60,7 @@ type dispatcher struct {
        Registry  *prometheus.Registry
 
        logger        logrus.FieldLogger
+       dbConnector   ctrlctx.DBConnector
        lsfcli        lsfcli
        lsfqueue      lsfqueue
        arvDispatcher *dispatch.Dispatcher
@@ -73,7 +76,9 @@ type dispatcher struct {
 func (disp *dispatcher) Start() {
        disp.initOnce.Do(func() {
                disp.init()
+               dblock.Dispatch.Lock(context.Background(), disp.dbConnector.GetDB)
                go func() {
+                       defer dblock.Dispatch.Unlock()
                        disp.checkLsfQueueForOrphans()
                        err := disp.arvDispatcher.Run(disp.Context)
                        if err != nil {
@@ -125,6 +130,7 @@ func (disp *dispatcher) init() {
                lsfcli: &disp.lsfcli,
        }
        disp.ArvClient.AuthToken = disp.AuthToken
+       disp.dbConnector = ctrlctx.DBConnector{PostgreSQL: disp.Cluster.PostgreSQL}
        disp.stop = make(chan struct{}, 1)
        disp.stopped = make(chan struct{})
 
index ac394e114962ddf05d2e71e94cc4bb1ff46c4780..1c0f6ad28f5ba7b6d20bcc0cadbef0fb87fec634 100644 (file)
@@ -19,6 +19,8 @@ import (
        "time"
 
        "git.arvados.org/arvados.git/lib/cmd"
+       "git.arvados.org/arvados.git/lib/controller/dblock"
+       "git.arvados.org/arvados.git/lib/ctrlctx"
        "git.arvados.org/arvados.git/lib/dispatchcloud"
        "git.arvados.org/arvados.git/lib/service"
        "git.arvados.org/arvados.git/sdk/go/arvados"
@@ -55,10 +57,11 @@ const initialNiceValue int64 = 10000
 
 type Dispatcher struct {
        *dispatch.Dispatcher
-       logger  logrus.FieldLogger
-       cluster *arvados.Cluster
-       sqCheck *SqueueChecker
-       slurm   Slurm
+       logger      logrus.FieldLogger
+       cluster     *arvados.Cluster
+       sqCheck     *SqueueChecker
+       slurm       Slurm
+       dbConnector ctrlctx.DBConnector
 
        done chan struct{}
        err  error
@@ -90,6 +93,7 @@ func (disp *Dispatcher) configure() error {
        disp.Client.APIHost = disp.cluster.Services.Controller.ExternalURL.Host
        disp.Client.AuthToken = disp.cluster.SystemRootToken
        disp.Client.Insecure = disp.cluster.TLS.Insecure
+       disp.dbConnector = ctrlctx.DBConnector{PostgreSQL: disp.cluster.PostgreSQL}
 
        if disp.Client.APIHost != "" || disp.Client.AuthToken != "" {
                // Copy real configs into env vars so [a]
@@ -137,6 +141,8 @@ func (disp *Dispatcher) setup() {
 }
 
 func (disp *Dispatcher) run() error {
+       dblock.Dispatch.Lock(context.Background(), disp.dbConnector.GetDB)
+       defer dblock.Dispatch.Unlock()
        defer disp.sqCheck.Stop()
 
        if disp.cluster != nil && len(disp.cluster.InstanceTypes) > 0 {
index 1dedb409a4a2de5c4f414959b024e291007d42b1..9f581751d938baf8d0f8fdc726bfae92e1f6877d 100644 (file)
@@ -23,7 +23,9 @@ import (
        "syscall"
        "time"
 
+       "git.arvados.org/arvados.git/lib/controller/dblock"
        "git.arvados.org/arvados.git/sdk/go/arvados"
+       "git.arvados.org/arvados.git/sdk/go/ctxlog"
        "git.arvados.org/arvados.git/sdk/go/keepclient"
        "github.com/jmoiron/sqlx"
        "github.com/sirupsen/logrus"
@@ -67,16 +69,19 @@ type Balancer struct {
 // subsequent balance operation.
 //
 // Run should only be called once on a given Balancer object.
-//
-// Typical usage:
-//
-//   runOptions, err = (&Balancer{}).Run(config, runOptions)
-func (bal *Balancer) Run(client *arvados.Client, cluster *arvados.Cluster, runOptions RunOptions) (nextRunOptions RunOptions, err error) {
+func (bal *Balancer) Run(ctx context.Context, client *arvados.Client, cluster *arvados.Cluster, runOptions RunOptions) (nextRunOptions RunOptions, err error) {
        nextRunOptions = runOptions
 
+       ctxlog.FromContext(ctx).Info("acquiring active lock")
+       if !dblock.KeepBalanceActive.Lock(ctx, func(context.Context) (*sqlx.DB, error) { return bal.DB, nil }) {
+               // context canceled
+               return
+       }
+       defer dblock.KeepBalanceActive.Unlock()
+
        defer bal.time("sweep", "wall clock time to run one full sweep")()
 
-       ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(cluster.Collections.BalanceTimeout.Duration()))
+       ctx, cancel := context.WithDeadline(ctx, time.Now().Add(cluster.Collections.BalanceTimeout.Duration()))
        defer cancel()
 
        var lbFile *os.File
index 2db7bea173c17dc41f6943b4fe579cbc7d15a24f..4772da55a2d6dddc79acff891dee1034781f7582 100644 (file)
@@ -6,6 +6,7 @@ package keepbalance
 
 import (
        "bytes"
+       "context"
        "encoding/json"
        "fmt"
        "io"
@@ -372,7 +373,7 @@ func (s *runSuite) TestRefuseZeroCollections(c *check.C) {
        trashReqs := s.stub.serveKeepstoreTrash()
        pullReqs := s.stub.serveKeepstorePull()
        srv := s.newServer(&opts)
-       _, err = srv.runOnce()
+       _, err = srv.runOnce(context.Background())
        c.Check(err, check.ErrorMatches, "received zero collections")
        c.Check(trashReqs.Count(), check.Equals, 4)
        c.Check(pullReqs.Count(), check.Equals, 0)
@@ -391,7 +392,7 @@ func (s *runSuite) TestRefuseNonAdmin(c *check.C) {
        trashReqs := s.stub.serveKeepstoreTrash()
        pullReqs := s.stub.serveKeepstorePull()
        srv := s.newServer(&opts)
-       _, err := srv.runOnce()
+       _, err := srv.runOnce(context.Background())
        c.Check(err, check.ErrorMatches, "current user .* is not .* admin user")
        c.Check(trashReqs.Count(), check.Equals, 0)
        c.Check(pullReqs.Count(), check.Equals, 0)
@@ -417,7 +418,7 @@ func (s *runSuite) TestRefuseSameDeviceDifferentVolumes(c *check.C) {
        trashReqs := s.stub.serveKeepstoreTrash()
        pullReqs := s.stub.serveKeepstorePull()
        srv := s.newServer(&opts)
-       _, err := srv.runOnce()
+       _, err := srv.runOnce(context.Background())
        c.Check(err, check.ErrorMatches, "cannot continue with config errors.*")
        c.Check(trashReqs.Count(), check.Equals, 0)
        c.Check(pullReqs.Count(), check.Equals, 0)
@@ -442,7 +443,7 @@ func (s *runSuite) TestWriteLostBlocks(c *check.C) {
        s.stub.serveKeepstorePull()
        srv := s.newServer(&opts)
        c.Assert(err, check.IsNil)
-       _, err = srv.runOnce()
+       _, err = srv.runOnce(context.Background())
        c.Check(err, check.IsNil)
        lost, err := ioutil.ReadFile(lostf.Name())
        c.Assert(err, check.IsNil)
@@ -463,7 +464,7 @@ func (s *runSuite) TestDryRun(c *check.C) {
        trashReqs := s.stub.serveKeepstoreTrash()
        pullReqs := s.stub.serveKeepstorePull()
        srv := s.newServer(&opts)
-       bal, err := srv.runOnce()
+       bal, err := srv.runOnce(context.Background())
        c.Check(err, check.IsNil)
        for _, req := range collReqs.reqs {
                c.Check(req.Form.Get("include_trash"), check.Equals, "true")
@@ -493,7 +494,7 @@ func (s *runSuite) TestCommit(c *check.C) {
        trashReqs := s.stub.serveKeepstoreTrash()
        pullReqs := s.stub.serveKeepstorePull()
        srv := s.newServer(&opts)
-       bal, err := srv.runOnce()
+       bal, err := srv.runOnce(context.Background())
        c.Check(err, check.IsNil)
        c.Check(trashReqs.Count(), check.Equals, 8)
        c.Check(pullReqs.Count(), check.Equals, 4)
@@ -533,13 +534,14 @@ func (s *runSuite) TestRunForever(c *check.C) {
        trashReqs := s.stub.serveKeepstoreTrash()
        pullReqs := s.stub.serveKeepstorePull()
 
-       stop := make(chan interface{})
+       ctx, cancel := context.WithCancel(context.Background())
+       defer cancel()
        s.config.Collections.BalancePeriod = arvados.Duration(time.Millisecond)
        srv := s.newServer(&opts)
 
        done := make(chan bool)
        go func() {
-               srv.runForever(stop)
+               srv.runForever(ctx)
                close(done)
        }()
 
@@ -550,7 +552,7 @@ func (s *runSuite) TestRunForever(c *check.C) {
        for t0 := time.Now(); pullReqs.Count() < 16 && time.Since(t0) < 10*time.Second; {
                time.Sleep(time.Millisecond)
        }
-       stop <- true
+       cancel()
        <-done
        c.Check(pullReqs.Count() >= 16, check.Equals, true)
        c.Check(trashReqs.Count(), check.Equals, pullReqs.Count()+4)
index 3cfb5cdeda5039fb37f414f5cd0b095eea0e772d..42463a002a5ec73652f7f7ef6f00f8a8c4fb44a1 100644 (file)
@@ -6,6 +6,7 @@ package keepbalance
 
 import (
        "bytes"
+       "context"
        "io"
        "os"
        "strings"
@@ -97,7 +98,7 @@ func (s *integrationSuite) TestBalanceAPIFixtures(c *check.C) {
                        Logger:  logger,
                        Metrics: newMetrics(prometheus.NewRegistry()),
                }
-               nextOpts, err := bal.Run(s.client, s.config, opts)
+               nextOpts, err := bal.Run(context.Background(), s.client, s.config, opts)
                c.Check(err, check.IsNil)
                c.Check(nextOpts.SafeRendezvousState, check.Not(check.Equals), "")
                c.Check(nextOpts.CommitPulls, check.Equals, true)
index f0b0df5bd331d6a97a2cdaab0a8d968cfdbfc550..b016db22ffe67f6316f1e4f537bfa680f135ecad 100644 (file)
@@ -112,7 +112,7 @@ func (command) RunCommand(prog string, args []string, stdin io.Reader, stdout, s
                                Routes: health.Routes{"ping": srv.CheckHealth},
                        }
 
-                       go srv.run()
+                       go srv.run(ctx)
                        return srv
                }).RunCommand(prog, args, stdin, stdout, stderr)
 }
index e485f5b2061f28134306d1d897b22cb62e4190e9..fd53497f789ed4f5f1db458f99e69f8e7f10c1a7 100644 (file)
@@ -5,12 +5,14 @@
 package keepbalance
 
 import (
+       "context"
        "net/http"
        "os"
        "os/signal"
        "syscall"
        "time"
 
+       "git.arvados.org/arvados.git/lib/controller/dblock"
        "git.arvados.org/arvados.git/sdk/go/arvados"
        "github.com/jmoiron/sqlx"
        "github.com/sirupsen/logrus"
@@ -62,12 +64,12 @@ func (srv *Server) Done() <-chan struct{} {
        return nil
 }
 
-func (srv *Server) run() {
+func (srv *Server) run(ctx context.Context) {
        var err error
        if srv.RunOptions.Once {
-               _, err = srv.runOnce()
+               _, err = srv.runOnce(ctx)
        } else {
-               err = srv.runForever(nil)
+               err = srv.runForever(ctx)
        }
        if err != nil {
                srv.Logger.Error(err)
@@ -77,7 +79,7 @@ func (srv *Server) run() {
        }
 }
 
-func (srv *Server) runOnce() (*Balancer, error) {
+func (srv *Server) runOnce(ctx context.Context) (*Balancer, error) {
        bal := &Balancer{
                DB:             srv.DB,
                Logger:         srv.Logger,
@@ -86,13 +88,12 @@ func (srv *Server) runOnce() (*Balancer, error) {
                LostBlocksFile: srv.Cluster.Collections.BlobMissingReport,
        }
        var err error
-       srv.RunOptions, err = bal.Run(srv.ArvClient, srv.Cluster, srv.RunOptions)
+       srv.RunOptions, err = bal.Run(ctx, srv.ArvClient, srv.Cluster, srv.RunOptions)
        return bal, err
 }
 
-// RunForever runs forever, or (for testing purposes) until the given
-// stop channel is ready to receive.
-func (srv *Server) runForever(stop <-chan interface{}) error {
+// RunForever runs forever, or until ctx is cancelled.
+func (srv *Server) runForever(ctx context.Context) error {
        logger := srv.Logger
 
        ticker := time.NewTicker(time.Duration(srv.Cluster.Collections.BalancePeriod))
@@ -102,6 +103,10 @@ func (srv *Server) runForever(stop <-chan interface{}) error {
        sigUSR1 := make(chan os.Signal)
        signal.Notify(sigUSR1, syscall.SIGUSR1)
 
+       logger.Info("acquiring service lock")
+       dblock.KeepBalanceService.Lock(ctx, func(context.Context) (*sqlx.DB, error) { return srv.DB, nil })
+       defer dblock.KeepBalanceService.Unlock()
+
        logger.Printf("starting up: will scan every %v and on SIGUSR1", srv.Cluster.Collections.BalancePeriod)
 
        for {
@@ -110,7 +115,11 @@ func (srv *Server) runForever(stop <-chan interface{}) error {
                        logger.Print("=======  Consider using -commit-pulls and -commit-trash flags.")
                }
 
-               _, err := srv.runOnce()
+               if !dblock.KeepBalanceService.Check() {
+                       // context canceled
+                       return nil
+               }
+               _, err := srv.runOnce(ctx)
                if err != nil {
                        logger.Print("run failed: ", err)
                } else {
@@ -118,7 +127,7 @@ func (srv *Server) runForever(stop <-chan interface{}) error {
                }
 
                select {
-               case <-stop:
+               case <-ctx.Done():
                        signal.Stop(sigUSR1)
                        return nil
                case <-ticker.C: