From e86ee860d99036ff4ac61f6635b738f3a408e446 Mon Sep 17 00:00:00 2001 From: Eric Biagiotti Date: Tue, 1 Oct 2019 16:23:38 -0400 Subject: [PATCH] 14714: Removes metrics serving - Updates test gathering of metrics - Simplifies balancer keep service discovery - Moves keep-balance info from keepstore install docs to the new admin page - Keepbalance now exits correctly when using -once Arvados-DCO-1.1-Signed-off-by: Eric Biagiotti --- doc/admin/keep-balance.html.textile.liquid | 8 ++++ .../install-keepstore.html.textile.liquid | 10 +---- services/keep-balance/balance.go | 13 ++---- services/keep-balance/balance_run_test.go | 45 ++++++++++--------- services/keep-balance/server.go | 31 +++---------- 5 files changed, 44 insertions(+), 63 deletions(-) diff --git a/doc/admin/keep-balance.html.textile.liquid b/doc/admin/keep-balance.html.textile.liquid index 1a77cdd160..5af0a26880 100644 --- a/doc/admin/keep-balance.html.textile.liquid +++ b/doc/admin/keep-balance.html.textile.liquid @@ -14,6 +14,14 @@ This page describes how to balance keepstore servers using keep-balance. Keep-ba See "the Keep-balance install docs":{{site.baseurl}}/install/install-keep-balance.html for installation instructions. +h3. Data deletion + +The keep-balance service determines which blocks are candidates for deletion and instructs the keepstore to move those blocks to the trash. When a block is newly written, it is protected from deletion for the duration in @BlobSigningTTL@. During this time, it cannot be trashed or deleted. + +If keep-balance instructs keepstore to trash a block which is older than @BlobSigningTTL@, and @BlobTrashLifetime@ is non-zero, the block will be moved to "trash". A block which is in the trash is no longer accessible by read requests, but has not yet been permanently deleted. Blocks which are in the trash may be recovered using the "untrash" API endpoint. Blocks are permanently deleted after they have been in the trash for the duration in @BlobTrashLifetime@. + +Keep-balance is also responsible for balancing the distribution of blocks across keepstore servers by asking servers to pull blocks from other servers (as determined by their "storage class":{{site.baseurl}}/admin/storage-classes.html and "rendezvous hashing order":{{site.baseurl}}/api/storage.html). Pulling a block makes a copy. If a block is overreplicated (i.e. there are excess copies) after pulling, it will be subsequently trashed and deleted on the original server, subject to @BlobTrash@ and @BlobTrashLifetime@ settings. + h3. Scanning By default, keep-balance operates periodically, i.e. do a scan/balance operation, sleep, repeat. diff --git a/doc/install/install-keepstore.html.textile.liquid b/doc/install/install-keepstore.html.textile.liquid index a16de51627..71c1cb639e 100644 --- a/doc/install/install-keepstore.html.textile.liquid +++ b/doc/install/install-keepstore.html.textile.liquid @@ -79,15 +79,9 @@ Add or update the following sections of @/etc/arvados/config.yml@ as needed. Ref -h3. Notes on storage management +h3. Note on storage management -On its own, a keepstore server never deletes data. The "keep-balance":install-keep-balance.html service determines which blocks are candidates for deletion and instructs the keepstore to move those blocks to the trash. - -When a block is newly written, it is protected from deletion for the duration in @BlobSigningTTL@. During this time, it cannot be trashed or deleted. - -If keep-balance instructs keepstore to trash a block which is older than @BlobSigningTTL@, and @BlobTrashLifetime@ is non-zero, the block will be moved to "trash". A block which is in the trash is no longer accessible by read requests, but has not yet been permanently deleted. Blocks which are in the trash may be recovered using the "untrash" API endpoint. Blocks are permanently deleted after they have been in the trash for the duration in @BlobTrashLifetime@. - -Keep-balance is also responsible for balancing the distribution of blocks across keepstore servers by asking servers to pull blocks from other servers (as determined by their "storage class":{{site.baseurl}}/admin/storage-classes.html and "rendezvous hashing order":{{site.baseurl}}/api/storage.html). Pulling a block makes a copy. If a block is overreplicated (i.e. there are excess copies) after pulling, it will be subsequently trashed and deleted on the original server, subject to @BlobTrash@ and @BlobTrashLifetime@ settings. +On its own, a keepstore server never deletes data. Instead, the keep-balance service determines which blocks are candidates for deletion and instructs the keepstore to move those blocks to the trash. Please see the "Balancing Keep servers":{{site.baseurl}}/admin/keep-balance.html for more details. h3. Configure storage volumes diff --git a/services/keep-balance/balance.go b/services/keep-balance/balance.go index 3471d50fe3..e50b0b505a 100644 --- a/services/keep-balance/balance.go +++ b/services/keep-balance/balance.go @@ -95,8 +95,7 @@ func (bal *Balancer) Run(client *arvados.Client, cluster *arvados.Cluster, runOp bal.lostBlocks = ioutil.Discard } - diskService := []string{"disk"} - err = bal.DiscoverKeepServices(client, diskService) + err = bal.DiscoverKeepServices(client) if err != nil { return } @@ -173,15 +172,11 @@ func (bal *Balancer) SetKeepServices(srvList arvados.KeepServiceList) error { // DiscoverKeepServices sets the list of KeepServices by calling the // API to get a list of all services, and selecting the ones whose -// ServiceType is in okTypes. -func (bal *Balancer) DiscoverKeepServices(c *arvados.Client, okTypes []string) error { +// ServiceType is "disk" +func (bal *Balancer) DiscoverKeepServices(c *arvados.Client) error { bal.KeepServices = make(map[string]*KeepService) - ok := make(map[string]bool) - for _, t := range okTypes { - ok[t] = true - } return c.EachKeepService(func(srv arvados.KeepService) error { - if ok[srv.ServiceType] { + if srv.ServiceType == "disk" { bal.KeepServices[srv.UUID] = &KeepService{ KeepService: srv, ChangeSet: &ChangeSet{}, diff --git a/services/keep-balance/balance_run_test.go b/services/keep-balance/balance_run_test.go index 5e2e752484..a3abc9f96a 100644 --- a/services/keep-balance/balance_run_test.go +++ b/services/keep-balance/balance_run_test.go @@ -5,6 +5,7 @@ package main import ( + "bytes" "encoding/json" "fmt" "io" @@ -21,6 +22,7 @@ import ( "git.curoverse.com/arvados.git/sdk/go/arvadostest" "git.curoverse.com/arvados.git/sdk/go/ctxlog" "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/common/expfmt" check "gopkg.in/check.v1" ) @@ -319,7 +321,6 @@ func (s *runSuite) newServer(options *RunOptions) *Server { Logger: options.Logger, Dumper: options.Dumper, } - srv.setup() return srv } @@ -492,12 +493,13 @@ func (s *runSuite) TestCommit(c *check.C) { c.Assert(err, check.IsNil) c.Check(string(lost), check.Equals, "") - metrics := s.getMetrics(c, srv) - c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_total_bytes 15\n.*`) - c.Check(metrics, check.Matches, `(?ms).*\narvados_keepbalance_changeset_compute_seconds_sum [0-9\.]+\n.*`) - c.Check(metrics, check.Matches, `(?ms).*\narvados_keepbalance_changeset_compute_seconds_count 1\n.*`) - c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_dedup_byte_ratio 1\.5\n.*`) - c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_dedup_block_ratio 1\.5\n.*`) + buf, err := s.getMetrics(c, srv) + c.Check(err, check.IsNil) + c.Check(buf, check.Matches, `(?ms).*\narvados_keep_total_bytes 15\n.*`) + c.Check(buf, check.Matches, `(?ms).*\narvados_keepbalance_changeset_compute_seconds_sum [0-9\.]+\n.*`) + c.Check(buf, check.Matches, `(?ms).*\narvados_keepbalance_changeset_compute_seconds_count 1\n.*`) + c.Check(buf, check.Matches, `(?ms).*\narvados_keep_dedup_byte_ratio 1\.5\n.*`) + c.Check(buf, check.Matches, `(?ms).*\narvados_keep_dedup_block_ratio 1\.5\n.*`) } func (s *runSuite) TestRunForever(c *check.C) { @@ -537,21 +539,24 @@ func (s *runSuite) TestRunForever(c *check.C) { <-done c.Check(pullReqs.Count() >= 16, check.Equals, true) c.Check(trashReqs.Count(), check.Equals, pullReqs.Count()+4) - c.Check(s.getMetrics(c, srv), check.Matches, `(?ms).*\narvados_keepbalance_changeset_compute_seconds_count `+fmt.Sprintf("%d", pullReqs.Count()/4)+`\n.*`) + + buf, err := s.getMetrics(c, srv) + c.Check(err, check.IsNil) + c.Check(buf, check.Matches, `(?ms).*\narvados_keepbalance_changeset_compute_seconds_count `+fmt.Sprintf("%d", pullReqs.Count()/4)+`\n.*`) } -func (s *runSuite) getMetrics(c *check.C, srv *Server) string { - req := httptest.NewRequest("GET", "/metrics", nil) - resp := httptest.NewRecorder() - srv.ServeHTTP(resp, req) - c.Check(resp.Code, check.Equals, http.StatusUnauthorized) +func (s *runSuite) getMetrics(c *check.C, srv *Server) (*bytes.Buffer, error) { + mfs, err := srv.Metrics.reg.Gather() + if err != nil { + return nil, err + } - req = httptest.NewRequest("GET", "/metrics?api_token=xyzzy", nil) - resp = httptest.NewRecorder() - srv.ServeHTTP(resp, req) - c.Check(resp.Code, check.Equals, http.StatusOK) + var buf bytes.Buffer + for _, mf := range mfs { + if _, err := expfmt.MetricFamilyToText(&buf, mf); err != nil { + return nil, err + } + } - buf, err := ioutil.ReadAll(resp.Body) - c.Check(err, check.IsNil) - return string(buf) + return &buf, nil } diff --git a/services/keep-balance/server.go b/services/keep-balance/server.go index 9f192d6355..b6806d552a 100644 --- a/services/keep-balance/server.go +++ b/services/keep-balance/server.go @@ -12,9 +12,6 @@ import ( "time" "git.curoverse.com/arvados.git/sdk/go/arvados" - "git.curoverse.com/arvados.git/sdk/go/auth" - "github.com/julienschmidt/httprouter" - "github.com/prometheus/client_golang/prometheus/promhttp" "github.com/sirupsen/logrus" ) @@ -40,43 +37,22 @@ type RunOptions struct { } type Server struct { + http.Handler + Cluster *arvados.Cluster ArvClient *arvados.Client RunOptions RunOptions Metrics *metrics - httpHandler http.Handler - Logger logrus.FieldLogger Dumper logrus.FieldLogger } -// ServeHTTP implements service.Handler. -func (srv *Server) ServeHTTP(w http.ResponseWriter, r *http.Request) { - srv.httpHandler.ServeHTTP(w, r) -} - // CheckHealth implements service.Handler. func (srv *Server) CheckHealth() error { return nil } -func (srv *Server) setup() { - if srv.Cluster.ManagementToken == "" { - srv.httpHandler = http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - http.Error(w, "Management API authentication is not configured", http.StatusForbidden) - }) - } else { - mux := httprouter.New() - metricsH := promhttp.HandlerFor(srv.Metrics.reg, promhttp.HandlerOpts{ - ErrorLog: srv.Logger, - }) - mux.Handler("GET", "/metrics", metricsH) - mux.Handler("GET", "/metrics.json", metricsH) - srv.httpHandler = auth.RequireLiteralToken(srv.Cluster.ManagementToken, mux) - } -} - func (srv *Server) run() { var err error if srv.RunOptions.Once { @@ -86,6 +62,9 @@ func (srv *Server) run() { } if err != nil { srv.Logger.Error(err) + os.Exit(1) + } else { + os.Exit(0) } } -- 2.30.2