//
// SPDX-License-Identifier: AGPL-3.0
-package main
+package keepbalance
import (
- "encoding/json"
+ "bytes"
+ "context"
"flag"
"fmt"
- "log"
- "net/http"
- "os"
- "os/signal"
- "syscall"
- "time"
-
- "git.curoverse.com/arvados.git/sdk/go/arvados"
- "git.curoverse.com/arvados.git/sdk/go/config"
+ "io"
+ _ "net/http/pprof"
+
+ "git.arvados.org/arvados.git/lib/cmd"
+ "git.arvados.org/arvados.git/lib/config"
+ "git.arvados.org/arvados.git/lib/service"
+ "git.arvados.org/arvados.git/sdk/go/arvados"
+ "git.arvados.org/arvados.git/sdk/go/ctxlog"
+ "git.arvados.org/arvados.git/sdk/go/health"
+ "github.com/jmoiron/sqlx"
+ _ "github.com/lib/pq"
+ "github.com/prometheus/client_golang/prometheus"
)
-var version = "dev"
-
-const defaultConfigPath = "/etc/arvados/keep-balance/keep-balance.yml"
-
-// Config specifies site configuration, like API credentials and the
-// choice of which servers are to be balanced.
-//
-// Config is loaded from a JSON config file (see usage()).
-type Config struct {
- // Arvados API endpoint and credentials.
- Client arvados.Client
-
- // List of service types (e.g., "disk") to balance.
- KeepServiceTypes []string
-
- KeepServiceList arvados.KeepServiceList
+type command struct{}
- // How often to check
- RunPeriod arvados.Duration
+var Command = command{}
- // Number of collections to request in each API call
- CollectionBatchSize int
-
- // Max collections to buffer in memory (bigger values consume
- // more memory, but can reduce store-and-forward latency when
- // fetching pages)
- CollectionBuffers int
-
- // Timeout for outgoing http request/response cycle.
- RequestTimeout arvados.Duration
-}
-
-// RunOptions controls runtime behavior. The flags/options that belong
-// here are the ones that are useful for interactive use. For example,
-// "CommitTrash" is a runtime option rather than a config item because
-// it invokes a troubleshooting feature rather than expressing how
-// balancing is meant to be done at a given site.
-//
-// RunOptions fields are controlled by command line flags.
-type RunOptions struct {
- Once bool
- CommitPulls bool
- CommitTrash bool
- Logger *log.Logger
- Dumper *log.Logger
-
- // SafeRendezvousState from the most recent balance operation,
- // or "" if unknown. If this changes from one run to the next,
- // we need to watch out for races. See
- // (*Balancer)ClearTrashLists.
- SafeRendezvousState string
-}
-
-var debugf = func(string, ...interface{}) {}
-
-func main() {
- var cfg Config
- var runOptions RunOptions
-
- configPath := flag.String("config", defaultConfigPath,
- "`path` of JSON or YAML configuration file")
- serviceListPath := flag.String("config.KeepServiceList", "",
- "`path` of JSON or YAML file with list of keep services to balance, as given by \"arv keep_service list\" "+
- "(default: config[\"KeepServiceList\"], or if none given, get all available services and filter by config[\"KeepServiceTypes\"])")
- flag.BoolVar(&runOptions.Once, "once", false,
+func (command) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int {
+ var options RunOptions
+ flags := flag.NewFlagSet(prog, flag.ContinueOnError)
+ flags.BoolVar(&options.Once, "once", false,
"balance once and then exit")
- flag.BoolVar(&runOptions.CommitPulls, "commit-pulls", false,
- "send pull requests (make more replicas of blocks that are underreplicated or are not in optimal rendezvous probe order)")
- flag.BoolVar(&runOptions.CommitTrash, "commit-trash", false,
- "send trash requests (delete unreferenced old blocks, and excess replicas of overreplicated blocks)")
- dumpConfig := flag.Bool("dump-config", false, "write current configuration to stdout and exit")
- dumpFlag := flag.Bool("dump", false, "dump details for each block to stdout")
- debugFlag := flag.Bool("debug", false, "enable debug messages")
- getVersion := flag.Bool("version", false, "Print version information and exit.")
- flag.Usage = usage
- flag.Parse()
-
- // Print version information if requested
- if *getVersion {
- fmt.Printf("keep-balance %s\n", version)
- return
- }
-
- mustReadConfig(&cfg, *configPath)
- if *serviceListPath != "" {
- mustReadConfig(&cfg.KeepServiceList, *serviceListPath)
+ deprCommitPulls := flags.Bool("commit-pulls", true,
+ "send pull requests (must be true -- configure Collections.BalancePullLimit = 0 to disable.)")
+ deprCommitTrash := flags.Bool("commit-trash", true,
+ "send trash requests (must be true -- configure Collections.BalanceTrashLimit = 0 to disable.)")
+ flags.BoolVar(&options.CommitConfirmedFields, "commit-confirmed-fields", true,
+ "update collection fields (replicas_confirmed, storage_classes_confirmed, etc.)")
+ flags.StringVar(&options.ChunkPrefix, "chunk-prefix", "",
+ "operate only on blocks with the given prefix (experimental, see https://dev.arvados.org/issues/19923)")
+ // These options are implemented by service.Command, so we
+ // don't need the vars here -- we just need the flags
+ // to pass flags.Parse().
+ flags.Bool("dump", false, "dump details for each block to stdout")
+ flags.String("pprof", "", "serve Go profile data at `[addr]:port`")
+ flags.Bool("version", false, "Write version information to stdout and exit 0")
+
+ logger := ctxlog.New(stderr, "json", "info")
+ loader := config.NewLoader(&bytes.Buffer{}, logger)
+ loader.SetupFlags(flags)
+ munged := loader.MungeLegacyConfigArgs(logger, args, "-legacy-keepbalance-config")
+ if ok, code := cmd.ParseFlags(flags, prog, munged, "", stderr); !ok {
+ return code
}
- if *dumpConfig {
- log.Fatal(config.DumpAndExit(cfg))
+ if !*deprCommitPulls || !*deprCommitTrash {
+ fmt.Fprint(stderr,
+ "Usage error: the -commit-pulls or -commit-trash command line flags are no longer supported.\n",
+ "Use Collections.BalancePullLimit and Collections.BalanceTrashLimit instead.\n")
+ return cmd.EXIT_INVALIDARGUMENT
}
- to := time.Duration(cfg.RequestTimeout)
- if to == 0 {
- to = 30 * time.Minute
+ // Drop our custom args that would be rejected by the generic
+ // service.Command
+ args = nil
+ dropFlag := map[string]bool{
+ "once": true,
+ "commit-pulls": true,
+ "commit-trash": true,
+ "commit-confirmed-fields": true,
+ "dump": true,
}
- arvados.DefaultSecureClient.Timeout = to
- arvados.InsecureHTTPClient.Timeout = to
- http.DefaultClient.Timeout = to
-
- log.Printf("keep-balance %s started", version)
-
- if *debugFlag {
- debugf = log.Printf
- if j, err := json.Marshal(cfg); err != nil {
- log.Fatal(err)
- } else {
- log.Printf("config is %s", j)
+ flags.Visit(func(f *flag.Flag) {
+ if !dropFlag[f.Name] {
+ args = append(args, "-"+f.Name+"="+f.Value.String())
}
- }
- if *dumpFlag {
- runOptions.Dumper = log.New(os.Stdout, "", log.LstdFlags)
- }
- err := CheckConfig(cfg, runOptions)
- if err != nil {
- // (don't run)
- } else if runOptions.Once {
- _, err = (&Balancer{}).Run(cfg, runOptions)
- } else {
- err = RunForever(cfg, runOptions, nil)
- }
- if err != nil {
- log.Fatal(err)
- }
-}
-
-func mustReadConfig(dst interface{}, path string) {
- if err := config.LoadFile(dst, path); err != nil {
- log.Fatal(err)
- }
-}
-
-// RunForever runs forever, or (for testing purposes) until the given
-// stop channel is ready to receive.
-func RunForever(config Config, runOptions RunOptions, stop <-chan interface{}) error {
- if runOptions.Logger == nil {
- runOptions.Logger = log.New(os.Stderr, "", log.LstdFlags)
- }
- logger := runOptions.Logger
-
- ticker := time.NewTicker(time.Duration(config.RunPeriod))
-
- // The unbuffered channel here means we only hear SIGUSR1 if
- // it arrives while we're waiting in select{}.
- sigUSR1 := make(chan os.Signal)
- signal.Notify(sigUSR1, syscall.SIGUSR1)
-
- logger.Printf("starting up: will scan every %v and on SIGUSR1", config.RunPeriod)
-
- for {
- if !runOptions.CommitPulls && !runOptions.CommitTrash {
- logger.Print("WARNING: Will scan periodically, but no changes will be committed.")
- logger.Print("======= Consider using -commit-pulls and -commit-trash flags.")
- }
-
- bal := &Balancer{}
- var err error
- runOptions, err = bal.Run(config, runOptions)
- if err != nil {
- logger.Print("run failed: ", err)
- } else {
- logger.Print("run succeeded")
- }
-
- select {
- case <-stop:
- signal.Stop(sigUSR1)
- return nil
- case <-ticker.C:
- logger.Print("timer went off")
- case <-sigUSR1:
- logger.Print("received SIGUSR1, resetting timer")
- // Reset the timer so we don't start the N+1st
- // run too soon after the Nth run is triggered
- // by SIGUSR1.
- ticker.Stop()
- ticker = time.NewTicker(time.Duration(config.RunPeriod))
- }
- logger.Print("starting next run")
- }
+ })
+
+ return service.Command(arvados.ServiceNameKeepbalance,
+ func(ctx context.Context, cluster *arvados.Cluster, token string, registry *prometheus.Registry) service.Handler {
+ if !options.Once && cluster.Collections.BalancePeriod == arvados.Duration(0) {
+ return service.ErrorHandler(ctx, cluster, fmt.Errorf("cannot start service: Collections.BalancePeriod is zero (if you want to run once and then exit, use the -once flag)"))
+ }
+
+ ac, err := arvados.NewClientFromConfig(cluster)
+ ac.AuthToken = token
+ if err != nil {
+ return service.ErrorHandler(ctx, cluster, fmt.Errorf("error initializing client from cluster config: %s", err))
+ }
+
+ db, err := sqlx.Open("postgres", cluster.PostgreSQL.Connection.String())
+ if err != nil {
+ return service.ErrorHandler(ctx, cluster, fmt.Errorf("postgresql connection failed: %s", err))
+ }
+ if p := cluster.PostgreSQL.ConnectionPool; p > 0 {
+ db.SetMaxOpenConns(p)
+ }
+ err = db.Ping()
+ if err != nil {
+ return service.ErrorHandler(ctx, cluster, fmt.Errorf("postgresql connection succeeded but ping failed: %s", err))
+ }
+
+ if options.Logger == nil {
+ options.Logger = ctxlog.FromContext(ctx)
+ }
+
+ srv := &Server{
+ Cluster: cluster,
+ ArvClient: ac,
+ RunOptions: options,
+ Metrics: newMetrics(registry),
+ Logger: options.Logger,
+ Dumper: options.Dumper,
+ DB: db,
+ }
+ srv.Handler = &health.Handler{
+ Token: cluster.ManagementToken,
+ Prefix: "/_health/",
+ Routes: health.Routes{"ping": srv.CheckHealth},
+ }
+
+ go srv.run(ctx)
+ return srv
+ }).RunCommand(prog, args, stdin, stdout, stderr)
}