16048: arvados-server boot: restart everything when config changes.
authorTom Clegg <tom@tomclegg.ca>
Fri, 10 Apr 2020 19:44:14 +0000 (15:44 -0400)
committerTom Clegg <tom@tomclegg.ca>
Fri, 10 Apr 2020 19:44:14 +0000 (15:44 -0400)
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom@tomclegg.ca>

doc/examples/config/zzzzz.yml
go.mod
go.sum
lib/boot/cmd.go
lib/boot/supervisor.go
lib/config/cmd_test.go
lib/config/config.default.yml
lib/config/generated_config.go
lib/controller/integration_test.go
sdk/go/arvados/config.go

index c63550edf709ad916f13c452b136c83208527afa..d1e1336d545c435fffc58c1778e5ffd4e9027367 100644 (file)
@@ -1,3 +1,4 @@
+AutoReloadConfig: true
 Clusters:
   zzzzz:
     ManagementToken: e687950a23c3a9bceec28c6223a06c79
diff --git a/go.mod b/go.mod
index 4491b359813c00ca2d39af34f4d6587e49290699..34b7e0779072fcf060d325ddffe77f54adffdfbf 100644 (file)
--- a/go.mod
+++ b/go.mod
@@ -11,6 +11,7 @@ require (
        github.com/anmitsu/go-shlex v0.0.0-20161002113705-648efa622239 // indirect
        github.com/arvados/cgofuse v1.2.0-arvados1
        github.com/aws/aws-sdk-go v1.25.30
+       github.com/bgentry/speakeasy v0.1.0 // indirect
        github.com/coreos/go-oidc v2.1.0+incompatible
        github.com/coreos/go-systemd v0.0.0-20180108085132-cc4f39464dc7
        github.com/dgrijalva/jwt-go v3.1.0+incompatible // indirect
@@ -21,6 +22,7 @@ require (
        github.com/docker/go-connections v0.3.0 // indirect
        github.com/docker/go-units v0.3.3-0.20171221200356-d59758554a3d // indirect
        github.com/flynn/go-shlex v0.0.0-20150515145356-3f9db97f8568 // indirect
+       github.com/fsnotify/fsnotify v1.4.9
        github.com/ghodss/yaml v1.0.0
        github.com/gliderlabs/ssh v0.2.2 // indirect
        github.com/gogo/protobuf v1.1.1
diff --git a/go.sum b/go.sum
index 18cf89b0e17e6130fa2e18cb2b4b067de54d506d..03b2f77b6d9ce62ecc5c2cf6abae2f4995bb3152 100644 (file)
--- a/go.sum
+++ b/go.sum
@@ -27,6 +27,8 @@ github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973/go.mod h1:Dwedo/Wpr24
 github.com/beorn7/perks v1.0.0/go.mod h1:KWe93zE9D1o94FZ5RNwFwVgaQK1VOXiVxmqh+CedLV8=
 github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
 github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
+github.com/bgentry/speakeasy v0.1.0 h1:ByYyxL9InA1OWqxJqqp2A5pYHUrCiAL6K3J+LKSsQkY=
+github.com/bgentry/speakeasy v0.1.0/go.mod h1:+zsyZBPWlz7T6j88CTgSN5bM796AkVf0kBD4zp0CCIs=
 github.com/cespare/xxhash/v2 v2.1.0 h1:yTUvW7Vhb89inJ+8irsUqiWjh8iT6sQPZiQzI6ReGkA=
 github.com/cespare/xxhash/v2 v2.1.0/go.mod h1:dgIUBU3pDso/gPgZ1osOZ0iQf77oPR28Tjxl5dIMyVM=
 github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw=
@@ -54,6 +56,8 @@ github.com/docker/go-units v0.3.3-0.20171221200356-d59758554a3d h1:dVaNRYvaGV23A
 github.com/docker/go-units v0.3.3-0.20171221200356-d59758554a3d/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk=
 github.com/flynn/go-shlex v0.0.0-20150515145356-3f9db97f8568 h1:BHsljHzVlRcyQhjrss6TZTdY2VfCqZPbv5k3iBFa2ZQ=
 github.com/flynn/go-shlex v0.0.0-20150515145356-3f9db97f8568/go.mod h1:xEzjJPgXI435gkrCt3MPfRiAkVrwSbHsst4LCFVfpJc=
+github.com/fsnotify/fsnotify v1.4.9 h1:hsms1Qyu0jgnwNXIxa+/V/PDsU6CfLf6CNO8H7IWoS4=
+github.com/fsnotify/fsnotify v1.4.9/go.mod h1:znqG4EE+3YCdAaPaxE2ZRY/06pZUdp0tY4IgpuI1SZQ=
 github.com/ghodss/yaml v1.0.0 h1:wQHKEahhL6wmXdzwWG11gIVCkOv05bNOh+Rxn0yngAk=
 github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04=
 github.com/gliderlabs/ssh v0.2.2 h1:6zsha5zo/TWhRhwqCD3+EarCAgZ2yN28ipRnGPnwkI0=
@@ -213,6 +217,7 @@ golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5h
 golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20190422165155-953cdadca894/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20190507160741-ecd444e8653b/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20191005200804-aed5e4c7ecf9/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20191010194322-b09406accb47/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20191105231009-c1f44814a5cd h1:3x5uuvBgE6oaXJjCOvpCC1IpgJogqQ+PqGGU3ZxAgII=
 golang.org/x/sys v0.0.0-20191105231009-c1f44814a5cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
index 5147e3ac33bb65ea8dc0305f986b30a69d736785..e0e2755220a1ec3bbdb8737067c54a579209f40e 100644 (file)
@@ -29,24 +29,33 @@ type supervisedTask interface {
        String() string
 }
 
+var errNeedConfigReload = errors.New("config changed, restart needed")
+
 type bootCommand struct{}
 
-func (bootCommand) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int {
-       super := &Supervisor{
-               Stderr: stderr,
-               logger: ctxlog.New(stderr, "json", "info"),
+func (bcmd bootCommand) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int {
+       logger := ctxlog.New(stderr, "json", "info")
+       ctx := ctxlog.Context(context.Background(), logger)
+       for {
+               err := bcmd.run(ctx, prog, args, stdin, stdout, stderr)
+               if err == errNeedConfigReload {
+                       continue
+               } else if err != nil {
+                       logger.WithError(err).Info("exiting")
+                       return 1
+               } else {
+                       return 0
+               }
        }
+}
 
-       ctx := ctxlog.Context(context.Background(), super.logger)
+func (bcmd bootCommand) run(ctx context.Context, prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) error {
        ctx, cancel := context.WithCancel(ctx)
        defer cancel()
-
-       var err error
-       defer func() {
-               if err != nil {
-                       super.logger.WithError(err).Info("exiting")
-               }
-       }()
+       super := &Supervisor{
+               Stderr: stderr,
+               logger: ctxlog.FromContext(ctx),
+       }
 
        flags := flag.NewFlagSet(prog, flag.ContinueOnError)
        flags.SetOutput(stderr)
@@ -60,26 +69,25 @@ func (bootCommand) RunCommand(prog string, args []string, stdin io.Reader, stdou
        flags.BoolVar(&super.OwnTemporaryDatabase, "own-temporary-database", false, "bring up a postgres server and create a temporary database")
        timeout := flags.Duration("timeout", 0, "maximum time to wait for cluster to be ready")
        shutdown := flags.Bool("shutdown", false, "shut down when the cluster becomes ready")
-       err = flags.Parse(args)
+       err := flags.Parse(args)
        if err == flag.ErrHelp {
-               err = nil
-               return 0
+               return nil
        } else if err != nil {
-               return 2
+               return err
        } else if *versionFlag {
-               return cmd.Version.RunCommand(prog, args, stdin, stdout, stderr)
+               cmd.Version.RunCommand(prog, args, stdin, stdout, stderr)
+               return nil
        } else if super.ClusterType != "development" && super.ClusterType != "test" && super.ClusterType != "production" {
-               err = fmt.Errorf("cluster type must be 'development', 'test', or 'production'")
-               return 2
+               return fmt.Errorf("cluster type must be 'development', 'test', or 'production'")
        }
 
        loader.SkipAPICalls = true
        cfg, err := loader.Load()
        if err != nil {
-               return 1
+               return err
        }
 
-       super.Start(ctx, cfg)
+       super.Start(ctx, cfg, loader.Path)
        defer super.Stop()
 
        var timer *time.Timer
@@ -89,20 +97,19 @@ func (bootCommand) RunCommand(prog string, args []string, stdin io.Reader, stdou
 
        url, ok := super.WaitReady()
        if timer != nil && !timer.Stop() {
-               err = errors.New("boot timed out")
-               return 1
+               return errors.New("boot timed out")
        } else if !ok {
-               err = errors.New("boot failed")
-               return 1
-       }
-       // Write controller URL to stdout. Nothing else goes to
-       // stdout, so this provides an easy way for a calling script
-       // to discover the controller URL when everything is ready.
-       fmt.Fprintln(stdout, url)
-       if *shutdown {
-               super.Stop()
+               super.logger.Error("boot failed")
+       } else {
+               // Write controller URL to stdout. Nothing else goes
+               // to stdout, so this provides an easy way for a
+               // calling script to discover the controller URL when
+               // everything is ready.
+               fmt.Fprintln(stdout, url)
+               if *shutdown {
+                       super.Stop()
+               }
        }
        // Wait for signal/crash + orderly shutdown
-       <-super.done
-       return 0
+       return super.Wait()
 }
index 7f5d6a9baae2dd4eaa2b2e66fea9585f7be3bdc1..03ae536daecc447724a251cad4dfb3701c936cf5 100644 (file)
@@ -19,15 +19,18 @@ import (
        "os/signal"
        "os/user"
        "path/filepath"
+       "reflect"
        "strings"
        "sync"
        "syscall"
        "time"
 
+       "git.arvados.org/arvados.git/lib/config"
        "git.arvados.org/arvados.git/lib/service"
        "git.arvados.org/arvados.git/sdk/go/arvados"
        "git.arvados.org/arvados.git/sdk/go/ctxlog"
        "git.arvados.org/arvados.git/sdk/go/health"
+       "github.com/fsnotify/fsnotify"
        "github.com/sirupsen/logrus"
 )
 
@@ -45,7 +48,8 @@ type Supervisor struct {
 
        ctx           context.Context
        cancel        context.CancelFunc
-       done          chan struct{}
+       done          chan struct{} // closed when child procs/services have shut down
+       err           error         // error that caused shutdown (valid when done is closed)
        healthChecker *health.Aggregator
        tasksReady    map[string]chan bool
        waitShutdown  sync.WaitGroup
@@ -55,30 +59,66 @@ type Supervisor struct {
        environ    []string // for child processes
 }
 
-func (super *Supervisor) Start(ctx context.Context, cfg *arvados.Config) {
+func (super *Supervisor) Start(ctx context.Context, cfg *arvados.Config, cfgPath string) {
        super.ctx, super.cancel = context.WithCancel(ctx)
        super.done = make(chan struct{})
 
        go func() {
+               defer close(super.done)
+
                sigch := make(chan os.Signal)
                signal.Notify(sigch, syscall.SIGINT, syscall.SIGTERM)
                defer signal.Stop(sigch)
                go func() {
                        for sig := range sigch {
                                super.logger.WithField("signal", sig).Info("caught signal")
+                               if super.err == nil {
+                                       super.err = fmt.Errorf("caught signal %s", sig)
+                               }
+                               super.cancel()
+                       }
+               }()
+
+               hupch := make(chan os.Signal)
+               signal.Notify(hupch, syscall.SIGHUP)
+               defer signal.Stop(hupch)
+               go func() {
+                       for sig := range hupch {
+                               super.logger.WithField("signal", sig).Info("caught signal")
+                               if super.err == nil {
+                                       super.err = errNeedConfigReload
+                               }
                                super.cancel()
                        }
                }()
 
+               if cfgPath != "" && cfgPath != "-" && cfg.AutoReloadConfig {
+                       go watchConfig(super.ctx, super.logger, cfgPath, copyConfig(cfg), func() {
+                               if super.err == nil {
+                                       super.err = errNeedConfigReload
+                               }
+                               super.cancel()
+                       })
+               }
+
                err := super.run(cfg)
                if err != nil {
                        super.logger.WithError(err).Warn("supervisor shut down")
+                       if super.err == nil {
+                               super.err = err
+                       }
                }
-               close(super.done)
        }()
 }
 
+func (super *Supervisor) Wait() error {
+       <-super.done
+       return super.err
+}
+
 func (super *Supervisor) run(cfg *arvados.Config) error {
+       defer super.cancel()
+
        cwd, err := os.Getwd()
        if err != nil {
                return err
@@ -706,3 +746,67 @@ func waitForConnect(ctx context.Context, addr string) error {
        }
        return ctx.Err()
 }
+
+func copyConfig(cfg *arvados.Config) *arvados.Config {
+       pr, pw := io.Pipe()
+       go func() {
+               err := json.NewEncoder(pw).Encode(cfg)
+               if err != nil {
+                       panic(err)
+               }
+               pw.Close()
+       }()
+       cfg2 := new(arvados.Config)
+       err := json.NewDecoder(pr).Decode(cfg2)
+       if err != nil {
+               panic(err)
+       }
+       return cfg2
+}
+
+func watchConfig(ctx context.Context, logger logrus.FieldLogger, cfgPath string, prevcfg *arvados.Config, fn func()) {
+       watcher, err := fsnotify.NewWatcher()
+       if err != nil {
+               logger.WithError(err).Error("fsnotify setup failed")
+               return
+       }
+       defer watcher.Close()
+
+       err = watcher.Add(cfgPath)
+       if err != nil {
+               logger.WithError(err).Error("fsnotify watcher failed")
+               return
+       }
+
+       for {
+               select {
+               case <-ctx.Done():
+                       return
+               case err, ok := <-watcher.Errors:
+                       if !ok {
+                               return
+                       }
+                       logger.WithError(err).Warn("fsnotify watcher reported error")
+               case _, ok := <-watcher.Events:
+                       if !ok {
+                               return
+                       }
+                       for len(watcher.Events) > 0 {
+                               <-watcher.Events
+                       }
+                       loader := config.NewLoader(&bytes.Buffer{}, &logrus.Logger{Out: ioutil.Discard})
+                       loader.Path = cfgPath
+                       loader.SkipAPICalls = true
+                       cfg, err := loader.Load()
+                       if err != nil {
+                               logger.WithError(err).Warn("error reloading config file after change detected; ignoring new config for now")
+                       } else if reflect.DeepEqual(cfg, prevcfg) {
+                               logger.Debug("config file changed but is still DeepEqual to the existing config")
+                       } else {
+                               logger.Debug("config changed, notifying supervisor")
+                               fn()
+                               prevcfg = cfg
+                       }
+               }
+       }
+}
index f4f2d5653b66692315e45a7aa95f366579bf438a..3c420a04eb43e28e39fa69f8e1baa3330b803d64 100644 (file)
@@ -162,7 +162,7 @@ Clusters:
        code := DumpCommand.RunCommand("arvados config-dump", []string{"-config", "-"}, bytes.NewBufferString(in), &stdout, &stderr)
        c.Check(code, check.Equals, 0)
        c.Check(stderr.String(), check.Matches, `(?ms).*deprecated or unknown config entry: Clusters.z1234.UnknownKey.*`)
-       c.Check(stdout.String(), check.Matches, `(?ms)Clusters:\n  z1234:\n.*`)
+       c.Check(stdout.String(), check.Matches, `(?ms)(.*\n)?Clusters:\n  z1234:\n.*`)
        c.Check(stdout.String(), check.Matches, `(?ms).*\n *ManagementToken: secret\n.*`)
        c.Check(stdout.String(), check.Not(check.Matches), `(?ms).*UnknownKey.*`)
 }
index a4616d70b906e1c1aba03f99add3738e76166cf3..fd59c9c4253618f03f18a64f86df0312462d27eb 100644 (file)
@@ -1219,3 +1219,8 @@ Clusters:
     # implementation. Note that it also disables some new federation
     # features and will be removed in a future release.
     ForceLegacyAPI14: false
+
+# (Experimental) Restart services automatically when config file
+# changes are detected. Only supported by `arvados-server boot` in
+# dev/test mode.
+AutoReloadConfig: false
index 0194b02f5e770687d6a34023d190d7cde4207b23..2fce813a79d80a1a32790566735f874456da7cb9 100644 (file)
@@ -1225,4 +1225,9 @@ Clusters:
     # implementation. Note that it also disables some new federation
     # features and will be removed in a future release.
     ForceLegacyAPI14: false
+
+# (Experimental) Restart services automatically when config file
+# changes are detected. Only supported by ` + "`" + `arvados-server boot` + "`" + ` in
+# dev/test mode.
+AutoReloadConfig: false
 `)
index 2adb5811ea60d325c7a04ce99f9efe23b29b0173..6472e274201a782b5447955116a30ade448e1190 100644 (file)
@@ -113,7 +113,7 @@ func (s *IntegrationSuite) SetUpSuite(c *check.C) {
                        },
                        config: *cfg,
                }
-               s.testClusters[id].super.Start(context.Background(), &s.testClusters[id].config)
+               s.testClusters[id].super.Start(context.Background(), &s.testClusters[id].config, "-")
        }
        for _, tc := range s.testClusters {
                au, ok := tc.super.WaitReady()
index 6b83fb96d49e6359e656c3e634a273b6f29c4e16..38de6b8ea40983081a14db18b270e3db31a6a895 100644 (file)
@@ -23,7 +23,8 @@ var DefaultConfigFile = func() string {
 }()
 
 type Config struct {
-       Clusters map[string]Cluster
+       Clusters         map[string]Cluster
+       AutoReloadConfig bool
 }
 
 // GetConfig returns the current system config, loading it from
@@ -66,6 +67,7 @@ type WebDAVCacheConfig struct {
        MaxPermissionEntries int
        MaxUUIDEntries       int
 }
+
 type Cluster struct {
        ClusterID       string `json:"-"`
        ManagementToken string