18700: Fix nginx temp dir permissions.
[arvados.git] / lib / boot / supervisor.go
1 // Copyright (C) The Arvados Authors. All rights reserved.
2 //
3 // SPDX-License-Identifier: AGPL-3.0
4
5 package boot
6
7 import (
8         "bytes"
9         "context"
10         "crypto/rand"
11         "encoding/json"
12         "errors"
13         "fmt"
14         "io"
15         "io/ioutil"
16         "net"
17         "net/url"
18         "os"
19         "os/exec"
20         "os/signal"
21         "os/user"
22         "path/filepath"
23         "reflect"
24         "strconv"
25         "strings"
26         "sync"
27         "syscall"
28         "time"
29
30         "git.arvados.org/arvados.git/lib/config"
31         "git.arvados.org/arvados.git/lib/service"
32         "git.arvados.org/arvados.git/sdk/go/arvados"
33         "git.arvados.org/arvados.git/sdk/go/ctxlog"
34         "git.arvados.org/arvados.git/sdk/go/health"
35         "github.com/fsnotify/fsnotify"
36         "github.com/sirupsen/logrus"
37 )
38
39 type Supervisor struct {
40         SourcePath           string // e.g., /home/username/src/arvados
41         SourceVersion        string // e.g., acbd1324...
42         ClusterType          string // e.g., production
43         ListenHost           string // e.g., localhost
44         ControllerAddr       string // e.g., 127.0.0.1:8000
45         Workbench2Source     string // e.g., /home/username/src/arvados-workbench2
46         NoWorkbench1         bool
47         NoWorkbench2         bool
48         OwnTemporaryDatabase bool
49         Stderr               io.Writer
50
51         logger  logrus.FieldLogger
52         cluster *arvados.Cluster
53
54         ctx           context.Context
55         cancel        context.CancelFunc
56         done          chan struct{} // closed when child procs/services have shut down
57         err           error         // error that caused shutdown (valid when done is closed)
58         healthChecker *health.Aggregator
59         tasksReady    map[string]chan bool
60         waitShutdown  sync.WaitGroup
61
62         bindir     string
63         tempdir    string // in production mode, this is accessible only to root
64         wwwtempdir string // in production mode, this is accessible only to www-data
65         configfile string
66         environ    []string // for child processes
67 }
68
69 func (super *Supervisor) Cluster() *arvados.Cluster { return super.cluster }
70
71 func (super *Supervisor) Start(ctx context.Context, cfg *arvados.Config, cfgPath string) {
72         super.ctx, super.cancel = context.WithCancel(ctx)
73         super.done = make(chan struct{})
74
75         go func() {
76                 defer close(super.done)
77
78                 sigch := make(chan os.Signal)
79                 signal.Notify(sigch, syscall.SIGINT, syscall.SIGTERM)
80                 defer signal.Stop(sigch)
81                 go func() {
82                         for sig := range sigch {
83                                 super.logger.WithField("signal", sig).Info("caught signal")
84                                 if super.err == nil {
85                                         super.err = fmt.Errorf("caught signal %s", sig)
86                                 }
87                                 super.cancel()
88                         }
89                 }()
90
91                 hupch := make(chan os.Signal)
92                 signal.Notify(hupch, syscall.SIGHUP)
93                 defer signal.Stop(hupch)
94                 go func() {
95                         for sig := range hupch {
96                                 super.logger.WithField("signal", sig).Info("caught signal")
97                                 if super.err == nil {
98                                         super.err = errNeedConfigReload
99                                 }
100                                 super.cancel()
101                         }
102                 }()
103
104                 if cfgPath != "" && cfgPath != "-" && cfg.AutoReloadConfig {
105                         go watchConfig(super.ctx, super.logger, cfgPath, copyConfig(cfg), func() {
106                                 if super.err == nil {
107                                         super.err = errNeedConfigReload
108                                 }
109                                 super.cancel()
110                         })
111                 }
112
113                 err := super.run(cfg)
114                 if err != nil {
115                         super.logger.WithError(err).Warn("supervisor shut down")
116                         if super.err == nil {
117                                 super.err = err
118                         }
119                 }
120         }()
121 }
122
123 func (super *Supervisor) Wait() error {
124         <-super.done
125         return super.err
126 }
127
128 func (super *Supervisor) run(cfg *arvados.Config) error {
129         defer super.cancel()
130
131         cwd, err := os.Getwd()
132         if err != nil {
133                 return err
134         }
135         if !strings.HasPrefix(super.SourcePath, "/") {
136                 super.SourcePath = filepath.Join(cwd, super.SourcePath)
137         }
138         super.SourcePath, err = filepath.EvalSymlinks(super.SourcePath)
139         if err != nil {
140                 return err
141         }
142
143         // Choose bin and temp dirs: /var/lib/arvados/... in
144         // production, transient tempdir otherwise.
145         if super.ClusterType == "production" {
146                 // These dirs have already been created by
147                 // "arvados-server install" (or by extracting a
148                 // package).
149                 super.tempdir = "/var/lib/arvados/tmp"
150                 super.wwwtempdir = "/var/lib/arvados/wwwtmp"
151                 super.bindir = "/var/lib/arvados/bin"
152         } else {
153                 super.tempdir, err = ioutil.TempDir("", "arvados-server-boot-")
154                 if err != nil {
155                         return err
156                 }
157                 defer os.RemoveAll(super.tempdir)
158                 super.wwwtempdir = super.tempdir
159                 super.bindir = filepath.Join(super.tempdir, "bin")
160                 if err := os.Mkdir(super.bindir, 0755); err != nil {
161                         return err
162                 }
163         }
164
165         // Fill in any missing config keys, and write the resulting
166         // config in the temp dir for child services to use.
167         err = super.autofillConfig(cfg)
168         if err != nil {
169                 return err
170         }
171         conffile, err := os.OpenFile(filepath.Join(super.wwwtempdir, "config.yml"), os.O_CREATE|os.O_WRONLY, 0644)
172         if err != nil {
173                 return err
174         }
175         defer conffile.Close()
176         err = json.NewEncoder(conffile).Encode(cfg)
177         if err != nil {
178                 return err
179         }
180         err = conffile.Close()
181         if err != nil {
182                 return err
183         }
184         super.configfile = conffile.Name()
185
186         super.environ = os.Environ()
187         super.cleanEnv([]string{"ARVADOS_"})
188         super.setEnv("ARVADOS_CONFIG", super.configfile)
189         super.setEnv("RAILS_ENV", super.ClusterType)
190         super.setEnv("TMPDIR", super.tempdir)
191         super.prependEnv("PATH", "/var/lib/arvados/bin:")
192         if super.ClusterType != "production" {
193                 super.prependEnv("PATH", super.tempdir+"/bin:")
194         }
195
196         super.cluster, err = cfg.GetCluster("")
197         if err != nil {
198                 return err
199         }
200         // Now that we have the config, replace the bootstrap logger
201         // with a new one according to the logging config.
202         loglevel := super.cluster.SystemLogs.LogLevel
203         if s := os.Getenv("ARVADOS_DEBUG"); s != "" && s != "0" {
204                 loglevel = "debug"
205         }
206         super.logger = ctxlog.New(super.Stderr, super.cluster.SystemLogs.Format, loglevel).WithFields(logrus.Fields{
207                 "PID": os.Getpid(),
208         })
209
210         if super.SourceVersion == "" && super.ClusterType == "production" {
211                 // don't need SourceVersion
212         } else if super.SourceVersion == "" {
213                 // Find current source tree version.
214                 var buf bytes.Buffer
215                 err = super.RunProgram(super.ctx, ".", runOptions{output: &buf}, "git", "diff", "--shortstat")
216                 if err != nil {
217                         return err
218                 }
219                 dirty := buf.Len() > 0
220                 buf.Reset()
221                 err = super.RunProgram(super.ctx, ".", runOptions{output: &buf}, "git", "log", "-n1", "--format=%H")
222                 if err != nil {
223                         return err
224                 }
225                 super.SourceVersion = strings.TrimSpace(buf.String())
226                 if dirty {
227                         super.SourceVersion += "+uncommitted"
228                 }
229         } else {
230                 return errors.New("specifying a version to run is not yet supported")
231         }
232
233         _, err = super.installGoProgram(super.ctx, "cmd/arvados-server")
234         if err != nil {
235                 return err
236         }
237         err = super.setupRubyEnv()
238         if err != nil {
239                 return err
240         }
241
242         tasks := []supervisedTask{
243                 createCertificates{},
244                 runPostgreSQL{},
245                 runNginx{},
246                 runServiceCommand{name: "controller", svc: super.cluster.Services.Controller, depends: []supervisedTask{seedDatabase{}}},
247                 runGoProgram{src: "services/arv-git-httpd", svc: super.cluster.Services.GitHTTP},
248                 runGoProgram{src: "services/health", svc: super.cluster.Services.Health},
249                 runGoProgram{src: "services/keepproxy", svc: super.cluster.Services.Keepproxy, depends: []supervisedTask{runPassenger{src: "services/api"}}},
250                 runServiceCommand{name: "keepstore", svc: super.cluster.Services.Keepstore},
251                 runGoProgram{src: "services/keep-web", svc: super.cluster.Services.WebDAV},
252                 runServiceCommand{name: "ws", svc: super.cluster.Services.Websocket, depends: []supervisedTask{seedDatabase{}}},
253                 installPassenger{src: "services/api"},
254                 runPassenger{src: "services/api", varlibdir: "railsapi", svc: super.cluster.Services.RailsAPI, depends: []supervisedTask{createCertificates{}, seedDatabase{}, installPassenger{src: "services/api"}}},
255                 seedDatabase{},
256         }
257         if !super.NoWorkbench1 {
258                 tasks = append(tasks,
259                         installPassenger{src: "apps/workbench", depends: []supervisedTask{seedDatabase{}}}, // dependency ensures workbench doesn't delay api install/startup
260                         runPassenger{src: "apps/workbench", varlibdir: "workbench1", svc: super.cluster.Services.Workbench1, depends: []supervisedTask{installPassenger{src: "apps/workbench"}}},
261                 )
262         }
263         if !super.NoWorkbench2 {
264                 tasks = append(tasks,
265                         runWorkbench2{svc: super.cluster.Services.Workbench2},
266                 )
267         }
268         if super.ClusterType != "test" {
269                 tasks = append(tasks,
270                         runServiceCommand{name: "dispatch-cloud", svc: super.cluster.Services.DispatchCloud},
271                         runGoProgram{src: "services/keep-balance", svc: super.cluster.Services.Keepbalance},
272                 )
273         }
274         super.tasksReady = map[string]chan bool{}
275         for _, task := range tasks {
276                 super.tasksReady[task.String()] = make(chan bool)
277         }
278         for _, task := range tasks {
279                 task := task
280                 fail := func(err error) {
281                         if super.ctx.Err() != nil {
282                                 return
283                         }
284                         super.cancel()
285                         super.logger.WithField("task", task.String()).WithError(err).Error("task failed")
286                 }
287                 go func() {
288                         super.logger.WithField("task", task.String()).Info("starting")
289                         err := task.Run(super.ctx, fail, super)
290                         if err != nil {
291                                 fail(err)
292                                 return
293                         }
294                         close(super.tasksReady[task.String()])
295                 }()
296         }
297         err = super.wait(super.ctx, tasks...)
298         if err != nil {
299                 return err
300         }
301         super.logger.Info("all startup tasks are complete; starting health checks")
302         super.healthChecker = &health.Aggregator{Cluster: super.cluster}
303         <-super.ctx.Done()
304         super.logger.Info("shutting down")
305         super.waitShutdown.Wait()
306         return super.ctx.Err()
307 }
308
309 func (super *Supervisor) wait(ctx context.Context, tasks ...supervisedTask) error {
310         for _, task := range tasks {
311                 ch, ok := super.tasksReady[task.String()]
312                 if !ok {
313                         return fmt.Errorf("no such task: %s", task)
314                 }
315                 super.logger.WithField("task", task.String()).Info("waiting")
316                 select {
317                 case <-ch:
318                         super.logger.WithField("task", task.String()).Info("ready")
319                 case <-ctx.Done():
320                         super.logger.WithField("task", task.String()).Info("task was never ready")
321                         return ctx.Err()
322                 }
323         }
324         return nil
325 }
326
327 func (super *Supervisor) Stop() {
328         super.cancel()
329         <-super.done
330 }
331
332 func (super *Supervisor) WaitReady() (*arvados.URL, bool) {
333         ticker := time.NewTicker(time.Second)
334         defer ticker.Stop()
335         for waiting := "all"; waiting != ""; {
336                 select {
337                 case <-ticker.C:
338                 case <-super.ctx.Done():
339                         return nil, false
340                 }
341                 if super.healthChecker == nil {
342                         // not set up yet
343                         continue
344                 }
345                 resp := super.healthChecker.ClusterHealth()
346                 // The overall health check (resp.Health=="OK") might
347                 // never pass due to missing components (like
348                 // arvados-dispatch-cloud in a test cluster), so
349                 // instead we wait for all configured components to
350                 // pass.
351                 waiting = ""
352                 for target, check := range resp.Checks {
353                         if check.Health != "OK" {
354                                 waiting += " " + target
355                         }
356                 }
357                 if waiting != "" {
358                         super.logger.WithField("targets", waiting[1:]).Info("waiting")
359                 }
360         }
361         u := super.cluster.Services.Controller.ExternalURL
362         return &u, true
363 }
364
365 func (super *Supervisor) prependEnv(key, prepend string) {
366         for i, s := range super.environ {
367                 if strings.HasPrefix(s, key+"=") {
368                         super.environ[i] = key + "=" + prepend + s[len(key)+1:]
369                         return
370                 }
371         }
372         super.environ = append(super.environ, key+"="+prepend)
373 }
374
375 func (super *Supervisor) cleanEnv(prefixes []string) {
376         var cleaned []string
377         for _, s := range super.environ {
378                 drop := false
379                 for _, p := range prefixes {
380                         if strings.HasPrefix(s, p) {
381                                 drop = true
382                                 break
383                         }
384                 }
385                 if !drop {
386                         cleaned = append(cleaned, s)
387                 }
388         }
389         super.environ = cleaned
390 }
391
392 func (super *Supervisor) setEnv(key, val string) {
393         for i, s := range super.environ {
394                 if strings.HasPrefix(s, key+"=") {
395                         super.environ[i] = key + "=" + val
396                         return
397                 }
398         }
399         super.environ = append(super.environ, key+"="+val)
400 }
401
402 // Remove all but the first occurrence of each env var.
403 func dedupEnv(in []string) []string {
404         saw := map[string]bool{}
405         var out []string
406         for _, kv := range in {
407                 if split := strings.Index(kv, "="); split < 1 {
408                         panic("invalid environment var: " + kv)
409                 } else if saw[kv[:split]] {
410                         continue
411                 } else {
412                         saw[kv[:split]] = true
413                         out = append(out, kv)
414                 }
415         }
416         return out
417 }
418
419 func (super *Supervisor) installGoProgram(ctx context.Context, srcpath string) (string, error) {
420         _, basename := filepath.Split(srcpath)
421         binfile := filepath.Join(super.bindir, basename)
422         if super.ClusterType == "production" {
423                 return binfile, nil
424         }
425         err := super.RunProgram(ctx, filepath.Join(super.SourcePath, srcpath), runOptions{env: []string{"GOBIN=" + super.bindir}}, "go", "install", "-ldflags", "-X git.arvados.org/arvados.git/lib/cmd.version="+super.SourceVersion+" -X main.version="+super.SourceVersion)
426         return binfile, err
427 }
428
429 func (super *Supervisor) usingRVM() bool {
430         return os.Getenv("rvm_path") != ""
431 }
432
433 func (super *Supervisor) setupRubyEnv() error {
434         if !super.usingRVM() {
435                 // (If rvm is in use, assume the caller has everything
436                 // set up as desired)
437                 super.cleanEnv([]string{
438                         "GEM_HOME=",
439                         "GEM_PATH=",
440                 })
441                 gem := "gem"
442                 if _, err := os.Stat("/var/lib/arvados/bin/gem"); err == nil || super.ClusterType == "production" {
443                         gem = "/var/lib/arvados/bin/gem"
444                 }
445                 cmd := exec.Command(gem, "env", "gempath")
446                 if super.ClusterType == "production" {
447                         cmd.Args = append([]string{"sudo", "-u", "www-data", "-E", "HOME=/var/www"}, cmd.Args...)
448                         path, err := exec.LookPath("sudo")
449                         if err != nil {
450                                 return fmt.Errorf("LookPath(\"sudo\"): %w", err)
451                         }
452                         cmd.Path = path
453                 }
454                 cmd.Stderr = super.Stderr
455                 cmd.Env = super.environ
456                 buf, err := cmd.Output() // /var/lib/arvados/.gem/ruby/2.5.0/bin:...
457                 if err != nil || len(buf) == 0 {
458                         return fmt.Errorf("gem env gempath: %w", err)
459                 }
460                 gempath := string(bytes.Split(buf, []byte{':'})[0])
461                 super.prependEnv("PATH", gempath+"/bin:")
462                 super.setEnv("GEM_HOME", gempath)
463                 super.setEnv("GEM_PATH", gempath)
464         }
465         // Passenger install doesn't work unless $HOME is ~user
466         u, err := user.Current()
467         if err != nil {
468                 return err
469         }
470         super.setEnv("HOME", u.HomeDir)
471         return nil
472 }
473
474 func (super *Supervisor) lookPath(prog string) string {
475         for _, val := range super.environ {
476                 if strings.HasPrefix(val, "PATH=") {
477                         for _, dir := range filepath.SplitList(val[5:]) {
478                                 path := filepath.Join(dir, prog)
479                                 if fi, err := os.Stat(path); err == nil && fi.Mode()&0111 != 0 {
480                                         return path
481                                 }
482                         }
483                 }
484         }
485         return prog
486 }
487
488 type runOptions struct {
489         output io.Writer // attach stdout
490         env    []string  // add/replace environment variables
491         user   string    // run as specified user
492         stdin  io.Reader
493 }
494
495 // RunProgram runs prog with args, using dir as working directory. If ctx is
496 // cancelled while the child is running, RunProgram terminates the child, waits
497 // for it to exit, then returns.
498 //
499 // Child's environment will have our env vars, plus any given in env.
500 //
501 // Child's stdout will be written to output if non-nil, otherwise the
502 // boot command's stderr.
503 func (super *Supervisor) RunProgram(ctx context.Context, dir string, opts runOptions, prog string, args ...string) error {
504         cmdline := fmt.Sprintf("%s", append([]string{prog}, args...))
505         super.logger.WithField("command", cmdline).WithField("dir", dir).Info("executing")
506
507         logprefix := prog
508         {
509                 innerargs := args
510                 if logprefix == "sudo" {
511                         for i := 0; i < len(args); i++ {
512                                 if args[i] == "-u" {
513                                         i++
514                                 } else if args[i] == "-E" || strings.Contains(args[i], "=") {
515                                 } else {
516                                         logprefix = args[i]
517                                         innerargs = args[i+1:]
518                                         break
519                                 }
520                         }
521                 }
522                 logprefix = strings.TrimPrefix(logprefix, "/var/lib/arvados/bin/")
523                 logprefix = strings.TrimPrefix(logprefix, super.tempdir+"/bin/")
524                 if logprefix == "bundle" && len(innerargs) > 2 && innerargs[0] == "exec" {
525                         _, dirbase := filepath.Split(dir)
526                         logprefix = innerargs[1] + "@" + dirbase
527                 } else if logprefix == "arvados-server" && len(args) > 1 {
528                         logprefix = args[0]
529                 }
530                 if !strings.HasPrefix(dir, "/") {
531                         logprefix = dir + ": " + logprefix
532                 }
533         }
534
535         cmd := exec.Command(super.lookPath(prog), args...)
536         cmd.Stdin = opts.stdin
537         stdout, err := cmd.StdoutPipe()
538         if err != nil {
539                 return err
540         }
541         stderr, err := cmd.StderrPipe()
542         if err != nil {
543                 return err
544         }
545         logwriter := &service.LogPrefixer{Writer: super.Stderr, Prefix: []byte("[" + logprefix + "] ")}
546         var copiers sync.WaitGroup
547         copiers.Add(1)
548         go func() {
549                 io.Copy(logwriter, stderr)
550                 copiers.Done()
551         }()
552         copiers.Add(1)
553         go func() {
554                 if opts.output == nil {
555                         io.Copy(logwriter, stdout)
556                 } else {
557                         io.Copy(opts.output, stdout)
558                 }
559                 copiers.Done()
560         }()
561
562         if strings.HasPrefix(dir, "/") {
563                 cmd.Dir = dir
564         } else {
565                 cmd.Dir = filepath.Join(super.SourcePath, dir)
566         }
567         env := append([]string(nil), opts.env...)
568         env = append(env, super.environ...)
569         cmd.Env = dedupEnv(env)
570
571         if opts.user != "" {
572                 // Note: We use this approach instead of "sudo"
573                 // because in certain circumstances (we are pid 1 in a
574                 // docker container, and our passenger child process
575                 // changes to pgid 1) the intermediate sudo process
576                 // notices we have the same pgid as our child and
577                 // refuses to propagate signals from us to our child,
578                 // so we can't signal/shutdown our passenger/rails
579                 // apps. "chpst" or "setuidgid" would work, but these
580                 // few lines avoid depending on runit/daemontools.
581                 u, err := user.Lookup(opts.user)
582                 if err != nil {
583                         return fmt.Errorf("user.Lookup(%q): %w", opts.user, err)
584                 }
585                 uid, _ := strconv.Atoi(u.Uid)
586                 gid, _ := strconv.Atoi(u.Gid)
587                 cmd.SysProcAttr = &syscall.SysProcAttr{
588                         Credential: &syscall.Credential{
589                                 Uid: uint32(uid),
590                                 Gid: uint32(gid),
591                         },
592                 }
593         }
594
595         exited := false
596         defer func() { exited = true }()
597         go func() {
598                 <-ctx.Done()
599                 log := ctxlog.FromContext(ctx).WithFields(logrus.Fields{"dir": dir, "cmdline": cmdline})
600                 for !exited {
601                         if cmd.Process == nil {
602                                 log.Debug("waiting for child process to start")
603                                 time.Sleep(time.Second / 2)
604                         } else {
605                                 log.WithField("PID", cmd.Process.Pid).Debug("sending SIGTERM")
606                                 cmd.Process.Signal(syscall.SIGTERM)
607                                 time.Sleep(5 * time.Second)
608                                 if !exited {
609                                         stdout.Close()
610                                         stderr.Close()
611                                         log.WithField("PID", cmd.Process.Pid).Warn("still waiting for child process to exit 5s after SIGTERM")
612                                 }
613                         }
614                 }
615         }()
616
617         err = cmd.Start()
618         if err != nil {
619                 return err
620         }
621         copiers.Wait()
622         err = cmd.Wait()
623         if ctx.Err() != nil {
624                 // Return "context canceled", instead of the "killed"
625                 // error that was probably caused by the context being
626                 // canceled.
627                 return ctx.Err()
628         } else if err != nil {
629                 return fmt.Errorf("%s: error: %v", cmdline, err)
630         }
631         return nil
632 }
633
634 func (super *Supervisor) autofillConfig(cfg *arvados.Config) error {
635         cluster, err := cfg.GetCluster("")
636         if err != nil {
637                 return err
638         }
639         usedPort := map[string]bool{}
640         nextPort := func(host string) (string, error) {
641                 for {
642                         port, err := availablePort(host)
643                         if err != nil {
644                                 port, err = availablePort(super.ListenHost)
645                         }
646                         if err != nil {
647                                 return "", err
648                         }
649                         if usedPort[port] {
650                                 continue
651                         }
652                         usedPort[port] = true
653                         return port, nil
654                 }
655         }
656         if cluster.Services.Controller.ExternalURL.Host == "" {
657                 h, p, err := net.SplitHostPort(super.ControllerAddr)
658                 if err != nil {
659                         return fmt.Errorf("SplitHostPort(ControllerAddr): %w", err)
660                 }
661                 if h == "" {
662                         h = super.ListenHost
663                 }
664                 if p == "0" {
665                         p, err = nextPort(h)
666                         if err != nil {
667                                 return err
668                         }
669                 }
670                 cluster.Services.Controller.ExternalURL = arvados.URL{Scheme: "https", Host: net.JoinHostPort(h, p), Path: "/"}
671         }
672         u := url.URL(cluster.Services.Controller.ExternalURL)
673         defaultExtHost := u.Hostname()
674         for _, svc := range []*arvados.Service{
675                 &cluster.Services.Controller,
676                 &cluster.Services.DispatchCloud,
677                 &cluster.Services.GitHTTP,
678                 &cluster.Services.Health,
679                 &cluster.Services.Keepproxy,
680                 &cluster.Services.Keepstore,
681                 &cluster.Services.RailsAPI,
682                 &cluster.Services.WebDAV,
683                 &cluster.Services.WebDAVDownload,
684                 &cluster.Services.Websocket,
685                 &cluster.Services.Workbench1,
686                 &cluster.Services.Workbench2,
687         } {
688                 if svc == &cluster.Services.DispatchCloud && super.ClusterType == "test" {
689                         continue
690                 }
691                 if svc.ExternalURL.Host == "" {
692                         port, err := nextPort(defaultExtHost)
693                         if err != nil {
694                                 return err
695                         }
696                         host := net.JoinHostPort(defaultExtHost, port)
697                         if svc == &cluster.Services.Controller ||
698                                 svc == &cluster.Services.GitHTTP ||
699                                 svc == &cluster.Services.Health ||
700                                 svc == &cluster.Services.Keepproxy ||
701                                 svc == &cluster.Services.WebDAV ||
702                                 svc == &cluster.Services.WebDAVDownload ||
703                                 svc == &cluster.Services.Workbench1 ||
704                                 svc == &cluster.Services.Workbench2 {
705                                 svc.ExternalURL = arvados.URL{Scheme: "https", Host: host, Path: "/"}
706                         } else if svc == &cluster.Services.Websocket {
707                                 svc.ExternalURL = arvados.URL{Scheme: "wss", Host: host, Path: "/websocket"}
708                         }
709                 }
710                 if super.NoWorkbench1 && svc == &cluster.Services.Workbench1 ||
711                         super.NoWorkbench2 && svc == &cluster.Services.Workbench2 {
712                         // When workbench1 is disabled, it gets an
713                         // ExternalURL (so we have a valid listening
714                         // port to write in our Nginx config) but no
715                         // InternalURLs (so health checker doesn't
716                         // complain).
717                         continue
718                 }
719                 if len(svc.InternalURLs) == 0 {
720                         port, err := nextPort(super.ListenHost)
721                         if err != nil {
722                                 return err
723                         }
724                         host := net.JoinHostPort(super.ListenHost, port)
725                         svc.InternalURLs = map[arvados.URL]arvados.ServiceInstance{
726                                 {Scheme: "http", Host: host, Path: "/"}: {},
727                         }
728                 }
729         }
730         if super.ClusterType != "production" {
731                 if cluster.SystemRootToken == "" {
732                         cluster.SystemRootToken = randomHexString(64)
733                 }
734                 if cluster.ManagementToken == "" {
735                         cluster.ManagementToken = randomHexString(64)
736                 }
737                 if cluster.Collections.BlobSigningKey == "" {
738                         cluster.Collections.BlobSigningKey = randomHexString(64)
739                 }
740                 if cluster.Users.AnonymousUserToken == "" {
741                         cluster.Users.AnonymousUserToken = randomHexString(64)
742                 }
743                 if cluster.Containers.DispatchPrivateKey == "" {
744                         buf, err := ioutil.ReadFile(filepath.Join(super.SourcePath, "lib", "dispatchcloud", "test", "sshkey_dispatch"))
745                         if err != nil {
746                                 return err
747                         }
748                         cluster.Containers.DispatchPrivateKey = string(buf)
749                 }
750                 cluster.TLS.Insecure = true
751         }
752         if super.ClusterType == "test" {
753                 // Add a second keepstore process.
754                 port, err := nextPort(super.ListenHost)
755                 if err != nil {
756                         return err
757                 }
758                 host := net.JoinHostPort(super.ListenHost, port)
759                 cluster.Services.Keepstore.InternalURLs[arvados.URL{Scheme: "http", Host: host, Path: "/"}] = arvados.ServiceInstance{}
760
761                 // Create a directory-backed volume for each keepstore
762                 // process.
763                 cluster.Volumes = map[string]arvados.Volume{}
764                 for url := range cluster.Services.Keepstore.InternalURLs {
765                         volnum := len(cluster.Volumes)
766                         datadir := fmt.Sprintf("%s/keep%d.data", super.tempdir, volnum)
767                         if _, err = os.Stat(datadir + "/."); err == nil {
768                         } else if !os.IsNotExist(err) {
769                                 return err
770                         } else if err = os.Mkdir(datadir, 0755); err != nil {
771                                 return err
772                         }
773                         cluster.Volumes[fmt.Sprintf(cluster.ClusterID+"-nyw5e-%015d", volnum)] = arvados.Volume{
774                                 Driver:           "Directory",
775                                 DriverParameters: json.RawMessage(fmt.Sprintf(`{"Root":%q}`, datadir)),
776                                 AccessViaHosts: map[arvados.URL]arvados.VolumeAccess{
777                                         url: {},
778                                 },
779                                 StorageClasses: map[string]bool{
780                                         "default": true,
781                                         "foo":     true,
782                                         "bar":     true,
783                                 },
784                         }
785                 }
786                 cluster.StorageClasses = map[string]arvados.StorageClassConfig{
787                         "default": {Default: true},
788                         "foo":     {},
789                         "bar":     {},
790                 }
791         }
792         if super.OwnTemporaryDatabase {
793                 port, err := nextPort("localhost")
794                 if err != nil {
795                         return err
796                 }
797                 cluster.PostgreSQL.Connection = arvados.PostgreSQLConnection{
798                         "client_encoding": "utf8",
799                         "host":            "localhost",
800                         "port":            port,
801                         "dbname":          "arvados_test",
802                         "user":            "arvados",
803                         "password":        "insecure_arvados_test",
804                 }
805         }
806
807         cfg.Clusters[cluster.ClusterID] = *cluster
808         return nil
809 }
810
811 func addrIsLocal(addr string) (bool, error) {
812         return true, nil
813         listener, err := net.Listen("tcp", addr)
814         if err == nil {
815                 listener.Close()
816                 return true, nil
817         } else if strings.Contains(err.Error(), "cannot assign requested address") {
818                 return false, nil
819         } else {
820                 return false, err
821         }
822 }
823
824 func randomHexString(chars int) string {
825         b := make([]byte, chars/2)
826         _, err := rand.Read(b)
827         if err != nil {
828                 panic(err)
829         }
830         return fmt.Sprintf("%x", b)
831 }
832
833 func internalPort(svc arvados.Service) (host, port string, err error) {
834         if len(svc.InternalURLs) > 1 {
835                 return "", "", errors.New("internalPort() doesn't work with multiple InternalURLs")
836         }
837         for u := range svc.InternalURLs {
838                 u := url.URL(u)
839                 host, port = u.Hostname(), u.Port()
840                 switch {
841                 case port != "":
842                 case u.Scheme == "https", u.Scheme == "ws":
843                         port = "443"
844                 default:
845                         port = "80"
846                 }
847                 return
848         }
849         return "", "", fmt.Errorf("service has no InternalURLs")
850 }
851
852 func externalPort(svc arvados.Service) (string, error) {
853         u := url.URL(svc.ExternalURL)
854         if p := u.Port(); p != "" {
855                 return p, nil
856         } else if u.Scheme == "https" || u.Scheme == "wss" {
857                 return "443", nil
858         } else {
859                 return "80", nil
860         }
861 }
862
863 func availablePort(host string) (string, error) {
864         ln, err := net.Listen("tcp", net.JoinHostPort(host, "0"))
865         if err != nil {
866                 return "", err
867         }
868         defer ln.Close()
869         _, port, err := net.SplitHostPort(ln.Addr().String())
870         if err != nil {
871                 return "", err
872         }
873         return port, nil
874 }
875
876 // Try to connect to addr until it works, then close ch. Give up if
877 // ctx cancels.
878 func waitForConnect(ctx context.Context, addr string) error {
879         dialer := net.Dialer{Timeout: time.Second}
880         for ctx.Err() == nil {
881                 conn, err := dialer.DialContext(ctx, "tcp", addr)
882                 if err != nil {
883                         time.Sleep(time.Second / 10)
884                         continue
885                 }
886                 conn.Close()
887                 return nil
888         }
889         return ctx.Err()
890 }
891
892 func copyConfig(cfg *arvados.Config) *arvados.Config {
893         pr, pw := io.Pipe()
894         go func() {
895                 err := json.NewEncoder(pw).Encode(cfg)
896                 if err != nil {
897                         panic(err)
898                 }
899                 pw.Close()
900         }()
901         cfg2 := new(arvados.Config)
902         err := json.NewDecoder(pr).Decode(cfg2)
903         if err != nil {
904                 panic(err)
905         }
906         return cfg2
907 }
908
909 func watchConfig(ctx context.Context, logger logrus.FieldLogger, cfgPath string, prevcfg *arvados.Config, fn func()) {
910         watcher, err := fsnotify.NewWatcher()
911         if err != nil {
912                 logger.WithError(err).Error("fsnotify setup failed")
913                 return
914         }
915         defer watcher.Close()
916
917         err = watcher.Add(cfgPath)
918         if err != nil {
919                 logger.WithError(err).Error("fsnotify watcher failed")
920                 return
921         }
922
923         for {
924                 select {
925                 case <-ctx.Done():
926                         return
927                 case err, ok := <-watcher.Errors:
928                         if !ok {
929                                 return
930                         }
931                         logger.WithError(err).Warn("fsnotify watcher reported error")
932                 case _, ok := <-watcher.Events:
933                         if !ok {
934                                 return
935                         }
936                         for len(watcher.Events) > 0 {
937                                 <-watcher.Events
938                         }
939                         loader := config.NewLoader(&bytes.Buffer{}, &logrus.Logger{Out: ioutil.Discard})
940                         loader.Path = cfgPath
941                         loader.SkipAPICalls = true
942                         cfg, err := loader.Load()
943                         if err != nil {
944                                 logger.WithError(err).Warn("error reloading config file after change detected; ignoring new config for now")
945                         } else if reflect.DeepEqual(cfg, prevcfg) {
946                                 logger.Debug("config file changed but is still DeepEqual to the existing config")
947                         } else {
948                                 logger.Debug("config changed, notifying supervisor")
949                                 fn()
950                                 prevcfg = cfg
951                         }
952                 }
953         }
954 }