Merge branch '17756-dispatch-lsf' into main
authorTom Clegg <tom@curii.com>
Thu, 29 Jul 2021 13:57:48 +0000 (09:57 -0400)
committerTom Clegg <tom@curii.com>
Thu, 29 Jul 2021 13:57:48 +0000 (09:57 -0400)
closes #17756

Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom@curii.com>

1  2 
build/run-build-packages.sh
doc/_config.yml
lib/config/export.go
services/crunch-dispatch-local/crunch-dispatch-local.go
services/crunch-dispatch-local/crunch-dispatch-local_test.go
services/crunch-dispatch-slurm/crunch-dispatch-slurm.go

index d46c246da7c9fadb0fe96fbe7edce0f4c623c351,ba68119865b6a3dbcf37b9b44323b0d19f644bcf..7829c8c6cd61792535960a153bb20baf1b7e1622
@@@ -277,6 -277,8 +277,8 @@@ package_go_binary cmd/arvados-server ar
      "Arvados cluster controller daemon"
  package_go_binary cmd/arvados-server arvados-dispatch-cloud \
      "Arvados cluster cloud dispatch"
+ package_go_binary cmd/arvados-server arvados-dispatch-lsf \
+     "Dispatch Arvados containers to an LSF cluster"
  package_go_binary services/arv-git-httpd arvados-git-httpd \
      "Provide authenticated http access to Arvados-hosted git repositories"
  package_go_binary services/crunch-dispatch-local crunch-dispatch-local \
@@@ -401,8 -403,8 +403,8 @@@ if [[ "$?" == "0" ]] ; the
        mv /tmp/x /etc/arvados/config.yml
        perl -p -i -e 'BEGIN{undef $/;} s/WebDAV(.*?):\n( *)ExternalURL: ""/WebDAV$1:\n$2ExternalURL: "example.com"/g' /etc/arvados/config.yml
  
 -      ARVADOS_CONFIG=none RAILS_ENV=production RAILS_GROUPS=assets bundle exec rake npm:install >"$STDOUT_IF_DEBUG"
 -      ARVADOS_CONFIG=none RAILS_ENV=production RAILS_GROUPS=assets bundle exec rake assets:precompile >"$STDOUT_IF_DEBUG"
 +      ARVADOS_CONFIG=none RAILS_ENV=production RAILS_GROUPS=assets bin/rake npm:install >"$STDOUT_IF_DEBUG"
 +      ARVADOS_CONFIG=none RAILS_ENV=production RAILS_GROUPS=assets bin/rake assets:precompile >"$STDOUT_IF_DEBUG"
  
        # Remove generated configuration files so they don't go in the package.
        rm -rf /etc/arvados/
diff --combined doc/_config.yml
index 39fe22fde3e1379205a26e0e5f28ef398c3d3980,0b0fbc9300749a968d5bb1570aada605155f4eb1..b18607ebb7490622d38e119ad6a0f0383fdb35ba
@@@ -161,7 -161,6 +161,7 @@@ navbar
        - architecture/manifest-format.html.textile.liquid
      - Computation with Crunch:
        - api/execution.html.textile.liquid
 +      - architecture/dispatchcloud.html.textile.liquid
      - Other:
        - api/permission-model.html.textile.liquid
        - architecture/federation.html.textile.liquid
      - Data Management:
        - admin/collection-versioning.html.textile.liquid
        - admin/collection-managed-properties.html.textile.liquid
 +      - admin/restricting-upload-download.html.textile.liquid
        - admin/keep-balance.html.textile.liquid
        - admin/controlling-container-reuse.html.textile.liquid
        - admin/logs-table-management.html.textile.liquid
        - install/crunch2-slurm/configure-slurm.html.textile.liquid
        - install/crunch2-slurm/install-compute-node.html.textile.liquid
        - install/crunch2-slurm/install-test.html.textile.liquid
+     - Containers API (lsf):
+       - install/crunch2-lsf/install-dispatch.html.textile.liquid
      - Additional configuration:
        - install/container-shell-access.html.textile.liquid
      - External dependencies:
diff --combined lib/config/export.go
index bb939321c9ce17e220bb031f041efae53c79fa46,7adb50ec374006063f23a1cd620ea46d8446728e..da5495352a53e7450ada8dabdbb578f9ec13647c
@@@ -120,6 -120,7 +120,7 @@@ var whitelist = map[string]bool
        "Containers.JobsAPI.GitInternalDir":                   false,
        "Containers.Logging":                                  false,
        "Containers.LogReuseDecisions":                        false,
+       "Containers.LSF":                                      false,
        "Containers.MaxComputeVMs":                            false,
        "Containers.MaxDispatchAttempts":                      false,
        "Containers.MaxRetryAttempts":                         true,
        "Volumes.*.ReadOnly":                                  true,
        "Volumes.*.Replication":                               true,
        "Volumes.*.StorageClasses":                            true,
 -      "Volumes.*.StorageClasses.*":                          false,
 +      "Volumes.*.StorageClasses.*":                          true,
        "Workbench":                                           true,
        "Workbench.ActivationContactLink":                     false,
        "Workbench.APIClientConnectTimeout":                   true,
index c202e683f2810e85ab6fddda40793f021b3e0eff,1486332382c3e370437542063e69719392a941ee..a3cb1341a4677e7ecdc7c03976da7483e47c1aa5
@@@ -17,7 -17,6 +17,7 @@@ import 
        "syscall"
        "time"
  
 +      "git.arvados.org/arvados.git/lib/config"
        "git.arvados.org/arvados.git/sdk/go/arvados"
        "git.arvados.org/arvados.git/sdk/go/arvadosclient"
        "git.arvados.org/arvados.git/sdk/go/dispatch"
@@@ -75,37 -74,10 +75,37 @@@ func doMain() error 
                return nil
        }
  
 +      loader := config.NewLoader(nil, logger)
 +      cfg, err := loader.Load()
 +      cluster, err := cfg.GetCluster("")
 +      if err != nil {
 +              return fmt.Errorf("config error: %s", err)
 +      }
 +
        logger.Printf("crunch-dispatch-local %s started", version)
  
        runningCmds = make(map[string]*exec.Cmd)
  
 +      var client arvados.Client
 +      client.APIHost = cluster.Services.Controller.ExternalURL.Host
 +      client.AuthToken = cluster.SystemRootToken
 +      client.Insecure = cluster.TLS.Insecure
 +
 +      if client.APIHost != "" || client.AuthToken != "" {
 +              // Copy real configs into env vars so [a]
 +              // MakeArvadosClient() uses them, and [b] they get
 +              // propagated to crunch-run via SLURM.
 +              os.Setenv("ARVADOS_API_HOST", client.APIHost)
 +              os.Setenv("ARVADOS_API_TOKEN", client.AuthToken)
 +              os.Setenv("ARVADOS_API_HOST_INSECURE", "")
 +              if client.Insecure {
 +                      os.Setenv("ARVADOS_API_HOST_INSECURE", "1")
 +              }
 +              os.Setenv("ARVADOS_EXTERNAL_CLIENT", "")
 +      } else {
 +              logger.Warnf("Client credentials missing from config, so falling back on environment variables (deprecated).")
 +      }
 +
        arv, err := arvadosclient.MakeArvadosClient()
        if err != nil {
                logger.Errorf("error making Arvados client: %v", err)
        dispatcher := dispatch.Dispatcher{
                Logger:       logger,
                Arv:          arv,
 -              RunContainer: (&LocalRun{startFunc, make(chan bool, 8), ctx}).run,
 +              RunContainer: (&LocalRun{startFunc, make(chan bool, 8), ctx, cluster}).run,
                PollPeriod:   time.Duration(*pollInterval) * time.Second,
        }
  
@@@ -156,7 -128,6 +156,7 @@@ type LocalRun struct 
        startCmd         func(container arvados.Container, cmd *exec.Cmd) error
        concurrencyLimit chan bool
        ctx              context.Context
 +      cluster          *arvados.Cluster
  }
  
  // Run a container.
  // crunch-run terminates, mark the container as Cancelled.
  func (lr *LocalRun) run(dispatcher *dispatch.Dispatcher,
        container arvados.Container,
-       status <-chan arvados.Container) {
+       status <-chan arvados.Container) error {
  
        uuid := container.UUID
  
                case lr.concurrencyLimit <- true:
                        break
                case <-lr.ctx.Done():
-                       return
+                       return lr.ctx.Err()
                }
  
                defer func() { <-lr.concurrencyLimit }()
                waitGroup.Add(1)
                defer waitGroup.Done()
  
 -              cmd := exec.Command(*crunchRunCommand, uuid)
 +              cmd := exec.Command(*crunchRunCommand, "--runtime-engine="+lr.cluster.Containers.RuntimeEngine, uuid)
                cmd.Stdin = nil
                cmd.Stderr = os.Stderr
                cmd.Stdout = os.Stderr
@@@ -270,4 -241,5 +270,5 @@@ Finish
        }
  
        dispatcher.Logger.Printf("finalized container %v", uuid)
+       return nil
  }
index d976bf0812950488b796cb063def8c960d128849,6ec31b1737f5a21004226f4bad31bf8fc504a970..92b8d2adcd6fe22e20c66afc1d4f803521ccd545
@@@ -81,11 -81,9 +81,11 @@@ func (s *TestSuite) TestIntegration(c *
                return cmd.Start()
        }
  
-       dispatcher.RunContainer = func(d *dispatch.Dispatcher, c arvados.Container, s <-chan arvados.Container) {
-               (&LocalRun{startCmd, make(chan bool, 8), ctx, &cl}).run(d, c, s)
-               cancel()
 +      cl := arvados.Cluster{Containers: arvados.ContainersConfig{RuntimeEngine: "docker"}}
 +
 -              return (&LocalRun{startCmd, make(chan bool, 8), ctx}).run(d, c, s)
+       dispatcher.RunContainer = func(d *dispatch.Dispatcher, c arvados.Container, s <-chan arvados.Container) error {
+               defer cancel()
++              return (&LocalRun{startCmd, make(chan bool, 8), ctx, &cl}).run(d, c, s)
        }
  
        err = dispatcher.Run(ctx)
@@@ -186,11 -184,9 +186,11 @@@ func testWithServerStub(c *C, apiStubRe
                return cmd.Start()
        }
  
-       dispatcher.RunContainer = func(d *dispatch.Dispatcher, c arvados.Container, s <-chan arvados.Container) {
-               (&LocalRun{startCmd, make(chan bool, 8), ctx, &cl}).run(d, c, s)
-               cancel()
 +      cl := arvados.Cluster{Containers: arvados.ContainersConfig{RuntimeEngine: "docker"}}
 +
 -              return (&LocalRun{startCmd, make(chan bool, 8), ctx}).run(d, c, s)
+       dispatcher.RunContainer = func(d *dispatch.Dispatcher, c arvados.Container, s <-chan arvados.Container) error {
+               defer cancel()
++              return (&LocalRun{startCmd, make(chan bool, 8), ctx, &cl}).run(d, c, s)
        }
  
        re := regexp.MustCompile(`(?ms).*` + expected + `.*`)
index 2f2f013c714a0be6bf863cbf8329efae62e616b6,5129495a0656633d146272bf8d60ff9d16a59918..584db38edf7e93ac57ad8929ca31e04de907b78d
@@@ -7,7 -7,6 +7,6 @@@ package mai
  // Dispatcher service for Crunch that submits containers to the slurm queue.
  
  import (
-       "bytes"
        "context"
        "flag"
        "fmt"
@@@ -255,7 -254,6 +254,7 @@@ func (disp *Dispatcher) submit(containe
        // append() here avoids modifying crunchRunCommand's
        // underlying array, which is shared with other goroutines.
        crArgs := append([]string(nil), crunchRunCommand...)
 +      crArgs = append(crArgs, "--runtime-engine="+disp.cluster.Containers.RuntimeEngine)
        crArgs = append(crArgs, container.UUID)
        crScript := strings.NewReader(execScript(crArgs))
  
  // already in the queue).  Cancel the slurm job if the container's
  // priority changes to zero or its state indicates it's no longer
  // running.
- func (disp *Dispatcher) runContainer(_ *dispatch.Dispatcher, ctr arvados.Container, status <-chan arvados.Container) {
+ func (disp *Dispatcher) runContainer(_ *dispatch.Dispatcher, ctr arvados.Container, status <-chan arvados.Container) error {
        ctx, cancel := context.WithCancel(context.Background())
        defer cancel()
  
                log.Printf("Submitting container %s to slurm", ctr.UUID)
                cmd := []string{disp.cluster.Containers.CrunchRunCommand}
                cmd = append(cmd, disp.cluster.Containers.CrunchRunArgumentsList...)
-               if err := disp.submit(ctr, cmd); err != nil {
-                       var text string
-                       switch err := err.(type) {
-                       case dispatchcloud.ConstraintsNotSatisfiableError:
-                               var logBuf bytes.Buffer
-                               fmt.Fprintf(&logBuf, "cannot run container %s: %s\n", ctr.UUID, err)
-                               if len(err.AvailableTypes) == 0 {
-                                       fmt.Fprint(&logBuf, "No instance types are configured.\n")
-                               } else {
-                                       fmt.Fprint(&logBuf, "Available instance types:\n")
-                                       for _, t := range err.AvailableTypes {
-                                               fmt.Fprintf(&logBuf,
-                                                       "Type %q: %d VCPUs, %d RAM, %d Scratch, %f Price\n",
-                                                       t.Name, t.VCPUs, t.RAM, t.Scratch, t.Price,
-                                               )
-                                       }
-                               }
-                               text = logBuf.String()
-                               disp.UpdateState(ctr.UUID, dispatch.Cancelled)
-                       default:
-                               text = fmt.Sprintf("Error submitting container %s to slurm: %s", ctr.UUID, err)
-                       }
-                       log.Print(text)
-                       lr := arvadosclient.Dict{"log": arvadosclient.Dict{
-                               "object_uuid": ctr.UUID,
-                               "event_type":  "dispatch",
-                               "properties":  map[string]string{"text": text}}}
-                       disp.Arv.Create("logs", lr, nil)
-                       disp.Unlock(ctr.UUID)
-                       return
+               err := disp.submit(ctr, cmd)
+               if err != nil {
+                       return err
                }
        }
  
                        case dispatch.Locked:
                                disp.Unlock(ctr.UUID)
                        }
-                       return
+                       return nil
                case updated, ok := <-status:
                        if !ok {
                                log.Printf("container %s is done: cancel slurm job", ctr.UUID)