From: Tom Clegg Date: Fri, 13 May 2022 20:53:27 +0000 (-0400) Subject: Merge branch '18947-dispatch-slurm' X-Git-Tag: 2.5.0~175 X-Git-Url: https://git.arvados.org/arvados.git/commitdiff_plain/1f92b08e21c19dfa0c6d2260fe2fd8b24529e9e3?hp=d904fbcd2e67054784954c0af54ef8758a37f5ea Merge branch '18947-dispatch-slurm' refs #18947 Arvados-DCO-1.1-Signed-off-by: Tom Clegg --- diff --git a/build/run-build-packages.sh b/build/run-build-packages.sh index 3771662423..3e1ed6a94d 100755 --- a/build/run-build-packages.sh +++ b/build/run-build-packages.sh @@ -248,7 +248,7 @@ package_go_binary cmd/arvados-server arvados-git-httpd "$FORMAT" "$ARCH" \ "Provide authenticated http access to Arvados-hosted git repositories" package_go_binary services/crunch-dispatch-local crunch-dispatch-local "$FORMAT" "$ARCH" \ "Dispatch Crunch containers on the local system" -package_go_binary services/crunch-dispatch-slurm crunch-dispatch-slurm "$FORMAT" "$ARCH" \ +package_go_binary cmd/arvados-server crunch-dispatch-slurm "$FORMAT" "$ARCH" \ "Dispatch Crunch containers to a SLURM cluster" package_go_binary cmd/arvados-server crunch-run "$FORMAT" "$ARCH" \ "Supervise a single Crunch container" diff --git a/services/crunch-dispatch-slurm/crunch-dispatch-slurm.service b/cmd/arvados-server/crunch-dispatch-slurm.service similarity index 88% rename from services/crunch-dispatch-slurm/crunch-dispatch-slurm.service rename to cmd/arvados-server/crunch-dispatch-slurm.service index 86830f3a7f..51b4e58c35 100644 --- a/services/crunch-dispatch-slurm/crunch-dispatch-slurm.service +++ b/cmd/arvados-server/crunch-dispatch-slurm.service @@ -6,18 +6,19 @@ Description=Arvados Crunch Dispatcher for SLURM Documentation=https://doc.arvados.org/ After=network.target +AssertPathExists=/etc/arvados/config.yml # systemd>=230 (debian:9) obeys StartLimitIntervalSec in the [Unit] section StartLimitIntervalSec=0 [Service] Type=notify +EnvironmentFile=-/etc/arvados/environment ExecStart=/usr/bin/crunch-dispatch-slurm # Set a reasonable default for the open file limit LimitNOFILE=65536 Restart=always RestartSec=1 -LimitNOFILE=1000000 # systemd<=219 (centos:7, debian:8, ubuntu:trusty) obeys StartLimitInterval in the [Service] section StartLimitInterval=0 diff --git a/doc/admin/upgrading.html.textile.liquid b/doc/admin/upgrading.html.textile.liquid index efce633e9e..3f6009a803 100644 --- a/doc/admin/upgrading.html.textile.liquid +++ b/doc/admin/upgrading.html.textile.liquid @@ -32,6 +32,10 @@ h2(#main). development main (as of 2022-04-08) "previous: Upgrading to 2.4.0":#v2_4_0 +h3. Slurm dispatcher requires configuration update + +If you use the Slurm dispatcher (@crunch-dispatch-slurm@) you must add a @Services.DispatchSLURM.InternalURLs@ section to your configuration file, as shown on the "updated install page":{{site.baseurl}}/install/crunch2-slurm/install-dispatch.html. + h3. New proxy parameters for arvados-controller We now recommend disabling nginx proxy caching for arvados-controller, to avoid truncation of large responses. diff --git a/doc/install/crunch2-slurm/install-dispatch.html.textile.liquid b/doc/install/crunch2-slurm/install-dispatch.html.textile.liquid index 52553a35e7..9b664ec9ef 100644 --- a/doc/install/crunch2-slurm/install-dispatch.html.textile.liquid +++ b/doc/install/crunch2-slurm/install-dispatch.html.textile.liquid @@ -26,9 +26,18 @@ This assumes you already have a Slurm cluster, and have set up all of your compu The Arvados Slurm dispatcher can run on any node that can submit requests to both the Arvados API server and the Slurm controller (via @sbatch@). It is not resource-intensive, so you can run it on the API server node. -h2(#update-config). Update config.yml (optional) +h2(#update-config). Update config.yml -Crunch-dispatch-slurm reads the common configuration file at @config.yml@. +Crunch-dispatch-slurm reads the common configuration file at @/etc/arvados/config.yml@. + +Add a DispatchSLURM entry to the Services section, using the hostname where @crunch-dispatch-slurm@ will run, and an available port: + + +
    Services:
+      DispatchSLURM:
+        InternalURLs:
+          "http://hostname.zzzzz.arvadosapi.com:9007": {}
+
The following configuration parameters are optional. diff --git a/lib/config/config.default.yml b/lib/config/config.default.yml index e60880c217..893542df18 100644 --- a/lib/config/config.default.yml +++ b/lib/config/config.default.yml @@ -55,6 +55,9 @@ Clusters: DispatchLSF: InternalURLs: {SAMPLE: {}} ExternalURL: "" + DispatchSLURM: + InternalURLs: {SAMPLE: {}} + ExternalURL: "" Keepproxy: InternalURLs: {SAMPLE: {}} ExternalURL: "" diff --git a/sdk/go/arvados/config.go b/sdk/go/arvados/config.go index 6a90c30ce4..319fa1a38f 100644 --- a/sdk/go/arvados/config.go +++ b/sdk/go/arvados/config.go @@ -348,6 +348,7 @@ type Services struct { Controller Service DispatchCloud Service DispatchLSF Service + DispatchSLURM Service GitHTTP Service GitSSH Service Health Service @@ -605,6 +606,7 @@ const ( ServiceNameController ServiceName = "arvados-controller" ServiceNameDispatchCloud ServiceName = "arvados-dispatch-cloud" ServiceNameDispatchLSF ServiceName = "arvados-dispatch-lsf" + ServiceNameDispatchSLURM ServiceName = "crunch-dispatch-slurm" ServiceNameGitHTTP ServiceName = "arvados-git-httpd" ServiceNameHealth ServiceName = "arvados-health" ServiceNameKeepbalance ServiceName = "keep-balance" @@ -624,6 +626,7 @@ func (svcs Services) Map() map[ServiceName]Service { ServiceNameController: svcs.Controller, ServiceNameDispatchCloud: svcs.DispatchCloud, ServiceNameDispatchLSF: svcs.DispatchLSF, + ServiceNameDispatchSLURM: svcs.DispatchSLURM, ServiceNameGitHTTP: svcs.GitHTTP, ServiceNameHealth: svcs.Health, ServiceNameKeepbalance: svcs.Keepbalance, diff --git a/sdk/go/health/aggregator_test.go b/sdk/go/health/aggregator_test.go index 5f60cf67f3..481054c4de 100644 --- a/sdk/go/health/aggregator_test.go +++ b/sdk/go/health/aggregator_test.go @@ -327,6 +327,7 @@ func (s *AggregatorSuite) setAllServiceURLs(listen string) { &svcs.Controller, &svcs.DispatchCloud, &svcs.DispatchLSF, + &svcs.DispatchSLURM, &svcs.GitHTTP, &svcs.Keepbalance, &svcs.Keepproxy, diff --git a/services/crunch-dispatch-slurm/crunch-dispatch-slurm.go b/services/crunch-dispatch-slurm/crunch-dispatch-slurm.go index 84105e1fc7..c31d799752 100644 --- a/services/crunch-dispatch-slurm/crunch-dispatch-slurm.go +++ b/services/crunch-dispatch-slurm/crunch-dispatch-slurm.go @@ -2,32 +2,48 @@ // // SPDX-License-Identifier: AGPL-3.0 -package main - // Dispatcher service for Crunch that submits containers to the slurm queue. +package dispatchslurm import ( "context" - "flag" "fmt" "log" "math" + "net/http" "os" "regexp" "strings" "time" "git.arvados.org/arvados.git/lib/cmd" - "git.arvados.org/arvados.git/lib/config" "git.arvados.org/arvados.git/lib/dispatchcloud" + "git.arvados.org/arvados.git/lib/service" "git.arvados.org/arvados.git/sdk/go/arvados" "git.arvados.org/arvados.git/sdk/go/arvadosclient" + "git.arvados.org/arvados.git/sdk/go/ctxlog" "git.arvados.org/arvados.git/sdk/go/dispatch" "github.com/coreos/go-systemd/daemon" - "github.com/ghodss/yaml" + "github.com/prometheus/client_golang/prometheus" "github.com/sirupsen/logrus" ) +var Command cmd.Handler = service.Command(arvados.ServiceNameDispatchSLURM, newHandler) + +func newHandler(ctx context.Context, cluster *arvados.Cluster, _ string, _ *prometheus.Registry) service.Handler { + logger := ctxlog.FromContext(ctx) + disp := &Dispatcher{logger: logger, cluster: cluster} + if err := disp.configure(); err != nil { + return service.ErrorHandler(ctx, cluster, err) + } + disp.setup() + go func() { + disp.err = disp.run() + close(disp.done) + }() + return disp +} + type logger interface { dispatch.Logger Fatalf(string, ...interface{}) @@ -35,10 +51,6 @@ type logger interface { const initialNiceValue int64 = 10000 -var ( - version = "dev" -) - type Dispatcher struct { *dispatch.Dispatcher logger logrus.FieldLogger @@ -46,75 +58,32 @@ type Dispatcher struct { sqCheck *SqueueChecker slurm Slurm + done chan struct{} + err error + Client arvados.Client } -func main() { - logger := logrus.StandardLogger() - if os.Getenv("DEBUG") != "" { - logger.SetLevel(logrus.DebugLevel) - } - logger.Formatter = &logrus.JSONFormatter{ - TimestampFormat: "2006-01-02T15:04:05.000000000Z07:00", - } - disp := &Dispatcher{logger: logger} - err := disp.Run(os.Args[0], os.Args[1:]) - if err != nil { - logrus.Fatalf("%s", err) - } +func (disp *Dispatcher) CheckHealth() error { + return disp.err } -func (disp *Dispatcher) Run(prog string, args []string) error { - if err := disp.configure(prog, args); err != nil { - return err - } - disp.setup() - return disp.run() +func (disp *Dispatcher) Done() <-chan struct{} { + return disp.done +} + +func (disp *Dispatcher) ServeHTTP(w http.ResponseWriter, r *http.Request) { + http.NotFound(w, r) } -// configure() loads config files. Tests skip this. -func (disp *Dispatcher) configure(prog string, args []string) error { +// configure() loads config files. Some tests skip this (see +// StubbedSuite). +func (disp *Dispatcher) configure() error { if disp.logger == nil { disp.logger = logrus.StandardLogger() } - flags := flag.NewFlagSet(prog, flag.ContinueOnError) - flags.Usage = func() { usage(flags) } - - loader := config.NewLoader(nil, disp.logger) - loader.SetupFlags(flags) - - dumpConfig := flag.Bool( - "dump-config", - false, - "write current configuration to stdout and exit") - getVersion := flags.Bool( - "version", - false, - "Print version information and exit.") - - args = loader.MungeLegacyConfigArgs(disp.logger, args, "-legacy-crunch-dispatch-slurm-config") - if ok, code := cmd.ParseFlags(flags, prog, args, "", os.Stderr); !ok { - os.Exit(code) - } - - // Print version information if requested - if *getVersion { - fmt.Printf("crunch-dispatch-slurm %s\n", version) - return nil - } - - disp.logger.Printf("crunch-dispatch-slurm %s started", version) - - cfg, err := loader.Load() - if err != nil { - return err - } - - if disp.cluster, err = cfg.GetCluster(""); err != nil { - return fmt.Errorf("config error: %s", err) - } - disp.logger = disp.logger.WithField("ClusterID", disp.cluster.ClusterID) + disp.logger.Printf("crunch-dispatch-slurm %s started", cmd.Version.String()) disp.Client.APIHost = disp.cluster.Services.Controller.ExternalURL.Host disp.Client.AuthToken = disp.cluster.SystemRootToken @@ -137,23 +106,12 @@ func (disp *Dispatcher) configure(prog string, args []string) error { } else { disp.logger.Warnf("Client credentials missing from config, so falling back on environment variables (deprecated).") } - - if *dumpConfig { - out, err := yaml.Marshal(cfg) - if err != nil { - return err - } - _, err = os.Stdout.Write(out) - if err != nil { - return err - } - } - return nil } // setup() initializes private fields after configure(). func (disp *Dispatcher) setup() { + disp.done = make(chan struct{}) arv, err := arvadosclient.MakeArvadosClient() if err != nil { disp.logger.Fatalf("Error making Arvados client: %v", err) diff --git a/services/crunch-dispatch-slurm/crunch-dispatch-slurm_test.go b/services/crunch-dispatch-slurm/crunch-dispatch-slurm_test.go index cf83257dad..fb433e65cd 100644 --- a/services/crunch-dispatch-slurm/crunch-dispatch-slurm_test.go +++ b/services/crunch-dispatch-slurm/crunch-dispatch-slurm_test.go @@ -2,12 +2,13 @@ // // SPDX-License-Identifier: AGPL-3.0 -package main +package dispatchslurm import ( "bytes" "context" "errors" + "flag" "fmt" "io" "io/ioutil" @@ -19,10 +20,13 @@ import ( "testing" "time" + "git.arvados.org/arvados.git/lib/cmd" + "git.arvados.org/arvados.git/lib/config" "git.arvados.org/arvados.git/lib/dispatchcloud" "git.arvados.org/arvados.git/sdk/go/arvados" "git.arvados.org/arvados.git/sdk/go/arvadosclient" "git.arvados.org/arvados.git/sdk/go/arvadostest" + "git.arvados.org/arvados.git/sdk/go/ctxlog" "git.arvados.org/arvados.git/sdk/go/dispatch" "github.com/sirupsen/logrus" . "gopkg.in/check.v1" @@ -387,6 +391,7 @@ func (s *StubbedSuite) TestSbatchPartition(c *C) { } func (s *StubbedSuite) TestLoadLegacyConfig(c *C) { + log := ctxlog.TestLogger(c) content := []byte(` Client: APIHost: example.com @@ -402,36 +407,42 @@ ReserveExtraRAM: 12345 MinRetryPeriod: 13s BatchSize: 99 `) - tmpfile, err := ioutil.TempFile("", "example") - if err != nil { - c.Error(err) - } - - defer os.Remove(tmpfile.Name()) // clean up - - if _, err := tmpfile.Write(content); err != nil { - c.Error(err) - } - if err := tmpfile.Close(); err != nil { - c.Error(err) + tmpfile := c.MkDir() + "/config.yml" + err := ioutil.WriteFile(tmpfile, content, 0777) + c.Assert(err, IsNil) - } os.Setenv("ARVADOS_KEEP_SERVICES", "") - err = s.disp.configure("crunch-dispatch-slurm", []string{"-config", tmpfile.Name()}) - c.Check(err, IsNil) - c.Check(s.disp.cluster.Services.Controller.ExternalURL, Equals, arvados.URL{Scheme: "https", Host: "example.com", Path: "/"}) - c.Check(s.disp.cluster.SystemRootToken, Equals, "abcdefg") - c.Check(s.disp.cluster.Containers.SLURM.SbatchArgumentsList, DeepEquals, []string{"--foo", "bar"}) - c.Check(s.disp.cluster.Containers.CloudVMs.PollInterval, Equals, arvados.Duration(12*time.Second)) - c.Check(s.disp.cluster.Containers.SLURM.PrioritySpread, Equals, int64(42)) - c.Check(s.disp.cluster.Containers.CrunchRunCommand, Equals, "x-crunch-run") - c.Check(s.disp.cluster.Containers.CrunchRunArgumentsList, DeepEquals, []string{"--cgroup-parent-subsystem=memory"}) - c.Check(s.disp.cluster.Containers.ReserveExtraRAM, Equals, arvados.ByteSize(12345)) - c.Check(s.disp.cluster.Containers.MinRetryPeriod, Equals, arvados.Duration(13*time.Second)) - c.Check(s.disp.cluster.API.MaxItemsPerResponse, Equals, 99) - c.Check(s.disp.cluster.Containers.SLURM.SbatchEnvironmentVariables, DeepEquals, map[string]string{ + flags := flag.NewFlagSet("", flag.ContinueOnError) + flags.SetOutput(os.Stderr) + loader := config.NewLoader(&bytes.Buffer{}, log) + loader.SetupFlags(flags) + args := loader.MungeLegacyConfigArgs(log, []string{"-config", tmpfile}, "-legacy-"+string(arvados.ServiceNameDispatchSLURM)+"-config") + ok, _ := cmd.ParseFlags(flags, "crunch-dispatch-slurm", args, "", os.Stderr) + c.Check(ok, Equals, true) + cfg, err := loader.Load() + c.Assert(err, IsNil) + cluster, err := cfg.GetCluster("") + c.Assert(err, IsNil) + + c.Check(cluster.Services.Controller.ExternalURL, Equals, arvados.URL{Scheme: "https", Host: "example.com", Path: "/"}) + c.Check(cluster.SystemRootToken, Equals, "abcdefg") + c.Check(cluster.Containers.SLURM.SbatchArgumentsList, DeepEquals, []string{"--foo", "bar"}) + c.Check(cluster.Containers.CloudVMs.PollInterval, Equals, arvados.Duration(12*time.Second)) + c.Check(cluster.Containers.SLURM.PrioritySpread, Equals, int64(42)) + c.Check(cluster.Containers.CrunchRunCommand, Equals, "x-crunch-run") + c.Check(cluster.Containers.CrunchRunArgumentsList, DeepEquals, []string{"--cgroup-parent-subsystem=memory"}) + c.Check(cluster.Containers.ReserveExtraRAM, Equals, arvados.ByteSize(12345)) + c.Check(cluster.Containers.MinRetryPeriod, Equals, arvados.Duration(13*time.Second)) + c.Check(cluster.API.MaxItemsPerResponse, Equals, 99) + c.Check(cluster.Containers.SLURM.SbatchEnvironmentVariables, DeepEquals, map[string]string{ "ARVADOS_KEEP_SERVICES": "https://example.com/keep1 https://example.com/keep2", }) + + // Ensure configure() copies SbatchEnvironmentVariables into + // the current process's environment (that's how they end up + // getting passed to sbatch). + s.disp.cluster = cluster + s.disp.configure() c.Check(os.Getenv("ARVADOS_KEEP_SERVICES"), Equals, "https://example.com/keep1 https://example.com/keep2") } diff --git a/services/crunch-dispatch-slurm/node_type.go b/services/crunch-dispatch-slurm/node_type.go index d31322f182..738426c92d 100644 --- a/services/crunch-dispatch-slurm/node_type.go +++ b/services/crunch-dispatch-slurm/node_type.go @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: AGPL-3.0 -package main +package dispatchslurm import ( "log" diff --git a/services/crunch-dispatch-slurm/priority.go b/services/crunch-dispatch-slurm/priority.go index 2312ce5952..515a98d323 100644 --- a/services/crunch-dispatch-slurm/priority.go +++ b/services/crunch-dispatch-slurm/priority.go @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: AGPL-3.0 -package main +package dispatchslurm const defaultSpread int64 = 10 diff --git a/services/crunch-dispatch-slurm/priority_test.go b/services/crunch-dispatch-slurm/priority_test.go index e80984c0fc..df1c27def7 100644 --- a/services/crunch-dispatch-slurm/priority_test.go +++ b/services/crunch-dispatch-slurm/priority_test.go @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: AGPL-3.0 -package main +package dispatchslurm import ( . "gopkg.in/check.v1" diff --git a/services/crunch-dispatch-slurm/script.go b/services/crunch-dispatch-slurm/script.go index f559104d14..fb16e593e5 100644 --- a/services/crunch-dispatch-slurm/script.go +++ b/services/crunch-dispatch-slurm/script.go @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: AGPL-3.0 -package main +package dispatchslurm import ( "strings" diff --git a/services/crunch-dispatch-slurm/script_test.go b/services/crunch-dispatch-slurm/script_test.go index a21aeeddad..00d70190dd 100644 --- a/services/crunch-dispatch-slurm/script_test.go +++ b/services/crunch-dispatch-slurm/script_test.go @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: AGPL-3.0 -package main +package dispatchslurm import ( . "gopkg.in/check.v1" diff --git a/services/crunch-dispatch-slurm/slurm.go b/services/crunch-dispatch-slurm/slurm.go index 791f294df1..e59826f763 100644 --- a/services/crunch-dispatch-slurm/slurm.go +++ b/services/crunch-dispatch-slurm/slurm.go @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: AGPL-3.0 -package main +package dispatchslurm import ( "fmt" diff --git a/services/crunch-dispatch-slurm/squeue.go b/services/crunch-dispatch-slurm/squeue.go index eae21e62b6..d4e41ed1fb 100644 --- a/services/crunch-dispatch-slurm/squeue.go +++ b/services/crunch-dispatch-slurm/squeue.go @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: AGPL-3.0 -package main +package dispatchslurm import ( "bytes" diff --git a/services/crunch-dispatch-slurm/squeue_test.go b/services/crunch-dispatch-slurm/squeue_test.go index ce74fe61cc..d41e1982b4 100644 --- a/services/crunch-dispatch-slurm/squeue_test.go +++ b/services/crunch-dispatch-slurm/squeue_test.go @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: AGPL-3.0 -package main +package dispatchslurm import ( "time" diff --git a/services/crunch-dispatch-slurm/usage.go b/services/crunch-dispatch-slurm/usage.go index 68a2305f74..785843b198 100644 --- a/services/crunch-dispatch-slurm/usage.go +++ b/services/crunch-dispatch-slurm/usage.go @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: AGPL-3.0 -package main +package dispatchslurm import ( "flag"