From 8685bdc41012f1623cc02b573e27439fdf314799 Mon Sep 17 00:00:00 2001 From: Tom Clegg Date: Mon, 15 Feb 2021 11:06:35 -0500 Subject: [PATCH] 17384: Respect CrunchRunCommand and CrunchRunArgumentsList in a-d-c. Arvados-DCO-1.1-Signed-off-by: Tom Clegg --- lib/config/config.default.yml | 8 ++++++-- lib/config/generated_config.go | 8 ++++++-- lib/dispatchcloud/dispatcher_test.go | 7 +++++-- lib/dispatchcloud/test/stub_driver.go | 3 ++- lib/dispatchcloud/worker/pool.go | 6 +++++- lib/dispatchcloud/worker/pool_test.go | 5 +++-- lib/dispatchcloud/worker/runner.go | 9 ++++++++- lib/dispatchcloud/worker/worker_test.go | 2 ++ 8 files changed, 37 insertions(+), 11 deletions(-) diff --git a/lib/config/config.default.yml b/lib/config/config.default.yml index 68e518732d..c644de3741 100644 --- a/lib/config/config.default.yml +++ b/lib/config/config.default.yml @@ -831,7 +831,11 @@ Clusters: # stale locks from a previous dispatch process. StaleLockTimeout: 1m - # The crunch-run command to manage the container on a node + # The crunch-run command used to start a container on a worker node. + # + # When dispatching to cloud VMs, this is used only if + # DeployRunnerBinary in the CloudVMs section is set to the empty + # string. CrunchRunCommand: "crunch-run" # Extra arguments to add to crunch-run invocation @@ -1052,7 +1056,7 @@ Clusters: # # Use the empty string to disable this step: nothing will be # copied, and cloud instances are assumed to have a suitable - # version of crunch-run installed. + # version of crunch-run installed; see CrunchRunCommand above. DeployRunnerBinary: "/proc/self/exe" # Tags to add on all resources (VMs, NICs, disks) created by diff --git a/lib/config/generated_config.go b/lib/config/generated_config.go index 8ef787771e..8354102c2a 100644 --- a/lib/config/generated_config.go +++ b/lib/config/generated_config.go @@ -837,7 +837,11 @@ Clusters: # stale locks from a previous dispatch process. StaleLockTimeout: 1m - # The crunch-run command to manage the container on a node + # The crunch-run command used to start a container on a worker node. + # + # When dispatching to cloud VMs, this is used only if + # DeployRunnerBinary in the CloudVMs section is set to the empty + # string. CrunchRunCommand: "crunch-run" # Extra arguments to add to crunch-run invocation @@ -1058,7 +1062,7 @@ Clusters: # # Use the empty string to disable this step: nothing will be # copied, and cloud instances are assumed to have a suitable - # version of crunch-run installed. + # version of crunch-run installed; see CrunchRunCommand above. DeployRunnerBinary: "/proc/self/exe" # Tags to add on all resources (VMs, NICs, disks) created by diff --git a/lib/dispatchcloud/dispatcher_test.go b/lib/dispatchcloud/dispatcher_test.go index d5d90bf351..8752ee0544 100644 --- a/lib/dispatchcloud/dispatcher_test.go +++ b/lib/dispatchcloud/dispatcher_test.go @@ -52,8 +52,10 @@ func (s *DispatcherSuite) SetUpTest(c *check.C) { s.cluster = &arvados.Cluster{ ManagementToken: "test-management-token", Containers: arvados.ContainersConfig{ - DispatchPrivateKey: string(dispatchprivraw), - StaleLockTimeout: arvados.Duration(5 * time.Millisecond), + CrunchRunCommand: "crunch-run", + CrunchRunArgumentsList: []string{"--foo", "--extra='args'"}, + DispatchPrivateKey: string(dispatchprivraw), + StaleLockTimeout: arvados.Duration(5 * time.Millisecond), CloudVMs: arvados.CloudVMsConfig{ Driver: "test", SyncInterval: arvados.Duration(10 * time.Millisecond), @@ -161,6 +163,7 @@ func (s *DispatcherSuite) TestDispatchToStubDriver(c *check.C) { stubvm.CrunchRunDetachDelay = time.Duration(rand.Int63n(int64(10 * time.Millisecond))) stubvm.ExecuteContainer = executeContainer stubvm.CrashRunningContainer = finishContainer + stubvm.ExtraCrunchRunArgs = "'--foo' '--extra='\\''args'\\'''" switch n % 7 { case 0: stubvm.Broken = time.Now().Add(time.Duration(rand.Int63n(90)) * time.Millisecond) diff --git a/lib/dispatchcloud/test/stub_driver.go b/lib/dispatchcloud/test/stub_driver.go index 4d32cf221c..1b31a71a26 100644 --- a/lib/dispatchcloud/test/stub_driver.go +++ b/lib/dispatchcloud/test/stub_driver.go @@ -193,6 +193,7 @@ type StubVM struct { ArvMountDeadlockRate float64 ExecuteContainer func(arvados.Container) int CrashRunningContainer func(arvados.Container) + ExtraCrunchRunArgs string // extra args expected after "crunch-run --detach --stdin-env " sis *StubInstanceSet id cloud.InstanceID @@ -251,7 +252,7 @@ func (svm *StubVM) Exec(env map[string]string, command string, stdin io.Reader, fmt.Fprint(stderr, "crunch-run: command not found\n") return 1 } - if strings.HasPrefix(command, "crunch-run --detach --stdin-env ") { + if strings.HasPrefix(command, "crunch-run --detach --stdin-env "+svm.ExtraCrunchRunArgs) { var stdinKV map[string]string err := json.Unmarshal(stdinData, &stdinKV) if err != nil { diff --git a/lib/dispatchcloud/worker/pool.go b/lib/dispatchcloud/worker/pool.go index 6a74280ca4..7289179fd6 100644 --- a/lib/dispatchcloud/worker/pool.go +++ b/lib/dispatchcloud/worker/pool.go @@ -121,6 +121,8 @@ func NewPool(logger logrus.FieldLogger, arvClient *arvados.Client, reg *promethe systemRootToken: cluster.SystemRootToken, installPublicKey: installPublicKey, tagKeyPrefix: cluster.Containers.CloudVMs.TagKeyPrefix, + runnerCmdDefault: cluster.Containers.CrunchRunCommand, + runnerArgs: cluster.Containers.CrunchRunArgumentsList, stop: make(chan bool), } wp.registerMetrics(reg) @@ -160,6 +162,8 @@ type Pool struct { systemRootToken string installPublicKey ssh.PublicKey tagKeyPrefix string + runnerCmdDefault string // crunch-run command to use if not deploying a binary + runnerArgs []string // extra args passed to crunch-run // private state subscribers map[<-chan struct{}]chan<- struct{} @@ -881,7 +885,7 @@ func (wp *Pool) loadRunnerData() error { if wp.runnerData != nil { return nil } else if wp.runnerSource == "" { - wp.runnerCmd = "crunch-run" + wp.runnerCmd = wp.runnerCmdDefault wp.runnerData = []byte{} return nil } diff --git a/lib/dispatchcloud/worker/pool_test.go b/lib/dispatchcloud/worker/pool_test.go index a85f7383ab..0f5c5ee196 100644 --- a/lib/dispatchcloud/worker/pool_test.go +++ b/lib/dispatchcloud/worker/pool_test.go @@ -72,8 +72,8 @@ func (suite *PoolSuite) TestResumeAfterRestart(c *check.C) { newExecutor := func(cloud.Instance) Executor { return &stubExecutor{ response: map[string]stubResp{ - "crunch-run --list": {}, - "true": {}, + "crunch-run-custom --list": {}, + "true": {}, }, } } @@ -87,6 +87,7 @@ func (suite *PoolSuite) TestResumeAfterRestart(c *check.C) { SyncInterval: arvados.Duration(time.Millisecond * 10), TagKeyPrefix: "testprefix:", }, + CrunchRunCommand: "crunch-run-custom", }, InstanceTypes: arvados.InstanceTypeMap{ type1.Name: type1, diff --git a/lib/dispatchcloud/worker/runner.go b/lib/dispatchcloud/worker/runner.go index 0fd99aeeef..63561874c9 100644 --- a/lib/dispatchcloud/worker/runner.go +++ b/lib/dispatchcloud/worker/runner.go @@ -9,6 +9,7 @@ import ( "encoding/json" "fmt" "net" + "strings" "syscall" "time" @@ -22,6 +23,7 @@ type remoteRunner struct { executor Executor envJSON json.RawMessage runnerCmd string + runnerArgs []string remoteUser string timeoutTERM time.Duration timeoutSignal time.Duration @@ -64,6 +66,7 @@ func newRemoteRunner(uuid string, wkr *worker) *remoteRunner { executor: wkr.executor, envJSON: envJSON, runnerCmd: wkr.wp.runnerCmd, + runnerArgs: wkr.wp.runnerArgs, remoteUser: wkr.instance.RemoteUser(), timeoutTERM: wkr.wp.timeoutTERM, timeoutSignal: wkr.wp.timeoutSignal, @@ -81,7 +84,11 @@ func newRemoteRunner(uuid string, wkr *worker) *remoteRunner { // assume the remote process _might_ have started, at least until it // probes the worker and finds otherwise. func (rr *remoteRunner) Start() { - cmd := rr.runnerCmd + " --detach --stdin-env '" + rr.uuid + "'" + cmd := rr.runnerCmd + " --detach --stdin-env" + for _, arg := range rr.runnerArgs { + cmd += " '" + strings.Replace(arg, "'", "'\\''", -1) + "'" + } + cmd += " '" + rr.uuid + "'" if rr.remoteUser != "root" { cmd = "sudo " + cmd } diff --git a/lib/dispatchcloud/worker/worker_test.go b/lib/dispatchcloud/worker/worker_test.go index cfb7a1bfb7..4134788b2e 100644 --- a/lib/dispatchcloud/worker/worker_test.go +++ b/lib/dispatchcloud/worker/worker_test.go @@ -236,6 +236,8 @@ func (suite *WorkerSuite) TestProbeAndUpdate(c *check.C) { timeoutBooting: bootTimeout, timeoutProbe: probeTimeout, exited: map[string]time.Time{}, + runnerCmdDefault: "crunch-run", + runnerArgs: []string{"--args=not used with --list"}, runnerCmd: "crunch-run", runnerData: trial.deployRunner, runnerMD5: md5.Sum(trial.deployRunner), -- 2.30.2