17384: Respect CrunchRunCommand and CrunchRunArgumentsList in a-d-c.
authorTom Clegg <tom@curii.com>
Mon, 15 Feb 2021 16:06:35 +0000 (11:06 -0500)
committerTom Clegg <tom@curii.com>
Mon, 15 Feb 2021 16:06:35 +0000 (11:06 -0500)
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom@curii.com>

lib/config/config.default.yml
lib/config/generated_config.go
lib/dispatchcloud/dispatcher_test.go
lib/dispatchcloud/test/stub_driver.go
lib/dispatchcloud/worker/pool.go
lib/dispatchcloud/worker/pool_test.go
lib/dispatchcloud/worker/runner.go
lib/dispatchcloud/worker/worker_test.go

index 68e518732d6f85b8ef377a4f22ea1efebf16af46..c644de3741923bfc90bcf7371cfcac7265bcb95f 100644 (file)
@@ -831,7 +831,11 @@ Clusters:
       # stale locks from a previous dispatch process.
       StaleLockTimeout: 1m
 
-      # The crunch-run command to manage the container on a node
+      # The crunch-run command used to start a container on a worker node.
+      #
+      # When dispatching to cloud VMs, this is used only if
+      # DeployRunnerBinary in the CloudVMs section is set to the empty
+      # string.
       CrunchRunCommand: "crunch-run"
 
       # Extra arguments to add to crunch-run invocation
@@ -1052,7 +1056,7 @@ Clusters:
         #
         # Use the empty string to disable this step: nothing will be
         # copied, and cloud instances are assumed to have a suitable
-        # version of crunch-run installed.
+        # version of crunch-run installed; see CrunchRunCommand above.
         DeployRunnerBinary: "/proc/self/exe"
 
         # Tags to add on all resources (VMs, NICs, disks) created by
index 8ef787771ebb9986f3b88f52ec69a6851d2eb8d2..8354102c2a321d062ca204e7eec7280568dcb117 100644 (file)
@@ -837,7 +837,11 @@ Clusters:
       # stale locks from a previous dispatch process.
       StaleLockTimeout: 1m
 
-      # The crunch-run command to manage the container on a node
+      # The crunch-run command used to start a container on a worker node.
+      #
+      # When dispatching to cloud VMs, this is used only if
+      # DeployRunnerBinary in the CloudVMs section is set to the empty
+      # string.
       CrunchRunCommand: "crunch-run"
 
       # Extra arguments to add to crunch-run invocation
@@ -1058,7 +1062,7 @@ Clusters:
         #
         # Use the empty string to disable this step: nothing will be
         # copied, and cloud instances are assumed to have a suitable
-        # version of crunch-run installed.
+        # version of crunch-run installed; see CrunchRunCommand above.
         DeployRunnerBinary: "/proc/self/exe"
 
         # Tags to add on all resources (VMs, NICs, disks) created by
index d5d90bf3518b75fb548e810e2ad8a7cc2c9867ba..8752ee054456bf1a2a2fc5b8030e8a7eeaa691b1 100644 (file)
@@ -52,8 +52,10 @@ func (s *DispatcherSuite) SetUpTest(c *check.C) {
        s.cluster = &arvados.Cluster{
                ManagementToken: "test-management-token",
                Containers: arvados.ContainersConfig{
-                       DispatchPrivateKey: string(dispatchprivraw),
-                       StaleLockTimeout:   arvados.Duration(5 * time.Millisecond),
+                       CrunchRunCommand:       "crunch-run",
+                       CrunchRunArgumentsList: []string{"--foo", "--extra='args'"},
+                       DispatchPrivateKey:     string(dispatchprivraw),
+                       StaleLockTimeout:       arvados.Duration(5 * time.Millisecond),
                        CloudVMs: arvados.CloudVMsConfig{
                                Driver:               "test",
                                SyncInterval:         arvados.Duration(10 * time.Millisecond),
@@ -161,6 +163,7 @@ func (s *DispatcherSuite) TestDispatchToStubDriver(c *check.C) {
                stubvm.CrunchRunDetachDelay = time.Duration(rand.Int63n(int64(10 * time.Millisecond)))
                stubvm.ExecuteContainer = executeContainer
                stubvm.CrashRunningContainer = finishContainer
+               stubvm.ExtraCrunchRunArgs = "'--foo' '--extra='\\''args'\\'''"
                switch n % 7 {
                case 0:
                        stubvm.Broken = time.Now().Add(time.Duration(rand.Int63n(90)) * time.Millisecond)
index 4d32cf221ce49461e092a834ad192460bc37a49d..1b31a71a264fabf865f981f5f94eab1649847ac4 100644 (file)
@@ -193,6 +193,7 @@ type StubVM struct {
        ArvMountDeadlockRate  float64
        ExecuteContainer      func(arvados.Container) int
        CrashRunningContainer func(arvados.Container)
+       ExtraCrunchRunArgs    string // extra args expected after "crunch-run --detach --stdin-env "
 
        sis          *StubInstanceSet
        id           cloud.InstanceID
@@ -251,7 +252,7 @@ func (svm *StubVM) Exec(env map[string]string, command string, stdin io.Reader,
                fmt.Fprint(stderr, "crunch-run: command not found\n")
                return 1
        }
-       if strings.HasPrefix(command, "crunch-run --detach --stdin-env ") {
+       if strings.HasPrefix(command, "crunch-run --detach --stdin-env "+svm.ExtraCrunchRunArgs) {
                var stdinKV map[string]string
                err := json.Unmarshal(stdinData, &stdinKV)
                if err != nil {
index 6a74280ca452e9b365f6a976f96e04ef03edc7e6..7289179fd6e4526ecfc7204d970172b42018af59 100644 (file)
@@ -121,6 +121,8 @@ func NewPool(logger logrus.FieldLogger, arvClient *arvados.Client, reg *promethe
                systemRootToken:                cluster.SystemRootToken,
                installPublicKey:               installPublicKey,
                tagKeyPrefix:                   cluster.Containers.CloudVMs.TagKeyPrefix,
+               runnerCmdDefault:               cluster.Containers.CrunchRunCommand,
+               runnerArgs:                     cluster.Containers.CrunchRunArgumentsList,
                stop:                           make(chan bool),
        }
        wp.registerMetrics(reg)
@@ -160,6 +162,8 @@ type Pool struct {
        systemRootToken                string
        installPublicKey               ssh.PublicKey
        tagKeyPrefix                   string
+       runnerCmdDefault               string   // crunch-run command to use if not deploying a binary
+       runnerArgs                     []string // extra args passed to crunch-run
 
        // private state
        subscribers  map[<-chan struct{}]chan<- struct{}
@@ -881,7 +885,7 @@ func (wp *Pool) loadRunnerData() error {
        if wp.runnerData != nil {
                return nil
        } else if wp.runnerSource == "" {
-               wp.runnerCmd = "crunch-run"
+               wp.runnerCmd = wp.runnerCmdDefault
                wp.runnerData = []byte{}
                return nil
        }
index a85f7383ab3cdc59fcc1bd0e7ad936703666ca2f..0f5c5ee196d2866269f2bb999292e8a9672c3e47 100644 (file)
@@ -72,8 +72,8 @@ func (suite *PoolSuite) TestResumeAfterRestart(c *check.C) {
        newExecutor := func(cloud.Instance) Executor {
                return &stubExecutor{
                        response: map[string]stubResp{
-                               "crunch-run --list": {},
-                               "true":              {},
+                               "crunch-run-custom --list": {},
+                               "true":                     {},
                        },
                }
        }
@@ -87,6 +87,7 @@ func (suite *PoolSuite) TestResumeAfterRestart(c *check.C) {
                                SyncInterval:       arvados.Duration(time.Millisecond * 10),
                                TagKeyPrefix:       "testprefix:",
                        },
+                       CrunchRunCommand: "crunch-run-custom",
                },
                InstanceTypes: arvados.InstanceTypeMap{
                        type1.Name: type1,
index 0fd99aeeef136cdc1113f55466b1342b7c975cc1..63561874c9c5e570187048922addbbc4e4ece502 100644 (file)
@@ -9,6 +9,7 @@ import (
        "encoding/json"
        "fmt"
        "net"
+       "strings"
        "syscall"
        "time"
 
@@ -22,6 +23,7 @@ type remoteRunner struct {
        executor      Executor
        envJSON       json.RawMessage
        runnerCmd     string
+       runnerArgs    []string
        remoteUser    string
        timeoutTERM   time.Duration
        timeoutSignal time.Duration
@@ -64,6 +66,7 @@ func newRemoteRunner(uuid string, wkr *worker) *remoteRunner {
                executor:      wkr.executor,
                envJSON:       envJSON,
                runnerCmd:     wkr.wp.runnerCmd,
+               runnerArgs:    wkr.wp.runnerArgs,
                remoteUser:    wkr.instance.RemoteUser(),
                timeoutTERM:   wkr.wp.timeoutTERM,
                timeoutSignal: wkr.wp.timeoutSignal,
@@ -81,7 +84,11 @@ func newRemoteRunner(uuid string, wkr *worker) *remoteRunner {
 // assume the remote process _might_ have started, at least until it
 // probes the worker and finds otherwise.
 func (rr *remoteRunner) Start() {
-       cmd := rr.runnerCmd + " --detach --stdin-env '" + rr.uuid + "'"
+       cmd := rr.runnerCmd + " --detach --stdin-env"
+       for _, arg := range rr.runnerArgs {
+               cmd += " '" + strings.Replace(arg, "'", "'\\''", -1) + "'"
+       }
+       cmd += " '" + rr.uuid + "'"
        if rr.remoteUser != "root" {
                cmd = "sudo " + cmd
        }
index cfb7a1bfb7a72b8924d5950deb7fc478f20873b0..4134788b2e27b00544151b857282d376c57a0ccf 100644 (file)
@@ -236,6 +236,8 @@ func (suite *WorkerSuite) TestProbeAndUpdate(c *check.C) {
                        timeoutBooting:   bootTimeout,
                        timeoutProbe:     probeTimeout,
                        exited:           map[string]time.Time{},
+                       runnerCmdDefault: "crunch-run",
+                       runnerArgs:       []string{"--args=not used with --list"},
                        runnerCmd:        "crunch-run",
                        runnerData:       trial.deployRunner,
                        runnerMD5:        md5.Sum(trial.deployRunner),