Merge branch '10690-config-dump' refs #10690
[arvados.git] / services / crunch-dispatch-slurm / crunch-dispatch-slurm.go
index f45cc823ec7c87d310609f6d33073c4d82f8b662..e768b509cd6f2c69bb529d9e9a90e2d923e422ce 100644 (file)
@@ -79,9 +79,9 @@ func doMain() error {
                // propagated to crunch-run via SLURM.
                os.Setenv("ARVADOS_API_HOST", theConfig.Client.APIHost)
                os.Setenv("ARVADOS_API_TOKEN", theConfig.Client.AuthToken)
-               os.Setenv("ARVADOS_API_INSECURE", "")
+               os.Setenv("ARVADOS_API_HOST_INSECURE", "")
                if theConfig.Client.Insecure {
-                       os.Setenv("ARVADOS_API_INSECURE", "1")
+                       os.Setenv("ARVADOS_API_HOST_INSECURE", "1")
                }
                os.Setenv("ARVADOS_KEEP_SERVICES", strings.Join(theConfig.Client.KeepServiceURIs, " "))
                os.Setenv("ARVADOS_EXTERNAL_CLIENT", "")
@@ -105,7 +105,7 @@ func doMain() error {
                PollInterval:   time.Duration(theConfig.PollPeriod),
                DoneProcessing: make(chan struct{})}
 
-       if _, err := daemon.SdNotify("READY=1"); err != nil {
+       if _, err := daemon.SdNotify(false, "READY=1"); err != nil {
                log.Printf("Error notifying init daemon: %v", err)
        }
 
@@ -127,8 +127,8 @@ func sbatchFunc(container arvados.Container) *exec.Cmd {
        sbatchArgs = append(sbatchArgs, fmt.Sprintf("--job-name=%s", container.UUID))
        sbatchArgs = append(sbatchArgs, fmt.Sprintf("--mem-per-cpu=%d", int(memPerCPU)))
        sbatchArgs = append(sbatchArgs, fmt.Sprintf("--cpus-per-task=%d", container.RuntimeConstraints.VCPUs))
-       if container.RuntimeConstraints.Partition != nil {
-               sbatchArgs = append(sbatchArgs, fmt.Sprintf("--partition=%s", strings.Join(container.RuntimeConstraints.Partition, ",")))
+       if container.SchedulingParameters.Partitions != nil {
+               sbatchArgs = append(sbatchArgs, fmt.Sprintf("--partition=%s", strings.Join(container.SchedulingParameters.Partitions, ",")))
        }
 
        return exec.Command("sbatch", sbatchArgs...)
@@ -183,9 +183,10 @@ func submit(dispatcher *dispatch.Dispatcher,
        squeueUpdater.SlurmLock.Lock()
        defer squeueUpdater.SlurmLock.Unlock()
 
+       log.Printf("sbatch starting: %+q", cmd.Args)
        err := cmd.Start()
        if err != nil {
-               submitErr = fmt.Errorf("Error starting %v: %v", cmd.Args, err)
+               submitErr = fmt.Errorf("Error starting sbatch: %v", err)
                return
        }
 
@@ -194,6 +195,7 @@ func submit(dispatcher *dispatch.Dispatcher,
                b, _ := ioutil.ReadAll(stdoutReader)
                stdoutReader.Close()
                stdoutChan <- b
+               close(stdoutChan)
        }()
 
        stderrChan := make(chan []byte)
@@ -201,6 +203,7 @@ func submit(dispatcher *dispatch.Dispatcher,
                b, _ := ioutil.ReadAll(stderrReader)
                stderrReader.Close()
                stderrChan <- b
+               close(stderrChan)
        }()
 
        // Send a tiny script on stdin to execute the crunch-run command
@@ -208,13 +211,10 @@ func submit(dispatcher *dispatch.Dispatcher,
        io.WriteString(stdinWriter, execScript(append(crunchRunCommand, container.UUID)))
        stdinWriter.Close()
 
-       err = cmd.Wait()
-
        stdoutMsg := <-stdoutChan
        stderrmsg := <-stderrChan
 
-       close(stdoutChan)
-       close(stderrChan)
+       err = cmd.Wait()
 
        if err != nil {
                submitErr = fmt.Errorf("Container submission failed: %v: %v (stderr: %q)", cmd.Args, err, stderrmsg)
@@ -301,12 +301,13 @@ func run(dispatcher *dispatch.Dispatcher,
 
                                // Mutex between squeue sync and running sbatch or scancel.
                                squeueUpdater.SlurmLock.Lock()
-                               err := scancelCmd(container).Run()
+                               cmd := scancelCmd(container)
+                               msg, err := cmd.CombinedOutput()
                                squeueUpdater.SlurmLock.Unlock()
 
                                if err != nil {
-                                       log.Printf("Error stopping container %s with scancel: %v",
-                                               container.UUID, err)
+                                       log.Printf("Error stopping container %s with %v %v: %v %v",
+                                               container.UUID, cmd.Path, cmd.Args, err, string(msg))
                                        if squeueUpdater.CheckSqueue(container.UUID) {
                                                log.Printf("Container %s is still in squeue after scancel.",
                                                        container.UUID)