Merge branch '10690-config-dump' refs #10690
[arvados.git] / services / crunch-dispatch-slurm / crunch-dispatch-slurm.go
index 0c1ce49592a6b08223271d440dca41f3a5d8fd46..e768b509cd6f2c69bb529d9e9a90e2d923e422ce 100644 (file)
@@ -105,7 +105,7 @@ func doMain() error {
                PollInterval:   time.Duration(theConfig.PollPeriod),
                DoneProcessing: make(chan struct{})}
 
-       if _, err := daemon.SdNotify("READY=1"); err != nil {
+       if _, err := daemon.SdNotify(false, "READY=1"); err != nil {
                log.Printf("Error notifying init daemon: %v", err)
        }
 
@@ -127,8 +127,8 @@ func sbatchFunc(container arvados.Container) *exec.Cmd {
        sbatchArgs = append(sbatchArgs, fmt.Sprintf("--job-name=%s", container.UUID))
        sbatchArgs = append(sbatchArgs, fmt.Sprintf("--mem-per-cpu=%d", int(memPerCPU)))
        sbatchArgs = append(sbatchArgs, fmt.Sprintf("--cpus-per-task=%d", container.RuntimeConstraints.VCPUs))
-       if container.RuntimeConstraints.Partition != nil {
-               sbatchArgs = append(sbatchArgs, fmt.Sprintf("--partition=%s", strings.Join(container.RuntimeConstraints.Partition, ",")))
+       if container.SchedulingParameters.Partitions != nil {
+               sbatchArgs = append(sbatchArgs, fmt.Sprintf("--partition=%s", strings.Join(container.SchedulingParameters.Partitions, ",")))
        }
 
        return exec.Command("sbatch", sbatchArgs...)
@@ -195,6 +195,7 @@ func submit(dispatcher *dispatch.Dispatcher,
                b, _ := ioutil.ReadAll(stdoutReader)
                stdoutReader.Close()
                stdoutChan <- b
+               close(stdoutChan)
        }()
 
        stderrChan := make(chan []byte)
@@ -202,6 +203,7 @@ func submit(dispatcher *dispatch.Dispatcher,
                b, _ := ioutil.ReadAll(stderrReader)
                stderrReader.Close()
                stderrChan <- b
+               close(stderrChan)
        }()
 
        // Send a tiny script on stdin to execute the crunch-run command
@@ -209,13 +211,10 @@ func submit(dispatcher *dispatch.Dispatcher,
        io.WriteString(stdinWriter, execScript(append(crunchRunCommand, container.UUID)))
        stdinWriter.Close()
 
-       err = cmd.Wait()
-
        stdoutMsg := <-stdoutChan
        stderrmsg := <-stderrChan
 
-       close(stdoutChan)
-       close(stderrChan)
+       err = cmd.Wait()
 
        if err != nil {
                submitErr = fmt.Errorf("Container submission failed: %v: %v (stderr: %q)", cmd.Args, err, stderrmsg)
@@ -302,12 +301,13 @@ func run(dispatcher *dispatch.Dispatcher,
 
                                // Mutex between squeue sync and running sbatch or scancel.
                                squeueUpdater.SlurmLock.Lock()
-                               err := scancelCmd(container).Run()
+                               cmd := scancelCmd(container)
+                               msg, err := cmd.CombinedOutput()
                                squeueUpdater.SlurmLock.Unlock()
 
                                if err != nil {
-                                       log.Printf("Error stopping container %s with scancel: %v",
-                                               container.UUID, err)
+                                       log.Printf("Error stopping container %s with %v %v: %v %v",
+                                               container.UUID, cmd.Path, cmd.Args, err, string(msg))
                                        if squeueUpdater.CheckSqueue(container.UUID) {
                                                log.Printf("Container %s is still in squeue after scancel.",
                                                        container.UUID)