X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/94588fc14c200e4163144cd53bfa548a27fff7e0..ba57b6537679889b42693ecd839a94d59c716aaf:/services/crunch-dispatch-slurm/crunch-dispatch-slurm.go diff --git a/services/crunch-dispatch-slurm/crunch-dispatch-slurm.go b/services/crunch-dispatch-slurm/crunch-dispatch-slurm.go index f793805a8b..e768b509cd 100644 --- a/services/crunch-dispatch-slurm/crunch-dispatch-slurm.go +++ b/services/crunch-dispatch-slurm/crunch-dispatch-slurm.go @@ -79,9 +79,9 @@ func doMain() error { // propagated to crunch-run via SLURM. os.Setenv("ARVADOS_API_HOST", theConfig.Client.APIHost) os.Setenv("ARVADOS_API_TOKEN", theConfig.Client.AuthToken) - os.Setenv("ARVADOS_API_INSECURE", "") + os.Setenv("ARVADOS_API_HOST_INSECURE", "") if theConfig.Client.Insecure { - os.Setenv("ARVADOS_API_INSECURE", "1") + os.Setenv("ARVADOS_API_HOST_INSECURE", "1") } os.Setenv("ARVADOS_KEEP_SERVICES", strings.Join(theConfig.Client.KeepServiceURIs, " ")) os.Setenv("ARVADOS_EXTERNAL_CLIENT", "") @@ -105,7 +105,7 @@ func doMain() error { PollInterval: time.Duration(theConfig.PollPeriod), DoneProcessing: make(chan struct{})} - if _, err := daemon.SdNotify("READY=1"); err != nil { + if _, err := daemon.SdNotify(false, "READY=1"); err != nil { log.Printf("Error notifying init daemon: %v", err) } @@ -127,8 +127,8 @@ func sbatchFunc(container arvados.Container) *exec.Cmd { sbatchArgs = append(sbatchArgs, fmt.Sprintf("--job-name=%s", container.UUID)) sbatchArgs = append(sbatchArgs, fmt.Sprintf("--mem-per-cpu=%d", int(memPerCPU))) sbatchArgs = append(sbatchArgs, fmt.Sprintf("--cpus-per-task=%d", container.RuntimeConstraints.VCPUs)) - if container.RuntimeConstraints.Partition != nil { - sbatchArgs = append(sbatchArgs, fmt.Sprintf("--partition=%s", strings.Join(container.RuntimeConstraints.Partition, ","))) + if container.SchedulingParameters.Partitions != nil { + sbatchArgs = append(sbatchArgs, fmt.Sprintf("--partition=%s", strings.Join(container.SchedulingParameters.Partitions, ","))) } return exec.Command("sbatch", sbatchArgs...) @@ -195,6 +195,7 @@ func submit(dispatcher *dispatch.Dispatcher, b, _ := ioutil.ReadAll(stdoutReader) stdoutReader.Close() stdoutChan <- b + close(stdoutChan) }() stderrChan := make(chan []byte) @@ -202,6 +203,7 @@ func submit(dispatcher *dispatch.Dispatcher, b, _ := ioutil.ReadAll(stderrReader) stderrReader.Close() stderrChan <- b + close(stderrChan) }() // Send a tiny script on stdin to execute the crunch-run command @@ -209,13 +211,10 @@ func submit(dispatcher *dispatch.Dispatcher, io.WriteString(stdinWriter, execScript(append(crunchRunCommand, container.UUID))) stdinWriter.Close() - err = cmd.Wait() - stdoutMsg := <-stdoutChan stderrmsg := <-stderrChan - close(stdoutChan) - close(stderrChan) + err = cmd.Wait() if err != nil { submitErr = fmt.Errorf("Container submission failed: %v: %v (stderr: %q)", cmd.Args, err, stderrmsg) @@ -302,12 +301,13 @@ func run(dispatcher *dispatch.Dispatcher, // Mutex between squeue sync and running sbatch or scancel. squeueUpdater.SlurmLock.Lock() - err := scancelCmd(container).Run() + cmd := scancelCmd(container) + msg, err := cmd.CombinedOutput() squeueUpdater.SlurmLock.Unlock() if err != nil { - log.Printf("Error stopping container %s with scancel: %v", - container.UUID, err) + log.Printf("Error stopping container %s with %v %v: %v %v", + container.UUID, cmd.Path, cmd.Args, err, string(msg)) if squeueUpdater.CheckSqueue(container.UUID) { log.Printf("Container %s is still in squeue after scancel.", container.UUID)