1 // Copyright (C) The Arvados Authors. All rights reserved.
3 // SPDX-License-Identifier: AGPL-3.0
15 type Slurm interface {
16 Batch(script io.Reader, args []string) error
17 Cancel(name string) error
18 QueueCommand(args []string) *exec.Cmd
19 Release(name string) error
20 Renice(name string, nice int64) error
23 type slurmCLI struct{}
25 func (scli *slurmCLI) Batch(script io.Reader, args []string) error {
26 return scli.run(script, "sbatch", args)
29 func (scli *slurmCLI) Cancel(name string) error {
30 for _, args := range [][]string{
31 // If the slurm job hasn't started yet, remove it from
34 // If the slurm job has started, send SIGTERM. If we
35 // cancel a running job without a --signal argument,
36 // slurm will send SIGTERM and then (after some
37 // site-configured interval) SIGKILL. This would kill
38 // crunch-run without stopping the container, which we
40 {"--batch", "--signal=TERM", "--state=running"},
41 {"--batch", "--signal=TERM", "--state=suspended"},
43 err := scli.run(nil, "scancel", append([]string{"--name=" + name}, args...))
45 // scancel exits 0 if no job matches the given
46 // name and state. Any error from scancel here
47 // really indicates something is wrong.
54 func (scli *slurmCLI) QueueCommand(args []string) *exec.Cmd {
55 return exec.Command("squeue", args...)
58 func (scli *slurmCLI) Release(name string) error {
59 return scli.run(nil, "scontrol", []string{"release", "Name=" + name})
62 func (scli *slurmCLI) Renice(name string, nice int64) error {
63 return scli.run(nil, "scontrol", []string{"update", "JobName=" + name, fmt.Sprintf("Nice=%d", nice)})
66 func (scli *slurmCLI) run(stdin io.Reader, prog string, args []string) error {
67 cmd := exec.Command(prog, args...)
69 out, err := cmd.CombinedOutput()
70 outTrim := strings.TrimSpace(string(out))
71 if err != nil || len(out) > 0 {
72 log.Printf("%q %q: %q", cmd.Path, cmd.Args, outTrim)
75 err = fmt.Errorf("%s: %s (%q)", cmd.Path, err, outTrim)