1 // Copyright (C) The Arvados Authors. All rights reserved.
3 // SPDX-License-Identifier: AGPL-3.0
15 type Slurm interface {
16 Cancel(name string) error
17 Renice(name string, nice int) error
18 QueueCommand(args []string) *exec.Cmd
19 Batch(script io.Reader, args []string) error
22 type slurmCLI struct{}
24 func (scli *slurmCLI) Batch(script io.Reader, args []string) error {
25 return scli.run(script, "sbatch", args)
28 func (scli *slurmCLI) Cancel(name string) error {
29 for _, args := range [][]string{
30 // If the slurm job hasn't started yet, remove it from
33 // If the slurm job has started, send SIGTERM. If we
34 // cancel a running job without a --signal argument,
35 // slurm will send SIGTERM and then (after some
36 // site-configured interval) SIGKILL. This would kill
37 // crunch-run without stopping the container, which we
39 {"--batch", "--signal=TERM", "--state=running"},
40 {"--batch", "--signal=TERM", "--state=suspended"},
42 err := scli.run(nil, "scancel", append([]string{"--name=" + name}, args...))
44 // scancel exits 0 if no job matches the given
45 // name and state. Any error from scancel here
46 // really indicates something is wrong.
53 func (scli *slurmCLI) QueueCommand(args []string) *exec.Cmd {
54 return exec.Command("squeue", args...)
57 func (scli *slurmCLI) Renice(name string, nice int) error {
58 return scli.run(nil, "scontrol", []string{"update", "JobName=" + name, fmt.Sprintf("Nice=%d", nice)})
61 func (scli *slurmCLI) run(stdin io.Reader, prog string, args []string) error {
62 cmd := exec.Command(prog, args...)
64 out, err := cmd.CombinedOutput()
65 outTrim := strings.TrimSpace(string(out))
66 if err != nil || len(out) > 0 {
67 log.Printf("%q %q: %q", cmd.Path, cmd.Args, outTrim)
70 err = fmt.Errorf("%s: %s (%q)", cmd.Path, err, outTrim)