X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/d14421423f2bd72b2c3eb95bdebe85a210972a12..2e03d03bc55b5a612c2bf04d878a72f2ee420d99:/services/crunch-dispatch-slurm/slurm.go diff --git a/services/crunch-dispatch-slurm/slurm.go b/services/crunch-dispatch-slurm/slurm.go index f675f6c4d8..e59826f763 100644 --- a/services/crunch-dispatch-slurm/slurm.go +++ b/services/crunch-dispatch-slurm/slurm.go @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: AGPL-3.0 -package main +package dispatchslurm import ( "fmt" @@ -13,31 +13,67 @@ import ( ) type Slurm interface { + Batch(script io.Reader, args []string) error Cancel(name string) error - Renice(name string, nice int) error QueueCommand(args []string) *exec.Cmd - Batch(script io.Reader, args []string) error + Release(name string) error + Renice(name string, nice int64) error } -type slurmCLI struct{} +type slurmCLI struct { + runSemaphore chan bool +} + +func NewSlurmCLI() *slurmCLI { + return &slurmCLI{ + runSemaphore: make(chan bool, 3), + } +} func (scli *slurmCLI) Batch(script io.Reader, args []string) error { return scli.run(script, "sbatch", args) } func (scli *slurmCLI) Cancel(name string) error { - return scli.run(nil, "scancel", []string{"--name=" + name}) + for _, args := range [][]string{ + // If the slurm job hasn't started yet, remove it from + // the queue. + {"--state=pending"}, + // If the slurm job has started, send SIGTERM. If we + // cancel a running job without a --signal argument, + // slurm will send SIGTERM and then (after some + // site-configured interval) SIGKILL. This would kill + // crunch-run without stopping the container, which we + // don't want. + {"--batch", "--signal=TERM", "--state=running"}, + {"--batch", "--signal=TERM", "--state=suspended"}, + } { + err := scli.run(nil, "scancel", append([]string{"--name=" + name}, args...)) + if err != nil { + // scancel exits 0 if no job matches the given + // name and state. Any error from scancel here + // really indicates something is wrong. + return err + } + } + return nil } func (scli *slurmCLI) QueueCommand(args []string) *exec.Cmd { return exec.Command("squeue", args...) } -func (scli *slurmCLI) Renice(name string, nice int) error { +func (scli *slurmCLI) Release(name string) error { + return scli.run(nil, "scontrol", []string{"release", "Name=" + name}) +} + +func (scli *slurmCLI) Renice(name string, nice int64) error { return scli.run(nil, "scontrol", []string{"update", "JobName=" + name, fmt.Sprintf("Nice=%d", nice)}) } func (scli *slurmCLI) run(stdin io.Reader, prog string, args []string) error { + scli.runSemaphore <- true + defer func() { <-scli.runSemaphore }() cmd := exec.Command(prog, args...) cmd.Stdin = stdin out, err := cmd.CombinedOutput()