From 3194c1b24ffe6fff5fcb2f620ca6ee43741e3462 Mon Sep 17 00:00:00 2001 From: Tom Clegg Date: Thu, 18 Jan 2018 16:39:16 -0500 Subject: [PATCH] 12891: Don't use SIGKILL when telling crunch-run to cancel. Arvados-DCO-1.1-Signed-off-by: Tom Clegg --- services/crunch-dispatch-slurm/slurm.go | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/services/crunch-dispatch-slurm/slurm.go b/services/crunch-dispatch-slurm/slurm.go index f675f6c4d8..bd193778b3 100644 --- a/services/crunch-dispatch-slurm/slurm.go +++ b/services/crunch-dispatch-slurm/slurm.go @@ -26,7 +26,28 @@ func (scli *slurmCLI) Batch(script io.Reader, args []string) error { } func (scli *slurmCLI) Cancel(name string) error { - return scli.run(nil, "scancel", []string{"--name=" + name}) + for _, args := range [][]string{ + // If the slurm job hasn't started yet, remove it from + // the queue. + {"--state=pending"}, + // If the slurm job has started, send SIGTERM. If we + // cancel a running job without a --signal argument, + // slurm will send SIGTERM and then (after some + // site-configured interval) SIGKILL. This would kill + // crunch-run without stopping the container, which we + // don't want. + {"--batch", "--signal=TERM", "--state=running"}, + {"--batch", "--signal=TERM", "--state=suspended"}, + } { + err := scli.run(nil, "scancel", append([]string{"--name=" + name}, args...)) + if err != nil { + // scancel exits 0 if no job matches the given + // name and state. Any error from scancel here + // really indicates something is wrong. + return err + } + } + return nil } func (scli *slurmCLI) QueueCommand(args []string) *exec.Cmd { -- 2.30.2