12891: Don't use SIGKILL when telling crunch-run to cancel.
authorTom Clegg <tclegg@veritasgenetics.com>
Thu, 18 Jan 2018 21:39:16 +0000 (16:39 -0500)
committerTom Clegg <tclegg@veritasgenetics.com>
Mon, 22 Jan 2018 17:34:03 +0000 (12:34 -0500)
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg@veritasgenetics.com>

services/crunch-dispatch-slurm/slurm.go

index f675f6c4d8dbf65e68332c446270a6f02ab9876e..bd193778b38c2172b13987945cfd2df1c58e22ce 100644 (file)
@@ -26,7 +26,28 @@ func (scli *slurmCLI) Batch(script io.Reader, args []string) error {
 }
 
 func (scli *slurmCLI) Cancel(name string) error {
-       return scli.run(nil, "scancel", []string{"--name=" + name})
+       for _, args := range [][]string{
+               // If the slurm job hasn't started yet, remove it from
+               // the queue.
+               {"--state=pending"},
+               // If the slurm job has started, send SIGTERM. If we
+               // cancel a running job without a --signal argument,
+               // slurm will send SIGTERM and then (after some
+               // site-configured interval) SIGKILL. This would kill
+               // crunch-run without stopping the container, which we
+               // don't want.
+               {"--batch", "--signal=TERM", "--state=running"},
+               {"--batch", "--signal=TERM", "--state=suspended"},
+       } {
+               err := scli.run(nil, "scancel", append([]string{"--name=" + name}, args...))
+               if err != nil {
+                       // scancel exits 0 if no job matches the given
+                       // name and state. Any error from scancel here
+                       // really indicates something is wrong.
+                       return err
+               }
+       }
+       return nil
 }
 
 func (scli *slurmCLI) QueueCommand(args []string) *exec.Cmd {