15345: Add .../containers/kill management API to dispatcher.
[arvados.git] / lib / dispatchcloud / worker / runner.go
index bf1632a6a245e2d7edaf4b1e67486fa657cabe9a..c30ff9f2b7608410bbcf3450c62708fa3e5e2d09 100644 (file)
@@ -1,3 +1,7 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
 package worker
 
 import (
@@ -19,14 +23,13 @@ type remoteRunner struct {
        arvClient     *arvados.Client
        remoteUser    string
        timeoutTERM   time.Duration
-       timeoutKILL   time.Duration
        timeoutSignal time.Duration
-       onUnkillable  func(uuid string) // callback invoked when giving up on SIGKILL
-       onKilled      func(uuid string) // callback invoked when process exits after SIGTERM/SIGKILL
+       onUnkillable  func(uuid string) // callback invoked when giving up on SIGTERM
+       onKilled      func(uuid string) // callback invoked when process exits after SIGTERM
        logger        logrus.FieldLogger
 
        stopping bool          // true if Stop() has been called
-       sentKILL bool          // true if SIGKILL has been sent
+       givenup  bool          // true if timeoutTERM has been reached
        closed   chan struct{} // channel is closed if Close() has been called
 }
 
@@ -39,7 +42,6 @@ func newRemoteRunner(uuid string, wkr *worker) *remoteRunner {
                arvClient:     wkr.wp.arvClient,
                remoteUser:    wkr.instance.RemoteUser(),
                timeoutTERM:   wkr.wp.timeoutTERM,
-               timeoutKILL:   wkr.wp.timeoutKILL,
                timeoutSignal: wkr.wp.timeoutSignal,
                onUnkillable:  wkr.onUnkillable,
                onKilled:      wkr.onKilled,
@@ -88,9 +90,13 @@ func (rr *remoteRunner) Close() {
        close(rr.closed)
 }
 
-// Kill starts a background task to kill the remote process,
-// escalating from SIGTERM to SIGKILL to onUnkillable() according to
-// the configured timeouts.
+// Kill starts a background task to kill the remote process, first
+// trying SIGTERM until reaching timeoutTERM, then calling
+// onUnkillable().
+//
+// SIGKILL is not used. It would merely kill the crunch-run supervisor
+// and thereby make the docker container, arv-mount, etc. invisible to
+// us without actually stopping them.
 //
 // Once Kill has been called, calling it again has no effect.
 func (rr *remoteRunner) Kill(reason string) {
@@ -101,19 +107,17 @@ func (rr *remoteRunner) Kill(reason string) {
        rr.logger.WithField("Reason", reason).Info("killing crunch-run process")
        go func() {
                termDeadline := time.Now().Add(rr.timeoutTERM)
-               killDeadline := termDeadline.Add(rr.timeoutKILL)
                t := time.NewTicker(rr.timeoutSignal)
                defer t.Stop()
                for range t.C {
                        switch {
                        case rr.isClosed():
                                return
-                       case time.Now().After(killDeadline):
+                       case time.Now().After(termDeadline):
+                               rr.logger.Debug("giving up")
+                               rr.givenup = true
                                rr.onUnkillable(rr.uuid)
                                return
-                       case time.Now().After(termDeadline):
-                               rr.sentKILL = true
-                               rr.kill(syscall.SIGKILL)
                        default:
                                rr.kill(syscall.SIGTERM)
                        }
@@ -134,7 +138,7 @@ func (rr *remoteRunner) kill(sig syscall.Signal) {
                        "stderr": string(stderr),
                        "stdout": string(stdout),
                        "error":  err,
-               }).Info("kill failed")
+               }).Info("kill attempt unsuccessful")
                return
        }
        rr.onKilled(rr.uuid)