14807: Don't send SIGKILL to crunch-run processes.
authorTom Clegg <tclegg@veritasgenetics.com>
Tue, 19 Mar 2019 17:28:39 +0000 (13:28 -0400)
committerTom Clegg <tclegg@veritasgenetics.com>
Tue, 19 Mar 2019 17:28:39 +0000 (13:28 -0400)
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg@veritasgenetics.com>

lib/dispatchcloud/dispatcher_test.go
lib/dispatchcloud/worker/pool.go
lib/dispatchcloud/worker/runner.go
lib/dispatchcloud/worker/worker.go
sdk/go/arvados/config.go

index d0b4efefa3578c214345a477a1fa25f8e5ab8dc8..7268f106a9f36ba933da51ecba4465ba760a8820 100644 (file)
@@ -64,7 +64,6 @@ func (s *DispatcherSuite) SetUpTest(c *check.C) {
                        MaxProbesPerSecond: 1000,
                        TimeoutSignal:      arvados.Duration(3 * time.Millisecond),
                        TimeoutTERM:        arvados.Duration(20 * time.Millisecond),
-                       TimeoutKILL:        arvados.Duration(20 * time.Millisecond),
                },
                InstanceTypes: arvados.InstanceTypeMap{
                        test.InstanceType(1).Name:  test.InstanceType(1),
index d13e29c1e89487df597a387f11129a6c7535e48b..81a658535ea593a7a0a0c2d9231fd66f3055bdbd 100644 (file)
@@ -69,7 +69,6 @@ const (
        defaultTimeoutProbe       = time.Minute * 10
        defaultTimeoutShutdown    = time.Second * 10
        defaultTimeoutTERM        = time.Minute * 2
-       defaultTimeoutKILL        = time.Second * 20
        defaultTimeoutSignal      = time.Second * 5
 
        // Time after a quota error to try again anyway, even if no
@@ -109,7 +108,6 @@ func NewPool(logger logrus.FieldLogger, arvClient *arvados.Client, reg *promethe
                timeoutProbe:       duration(cluster.CloudVMs.TimeoutProbe, defaultTimeoutProbe),
                timeoutShutdown:    duration(cluster.CloudVMs.TimeoutShutdown, defaultTimeoutShutdown),
                timeoutTERM:        duration(cluster.Dispatch.TimeoutTERM, defaultTimeoutTERM),
-               timeoutKILL:        duration(cluster.Dispatch.TimeoutKILL, defaultTimeoutKILL),
                timeoutSignal:      duration(cluster.Dispatch.TimeoutSignal, defaultTimeoutSignal),
                installPublicKey:   installPublicKey,
                stop:               make(chan bool),
@@ -143,7 +141,6 @@ type Pool struct {
        timeoutProbe       time.Duration
        timeoutShutdown    time.Duration
        timeoutTERM        time.Duration
-       timeoutKILL        time.Duration
        timeoutSignal      time.Duration
        installPublicKey   ssh.PublicKey
 
index bf1632a6a245e2d7edaf4b1e67486fa657cabe9a..69048096f0bf9dc563997e46532318a158014c12 100644 (file)
@@ -1,3 +1,7 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
 package worker
 
 import (
@@ -19,14 +23,13 @@ type remoteRunner struct {
        arvClient     *arvados.Client
        remoteUser    string
        timeoutTERM   time.Duration
-       timeoutKILL   time.Duration
        timeoutSignal time.Duration
-       onUnkillable  func(uuid string) // callback invoked when giving up on SIGKILL
-       onKilled      func(uuid string) // callback invoked when process exits after SIGTERM/SIGKILL
+       onUnkillable  func(uuid string) // callback invoked when giving up on SIGTERM
+       onKilled      func(uuid string) // callback invoked when process exits after SIGTERM
        logger        logrus.FieldLogger
 
        stopping bool          // true if Stop() has been called
-       sentKILL bool          // true if SIGKILL has been sent
+       givenup  bool          // true if timeoutTERM has been reached
        closed   chan struct{} // channel is closed if Close() has been called
 }
 
@@ -39,7 +42,6 @@ func newRemoteRunner(uuid string, wkr *worker) *remoteRunner {
                arvClient:     wkr.wp.arvClient,
                remoteUser:    wkr.instance.RemoteUser(),
                timeoutTERM:   wkr.wp.timeoutTERM,
-               timeoutKILL:   wkr.wp.timeoutKILL,
                timeoutSignal: wkr.wp.timeoutSignal,
                onUnkillable:  wkr.onUnkillable,
                onKilled:      wkr.onKilled,
@@ -88,9 +90,13 @@ func (rr *remoteRunner) Close() {
        close(rr.closed)
 }
 
-// Kill starts a background task to kill the remote process,
-// escalating from SIGTERM to SIGKILL to onUnkillable() according to
-// the configured timeouts.
+// Kill starts a background task to kill the remote process, first
+// trying SIGTERM until reaching timeoutTERM, then calling
+// onUnkillable().
+//
+// SIGKILL is not used. It would merely kill the crunch-run supervisor
+// and thereby make the docker container, arv-mount, etc. invisible to
+// us without actually stopping them.
 //
 // Once Kill has been called, calling it again has no effect.
 func (rr *remoteRunner) Kill(reason string) {
@@ -101,19 +107,17 @@ func (rr *remoteRunner) Kill(reason string) {
        rr.logger.WithField("Reason", reason).Info("killing crunch-run process")
        go func() {
                termDeadline := time.Now().Add(rr.timeoutTERM)
-               killDeadline := termDeadline.Add(rr.timeoutKILL)
                t := time.NewTicker(rr.timeoutSignal)
                defer t.Stop()
                for range t.C {
                        switch {
                        case rr.isClosed():
                                return
-                       case time.Now().After(killDeadline):
+                       case time.Now().After(termDeadline):
+                               rr.logger.Debug("giving up")
+                               rr.givenup = true
                                rr.onUnkillable(rr.uuid)
                                return
-                       case time.Now().After(termDeadline):
-                               rr.sentKILL = true
-                               rr.kill(syscall.SIGKILL)
                        default:
                                rr.kill(syscall.SIGTERM)
                        }
index b0b030a40daf5e73bd639040191d1b77d358eb11..41117c1d4eafb5aa2a92c163d3f79d72ace443d3 100644 (file)
@@ -394,12 +394,12 @@ func (wkr *worker) eligibleForShutdown() bool {
                        return false
                }
                for _, rr := range wkr.running {
-                       if !rr.sentKILL {
+                       if !rr.givenup {
                                return false
                        }
                }
                for _, rr := range wkr.starting {
-                       if !rr.sentKILL {
+                       if !rr.givenup {
                                return false
                        }
                }
index 8bd29097f42f4af78e727d46b69f62caf7512040..73addb739cd0e25b47212300d420fa4f2b8c8e7d 100644 (file)
@@ -123,14 +123,11 @@ type Dispatch struct {
        // Maximum total worker probes per second
        MaxProbesPerSecond int
 
-       // Time before repeating TERM/KILL signal
+       // Time before repeating SIGTERM when killing a container
        TimeoutSignal Duration
 
-       // Time to give up on TERM and move to KILL
+       // Time to give up on SIGTERM and write off the worker
        TimeoutTERM Duration
-
-       // Time to give up on KILL and write off the worker
-       TimeoutKILL Duration
 }
 
 type CloudVMs struct {