14807: Don't send SIGKILL to crunch-run processes.

author Tom Clegg <tclegg@veritasgenetics.com>

Tue, 19 Mar 2019 17:28:39 +0000 (13:28 -0400)

committer Tom Clegg <tclegg@veritasgenetics.com>

Tue, 19 Mar 2019 17:28:39 +0000 (13:28 -0400)
author Tom Clegg <tclegg@veritasgenetics.com>
Tue, 19 Mar 2019 17:28:39 +0000 (13:28 -0400)
committer Tom Clegg <tclegg@veritasgenetics.com>
Tue, 19 Mar 2019 17:28:39 +0000 (13:28 -0400)
diff --git a/lib/dispatchcloud/dispatcher_test.go b/lib/dispatchcloud/dispatcher_test.go

index d0b4efefa3578c214345a477a1fa25f8e5ab8dc8..7268f106a9f36ba933da51ecba4465ba760a8820 100644 (file)
--- a/lib/dispatchcloud/dispatcher_test.go
+++ b/lib/dispatchcloud/dispatcher_test.go
@@ -64,7 +64,6 @@ func (s *DispatcherSuite) SetUpTest(c *check.C) {
                         MaxProbesPerSecond: 1000,
                         TimeoutSignal:      arvados.Duration(3 * time.Millisecond),
                         TimeoutTERM:        arvados.Duration(20 * time.Millisecond),
                         MaxProbesPerSecond: 1000,
                         TimeoutSignal:      arvados.Duration(3 * time.Millisecond),
                         TimeoutTERM:        arvados.Duration(20 * time.Millisecond),
-                       TimeoutKILL:        arvados.Duration(20 * time.Millisecond),
                 },
                 InstanceTypes: arvados.InstanceTypeMap{
                         test.InstanceType(1).Name:  test.InstanceType(1),
                 },
                 InstanceTypes: arvados.InstanceTypeMap{
                         test.InstanceType(1).Name:  test.InstanceType(1),
diff --git a/lib/dispatchcloud/worker/pool.go b/lib/dispatchcloud/worker/pool.go

index d13e29c1e89487df597a387f11129a6c7535e48b..81a658535ea593a7a0a0c2d9231fd66f3055bdbd 100644 (file)
--- a/lib/dispatchcloud/worker/pool.go
+++ b/lib/dispatchcloud/worker/pool.go
@@ -69,7 +69,6 @@ const (
         defaultTimeoutProbe       = time.Minute * 10
         defaultTimeoutShutdown    = time.Second * 10
         defaultTimeoutTERM        = time.Minute * 2
         defaultTimeoutProbe       = time.Minute * 10
         defaultTimeoutShutdown    = time.Second * 10
         defaultTimeoutTERM        = time.Minute * 2
-       defaultTimeoutKILL        = time.Second * 20
         defaultTimeoutSignal      = time.Second * 5
  
         // Time after a quota error to try again anyway, even if no
         defaultTimeoutSignal      = time.Second * 5
  
         // Time after a quota error to try again anyway, even if no
@@ -109,7 +108,6 @@ func NewPool(logger logrus.FieldLogger, arvClient *arvados.Client, reg *promethe
                 timeoutProbe:       duration(cluster.CloudVMs.TimeoutProbe, defaultTimeoutProbe),
                 timeoutShutdown:    duration(cluster.CloudVMs.TimeoutShutdown, defaultTimeoutShutdown),
                 timeoutTERM:        duration(cluster.Dispatch.TimeoutTERM, defaultTimeoutTERM),
                 timeoutProbe:       duration(cluster.CloudVMs.TimeoutProbe, defaultTimeoutProbe),
                 timeoutShutdown:    duration(cluster.CloudVMs.TimeoutShutdown, defaultTimeoutShutdown),
                 timeoutTERM:        duration(cluster.Dispatch.TimeoutTERM, defaultTimeoutTERM),
-               timeoutKILL:        duration(cluster.Dispatch.TimeoutKILL, defaultTimeoutKILL),
                 timeoutSignal:      duration(cluster.Dispatch.TimeoutSignal, defaultTimeoutSignal),
                 installPublicKey:   installPublicKey,
                 stop:               make(chan bool),
                 timeoutSignal:      duration(cluster.Dispatch.TimeoutSignal, defaultTimeoutSignal),
                 installPublicKey:   installPublicKey,
                 stop:               make(chan bool),
@@ -143,7 +141,6 @@ type Pool struct {
         timeoutProbe       time.Duration
         timeoutShutdown    time.Duration
         timeoutTERM        time.Duration
         timeoutProbe       time.Duration
         timeoutShutdown    time.Duration
         timeoutTERM        time.Duration
-       timeoutKILL        time.Duration
         timeoutSignal      time.Duration
         installPublicKey   ssh.PublicKey
  
         timeoutSignal      time.Duration
         installPublicKey   ssh.PublicKey
  
diff --git a/lib/dispatchcloud/worker/runner.go b/lib/dispatchcloud/worker/runner.go

index bf1632a6a245e2d7edaf4b1e67486fa657cabe9a..69048096f0bf9dc563997e46532318a158014c12 100644 (file)
--- a/lib/dispatchcloud/worker/runner.go
+++ b/lib/dispatchcloud/worker/runner.go
@@ -1,3 +1,7 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
  package worker
  
  import (
  package worker
  
  import (
@@ -19,14 +23,13 @@ type remoteRunner struct {
         arvClient     *arvados.Client
         remoteUser    string
         timeoutTERM   time.Duration
         arvClient     *arvados.Client
         remoteUser    string
         timeoutTERM   time.Duration
-       timeoutKILL   time.Duration
         timeoutSignal time.Duration
         timeoutSignal time.Duration
-       onUnkillable  func(uuid string) // callback invoked when giving up on SIGKILL
-       onKilled      func(uuid string) // callback invoked when process exits after SIGTERM/SIGKILL
+       onUnkillable  func(uuid string) // callback invoked when giving up on SIGTERM
+       onKilled      func(uuid string) // callback invoked when process exits after SIGTERM
         logger        logrus.FieldLogger
  
         stopping bool          // true if Stop() has been called
         logger        logrus.FieldLogger
  
         stopping bool          // true if Stop() has been called
-       sentKILL bool          // true if SIGKILL has been sent
+       givenup  bool          // true if timeoutTERM has been reached
         closed   chan struct{} // channel is closed if Close() has been called
  }
  
         closed   chan struct{} // channel is closed if Close() has been called
  }
  
@@ -39,7 +42,6 @@ func newRemoteRunner(uuid string, wkr *worker) *remoteRunner {
                 arvClient:     wkr.wp.arvClient,
                 remoteUser:    wkr.instance.RemoteUser(),
                 timeoutTERM:   wkr.wp.timeoutTERM,
                 arvClient:     wkr.wp.arvClient,
                 remoteUser:    wkr.instance.RemoteUser(),
                 timeoutTERM:   wkr.wp.timeoutTERM,
-               timeoutKILL:   wkr.wp.timeoutKILL,
                 timeoutSignal: wkr.wp.timeoutSignal,
                 onUnkillable:  wkr.onUnkillable,
                 onKilled:      wkr.onKilled,
                 timeoutSignal: wkr.wp.timeoutSignal,
                 onUnkillable:  wkr.onUnkillable,
                 onKilled:      wkr.onKilled,
@@ -88,9 +90,13 @@ func (rr *remoteRunner) Close() {
         close(rr.closed)
  }
  
         close(rr.closed)
  }
  
-// Kill starts a background task to kill the remote process,
-// escalating from SIGTERM to SIGKILL to onUnkillable() according to
-// the configured timeouts.
+// Kill starts a background task to kill the remote process, first
+// trying SIGTERM until reaching timeoutTERM, then calling
+// onUnkillable().
+//
+// SIGKILL is not used. It would merely kill the crunch-run supervisor
+// and thereby make the docker container, arv-mount, etc. invisible to
+// us without actually stopping them.
  //
  // Once Kill has been called, calling it again has no effect.
  func (rr *remoteRunner) Kill(reason string) {
  //
  // Once Kill has been called, calling it again has no effect.
  func (rr *remoteRunner) Kill(reason string) {
@@ -101,19 +107,17 @@ func (rr *remoteRunner) Kill(reason string) {
         rr.logger.WithField("Reason", reason).Info("killing crunch-run process")
         go func() {
                 termDeadline := time.Now().Add(rr.timeoutTERM)
         rr.logger.WithField("Reason", reason).Info("killing crunch-run process")
         go func() {
                 termDeadline := time.Now().Add(rr.timeoutTERM)
-               killDeadline := termDeadline.Add(rr.timeoutKILL)
                 t := time.NewTicker(rr.timeoutSignal)
                 defer t.Stop()
                 for range t.C {
                         switch {
                         case rr.isClosed():
                                 return
                 t := time.NewTicker(rr.timeoutSignal)
                 defer t.Stop()
                 for range t.C {
                         switch {
                         case rr.isClosed():
                                 return
-                       case time.Now().After(killDeadline):
+                       case time.Now().After(termDeadline):
+                               rr.logger.Debug("giving up")
+                               rr.givenup = true
                                 rr.onUnkillable(rr.uuid)
                                 return
                                 rr.onUnkillable(rr.uuid)
                                 return
-                       case time.Now().After(termDeadline):
-                               rr.sentKILL = true
-                               rr.kill(syscall.SIGKILL)
                         default:
                                 rr.kill(syscall.SIGTERM)
                         }
                         default:
                                 rr.kill(syscall.SIGTERM)
                         }
diff --git a/lib/dispatchcloud/worker/worker.go b/lib/dispatchcloud/worker/worker.go

index b0b030a40daf5e73bd639040191d1b77d358eb11..41117c1d4eafb5aa2a92c163d3f79d72ace443d3 100644 (file)
--- a/lib/dispatchcloud/worker/worker.go
+++ b/lib/dispatchcloud/worker/worker.go
@@ -394,12 +394,12 @@ func (wkr *worker) eligibleForShutdown() bool {
                         return false
                 }
                 for _, rr := range wkr.running {
                         return false
                 }
                 for _, rr := range wkr.running {
-                       if !rr.sentKILL {
+                       if !rr.givenup {
                                 return false
                         }
                 }
                 for _, rr := range wkr.starting {
                                 return false
                         }
                 }
                 for _, rr := range wkr.starting {
-                       if !rr.sentKILL {
+                       if !rr.givenup {
                                 return false
                         }
                 }
                                 return false
                         }
                 }
diff --git a/sdk/go/arvados/config.go b/sdk/go/arvados/config.go

index 8bd29097f42f4af78e727d46b69f62caf7512040..73addb739cd0e25b47212300d420fa4f2b8c8e7d 100644 (file)
--- a/sdk/go/arvados/config.go
+++ b/sdk/go/arvados/config.go
@@ -123,14 +123,11 @@ type Dispatch struct {
         // Maximum total worker probes per second
         MaxProbesPerSecond int
  
         // Maximum total worker probes per second
         MaxProbesPerSecond int
  
-       // Time before repeating TERM/KILL signal
+       // Time before repeating SIGTERM when killing a container
         TimeoutSignal Duration
  
         TimeoutSignal Duration
  
-       // Time to give up on TERM and move to KILL
+       // Time to give up on SIGTERM and write off the worker
         TimeoutTERM Duration
         TimeoutTERM Duration
-
-       // Time to give up on KILL and write off the worker
-       TimeoutKILL Duration
  }
  
  type CloudVMs struct {
  }
  
  type CloudVMs struct {
author	Tom Clegg <tclegg@veritasgenetics.com>
	Tue, 19 Mar 2019 17:28:39 +0000 (13:28 -0400)
committer	Tom Clegg <tclegg@veritasgenetics.com>
	Tue, 19 Mar 2019 17:28:39 +0000 (13:28 -0400)
lib/dispatchcloud/dispatcher_test.go		patch \| blob \| history
lib/dispatchcloud/worker/pool.go		patch \| blob \| history
lib/dispatchcloud/worker/runner.go		patch \| blob \| history
lib/dispatchcloud/worker/worker.go		patch \| blob \| history
sdk/go/arvados/config.go		patch \| blob \| history