Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg@veritasgenetics.com>
MaxProbesPerSecond: 1000,
TimeoutSignal: arvados.Duration(3 * time.Millisecond),
TimeoutTERM: arvados.Duration(20 * time.Millisecond),
MaxProbesPerSecond: 1000,
TimeoutSignal: arvados.Duration(3 * time.Millisecond),
TimeoutTERM: arvados.Duration(20 * time.Millisecond),
- TimeoutKILL: arvados.Duration(20 * time.Millisecond),
},
InstanceTypes: arvados.InstanceTypeMap{
test.InstanceType(1).Name: test.InstanceType(1),
},
InstanceTypes: arvados.InstanceTypeMap{
test.InstanceType(1).Name: test.InstanceType(1),
defaultTimeoutProbe = time.Minute * 10
defaultTimeoutShutdown = time.Second * 10
defaultTimeoutTERM = time.Minute * 2
defaultTimeoutProbe = time.Minute * 10
defaultTimeoutShutdown = time.Second * 10
defaultTimeoutTERM = time.Minute * 2
- defaultTimeoutKILL = time.Second * 20
defaultTimeoutSignal = time.Second * 5
// Time after a quota error to try again anyway, even if no
defaultTimeoutSignal = time.Second * 5
// Time after a quota error to try again anyway, even if no
timeoutProbe: duration(cluster.CloudVMs.TimeoutProbe, defaultTimeoutProbe),
timeoutShutdown: duration(cluster.CloudVMs.TimeoutShutdown, defaultTimeoutShutdown),
timeoutTERM: duration(cluster.Dispatch.TimeoutTERM, defaultTimeoutTERM),
timeoutProbe: duration(cluster.CloudVMs.TimeoutProbe, defaultTimeoutProbe),
timeoutShutdown: duration(cluster.CloudVMs.TimeoutShutdown, defaultTimeoutShutdown),
timeoutTERM: duration(cluster.Dispatch.TimeoutTERM, defaultTimeoutTERM),
- timeoutKILL: duration(cluster.Dispatch.TimeoutKILL, defaultTimeoutKILL),
timeoutSignal: duration(cluster.Dispatch.TimeoutSignal, defaultTimeoutSignal),
installPublicKey: installPublicKey,
stop: make(chan bool),
timeoutSignal: duration(cluster.Dispatch.TimeoutSignal, defaultTimeoutSignal),
installPublicKey: installPublicKey,
stop: make(chan bool),
timeoutProbe time.Duration
timeoutShutdown time.Duration
timeoutTERM time.Duration
timeoutProbe time.Duration
timeoutShutdown time.Duration
timeoutTERM time.Duration
- timeoutKILL time.Duration
timeoutSignal time.Duration
installPublicKey ssh.PublicKey
timeoutSignal time.Duration
installPublicKey ssh.PublicKey
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
arvClient *arvados.Client
remoteUser string
timeoutTERM time.Duration
arvClient *arvados.Client
remoteUser string
timeoutTERM time.Duration
- timeoutKILL time.Duration
timeoutSignal time.Duration
timeoutSignal time.Duration
- onUnkillable func(uuid string) // callback invoked when giving up on SIGKILL
- onKilled func(uuid string) // callback invoked when process exits after SIGTERM/SIGKILL
+ onUnkillable func(uuid string) // callback invoked when giving up on SIGTERM
+ onKilled func(uuid string) // callback invoked when process exits after SIGTERM
logger logrus.FieldLogger
stopping bool // true if Stop() has been called
logger logrus.FieldLogger
stopping bool // true if Stop() has been called
- sentKILL bool // true if SIGKILL has been sent
+ givenup bool // true if timeoutTERM has been reached
closed chan struct{} // channel is closed if Close() has been called
}
closed chan struct{} // channel is closed if Close() has been called
}
arvClient: wkr.wp.arvClient,
remoteUser: wkr.instance.RemoteUser(),
timeoutTERM: wkr.wp.timeoutTERM,
arvClient: wkr.wp.arvClient,
remoteUser: wkr.instance.RemoteUser(),
timeoutTERM: wkr.wp.timeoutTERM,
- timeoutKILL: wkr.wp.timeoutKILL,
timeoutSignal: wkr.wp.timeoutSignal,
onUnkillable: wkr.onUnkillable,
onKilled: wkr.onKilled,
timeoutSignal: wkr.wp.timeoutSignal,
onUnkillable: wkr.onUnkillable,
onKilled: wkr.onKilled,
-// Kill starts a background task to kill the remote process,
-// escalating from SIGTERM to SIGKILL to onUnkillable() according to
-// the configured timeouts.
+// Kill starts a background task to kill the remote process, first
+// trying SIGTERM until reaching timeoutTERM, then calling
+// onUnkillable().
+//
+// SIGKILL is not used. It would merely kill the crunch-run supervisor
+// and thereby make the docker container, arv-mount, etc. invisible to
+// us without actually stopping them.
//
// Once Kill has been called, calling it again has no effect.
func (rr *remoteRunner) Kill(reason string) {
//
// Once Kill has been called, calling it again has no effect.
func (rr *remoteRunner) Kill(reason string) {
rr.logger.WithField("Reason", reason).Info("killing crunch-run process")
go func() {
termDeadline := time.Now().Add(rr.timeoutTERM)
rr.logger.WithField("Reason", reason).Info("killing crunch-run process")
go func() {
termDeadline := time.Now().Add(rr.timeoutTERM)
- killDeadline := termDeadline.Add(rr.timeoutKILL)
t := time.NewTicker(rr.timeoutSignal)
defer t.Stop()
for range t.C {
switch {
case rr.isClosed():
return
t := time.NewTicker(rr.timeoutSignal)
defer t.Stop()
for range t.C {
switch {
case rr.isClosed():
return
- case time.Now().After(killDeadline):
+ case time.Now().After(termDeadline):
+ rr.logger.Debug("giving up")
+ rr.givenup = true
rr.onUnkillable(rr.uuid)
return
rr.onUnkillable(rr.uuid)
return
- case time.Now().After(termDeadline):
- rr.sentKILL = true
- rr.kill(syscall.SIGKILL)
default:
rr.kill(syscall.SIGTERM)
}
default:
rr.kill(syscall.SIGTERM)
}
return false
}
for _, rr := range wkr.running {
return false
}
for _, rr := range wkr.running {
return false
}
}
for _, rr := range wkr.starting {
return false
}
}
for _, rr := range wkr.starting {
// Maximum total worker probes per second
MaxProbesPerSecond int
// Maximum total worker probes per second
MaxProbesPerSecond int
- // Time before repeating TERM/KILL signal
+ // Time before repeating SIGTERM when killing a container
- // Time to give up on TERM and move to KILL
+ // Time to give up on SIGTERM and write off the worker
-
- // Time to give up on KILL and write off the worker
- TimeoutKILL Duration