1 // Copyright (C) The Arvados Authors. All rights reserved.
3 // SPDX-License-Identifier: AGPL-3.0
14 "git.curoverse.com/arvados.git/sdk/go/arvados"
15 "github.com/sirupsen/logrus"
18 // remoteRunner handles the starting and stopping of a crunch-run
19 // process on a remote machine.
20 type remoteRunner struct {
23 arvClient *arvados.Client
25 timeoutTERM time.Duration
26 timeoutSignal time.Duration
27 onUnkillable func(uuid string) // callback invoked when giving up on SIGTERM
28 onKilled func(uuid string) // callback invoked when process exits after SIGTERM
29 logger logrus.FieldLogger
31 stopping bool // true if Stop() has been called
32 givenup bool // true if timeoutTERM has been reached
33 closed chan struct{} // channel is closed if Close() has been called
36 // newRemoteRunner returns a new remoteRunner. Caller should ensure
37 // Close() is called to release resources.
38 func newRemoteRunner(uuid string, wkr *worker) *remoteRunner {
41 executor: wkr.executor,
42 arvClient: wkr.wp.arvClient,
43 remoteUser: wkr.instance.RemoteUser(),
44 timeoutTERM: wkr.wp.timeoutTERM,
45 timeoutSignal: wkr.wp.timeoutSignal,
46 onUnkillable: wkr.onUnkillable,
47 onKilled: wkr.onKilled,
48 logger: wkr.logger.WithField("ContainerUUID", uuid),
49 closed: make(chan struct{}),
54 // Start a crunch-run process on the remote host.
56 // Start does not return any error encountered. The caller should
57 // assume the remote process _might_ have started, at least until it
58 // probes the worker and finds otherwise.
59 func (rr *remoteRunner) Start() {
60 env := map[string]string{
61 "ARVADOS_API_HOST": rr.arvClient.APIHost,
62 "ARVADOS_API_TOKEN": rr.arvClient.AuthToken,
64 if rr.arvClient.Insecure {
65 env["ARVADOS_API_HOST_INSECURE"] = "1"
67 envJSON, err := json.Marshal(env)
71 stdin := bytes.NewBuffer(envJSON)
72 cmd := "crunch-run --detach --stdin-env '" + rr.uuid + "'"
73 if rr.remoteUser != "root" {
76 stdout, stderr, err := rr.executor.Execute(nil, cmd, stdin)
78 rr.logger.WithField("stdout", string(stdout)).
79 WithField("stderr", string(stderr)).
81 Error("error starting crunch-run process")
84 rr.logger.Info("crunch-run process started")
87 // Close abandons the remote process (if any) and releases
88 // resources. Close must not be called more than once.
89 func (rr *remoteRunner) Close() {
93 // Kill starts a background task to kill the remote process, first
94 // trying SIGTERM until reaching timeoutTERM, then calling
97 // SIGKILL is not used. It would merely kill the crunch-run supervisor
98 // and thereby make the docker container, arv-mount, etc. invisible to
99 // us without actually stopping them.
101 // Once Kill has been called, calling it again has no effect.
102 func (rr *remoteRunner) Kill(reason string) {
107 rr.logger.WithField("Reason", reason).Info("killing crunch-run process")
109 termDeadline := time.Now().Add(rr.timeoutTERM)
110 t := time.NewTicker(rr.timeoutSignal)
116 case time.Now().After(termDeadline):
117 rr.logger.Debug("giving up")
119 rr.onUnkillable(rr.uuid)
122 rr.kill(syscall.SIGTERM)
128 func (rr *remoteRunner) kill(sig syscall.Signal) {
129 logger := rr.logger.WithField("Signal", int(sig))
130 logger.Info("sending signal")
131 cmd := fmt.Sprintf("crunch-run --kill %d %s", sig, rr.uuid)
132 if rr.remoteUser != "root" {
135 stdout, stderr, err := rr.executor.Execute(nil, cmd, nil)
137 logger.WithFields(logrus.Fields{
138 "stderr": string(stderr),
139 "stdout": string(stdout),
141 }).Info("kill attempt unsuccessful")
147 func (rr *remoteRunner) isClosed() bool {