From: Tom Clegg Date: Fri, 2 Sep 2022 14:06:03 +0000 (-0400) Subject: 19437: Don't cancel until 3 consecutive docker-inspect failures. X-Git-Tag: 2.5.0~74^2 X-Git-Url: https://git.arvados.org/arvados.git/commitdiff_plain/9c254acbd78ed50e1e9fec508fb9ec4164867dda 19437: Don't cancel until 3 consecutive docker-inspect failures. Arvados-DCO-1.1-Signed-off-by: Tom Clegg --- diff --git a/lib/crunchrun/docker.go b/lib/crunchrun/docker.go index 54d0e680fe..bb63526586 100644 --- a/lib/crunchrun/docker.go +++ b/lib/crunchrun/docker.go @@ -31,6 +31,11 @@ const minDockerRAM = int64(16 * 1024 * 1024) // https://docs.docker.com/engine/api/. const DockerAPIVersion = "1.35" +// Number of consecutive "inspect container" failures before +// concluding Docker is unresponsive, giving up, and cancelling the +// container. +const dockerWatchdogThreshold = 3 + type dockerExecutor struct { containerUUID string logf func(string, ...interface{}) @@ -225,17 +230,17 @@ func (e *dockerExecutor) Wait(ctx context.Context) (int, error) { // kill it. return } else if err != nil { - e.logf("Error inspecting container: %s", err) - watchdogErr <- err - return + watchdogErr <- fmt.Errorf("error inspecting container: %s", err) } else if ctr.State == nil || !(ctr.State.Running || ctr.State.Status == "created") { - watchdogErr <- fmt.Errorf("Container is not running: State=%v", ctr.State) - return + watchdogErr <- fmt.Errorf("container is not running: State=%v", ctr.State) + } else { + watchdogErr <- nil } } }() waitOk, waitErr := e.dockerclient.ContainerWait(ctx, e.containerID, dockercontainer.WaitConditionNotRunning) + errors := 0 for { select { case waitBody := <-waitOk: @@ -250,7 +255,16 @@ func (e *dockerExecutor) Wait(ctx context.Context) (int, error) { return -1, ctx.Err() case err := <-watchdogErr: - return -1, err + if err == nil { + errors = 0 + } else { + e.logf("docker watchdog: %s", err) + errors++ + if errors >= dockerWatchdogThreshold { + e.logf("docker watchdog: giving up") + return -1, err + } + } } } }