19437: Don't cancel until 3 consecutive docker-inspect failures. 19437-docker-watchdog
authorTom Clegg <tom@curii.com>
Fri, 2 Sep 2022 14:06:03 +0000 (10:06 -0400)
committerTom Clegg <tom@curii.com>
Fri, 2 Sep 2022 14:06:03 +0000 (10:06 -0400)
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom@curii.com>

lib/crunchrun/docker.go

index 54d0e680fe6af83494707f58298629db74ded486..bb635265862fff8a4bf36127000297bbad16e051 100644 (file)
@@ -31,6 +31,11 @@ const minDockerRAM = int64(16 * 1024 * 1024)
 // https://docs.docker.com/engine/api/.
 const DockerAPIVersion = "1.35"
 
+// Number of consecutive "inspect container" failures before
+// concluding Docker is unresponsive, giving up, and cancelling the
+// container.
+const dockerWatchdogThreshold = 3
+
 type dockerExecutor struct {
        containerUUID    string
        logf             func(string, ...interface{})
@@ -225,17 +230,17 @@ func (e *dockerExecutor) Wait(ctx context.Context) (int, error) {
                                // kill it.
                                return
                        } else if err != nil {
-                               e.logf("Error inspecting container: %s", err)
-                               watchdogErr <- err
-                               return
+                               watchdogErr <- fmt.Errorf("error inspecting container: %s", err)
                        } else if ctr.State == nil || !(ctr.State.Running || ctr.State.Status == "created") {
-                               watchdogErr <- fmt.Errorf("Container is not running: State=%v", ctr.State)
-                               return
+                               watchdogErr <- fmt.Errorf("container is not running: State=%v", ctr.State)
+                       } else {
+                               watchdogErr <- nil
                        }
                }
        }()
 
        waitOk, waitErr := e.dockerclient.ContainerWait(ctx, e.containerID, dockercontainer.WaitConditionNotRunning)
+       errors := 0
        for {
                select {
                case waitBody := <-waitOk:
@@ -250,7 +255,16 @@ func (e *dockerExecutor) Wait(ctx context.Context) (int, error) {
                        return -1, ctx.Err()
 
                case err := <-watchdogErr:
-                       return -1, err
+                       if err == nil {
+                               errors = 0
+                       } else {
+                               e.logf("docker watchdog: %s", err)
+                               errors++
+                               if errors >= dockerWatchdogThreshold {
+                                       e.logf("docker watchdog: giving up")
+                                       return -1, err
+                               }
+                       }
                }
        }
 }