Merge branch '21535-multi-wf-delete'
[arvados.git] / lib / crunchrun / docker.go
index 54d0e680fe6af83494707f58298629db74ded486..4f449133f3a18014500c0ebd7d0fd78ae8e3b6af 100644 (file)
@@ -4,6 +4,7 @@
 package crunchrun
 
 import (
+       "context"
        "fmt"
        "io"
        "io/ioutil"
@@ -17,7 +18,6 @@ import (
        dockertypes "github.com/docker/docker/api/types"
        dockercontainer "github.com/docker/docker/api/types/container"
        dockerclient "github.com/docker/docker/client"
-       "golang.org/x/net/context"
 )
 
 // Docker daemon won't let you set a limit less than ~10 MiB
@@ -31,6 +31,11 @@ const minDockerRAM = int64(16 * 1024 * 1024)
 // https://docs.docker.com/engine/api/.
 const DockerAPIVersion = "1.35"
 
+// Number of consecutive "inspect container" failures before
+// concluding Docker is unresponsive, giving up, and cancelling the
+// container.
+const dockerWatchdogThreshold = 5
+
 type dockerExecutor struct {
        containerUUID    string
        logf             func(string, ...interface{})
@@ -47,7 +52,7 @@ func newDockerExecutor(containerUUID string, logf func(string, ...interface{}),
        // currently the minimum version we want to support.
        client, err := dockerclient.NewClient(dockerclient.DefaultDockerHost, DockerAPIVersion, nil, nil)
        if watchdogInterval < 1 {
-               watchdogInterval = time.Minute
+               watchdogInterval = time.Minute * 2
        }
        return &dockerExecutor{
                containerUUID:    containerUUID,
@@ -182,7 +187,7 @@ func (e *dockerExecutor) config(spec containerSpec) (dockercontainer.Config, doc
 
 func (e *dockerExecutor) Create(spec containerSpec) error {
        cfg, hostCfg := e.config(spec)
-       created, err := e.dockerclient.ContainerCreate(context.TODO(), &cfg, &hostCfg, nil, e.containerUUID)
+       created, err := e.dockerclient.ContainerCreate(context.TODO(), &cfg, &hostCfg, nil, nil, e.containerUUID)
        if err != nil {
                return fmt.Errorf("While creating container: %v", err)
        }
@@ -190,8 +195,15 @@ func (e *dockerExecutor) Create(spec containerSpec) error {
        return e.startIO(spec.Stdin, spec.Stdout, spec.Stderr)
 }
 
-func (e *dockerExecutor) CgroupID() string {
-       return e.containerID
+func (e *dockerExecutor) Pid() int {
+       ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(10*time.Second))
+       defer cancel()
+       ctr, err := e.dockerclient.ContainerInspect(ctx, e.containerID)
+       if err == nil && ctr.State != nil {
+               return ctr.State.Pid
+       } else {
+               return 0
+       }
 }
 
 func (e *dockerExecutor) Start() error {
@@ -225,17 +237,17 @@ func (e *dockerExecutor) Wait(ctx context.Context) (int, error) {
                                // kill it.
                                return
                        } else if err != nil {
-                               e.logf("Error inspecting container: %s", err)
-                               watchdogErr <- err
-                               return
+                               watchdogErr <- fmt.Errorf("error inspecting container: %s", err)
                        } else if ctr.State == nil || !(ctr.State.Running || ctr.State.Status == "created") {
-                               watchdogErr <- fmt.Errorf("Container is not running: State=%v", ctr.State)
-                               return
+                               watchdogErr <- fmt.Errorf("container is not running: State=%v", ctr.State)
+                       } else {
+                               watchdogErr <- nil
                        }
                }
        }()
 
        waitOk, waitErr := e.dockerclient.ContainerWait(ctx, e.containerID, dockercontainer.WaitConditionNotRunning)
+       errors := 0
        for {
                select {
                case waitBody := <-waitOk:
@@ -250,7 +262,16 @@ func (e *dockerExecutor) Wait(ctx context.Context) (int, error) {
                        return -1, ctx.Err()
 
                case err := <-watchdogErr:
-                       return -1, err
+                       if err == nil {
+                               errors = 0
+                       } else {
+                               e.logf("docker watchdog: %s", err)
+                               errors++
+                               if errors >= dockerWatchdogThreshold {
+                                       e.logf("docker watchdog: giving up")
+                                       return -1, err
+                               }
+                       }
                }
        }
 }