X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/58e6402a72e9ac1a210b2d318591f973a37e1e57..ee9d1e39b5d469a827be5a719c9c0860914ab2a8:/lib/crunchrun/docker.go diff --git a/lib/crunchrun/docker.go b/lib/crunchrun/docker.go index 54d0e680fe..4f449133f3 100644 --- a/lib/crunchrun/docker.go +++ b/lib/crunchrun/docker.go @@ -4,6 +4,7 @@ package crunchrun import ( + "context" "fmt" "io" "io/ioutil" @@ -17,7 +18,6 @@ import ( dockertypes "github.com/docker/docker/api/types" dockercontainer "github.com/docker/docker/api/types/container" dockerclient "github.com/docker/docker/client" - "golang.org/x/net/context" ) // Docker daemon won't let you set a limit less than ~10 MiB @@ -31,6 +31,11 @@ const minDockerRAM = int64(16 * 1024 * 1024) // https://docs.docker.com/engine/api/. const DockerAPIVersion = "1.35" +// Number of consecutive "inspect container" failures before +// concluding Docker is unresponsive, giving up, and cancelling the +// container. +const dockerWatchdogThreshold = 5 + type dockerExecutor struct { containerUUID string logf func(string, ...interface{}) @@ -47,7 +52,7 @@ func newDockerExecutor(containerUUID string, logf func(string, ...interface{}), // currently the minimum version we want to support. client, err := dockerclient.NewClient(dockerclient.DefaultDockerHost, DockerAPIVersion, nil, nil) if watchdogInterval < 1 { - watchdogInterval = time.Minute + watchdogInterval = time.Minute * 2 } return &dockerExecutor{ containerUUID: containerUUID, @@ -182,7 +187,7 @@ func (e *dockerExecutor) config(spec containerSpec) (dockercontainer.Config, doc func (e *dockerExecutor) Create(spec containerSpec) error { cfg, hostCfg := e.config(spec) - created, err := e.dockerclient.ContainerCreate(context.TODO(), &cfg, &hostCfg, nil, e.containerUUID) + created, err := e.dockerclient.ContainerCreate(context.TODO(), &cfg, &hostCfg, nil, nil, e.containerUUID) if err != nil { return fmt.Errorf("While creating container: %v", err) } @@ -190,8 +195,15 @@ func (e *dockerExecutor) Create(spec containerSpec) error { return e.startIO(spec.Stdin, spec.Stdout, spec.Stderr) } -func (e *dockerExecutor) CgroupID() string { - return e.containerID +func (e *dockerExecutor) Pid() int { + ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(10*time.Second)) + defer cancel() + ctr, err := e.dockerclient.ContainerInspect(ctx, e.containerID) + if err == nil && ctr.State != nil { + return ctr.State.Pid + } else { + return 0 + } } func (e *dockerExecutor) Start() error { @@ -225,17 +237,17 @@ func (e *dockerExecutor) Wait(ctx context.Context) (int, error) { // kill it. return } else if err != nil { - e.logf("Error inspecting container: %s", err) - watchdogErr <- err - return + watchdogErr <- fmt.Errorf("error inspecting container: %s", err) } else if ctr.State == nil || !(ctr.State.Running || ctr.State.Status == "created") { - watchdogErr <- fmt.Errorf("Container is not running: State=%v", ctr.State) - return + watchdogErr <- fmt.Errorf("container is not running: State=%v", ctr.State) + } else { + watchdogErr <- nil } } }() waitOk, waitErr := e.dockerclient.ContainerWait(ctx, e.containerID, dockercontainer.WaitConditionNotRunning) + errors := 0 for { select { case waitBody := <-waitOk: @@ -250,7 +262,16 @@ func (e *dockerExecutor) Wait(ctx context.Context) (int, error) { return -1, ctx.Err() case err := <-watchdogErr: - return -1, err + if err == nil { + errors = 0 + } else { + e.logf("docker watchdog: %s", err) + errors++ + if errors >= dockerWatchdogThreshold { + e.logf("docker watchdog: giving up") + return -1, err + } + } } } }