X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/2130de30acf0a3b89e06494f957aacb350c15067..afe579c7b3c03b56fa8cc2c97f44134642a9458f:/services/crunch-run/crunchrun.go?ds=sidebyside diff --git a/services/crunch-run/crunchrun.go b/services/crunch-run/crunchrun.go index be98a3ee11..d38808f468 100644 --- a/services/crunch-run/crunchrun.go +++ b/services/crunch-run/crunchrun.go @@ -79,6 +79,7 @@ type ThinDockerClient interface { ContainerStart(ctx context.Context, container string, options dockertypes.ContainerStartOptions) error ContainerRemove(ctx context.Context, container string, options dockertypes.ContainerRemoveOptions) error ContainerWait(ctx context.Context, container string, condition dockercontainer.WaitCondition) (<-chan dockercontainer.ContainerWaitOKBody, <-chan error) + ContainerInspect(ctx context.Context, id string) (dockertypes.ContainerJSON, error) ImageInspectWithRaw(ctx context.Context, image string) (dockertypes.ImageInspect, []byte, error) ImageLoad(ctx context.Context, input io.Reader, quiet bool) (dockertypes.ImageLoadResponse, error) ImageRemove(ctx context.Context, image string, options dockertypes.ImageRemoveOptions) ([]dockertypes.ImageDeleteResponseItem, error) @@ -149,11 +150,14 @@ type ContainerRunner struct { cStateLock sync.Mutex cCancelled bool // StopContainer() invoked + cRemoved bool // docker confirmed the container no longer exists enableNetwork string // one of "default" or "always" networkMode string // passed through to HostConfig.NetworkMode arvMountLog *ThrottledLogger checkContainerd time.Duration + + containerWatchdogInterval time.Duration } // setupSignals sets up signal handling to gracefully terminate the underlying @@ -187,6 +191,9 @@ func (runner *ContainerRunner) stop(sig os.Signal) { if err != nil { runner.CrunchLog.Printf("error removing container: %s", err) } + if err == nil || strings.Contains(err.Error(), "No such container: "+runner.ContainerID) { + runner.cRemoved = true + } } var errorBlacklist = []string{ @@ -1124,6 +1131,32 @@ func (runner *ContainerRunner) WaitFinish() error { runTimeExceeded = time.After(time.Duration(timeout) * time.Second) } + containerGone := make(chan struct{}) + go func() { + defer close(containerGone) + if runner.containerWatchdogInterval < 1 { + runner.containerWatchdogInterval = time.Minute + } + for range time.NewTicker(runner.containerWatchdogInterval).C { + ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(runner.containerWatchdogInterval)) + ctr, err := runner.Docker.ContainerInspect(ctx, runner.ContainerID) + cancel() + runner.cStateLock.Lock() + done := runner.cRemoved || runner.ExitCode != nil + runner.cStateLock.Unlock() + if done { + return + } else if err != nil { + runner.CrunchLog.Printf("Error inspecting container: %s", err) + runner.checkBrokenNode(err) + return + } else if ctr.State == nil || !(ctr.State.Running || ctr.State.Status == "created") { + runner.CrunchLog.Printf("Container is not running: State=%v", ctr.State) + return + } + } + }() + containerdGone := make(chan error) defer close(containerdGone) if runner.checkContainerd > 0 { @@ -1171,6 +1204,9 @@ func (runner *ContainerRunner) WaitFinish() error { runner.stop(nil) runTimeExceeded = nil + case <-containerGone: + return errors.New("docker client never returned status") + case err := <-containerdGone: return err }