14328: Check ContainerInspect instead of List, give up on error.
authorTom Clegg <tclegg@veritasgenetics.com>
Thu, 25 Oct 2018 21:44:14 +0000 (17:44 -0400)
committerTom Clegg <tclegg@veritasgenetics.com>
Thu, 25 Oct 2018 21:44:14 +0000 (17:44 -0400)
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg@veritasgenetics.com>

services/crunch-run/crunchrun.go
services/crunch-run/crunchrun_test.go

index 39cf4408203dfda83463b7a3518a243cb4314e3b..d38808f468aa3b50aaed51358d6845c7cb665123 100644 (file)
@@ -79,7 +79,7 @@ type ThinDockerClient interface {
        ContainerStart(ctx context.Context, container string, options dockertypes.ContainerStartOptions) error
        ContainerRemove(ctx context.Context, container string, options dockertypes.ContainerRemoveOptions) error
        ContainerWait(ctx context.Context, container string, condition dockercontainer.WaitCondition) (<-chan dockercontainer.ContainerWaitOKBody, <-chan error)
-       ContainerList(ctx context.Context, opts dockertypes.ContainerListOptions) ([]dockertypes.Container, error)
+       ContainerInspect(ctx context.Context, id string) (dockertypes.ContainerJSON, error)
        ImageInspectWithRaw(ctx context.Context, image string) (dockertypes.ImageInspect, []byte, error)
        ImageLoad(ctx context.Context, input io.Reader, quiet bool) (dockertypes.ImageLoadResponse, error)
        ImageRemove(ctx context.Context, image string, options dockertypes.ImageRemoveOptions) ([]dockertypes.ImageDeleteResponseItem, error)
@@ -157,7 +157,7 @@ type ContainerRunner struct {
        arvMountLog     *ThrottledLogger
        checkContainerd time.Duration
 
-       containerWaitGracePeriod time.Duration
+       containerWatchdogInterval time.Duration
 }
 
 // setupSignals sets up signal handling to gracefully terminate the underlying
@@ -1134,38 +1134,24 @@ func (runner *ContainerRunner) WaitFinish() error {
        containerGone := make(chan struct{})
        go func() {
                defer close(containerGone)
-               if runner.containerWaitGracePeriod < 1 {
-                       runner.containerWaitGracePeriod = 30 * time.Second
+               if runner.containerWatchdogInterval < 1 {
+                       runner.containerWatchdogInterval = time.Minute
                }
-               found := time.Now()
-       polling:
-               for range time.NewTicker(runner.containerWaitGracePeriod / 30).C {
-                       ctrs, err := runner.Docker.ContainerList(context.Background(), dockertypes.ContainerListOptions{})
-                       if err != nil {
-                               runner.CrunchLog.Printf("error checking container list: %s", err)
-                               if runner.checkBrokenNode(err) {
-                                       return
-                               }
-                               continue polling
-                       }
-                       for _, ctr := range ctrs {
-                               if ctr.ID == runner.ContainerID {
-                                       found = time.Now()
-                                       continue polling
-                               }
-                       }
+               for range time.NewTicker(runner.containerWatchdogInterval).C {
+                       ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(runner.containerWatchdogInterval))
+                       ctr, err := runner.Docker.ContainerInspect(ctx, runner.ContainerID)
+                       cancel()
                        runner.cStateLock.Lock()
                        done := runner.cRemoved || runner.ExitCode != nil
                        runner.cStateLock.Unlock()
                        if done {
-                               // Skip the grace period and warning
-                               // log if the container disappeared
-                               // because it finished, or we removed
-                               // it ourselves.
                                return
-                       }
-                       if time.Since(found) > runner.containerWaitGracePeriod {
-                               runner.CrunchLog.Printf("container %s no longer exists", runner.ContainerID)
+                       } else if err != nil {
+                               runner.CrunchLog.Printf("Error inspecting container: %s", err)
+                               runner.checkBrokenNode(err)
+                               return
+                       } else if ctr.State == nil || !(ctr.State.Running || ctr.State.Status == "created") {
+                               runner.CrunchLog.Printf("Container is not running: State=%v", ctr.State)
                                return
                        }
                }
index 0e48a50aa196d41908d0b9e14eb844193ff3bbc7..eb4f220e227d399b31b0669b15ba77c75449e557 100644 (file)
@@ -178,12 +178,15 @@ func (t *TestDockerClient) ContainerWait(ctx context.Context, container string,
        return body, err
 }
 
-func (t *TestDockerClient) ContainerList(ctx context.Context, options dockertypes.ContainerListOptions) ([]dockertypes.Container, error) {
+func (t *TestDockerClient) ContainerInspect(ctx context.Context, id string) (c dockertypes.ContainerJSON, err error) {
+       c.ContainerJSONBase = &dockertypes.ContainerJSONBase{}
+       c.ID = "abcde"
        if t.ctrExited {
-               return nil, nil
+               c.State = &dockertypes.ContainerState{Status: "exited", Dead: true}
        } else {
-               return []dockertypes.Container{{ID: "abcde"}}, nil
+               c.State = &dockertypes.ContainerState{Status: "running", Pid: 1234, Running: true}
        }
+       return
 }
 
 func (t *TestDockerClient) ImageInspectWithRaw(ctx context.Context, image string) (dockertypes.ImageInspect, []byte, error) {
@@ -748,6 +751,7 @@ func (s *TestSuite) fullRunHelper(c *C, record string, extraMounts []string, exi
        c.Assert(err, IsNil)
        s.runner = cr
        cr.statInterval = 100 * time.Millisecond
+       cr.containerWatchdogInterval = time.Second
        am := &ArvMountCmdLine{}
        cr.RunArvMount = am.ArvMountTest
 
@@ -850,15 +854,13 @@ func (s *TestSuite) TestContainerWaitFails(c *C) {
     "output_path": "/tmp",
     "priority": 1
 }`, nil, 0, func(t *TestDockerClient) {
-               s.runner.containerWaitGracePeriod = time.Second
                t.ctrExited = true
                time.Sleep(10 * time.Second)
                t.logWriter.Close()
        })
 
        c.Check(api.CalledWith("container.state", "Cancelled"), NotNil)
-       c.Check(api.Logs["crunch-run"].String(), Matches, "(?ms).*container.*no longer exists.*")
-       c.Check(api.Logs["crunch-run"].String(), Matches, "(?ms).*docker client never returned status.*")
+       c.Check(api.Logs["crunch-run"].String(), Matches, "(?ms).*Container is not running.*")
 }
 
 func (s *TestSuite) TestCrunchstat(c *C) {