14406: Copy remote blocks to local storage during MarshalManifest.
[arvados.git] / services / crunch-run / crunchrun.go
index 39cf4408203dfda83463b7a3518a243cb4314e3b..800556866a43e985c9107370ff83d9691158aef1 100644 (file)
@@ -61,6 +61,7 @@ type IKeepClient interface {
        PutB(buf []byte) (string, int, error)
        ReadAt(locator string, p []byte, off int) (int, error)
        ManifestFileReader(m manifest.Manifest, filename string) (arvados.File, error)
+       LocalLocator(locator string) (string, error)
        ClearBlockCache()
 }
 
@@ -79,7 +80,7 @@ type ThinDockerClient interface {
        ContainerStart(ctx context.Context, container string, options dockertypes.ContainerStartOptions) error
        ContainerRemove(ctx context.Context, container string, options dockertypes.ContainerRemoveOptions) error
        ContainerWait(ctx context.Context, container string, condition dockercontainer.WaitCondition) (<-chan dockercontainer.ContainerWaitOKBody, <-chan error)
-       ContainerList(ctx context.Context, opts dockertypes.ContainerListOptions) ([]dockertypes.Container, error)
+       ContainerInspect(ctx context.Context, id string) (dockertypes.ContainerJSON, error)
        ImageInspectWithRaw(ctx context.Context, image string) (dockertypes.ImageInspect, []byte, error)
        ImageLoad(ctx context.Context, input io.Reader, quiet bool) (dockertypes.ImageLoadResponse, error)
        ImageRemove(ctx context.Context, image string, options dockertypes.ImageRemoveOptions) ([]dockertypes.ImageDeleteResponseItem, error)
@@ -157,7 +158,7 @@ type ContainerRunner struct {
        arvMountLog     *ThrottledLogger
        checkContainerd time.Duration
 
-       containerWaitGracePeriod time.Duration
+       containerWatchdogInterval time.Duration
 }
 
 // setupSignals sets up signal handling to gracefully terminate the underlying
@@ -1134,38 +1135,24 @@ func (runner *ContainerRunner) WaitFinish() error {
        containerGone := make(chan struct{})
        go func() {
                defer close(containerGone)
-               if runner.containerWaitGracePeriod < 1 {
-                       runner.containerWaitGracePeriod = 30 * time.Second
+               if runner.containerWatchdogInterval < 1 {
+                       runner.containerWatchdogInterval = time.Minute
                }
-               found := time.Now()
-       polling:
-               for range time.NewTicker(runner.containerWaitGracePeriod / 30).C {
-                       ctrs, err := runner.Docker.ContainerList(context.Background(), dockertypes.ContainerListOptions{})
-                       if err != nil {
-                               runner.CrunchLog.Printf("error checking container list: %s", err)
-                               if runner.checkBrokenNode(err) {
-                                       return
-                               }
-                               continue polling
-                       }
-                       for _, ctr := range ctrs {
-                               if ctr.ID == runner.ContainerID {
-                                       found = time.Now()
-                                       continue polling
-                               }
-                       }
+               for range time.NewTicker(runner.containerWatchdogInterval).C {
+                       ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(runner.containerWatchdogInterval))
+                       ctr, err := runner.Docker.ContainerInspect(ctx, runner.ContainerID)
+                       cancel()
                        runner.cStateLock.Lock()
                        done := runner.cRemoved || runner.ExitCode != nil
                        runner.cStateLock.Unlock()
                        if done {
-                               // Skip the grace period and warning
-                               // log if the container disappeared
-                               // because it finished, or we removed
-                               // it ourselves.
                                return
-                       }
-                       if time.Since(found) > runner.containerWaitGracePeriod {
-                               runner.CrunchLog.Printf("container %s no longer exists", runner.ContainerID)
+                       } else if err != nil {
+                               runner.CrunchLog.Printf("Error inspecting container: %s", err)
+                               runner.checkBrokenNode(err)
+                               return
+                       } else if ctr.State == nil || !(ctr.State.Running || ctr.State.Status == "created") {
+                               runner.CrunchLog.Printf("Container is not running: State=%v", ctr.State)
                                return
                        }
                }
@@ -1307,6 +1294,17 @@ func (runner *ContainerRunner) CaptureOutput() error {
        if err != nil {
                return err
        }
+       if n := len(regexp.MustCompile(` [0-9a-f]+\+\S*\+R`).FindAllStringIndex(txt, -1)); n > 0 {
+               runner.CrunchLog.Printf("Copying %d data blocks from remote input collections...", n)
+               fs, err := (&arvados.Collection{ManifestText: txt}).FileSystem(runner.client, runner.Kc)
+               if err != nil {
+                       return err
+               }
+               txt, err = fs.MarshalManifest(".")
+               if err != nil {
+                       return err
+               }
+       }
        var resp arvados.Collection
        err = runner.ArvClient.Create("collections", arvadosclient.Dict{
                "ensure_unique_name": true,
@@ -1482,7 +1480,7 @@ func (runner *ContainerRunner) ContainerToken() (string, error) {
        if err != nil {
                return "", err
        }
-       runner.token = auth.APIToken
+       runner.token = fmt.Sprintf("v2/%s/%s/%s", auth.UUID, auth.APIToken, runner.Container.UUID)
        return runner.token, nil
 }