X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/aa9fbcd3fbcc653fef23ca9f583d16e535c6ffb9..cb9c8f800495b8a499348d931326a06f020b8ffa:/services/crunch-run/crunchrun.go diff --git a/services/crunch-run/crunchrun.go b/services/crunch-run/crunchrun.go index 0f687142a2..c983cbd37b 100644 --- a/services/crunch-run/crunchrun.go +++ b/services/crunch-run/crunchrun.go @@ -32,6 +32,7 @@ import ( "git.curoverse.com/arvados.git/sdk/go/arvadosclient" "git.curoverse.com/arvados.git/sdk/go/keepclient" "git.curoverse.com/arvados.git/sdk/go/manifest" + arvadosVersion "git.curoverse.com/arvados.git/sdk/go/version" dockertypes "github.com/docker/docker/api/types" dockercontainer "github.com/docker/docker/api/types/container" @@ -182,6 +183,7 @@ type ContainerRunner struct { enableNetwork string // one of "default" or "always" networkMode string // passed through to HostConfig.NetworkMode + arvMountLog *ThrottledLogger } // setupSignals sets up signal handling to gracefully terminate the underlying @@ -193,7 +195,10 @@ func (runner *ContainerRunner) setupSignals() { signal.Notify(runner.SigChan, syscall.SIGQUIT) go func(sig chan os.Signal) { - <-sig + s := <-sig + if s != nil { + runner.CrunchLog.Printf("Caught signal %v", s) + } runner.stop() }(runner.SigChan) } @@ -210,12 +215,14 @@ func (runner *ContainerRunner) stop() { timeout := time.Duration(10) err := runner.Docker.ContainerStop(context.TODO(), runner.ContainerID, &(timeout)) if err != nil { - log.Printf("StopContainer failed: %s", err) + runner.CrunchLog.Printf("StopContainer failed: %s", err) } + // Suppress multiple calls to stop() + runner.cStarted = false } } -func (runner *ContainerRunner) teardown() { +func (runner *ContainerRunner) stopSignals() { if runner.SigChan != nil { signal.Stop(runner.SigChan) close(runner.SigChan) @@ -291,9 +298,11 @@ func (runner *ContainerRunner) ArvMountCmd(arvMountCmd []string, token string) ( } c.Env = append(c.Env, "ARVADOS_API_TOKEN="+token) - nt := NewThrottledLogger(runner.NewLogWriter("arv-mount")) - c.Stdout = nt - c.Stderr = nt + runner.arvMountLog = NewThrottledLogger(runner.NewLogWriter("arv-mount")) + c.Stdout = runner.arvMountLog + c.Stderr = runner.arvMountLog + + runner.CrunchLog.Printf("Running %v", c.Args) err = c.Start() if err != nil { @@ -317,7 +326,11 @@ func (runner *ContainerRunner) ArvMountCmd(arvMountCmd []string, token string) ( }() go func() { - runner.ArvMountExit <- c.Wait() + mnterr := c.Wait() + if mnterr != nil { + runner.CrunchLog.Printf("Arv-mount exit error: %v", mnterr) + } + runner.ArvMountExit <- mnterr close(runner.ArvMountExit) }() @@ -346,11 +359,13 @@ func (runner *ContainerRunner) SetupMounts() (err error) { return fmt.Errorf("While creating keep mount temp dir: %v", err) } - runner.CleanupTempDir = append(runner.CleanupTempDir, runner.ArvMountPoint) - pdhOnly := true tmpcount := 0 - arvMountCmd := []string{"--foreground", "--allow-other", "--read-write"} + arvMountCmd := []string{ + "--foreground", + "--allow-other", + "--read-write", + fmt.Sprintf("--crunchstat-interval=%v", runner.statInterval.Seconds())} if runner.Container.RuntimeConstraints.KeepCacheRAM > 0 { arvMountCmd = append(arvMountCmd, "--file-cache", fmt.Sprintf("%d", runner.Container.RuntimeConstraints.KeepCacheRAM)) @@ -891,12 +906,23 @@ func (runner *ContainerRunner) WaitFinish() (err error) { waitOk, waitErr := runner.Docker.ContainerWait(context.TODO(), runner.ContainerID, "not-running") + go func() { + <-runner.ArvMountExit + if runner.cStarted { + runner.CrunchLog.Printf("arv-mount exited while container is still running. Stopping container.") + runner.stop() + } + }() + var waitBody dockercontainer.ContainerWaitOKBody select { case waitBody = <-waitOk: case err = <-waitErr: } + // Container isn't running any more + runner.cStarted = false + if err != nil { return fmt.Errorf("container wait: %v", err) } @@ -905,65 +931,33 @@ func (runner *ContainerRunner) WaitFinish() (err error) { code := int(waitBody.StatusCode) runner.ExitCode = &code - waitMount := runner.ArvMountExit - select { - case err = <-waitMount: - runner.CrunchLog.Printf("arv-mount exited before container finished: %v", err) - waitMount = nil - runner.stop() - default: - } - // wait for stdout/stderr to complete <-runner.loggingDone return nil } -// UploadFile uploads files within the output directory, with special handling -// for symlinks. If the symlink leads to a keep mount, copy the manifest text -// from the keep mount into the output manifestText. Ensure that whether -// symlinks are relative or absolute, they must remain within the output -// directory. -// -// Assumes initial value of "path" is absolute, and located within runner.HostOutputDir. -func (runner *ContainerRunner) UploadOutputFile( - path string, - info os.FileInfo, - infoerr error, - binds []string, - walkUpload *WalkUpload, - relocateFrom string, - relocateTo string) (manifestText string, err error) { - - if infoerr != nil { - return "", infoerr - } - - // When following symlinks, the source path may need to be logically - // relocated to some other path within the output collection. Remove - // the relocateFrom prefix and replace it with relocateTo. - relocated := relocateTo + path[len(relocateFrom):] - - if info.Mode().IsRegular() { - return "", walkUpload.UploadFile(relocated, path) - } - - // Not a regular file, try to follow symlinks - var nextlink = path - for n := 0; n < 32; n++ { - if info.Mode()&os.ModeSymlink == 0 { - // Not a symlink, don't do anything +var ErrNotInOutputDir = fmt.Errorf("Must point to path within the output directory") + +func (runner *ContainerRunner) derefOutputSymlink(path string, startinfo os.FileInfo) (tgt string, readlinktgt string, info os.FileInfo, err error) { + // Follow symlinks if necessary + info = startinfo + tgt = path + readlinktgt = "" + nextlink := path + for followed := 0; info.Mode()&os.ModeSymlink != 0; followed++ { + if followed >= limitFollowSymlinks { + // Got stuck in a loop or just a pathological number of links, give up. + err = fmt.Errorf("Followed more than %v symlinks from path %q", limitFollowSymlinks, path) return } - var readlinktgt string readlinktgt, err = os.Readlink(nextlink) if err != nil { return } - tgt := readlinktgt + tgt = readlinktgt if !strings.HasPrefix(tgt, "/") { // Relative symlink, resolve it to host path tgt = filepath.Join(filepath.Dir(path), tgt) @@ -972,68 +966,121 @@ func (runner *ContainerRunner) UploadOutputFile( // Absolute symlink to container output path, adjust it to host output path. tgt = filepath.Join(runner.HostOutputDir, tgt[len(runner.Container.OutputPath):]) } - - // go through mounts and try reverse map to collection reference - for _, bind := range binds { - mnt := runner.Container.Mounts[bind] - if tgt == bind || strings.HasPrefix(tgt, bind+"/") { - // get path relative to bind - targetSuffix := tgt[len(bind):] - - // Copy mount and adjust the path to add path relative to the bind - adjustedMount := mnt - adjustedMount.Path = filepath.Join(adjustedMount.Path, targetSuffix) - - // Terminates in this keep mount, so add the - // manifest text at appropriate location. - outputSuffix := path[len(runner.HostOutputDir):] - manifestText, err = runner.getCollectionManifestForPath(adjustedMount, outputSuffix) - return - } - } - - // If target is not a collection mount, it must be within the - // output directory, otherwise it is an error. if !strings.HasPrefix(tgt, runner.HostOutputDir+"/") { - err = fmt.Errorf("Output directory symlink %q points to invalid location %q, must point to mount or output directory.", - path[len(runner.HostOutputDir):], readlinktgt) + // After dereferencing, symlink target must either be + // within output directory, or must point to a + // collection mount. + err = ErrNotInOutputDir return } info, err = os.Lstat(tgt) if err != nil { - // tgt doesn't exist or lacks permissions - err = fmt.Errorf("Output directory symlink %q points to invalid location %q, must point to mount or output directory.", - path[len(runner.HostOutputDir):], readlinktgt) + // tgt + err = fmt.Errorf("Symlink in output %q points to invalid location %q: %v", + path[len(runner.HostOutputDir):], readlinktgt, err) return } - if info.Mode().IsRegular() { - // Symlink leads to regular file. Need to read from - // the target but upload it at the original path. - return "", walkUpload.UploadFile(relocated, tgt) - } + nextlink = tgt + } - if info.Mode().IsDir() { - // Symlink leads to directory. Walk() doesn't follow - // directory symlinks, so we walk the target directory - // instead. Within the walk, file paths are relocated - // so they appear under the original symlink path. - err = filepath.Walk(tgt, func(walkpath string, walkinfo os.FileInfo, walkerr error) error { - var m string - m, walkerr = runner.UploadOutputFile(walkpath, walkinfo, walkerr, binds, walkUpload, tgt, relocated) - if walkerr == nil { - manifestText = manifestText + m - } - return walkerr - }) + return +} + +var limitFollowSymlinks = 10 + +// UploadFile uploads files within the output directory, with special handling +// for symlinks. If the symlink leads to a keep mount, copy the manifest text +// from the keep mount into the output manifestText. Ensure that whether +// symlinks are relative or absolute, every symlink target (even targets that +// are symlinks themselves) must point to a path in either the output directory +// or a collection mount. +// +// Assumes initial value of "path" is absolute, and located within runner.HostOutputDir. +func (runner *ContainerRunner) UploadOutputFile( + path string, + info os.FileInfo, + infoerr error, + binds []string, + walkUpload *WalkUpload, + relocateFrom string, + relocateTo string, + followed int) (manifestText string, err error) { + + if info.Mode().IsDir() { + return + } + + if infoerr != nil { + return "", infoerr + } + + if followed >= limitFollowSymlinks { + // Got stuck in a loop or just a pathological number of + // directory links, give up. + err = fmt.Errorf("Followed more than %v symlinks from path %q", limitFollowSymlinks, path) + return + } + + // When following symlinks, the source path may need to be logically + // relocated to some other path within the output collection. Remove + // the relocateFrom prefix and replace it with relocateTo. + relocated := relocateTo + path[len(relocateFrom):] + + tgt, readlinktgt, info, derefErr := runner.derefOutputSymlink(path, info) + if derefErr != nil && derefErr != ErrNotInOutputDir { + return "", derefErr + } + + // go through mounts and try reverse map to collection reference + for _, bind := range binds { + mnt := runner.Container.Mounts[bind] + if tgt == bind || strings.HasPrefix(tgt, bind+"/") { + // get path relative to bind + targetSuffix := tgt[len(bind):] + + // Copy mount and adjust the path to add path relative to the bind + adjustedMount := mnt + adjustedMount.Path = filepath.Join(adjustedMount.Path, targetSuffix) + + // Terminates in this keep mount, so add the + // manifest text at appropriate location. + outputSuffix := path[len(runner.HostOutputDir):] + manifestText, err = runner.getCollectionManifestForPath(adjustedMount, outputSuffix) return } + } - nextlink = tgt + // If target is not a collection mount, it must be located within the + // output directory, otherwise it is an error. + if derefErr == ErrNotInOutputDir { + err = fmt.Errorf("Symlink in output %q points to invalid location %q, must point to path within the output directory.", + path[len(runner.HostOutputDir):], readlinktgt) + return + } + + if info.Mode().IsRegular() { + return "", walkUpload.UploadFile(relocated, tgt) + } + + if info.Mode().IsDir() { + // Symlink leads to directory. Walk() doesn't follow + // directory symlinks, so we walk the target directory + // instead. Within the walk, file paths are relocated + // so they appear under the original symlink path. + err = filepath.Walk(tgt, func(walkpath string, walkinfo os.FileInfo, walkerr error) error { + var m string + m, walkerr = runner.UploadOutputFile(walkpath, walkinfo, walkerr, + binds, walkUpload, tgt, relocated, followed+1) + if walkerr == nil { + manifestText = manifestText + m + } + return walkerr + }) + return } - // Got stuck in a loop or just a pathological number of links, give up. - err = fmt.Errorf("Followed too many symlinks from path %q", path) + return } @@ -1088,7 +1135,7 @@ func (runner *ContainerRunner) CaptureOutput() error { var m string err = filepath.Walk(runner.HostOutputDir, func(path string, info os.FileInfo, err error) error { - m, err = runner.UploadOutputFile(path, info, err, binds, walkUpload, "", "") + m, err = runner.UploadOutputFile(path, info, err, binds, walkUpload, "", "", 0) if err == nil { manifestText = manifestText + m } @@ -1207,21 +1254,55 @@ func (runner *ContainerRunner) getCollectionManifestForPath(mnt arvados.Mount, b func (runner *ContainerRunner) CleanupDirs() { if runner.ArvMount != nil { - umount := exec.Command("fusermount", "-z", "-u", runner.ArvMountPoint) - umnterr := umount.Run() + var delay int64 = 8 + umount := exec.Command("arv-mount", fmt.Sprintf("--unmount-timeout=%d", delay), "--unmount", runner.ArvMountPoint) + umount.Stdout = runner.CrunchLog + umount.Stderr = runner.CrunchLog + runner.CrunchLog.Printf("Running %v", umount.Args) + umnterr := umount.Start() + if umnterr != nil { - runner.CrunchLog.Printf("While running fusermount: %v", umnterr) + runner.CrunchLog.Printf("Error unmounting: %v", umnterr) + } else { + // If arv-mount --unmount gets stuck for any reason, we + // don't want to wait for it forever. Do Wait() in a goroutine + // so it doesn't block crunch-run. + umountExit := make(chan error) + go func() { + mnterr := umount.Wait() + if mnterr != nil { + runner.CrunchLog.Printf("Error unmounting: %v", mnterr) + } + umountExit <- mnterr + }() + + for again := true; again; { + again = false + select { + case <-umountExit: + umount = nil + again = true + case <-runner.ArvMountExit: + break + case <-time.After(time.Duration((delay + 1) * int64(time.Second))): + runner.CrunchLog.Printf("Timed out waiting for unmount") + if umount != nil { + umount.Process.Kill() + } + runner.ArvMount.Process.Kill() + } + } } + } - mnterr := <-runner.ArvMountExit - if mnterr != nil { - runner.CrunchLog.Printf("Arv-mount exit error: %v", mnterr) + if runner.ArvMountPoint != "" { + if rmerr := os.Remove(runner.ArvMountPoint); rmerr != nil { + runner.CrunchLog.Printf("While cleaning up arv-mount directory %s: %v", runner.ArvMountPoint, rmerr) } } for _, tmpdir := range runner.CleanupTempDir { - rmerr := os.RemoveAll(tmpdir) - if rmerr != nil { + if rmerr := os.RemoveAll(tmpdir); rmerr != nil { runner.CrunchLog.Printf("While cleaning up temporary directory %s: %v", tmpdir, rmerr) } } @@ -1230,14 +1311,19 @@ func (runner *ContainerRunner) CleanupDirs() { // CommitLogs posts the collection containing the final container logs. func (runner *ContainerRunner) CommitLogs() error { runner.CrunchLog.Print(runner.finalState) + + if runner.arvMountLog != nil { + runner.arvMountLog.Close() + } runner.CrunchLog.Close() - // Closing CrunchLog above allows it to be committed to Keep at this + // Closing CrunchLog above allows them to be committed to Keep at this // point, but re-open crunch log with ArvClient in case there are any - // other further (such as failing to write the log to Keep!) while - // shutting down + // other further errors (such as failing to write the log to Keep!) + // while shutting down runner.CrunchLog = NewThrottledLogger(&ArvLogWriter{ArvClient: runner.ArvClient, UUID: runner.Container.UUID, loggingStream: "crunch-run", writeCloser: nil}) + runner.CrunchLog.Immediate = log.New(os.Stderr, runner.Container.UUID+" ", 0) if runner.LogsPDH != nil { // If we have already assigned something to LogsPDH, @@ -1324,8 +1410,11 @@ func (runner *ContainerRunner) IsCancelled() bool { // NewArvLogWriter creates an ArvLogWriter func (runner *ContainerRunner) NewArvLogWriter(name string) io.WriteCloser { - return &ArvLogWriter{ArvClient: runner.ArvClient, UUID: runner.Container.UUID, loggingStream: name, - writeCloser: runner.LogCollection.Open(name + ".txt")} + return &ArvLogWriter{ + ArvClient: runner.ArvClient, + UUID: runner.Container.UUID, + loggingStream: name, + writeCloser: runner.LogCollection.Open(name + ".txt")} } // Run the full container lifecycle. @@ -1339,12 +1428,16 @@ func (runner *ContainerRunner) Run() (err error) { runner.CrunchLog.Printf("Executing on host '%s'", hostname) } - // Clean up temporary directories _after_ finalizing - // everything (if we've made any by then) - defer runner.CleanupDirs() - runner.finalState = "Queued" + defer func() { + runner.stopSignals() + runner.CleanupDirs() + + runner.CrunchLog.Printf("crunch-run finished") + runner.CrunchLog.Close() + }() + defer func() { // checkErr prints e (unless it's nil) and sets err to // e (unless err is already non-nil). Thus, if err @@ -1369,7 +1462,6 @@ func (runner *ContainerRunner) Run() (err error) { checkErr(err) if runner.finalState == "Queued" { - runner.CrunchLog.Close() runner.UpdateContainerFinal() return } @@ -1383,13 +1475,6 @@ func (runner *ContainerRunner) Run() (err error) { checkErr(runner.CaptureOutput()) checkErr(runner.CommitLogs()) checkErr(runner.UpdateContainerFinal()) - - // The real log is already closed, but then we opened - // a new one in case we needed to log anything while - // finalizing. - runner.CrunchLog.Close() - - runner.teardown() }() err = runner.fetchContainerRecord() @@ -1509,8 +1594,17 @@ func main() { `Set networking mode for container. Corresponds to Docker network mode (--net). `) memprofile := flag.String("memprofile", "", "write memory profile to `file` after running container") + getVersion := flag.Bool("version", false, "Print version information and exit.") flag.Parse() + // Print version information if requested + if *getVersion { + fmt.Printf("Version: %s\n", arvadosVersion.GetVersion()) + os.Exit(0) + } + + log.Printf("crunch-run %q started", arvadosVersion.GetVersion()) + containerId := flag.Arg(0) if *caCertsPath != "" {