Merge branch '12840-cancelled-pipeline'
[arvados.git] / services / crunch-run / crunchrun.go
index 18957014681b8f841ac955a78adf298f6c36f8fc..653e0b4949da882cbcc185894ffdaee369c19318 100644 (file)
@@ -103,16 +103,16 @@ type ContainerRunner struct {
        LogsPDH       *string
        RunArvMount
        MkTempDir
-       ArvMount       *exec.Cmd
-       ArvMountPoint  string
-       HostOutputDir  string
-       CleanupTempDir []string
-       Binds          []string
-       Volumes        map[string]struct{}
-       OutputPDH      *string
-       SigChan        chan os.Signal
-       ArvMountExit   chan error
-       finalState     string
+       ArvMount      *exec.Cmd
+       ArvMountPoint string
+       HostOutputDir string
+       Binds         []string
+       Volumes       map[string]struct{}
+       OutputPDH     *string
+       SigChan       chan os.Signal
+       ArvMountExit  chan error
+       finalState    string
+       parentTemp    string
 
        statLogger       io.WriteCloser
        statReporter     *crunchstat.Reporter
@@ -152,16 +152,18 @@ func (runner *ContainerRunner) setupSignals() {
 
        go func(sig chan os.Signal) {
                for s := range sig {
-                       runner.CrunchLog.Printf("caught signal: %v", s)
-                       runner.stop()
+                       runner.stop(s)
                }
        }(runner.SigChan)
 }
 
 // stop the underlying Docker container.
-func (runner *ContainerRunner) stop() {
+func (runner *ContainerRunner) stop(sig os.Signal) {
        runner.cStateLock.Lock()
        defer runner.cStateLock.Unlock()
+       if sig != nil {
+               runner.CrunchLog.Printf("caught signal: %v", sig)
+       }
        if runner.ContainerID == "" {
                return
        }
@@ -173,12 +175,6 @@ func (runner *ContainerRunner) stop() {
        }
 }
 
-func (runner *ContainerRunner) stopSignals() {
-       if runner.SigChan != nil {
-               signal.Stop(runner.SigChan)
-       }
-}
-
 var errorBlacklist = []string{
        "(?ms).*[Cc]annot connect to the Docker daemon.*",
        "(?ms).*oci runtime error.*starting container process.*container init.*mounting.*to rootfs.*no such file or directory.*",
@@ -327,7 +323,7 @@ func (runner *ContainerRunner) ArvMountCmd(arvMountCmd []string, token string) (
 
 func (runner *ContainerRunner) SetupArvMountPoint(prefix string) (err error) {
        if runner.ArvMountPoint == "" {
-               runner.ArvMountPoint, err = runner.MkTempDir("", prefix)
+               runner.ArvMountPoint, err = runner.MkTempDir(runner.parentTemp, prefix)
        }
        return
 }
@@ -490,7 +486,7 @@ func (runner *ContainerRunner) SetupMounts() (err error) {
 
                case mnt.Kind == "tmp":
                        var tmpdir string
-                       tmpdir, err = runner.MkTempDir("", "")
+                       tmpdir, err = runner.MkTempDir(runner.parentTemp, "tmp")
                        if err != nil {
                                return fmt.Errorf("While creating mount temp dir: %v", err)
                        }
@@ -502,7 +498,6 @@ func (runner *ContainerRunner) SetupMounts() (err error) {
                        if staterr != nil {
                                return fmt.Errorf("While Chmod temp dir: %v", err)
                        }
-                       runner.CleanupTempDir = append(runner.CleanupTempDir, tmpdir)
                        runner.Binds = append(runner.Binds, fmt.Sprintf("%s:%s", tmpdir, bind))
                        if bind == runner.Container.OutputPath {
                                runner.HostOutputDir = tmpdir
@@ -518,11 +513,10 @@ func (runner *ContainerRunner) SetupMounts() (err error) {
                        // can ensure the file is world-readable
                        // inside the container, without having to
                        // make it world-readable on the docker host.
-                       tmpdir, err := runner.MkTempDir("", "")
+                       tmpdir, err := runner.MkTempDir(runner.parentTemp, "json")
                        if err != nil {
                                return fmt.Errorf("creating temp dir: %v", err)
                        }
-                       runner.CleanupTempDir = append(runner.CleanupTempDir, tmpdir)
                        tmpfn := filepath.Join(tmpdir, "mountdata.json")
                        err = ioutil.WriteFile(tmpfn, jsondata, 0644)
                        if err != nil {
@@ -531,11 +525,10 @@ func (runner *ContainerRunner) SetupMounts() (err error) {
                        runner.Binds = append(runner.Binds, fmt.Sprintf("%s:%s:ro", tmpfn, bind))
 
                case mnt.Kind == "git_tree":
-                       tmpdir, err := runner.MkTempDir("", "")
+                       tmpdir, err := runner.MkTempDir(runner.parentTemp, "git_tree")
                        if err != nil {
                                return fmt.Errorf("creating temp dir: %v", err)
                        }
-                       runner.CleanupTempDir = append(runner.CleanupTempDir, tmpdir)
                        err = gitMount(mnt).extractTree(runner.ArvClient, tmpdir, token)
                        if err != nil {
                                return err
@@ -578,25 +571,37 @@ func (runner *ContainerRunner) SetupMounts() (err error) {
        }
 
        for _, cp := range copyFiles {
-               dir, err := os.Stat(cp.src)
+               st, err := os.Stat(cp.src)
                if err != nil {
                        return fmt.Errorf("While staging writable file from %q to %q: %v", cp.src, cp.bind, err)
                }
-               if dir.IsDir() {
+               if st.IsDir() {
                        err = filepath.Walk(cp.src, func(walkpath string, walkinfo os.FileInfo, walkerr error) error {
                                if walkerr != nil {
                                        return walkerr
                                }
+                               target := path.Join(cp.bind, walkpath[len(cp.src):])
                                if walkinfo.Mode().IsRegular() {
-                                       return copyfile(walkpath, path.Join(cp.bind, walkpath[len(cp.src):]))
+                                       copyerr := copyfile(walkpath, target)
+                                       if copyerr != nil {
+                                               return copyerr
+                                       }
+                                       return os.Chmod(target, walkinfo.Mode()|0777)
                                } else if walkinfo.Mode().IsDir() {
-                                       return os.MkdirAll(path.Join(cp.bind, walkpath[len(cp.src):]), 0777)
+                                       mkerr := os.MkdirAll(target, 0777)
+                                       if mkerr != nil {
+                                               return mkerr
+                                       }
+                                       return os.Chmod(target, walkinfo.Mode()|os.ModeSetgid|0777)
                                } else {
                                        return fmt.Errorf("Source %q is not a regular file or directory", cp.src)
                                }
                        })
-               } else {
+               } else if st.Mode().IsRegular() {
                        err = copyfile(cp.src, cp.bind)
+                       if err == nil {
+                               err = os.Chmod(cp.bind, st.Mode()|0777)
+                       }
                }
                if err != nil {
                        return fmt.Errorf("While staging writable file from %q to %q: %v", cp.src, cp.bind, err)
@@ -882,7 +887,7 @@ func (runner *ContainerRunner) AttachStreams() (err error) {
                        _, err := io.Copy(response.Conn, stdinRdr)
                        if err != nil {
                                runner.CrunchLog.Print("While writing stdin collection to docker container %q", err)
-                               runner.stop()
+                               runner.stop(nil)
                        }
                        stdinRdr.Close()
                        response.CloseWrite()
@@ -892,7 +897,7 @@ func (runner *ContainerRunner) AttachStreams() (err error) {
                        _, err := io.Copy(response.Conn, bytes.NewReader(stdinJson))
                        if err != nil {
                                runner.CrunchLog.Print("While writing stdin json to docker container %q", err)
-                               runner.stop()
+                               runner.stop(nil)
                        }
                        response.CloseWrite()
                }()
@@ -943,6 +948,7 @@ func (runner *ContainerRunner) CreateContainer() error {
 
        runner.ContainerConfig.Volumes = runner.Volumes
 
+       maxRAM := int64(runner.Container.RuntimeConstraints.RAM)
        runner.HostConfig = dockercontainer.HostConfig{
                Binds: runner.Binds,
                LogConfig: dockercontainer.LogConfig{
@@ -950,6 +956,10 @@ func (runner *ContainerRunner) CreateContainer() error {
                },
                Resources: dockercontainer.Resources{
                        CgroupParent: runner.setCgroupParent,
+                       NanoCPUs:     int64(runner.Container.RuntimeConstraints.VCPUs) * 1000000000,
+                       Memory:       maxRAM, // RAM
+                       MemorySwap:   maxRAM, // RAM+swap
+                       KernelMemory: maxRAM, // kernel portion
                },
        }
 
@@ -1032,7 +1042,7 @@ func (runner *ContainerRunner) WaitFinish() error {
 
                case <-arvMountExit:
                        runner.CrunchLog.Printf("arv-mount exited while container is still running.  Stopping container.")
-                       runner.stop()
+                       runner.stop(nil)
                        // arvMountExit will always be ready now that
                        // it's closed, but that doesn't interest us.
                        arvMountExit = nil
@@ -1427,29 +1437,33 @@ func (runner *ContainerRunner) CleanupDirs() {
                }
        }
 
-       for _, tmpdir := range runner.CleanupTempDir {
-               if rmerr := os.RemoveAll(tmpdir); rmerr != nil {
-                       runner.CrunchLog.Printf("While cleaning up temporary directory %s: %v", tmpdir, rmerr)
-               }
+       if rmerr := os.RemoveAll(runner.parentTemp); rmerr != nil {
+               runner.CrunchLog.Printf("While cleaning up temporary directory %s: %v", runner.parentTemp, rmerr)
        }
 }
 
 // CommitLogs posts the collection containing the final container logs.
 func (runner *ContainerRunner) CommitLogs() error {
-       runner.CrunchLog.Print(runner.finalState)
+       func() {
+               // Hold cStateLock to prevent races on CrunchLog (e.g., stop()).
+               runner.cStateLock.Lock()
+               defer runner.cStateLock.Unlock()
 
-       if runner.arvMountLog != nil {
-               runner.arvMountLog.Close()
-       }
-       runner.CrunchLog.Close()
+               runner.CrunchLog.Print(runner.finalState)
+
+               if runner.arvMountLog != nil {
+                       runner.arvMountLog.Close()
+               }
+               runner.CrunchLog.Close()
 
-       // Closing CrunchLog above allows them to be committed to Keep at this
-       // point, but re-open crunch log with ArvClient in case there are any
-       // other further errors (such as failing to write the log to Keep!)
-       // while shutting down
-       runner.CrunchLog = NewThrottledLogger(&ArvLogWriter{ArvClient: runner.ArvClient,
-               UUID: runner.Container.UUID, loggingStream: "crunch-run", writeCloser: nil})
-       runner.CrunchLog.Immediate = log.New(os.Stderr, runner.Container.UUID+" ", 0)
+               // Closing CrunchLog above allows them to be committed to Keep at this
+               // point, but re-open crunch log with ArvClient in case there are any
+               // other further errors (such as failing to write the log to Keep!)
+               // while shutting down
+               runner.CrunchLog = NewThrottledLogger(&ArvLogWriter{ArvClient: runner.ArvClient,
+                       UUID: runner.Container.UUID, loggingStream: "crunch-run", writeCloser: nil})
+               runner.CrunchLog.Immediate = log.New(os.Stderr, runner.Container.UUID+" ", 0)
+       }()
 
        if runner.LogsPDH != nil {
                // If we have already assigned something to LogsPDH,
@@ -1558,7 +1572,6 @@ func (runner *ContainerRunner) Run() (err error) {
        runner.finalState = "Queued"
 
        defer func() {
-               runner.stopSignals()
                runner.CleanupDirs()
 
                runner.CrunchLog.Printf("crunch-run finished")
@@ -1769,6 +1782,12 @@ func main() {
                os.Exit(1)
        }
 
+       parentTemp, tmperr := cr.MkTempDir("", "crunch-run."+containerId+".")
+       if tmperr != nil {
+               log.Fatalf("%s: %v", containerId, tmperr)
+       }
+
+       cr.parentTemp = parentTemp
        cr.statInterval = *statInterval
        cr.cgroupRoot = *cgroupRoot
        cr.expectCgroupParent = *cgroupParent