10777: Close and flush logs right away instead of waiting for next tick.
[arvados.git] / services / crunch-run / crunchrun.go
index 0b4eb2bcbf4934f62a179b3037ad395155f7aeb4..47099673e2b2133c92e6205401be508e162b94a1 100644 (file)
@@ -5,12 +5,6 @@ import (
        "errors"
        "flag"
        "fmt"
-       "git.curoverse.com/arvados.git/lib/crunchstat"
-       "git.curoverse.com/arvados.git/sdk/go/arvados"
-       "git.curoverse.com/arvados.git/sdk/go/arvadosclient"
-       "git.curoverse.com/arvados.git/sdk/go/keepclient"
-       "git.curoverse.com/arvados.git/sdk/go/manifest"
-       "github.com/curoverse/dockerclient"
        "io"
        "io/ioutil"
        "log"
@@ -19,18 +13,27 @@ import (
        "os/signal"
        "path"
        "path/filepath"
+       "sort"
        "strings"
        "sync"
        "syscall"
        "time"
+
+       "git.curoverse.com/arvados.git/lib/crunchstat"
+       "git.curoverse.com/arvados.git/sdk/go/arvados"
+       "git.curoverse.com/arvados.git/sdk/go/arvadosclient"
+       "git.curoverse.com/arvados.git/sdk/go/keepclient"
+       "git.curoverse.com/arvados.git/sdk/go/manifest"
+       "github.com/curoverse/dockerclient"
 )
 
 // IArvadosClient is the minimal Arvados API methods used by crunch-run.
 type IArvadosClient interface {
        Create(resourceType string, parameters arvadosclient.Dict, output interface{}) error
        Get(resourceType string, uuid string, parameters arvadosclient.Dict, output interface{}) error
-       Update(resourceType string, uuid string, parameters arvadosclient.Dict, output interface{}) (err error)
-       Call(method, resourceType, uuid, action string, parameters arvadosclient.Dict, output interface{}) (err error)
+       Update(resourceType string, uuid string, parameters arvadosclient.Dict, output interface{}) error
+       Call(method, resourceType, uuid, action string, parameters arvadosclient.Dict, output interface{}) error
+       Discovery(key string) (interface{}, error)
 }
 
 // ErrCancelled is the error returned when the container is cancelled.
@@ -39,7 +42,7 @@ var ErrCancelled = errors.New("Cancelled")
 // IKeepClient is the minimal Keep API methods used by crunch-run.
 type IKeepClient interface {
        PutHB(hash string, buf []byte) (string, int, error)
-       ManifestFileReader(m manifest.Manifest, filename string) (keepclient.ReadCloserWithLen, error)
+       ManifestFileReader(m manifest.Manifest, filename string) (keepclient.Reader, error)
 }
 
 // NewLogWriter is a factory function to create a new log writer.
@@ -121,20 +124,29 @@ func (runner *ContainerRunner) SetupSignals() {
        signal.Notify(runner.SigChan, syscall.SIGINT)
        signal.Notify(runner.SigChan, syscall.SIGQUIT)
 
-       go func(sig <-chan os.Signal) {
-               for range sig {
-                       if !runner.Cancelled {
-                               runner.CancelLock.Lock()
-                               runner.Cancelled = true
-                               if runner.ContainerID != "" {
-                                       runner.Docker.StopContainer(runner.ContainerID, 10)
-                               }
-                               runner.CancelLock.Unlock()
-                       }
-               }
+       go func(sig chan os.Signal) {
+               <-sig
+               runner.stop()
+               signal.Stop(sig)
        }(runner.SigChan)
 }
 
+// stop the underlying Docker container.
+func (runner *ContainerRunner) stop() {
+       runner.CancelLock.Lock()
+       defer runner.CancelLock.Unlock()
+       if runner.Cancelled {
+               return
+       }
+       runner.Cancelled = true
+       if runner.ContainerID != "" {
+               err := runner.Docker.StopContainer(runner.ContainerID, 10)
+               if err != nil {
+                       log.Printf("StopContainer failed: %s", err)
+               }
+       }
+}
+
 // LoadImage determines the docker image id from the container record and
 // checks if it is available in the local Docker image store.  If not, it loads
 // the image from Keep.
@@ -237,8 +249,15 @@ func (runner *ContainerRunner) ArvMountCmd(arvMountCmd []string, token string) (
        return c, nil
 }
 
+func (runner *ContainerRunner) SetupArvMountPoint(prefix string) (err error) {
+       if runner.ArvMountPoint == "" {
+               runner.ArvMountPoint, err = runner.MkTempDir("", prefix)
+       }
+       return
+}
+
 func (runner *ContainerRunner) SetupMounts() (err error) {
-       runner.ArvMountPoint, err = runner.MkTempDir("", "keep")
+       err = runner.SetupArvMountPoint("keep")
        if err != nil {
                return fmt.Errorf("While creating keep mount temp dir: %v", err)
        }
@@ -248,10 +267,23 @@ func (runner *ContainerRunner) SetupMounts() (err error) {
        pdhOnly := true
        tmpcount := 0
        arvMountCmd := []string{"--foreground", "--allow-other", "--read-write"}
+
+       if runner.Container.RuntimeConstraints.KeepCacheRAM > 0 {
+               arvMountCmd = append(arvMountCmd, "--file-cache", fmt.Sprintf("%d", runner.Container.RuntimeConstraints.KeepCacheRAM))
+       }
+
        collectionPaths := []string{}
        runner.Binds = nil
+       needCertMount := true
 
-       for bind, mnt := range runner.Container.Mounts {
+       var binds []string
+       for bind, _ := range runner.Container.Mounts {
+               binds = append(binds, bind)
+       }
+       sort.Strings(binds)
+
+       for _, bind := range binds {
+               mnt := runner.Container.Mounts[bind]
                if bind == "stdout" {
                        // Is it a "file" mount kind?
                        if mnt.Kind != "file" {
@@ -268,6 +300,16 @@ func (runner *ContainerRunner) SetupMounts() (err error) {
                        }
                }
 
+               if bind == "/etc/arvados/ca-certificates.crt" {
+                       needCertMount = false
+               }
+
+               if strings.HasPrefix(bind, runner.Container.OutputPath+"/") && bind != runner.Container.OutputPath+"/" {
+                       if mnt.Kind != "collection" {
+                               return fmt.Errorf("Only mount points of kind 'collection' are supported underneath the output_path: %v", bind)
+                       }
+               }
+
                switch {
                case mnt.Kind == "collection":
                        var src string
@@ -284,7 +326,21 @@ func (runner *ContainerRunner) SetupMounts() (err error) {
                                if mnt.Writable {
                                        return fmt.Errorf("Can never write to a collection specified by portable data hash")
                                }
+                               idx := strings.Index(mnt.PortableDataHash, "/")
+                               if idx > 0 {
+                                       mnt.Path = path.Clean(mnt.PortableDataHash[idx:])
+                                       mnt.PortableDataHash = mnt.PortableDataHash[0:idx]
+                                       runner.Container.Mounts[bind] = mnt
+                               }
                                src = fmt.Sprintf("%s/by_id/%s", runner.ArvMountPoint, mnt.PortableDataHash)
+                               if mnt.Path != "" && mnt.Path != "." {
+                                       if strings.HasPrefix(mnt.Path, "./") {
+                                               mnt.Path = mnt.Path[2:]
+                                       } else if strings.HasPrefix(mnt.Path, "/") {
+                                               mnt.Path = mnt.Path[1:]
+                                       }
+                                       src += "/" + mnt.Path
+                               }
                        } else {
                                src = fmt.Sprintf("%s/tmp%d", runner.ArvMountPoint, tmpcount)
                                arvMountCmd = append(arvMountCmd, "--mount-tmp")
@@ -294,6 +350,8 @@ func (runner *ContainerRunner) SetupMounts() (err error) {
                        if mnt.Writable {
                                if bind == runner.Container.OutputPath {
                                        runner.HostOutputDir = src
+                               } else if strings.HasPrefix(bind, runner.Container.OutputPath+"/") {
+                                       return fmt.Errorf("Writable mount points are not permitted underneath the output_path: %v", bind)
                                }
                                runner.Binds = append(runner.Binds, fmt.Sprintf("%s:%s", src, bind))
                        } else {
@@ -348,6 +406,16 @@ func (runner *ContainerRunner) SetupMounts() (err error) {
                return fmt.Errorf("Output path does not correspond to a writable mount point")
        }
 
+       if wantAPI := runner.Container.RuntimeConstraints.API; needCertMount && wantAPI != nil && *wantAPI {
+               for _, certfile := range arvadosclient.CertFiles {
+                       _, err := os.Stat(certfile)
+                       if err == nil {
+                               runner.Binds = append(runner.Binds, fmt.Sprintf("%s:/etc/arvados/ca-certificates.crt:ro", certfile))
+                               break
+                       }
+               }
+       }
+
        if pdhOnly {
                arvMountCmd = append(arvMountCmd, "--mount-by-pdh", "by_id")
        } else {
@@ -542,12 +610,22 @@ func (runner *ContainerRunner) StartContainer() error {
 func (runner *ContainerRunner) WaitFinish() error {
        runner.CrunchLog.Print("Waiting for container to finish")
 
-       result := runner.Docker.Wait(runner.ContainerID)
-       wr := <-result
-       if wr.Error != nil {
-               return fmt.Errorf("While waiting for container to finish: %v", wr.Error)
+       waitDocker := runner.Docker.Wait(runner.ContainerID)
+       waitMount := runner.ArvMountExit
+       for waitDocker != nil {
+               select {
+               case err := <-waitMount:
+                       runner.CrunchLog.Printf("arv-mount exited before container finished: %v", err)
+                       waitMount = nil
+                       runner.stop()
+               case wr := <-waitDocker:
+                       if wr.Error != nil {
+                               return fmt.Errorf("While waiting for container to finish: %v", wr.Error)
+                       }
+                       runner.ExitCode = &wr.ExitCode
+                       waitDocker = nil
+               }
        }
-       runner.ExitCode = &wr.ExitCode
 
        // wait for stdout/stderr to complete
        <-runner.loggingDone
@@ -561,6 +639,21 @@ func (runner *ContainerRunner) CaptureOutput() error {
                return nil
        }
 
+       if wantAPI := runner.Container.RuntimeConstraints.API; wantAPI != nil && *wantAPI {
+               // Output may have been set directly by the container, so
+               // refresh the container record to check.
+               err := runner.ArvClient.Get("containers", runner.Container.UUID,
+                       nil, &runner.Container)
+               if err != nil {
+                       return err
+               }
+               if runner.Container.Output != "" {
+                       // Container output is already set.
+                       runner.OutputPDH = &runner.Container.Output
+                       return nil
+               }
+       }
+
        if runner.HostOutputDir == "" {
                return nil
        }
@@ -597,20 +690,93 @@ func (runner *ContainerRunner) CaptureOutput() error {
                manifestText = rec.ManifestText
        }
 
+       // Pre-populate output from the configured mount points
+       var binds []string
+       for bind, _ := range runner.Container.Mounts {
+               binds = append(binds, bind)
+       }
+       sort.Strings(binds)
+
+       for _, bind := range binds {
+               mnt := runner.Container.Mounts[bind]
+
+               bindSuffix := strings.TrimPrefix(bind, runner.Container.OutputPath)
+
+               if bindSuffix == bind || len(bindSuffix) <= 0 {
+                       // either does not start with OutputPath or is OutputPath itself
+                       continue
+               }
+
+               if mnt.ExcludeFromOutput == true {
+                       continue
+               }
+
+               // append to manifest_text
+               m, err := runner.getCollectionManifestForPath(mnt, bindSuffix)
+               if err != nil {
+                       return err
+               }
+
+               manifestText = manifestText + m
+       }
+
+       // Save output
        var response arvados.Collection
+       manifest := manifest.Manifest{Text: manifestText}
+       manifestText = manifest.Extract(".", ".").Text
        err = runner.ArvClient.Create("collections",
                arvadosclient.Dict{
                        "collection": arvadosclient.Dict{
+                               "is_trashed":    true,
+                               "name":          "output for " + runner.Container.UUID,
                                "manifest_text": manifestText}},
                &response)
        if err != nil {
                return fmt.Errorf("While creating output collection: %v", err)
        }
+       runner.OutputPDH = &response.PortableDataHash
+       return nil
+}
 
-       runner.OutputPDH = new(string)
-       *runner.OutputPDH = response.PortableDataHash
+var outputCollections = make(map[string]arvados.Collection)
+
+// Fetch the collection for the mnt.PortableDataHash
+// Return the manifest_text fragment corresponding to the specified mnt.Path
+//  after making any required updates.
+//  Ex:
+//    If mnt.Path is not specified,
+//      return the entire manifest_text after replacing any "." with bindSuffix
+//    If mnt.Path corresponds to one stream,
+//      return the manifest_text for that stream after replacing that stream name with bindSuffix
+//    Otherwise, check if a filename in any one stream is being sought. Return the manifest_text
+//      for that stream after replacing stream name with bindSuffix minus the last word
+//      and the file name with last word of the bindSuffix
+//  Allowed path examples:
+//    "path":"/"
+//    "path":"/subdir1"
+//    "path":"/subdir1/subdir2"
+//    "path":"/subdir/filename" etc
+func (runner *ContainerRunner) getCollectionManifestForPath(mnt arvados.Mount, bindSuffix string) (string, error) {
+       collection := outputCollections[mnt.PortableDataHash]
+       if collection.PortableDataHash == "" {
+               err := runner.ArvClient.Get("collections", mnt.PortableDataHash, nil, &collection)
+               if err != nil {
+                       return "", fmt.Errorf("While getting collection for %v: %v", mnt.PortableDataHash, err)
+               }
+               outputCollections[mnt.PortableDataHash] = collection
+       }
 
-       return nil
+       if collection.ManifestText == "" {
+               runner.CrunchLog.Printf("No manifest text for collection %v", collection.PortableDataHash)
+               return "", nil
+       }
+
+       mft := manifest.Manifest{Text: collection.ManifestText}
+       extracted := mft.Extract(mnt.Path, bindSuffix)
+       if extracted.Err != nil {
+               return "", fmt.Errorf("Error parsing manifest for %v: %v", mnt.PortableDataHash, extracted.Err.Error())
+       }
+       return extracted.Text, nil
 }
 
 func (runner *ContainerRunner) CleanupDirs() {
@@ -665,15 +831,14 @@ func (runner *ContainerRunner) CommitLogs() error {
        err = runner.ArvClient.Create("collections",
                arvadosclient.Dict{
                        "collection": arvadosclient.Dict{
+                               "is_trashed":    true,
                                "name":          "logs for " + runner.Container.UUID,
                                "manifest_text": mt}},
                &response)
        if err != nil {
                return fmt.Errorf("While creating log collection: %v", err)
        }
-
        runner.LogsPDH = &response.PortableDataHash
-
        return nil
 }
 
@@ -709,10 +874,10 @@ func (runner *ContainerRunner) ContainerToken() (string, error) {
 func (runner *ContainerRunner) UpdateContainerFinal() error {
        update := arvadosclient.Dict{}
        update["state"] = runner.finalState
+       if runner.LogsPDH != nil {
+               update["log"] = *runner.LogsPDH
+       }
        if runner.finalState == "Complete" {
-               if runner.LogsPDH != nil {
-                       update["log"] = *runner.LogsPDH
-               }
                if runner.ExitCode != nil {
                        update["exit_code"] = *runner.ExitCode
                }
@@ -772,6 +937,7 @@ func (runner *ContainerRunner) Run() (err error) {
                checkErr(err)
 
                if runner.finalState == "Queued" {
+                       runner.CrunchLog.Close()
                        runner.UpdateContainerFinal()
                        return
                }
@@ -804,6 +970,7 @@ func (runner *ContainerRunner) Run() (err error) {
        // check for and/or load image
        err = runner.LoadImage()
        if err != nil {
+               runner.finalState = "Cancelled"
                err = fmt.Errorf("While loading container image: %v", err)
                return
        }
@@ -811,6 +978,7 @@ func (runner *ContainerRunner) Run() (err error) {
        // set up FUSE mount and binds
        err = runner.SetupMounts()
        if err != nil {
+               runner.finalState = "Cancelled"
                err = fmt.Errorf("While setting up mounts: %v", err)
                return
        }
@@ -866,10 +1034,15 @@ func main() {
        cgroupRoot := flag.String("cgroup-root", "/sys/fs/cgroup", "path to sysfs cgroup tree")
        cgroupParent := flag.String("cgroup-parent", "docker", "name of container's parent cgroup (ignored if -cgroup-parent-subsystem is used)")
        cgroupParentSubsystem := flag.String("cgroup-parent-subsystem", "", "use current cgroup for given subsystem as parent cgroup for container")
+       caCertsPath := flag.String("ca-certs", "", "Path to TLS root certificates")
        flag.Parse()
 
        containerId := flag.Arg(0)
 
+       if *caCertsPath != "" {
+               arvadosclient.CertFiles = []string{*caCertsPath}
+       }
+
        api, err := arvadosclient.MakeArvadosClient()
        if err != nil {
                log.Fatalf("%s: %v", containerId, err)
@@ -877,7 +1050,7 @@ func main() {
        api.Retries = 8
 
        var kc *keepclient.KeepClient
-       kc, err = keepclient.MakeKeepClient(&api)
+       kc, err = keepclient.MakeKeepClient(api)
        if err != nil {
                log.Fatalf("%s: %v", containerId, err)
        }