11170: Treat the squeue/scancel calls as files instead of treating them as processes...
[arvados.git] / services / crunch-run / crunchrun.go
index f2b427764d658fb58b8cf6f5a2735b63977befae..3b3cdf1c14a872dacbc76b5679db9970609907dd 100644 (file)
@@ -5,12 +5,6 @@ import (
        "errors"
        "flag"
        "fmt"
-       "git.curoverse.com/arvados.git/lib/crunchstat"
-       "git.curoverse.com/arvados.git/sdk/go/arvados"
-       "git.curoverse.com/arvados.git/sdk/go/arvadosclient"
-       "git.curoverse.com/arvados.git/sdk/go/keepclient"
-       "git.curoverse.com/arvados.git/sdk/go/manifest"
-       "github.com/curoverse/dockerclient"
        "io"
        "io/ioutil"
        "log"
@@ -24,6 +18,13 @@ import (
        "sync"
        "syscall"
        "time"
+
+       "git.curoverse.com/arvados.git/lib/crunchstat"
+       "git.curoverse.com/arvados.git/sdk/go/arvados"
+       "git.curoverse.com/arvados.git/sdk/go/arvadosclient"
+       "git.curoverse.com/arvados.git/sdk/go/keepclient"
+       "git.curoverse.com/arvados.git/sdk/go/manifest"
+       "github.com/curoverse/dockerclient"
 )
 
 // IArvadosClient is the minimal Arvados API methods used by crunch-run.
@@ -41,7 +42,7 @@ var ErrCancelled = errors.New("Cancelled")
 // IKeepClient is the minimal Keep API methods used by crunch-run.
 type IKeepClient interface {
        PutHB(hash string, buf []byte) (string, int, error)
-       ManifestFileReader(m manifest.Manifest, filename string) (keepclient.ReadCloserWithLen, error)
+       ManifestFileReader(m manifest.Manifest, filename string) (keepclient.Reader, error)
 }
 
 // NewLogWriter is a factory function to create a new log writer.
@@ -95,7 +96,6 @@ type ContainerRunner struct {
        SigChan        chan os.Signal
        ArvMountExit   chan error
        finalState     string
-       trashLifetime  time.Duration
 
        statLogger   io.WriteCloser
        statReporter *crunchstat.Reporter
@@ -124,20 +124,29 @@ func (runner *ContainerRunner) SetupSignals() {
        signal.Notify(runner.SigChan, syscall.SIGINT)
        signal.Notify(runner.SigChan, syscall.SIGQUIT)
 
-       go func(sig <-chan os.Signal) {
-               for range sig {
-                       if !runner.Cancelled {
-                               runner.CancelLock.Lock()
-                               runner.Cancelled = true
-                               if runner.ContainerID != "" {
-                                       runner.Docker.StopContainer(runner.ContainerID, 10)
-                               }
-                               runner.CancelLock.Unlock()
-                       }
-               }
+       go func(sig chan os.Signal) {
+               <-sig
+               runner.stop()
+               signal.Stop(sig)
        }(runner.SigChan)
 }
 
+// stop the underlying Docker container.
+func (runner *ContainerRunner) stop() {
+       runner.CancelLock.Lock()
+       defer runner.CancelLock.Unlock()
+       if runner.Cancelled {
+               return
+       }
+       runner.Cancelled = true
+       if runner.ContainerID != "" {
+               err := runner.Docker.StopContainer(runner.ContainerID, 10)
+               if err != nil {
+                       log.Printf("StopContainer failed: %s", err)
+               }
+       }
+}
+
 // LoadImage determines the docker image id from the container record and
 // checks if it is available in the local Docker image store.  If not, it loads
 // the image from Keep.
@@ -240,8 +249,6 @@ func (runner *ContainerRunner) ArvMountCmd(arvMountCmd []string, token string) (
        return c, nil
 }
 
-var tmpBackedOutputDir = false
-
 func (runner *ContainerRunner) SetupArvMountPoint(prefix string) (err error) {
        if runner.ArvMountPoint == "" {
                runner.ArvMountPoint, err = runner.MkTempDir("", prefix)
@@ -319,7 +326,21 @@ func (runner *ContainerRunner) SetupMounts() (err error) {
                                if mnt.Writable {
                                        return fmt.Errorf("Can never write to a collection specified by portable data hash")
                                }
+                               idx := strings.Index(mnt.PortableDataHash, "/")
+                               if idx > 0 {
+                                       mnt.Path = path.Clean(mnt.PortableDataHash[idx:])
+                                       mnt.PortableDataHash = mnt.PortableDataHash[0:idx]
+                                       runner.Container.Mounts[bind] = mnt
+                               }
                                src = fmt.Sprintf("%s/by_id/%s", runner.ArvMountPoint, mnt.PortableDataHash)
+                               if mnt.Path != "" && mnt.Path != "." {
+                                       if strings.HasPrefix(mnt.Path, "./") {
+                                               mnt.Path = mnt.Path[2:]
+                                       } else if strings.HasPrefix(mnt.Path, "/") {
+                                               mnt.Path = mnt.Path[1:]
+                                       }
+                                       src += "/" + mnt.Path
+                               }
                        } else {
                                src = fmt.Sprintf("%s/tmp%d", runner.ArvMountPoint, tmpcount)
                                arvMountCmd = append(arvMountCmd, "--mount-tmp")
@@ -353,7 +374,6 @@ func (runner *ContainerRunner) SetupMounts() (err error) {
                        }
                        runner.CleanupTempDir = append(runner.CleanupTempDir, runner.HostOutputDir)
                        runner.Binds = append(runner.Binds, fmt.Sprintf("%s:%s", runner.HostOutputDir, bind))
-                       tmpBackedOutputDir = true
 
                case mnt.Kind == "tmp":
                        runner.Binds = append(runner.Binds, bind)
@@ -590,12 +610,22 @@ func (runner *ContainerRunner) StartContainer() error {
 func (runner *ContainerRunner) WaitFinish() error {
        runner.CrunchLog.Print("Waiting for container to finish")
 
-       result := runner.Docker.Wait(runner.ContainerID)
-       wr := <-result
-       if wr.Error != nil {
-               return fmt.Errorf("While waiting for container to finish: %v", wr.Error)
+       waitDocker := runner.Docker.Wait(runner.ContainerID)
+       waitMount := runner.ArvMountExit
+       for waitDocker != nil {
+               select {
+               case err := <-waitMount:
+                       runner.CrunchLog.Printf("arv-mount exited before container finished: %v", err)
+                       waitMount = nil
+                       runner.stop()
+               case wr := <-waitDocker:
+                       if wr.Error != nil {
+                               return fmt.Errorf("While waiting for container to finish: %v", wr.Error)
+                       }
+                       runner.ExitCode = &wr.ExitCode
+                       waitDocker = nil
+               }
        }
-       runner.ExitCode = &wr.ExitCode
 
        // wait for stdout/stderr to complete
        <-runner.loggingDone
@@ -639,7 +669,7 @@ func (runner *ContainerRunner) CaptureOutput() error {
        _, err = os.Stat(collectionMetafile)
        if err != nil {
                // Regular directory
-               cw := CollectionWriter{runner.Kc, nil, sync.Mutex{}}
+               cw := CollectionWriter{0, runner.Kc, nil, nil, sync.Mutex{}}
                manifestText, err = cw.WriteTree(runner.HostOutputDir, runner.CrunchLog.Logger)
                if err != nil {
                        return fmt.Errorf("While uploading output files: %v", err)
@@ -673,34 +703,14 @@ func (runner *ContainerRunner) CaptureOutput() error {
                bindSuffix := strings.TrimPrefix(bind, runner.Container.OutputPath)
 
                if bindSuffix == bind || len(bindSuffix) <= 0 {
-                       // either doesn't start with OutputPath or is OutputPath itself
+                       // either does not start with OutputPath or is OutputPath itself
                        continue
                }
 
-               if strings.Index(bindSuffix, "/") != 0 {
-                       return fmt.Errorf("Expected bind to be of the format '%v/*' but found: %v", runner.Container.OutputPath, bind)
-               }
-
-               jsondata, err := json.Marshal(mnt.Content)
-               if err != nil {
-                       return fmt.Errorf("While marshal of mount content: %v", err)
-               }
-               var content map[string]interface{}
-               err = json.Unmarshal(jsondata, &content)
-               if err != nil {
-                       return fmt.Errorf("While unmarshal of mount content: %v", err)
-               }
-
-               if content["exclude_from_output"] == true {
+               if mnt.ExcludeFromOutput == true {
                        continue
                }
 
-               idx := strings.Index(mnt.PortableDataHash, "/")
-               if idx > 0 {
-                       mnt.Path = mnt.PortableDataHash[idx:]
-                       mnt.PortableDataHash = mnt.PortableDataHash[0 : idx-1]
-               }
-
                // append to manifest_text
                m, err := runner.getCollectionManifestForPath(mnt, bindSuffix)
                if err != nil {
@@ -712,10 +722,13 @@ func (runner *ContainerRunner) CaptureOutput() error {
 
        // Save output
        var response arvados.Collection
+       manifest := manifest.Manifest{Text: manifestText}
+       manifestText = manifest.Extract(".", ".").Text
        err = runner.ArvClient.Create("collections",
                arvadosclient.Dict{
+                       "ensure_unique_name": true,
                        "collection": arvadosclient.Dict{
-                               "trash_at":      time.Now().Add(runner.trashLifetime).Format(time.RFC3339),
+                               "is_trashed":    true,
                                "name":          "output for " + runner.Container.UUID,
                                "manifest_text": manifestText}},
                &response)
@@ -726,85 +739,45 @@ func (runner *ContainerRunner) CaptureOutput() error {
        return nil
 }
 
+var outputCollections = make(map[string]arvados.Collection)
+
 // Fetch the collection for the mnt.PortableDataHash
 // Return the manifest_text fragment corresponding to the specified mnt.Path
 //  after making any required updates.
 //  Ex:
-//    If mnt.Path is not speficied,
+//    If mnt.Path is not specified,
 //      return the entire manifest_text after replacing any "." with bindSuffix
 //    If mnt.Path corresponds to one stream,
-//      return the manitest_text for that stream after replacing that stream name with bindSuffix
-//    Otherwise, check if a filename in any one stream is being sought. Return the manitest_text
-//      for that stream after replacing that stream name and file name using bindSuffix components.
+//      return the manifest_text for that stream after replacing that stream name with bindSuffix
+//    Otherwise, check if a filename in any one stream is being sought. Return the manifest_text
+//      for that stream after replacing stream name with bindSuffix minus the last word
+//      and the file name with last word of the bindSuffix
+//  Allowed path examples:
+//    "path":"/"
+//    "path":"/subdir1"
+//    "path":"/subdir1/subdir2"
+//    "path":"/subdir/filename" etc
 func (runner *ContainerRunner) getCollectionManifestForPath(mnt arvados.Mount, bindSuffix string) (string, error) {
-       var collection arvados.Collection
-       err := runner.ArvClient.Get("collections", mnt.PortableDataHash, nil, &collection)
-       if err != nil {
-               return "", fmt.Errorf("While getting collection for %v: %v", mnt.PortableDataHash, err)
-       }
-
-       manifestText := ""
-       if mnt.Path == "" {
-               // no path specified; return the entire manifest text
-               manifestText = collection.ManifestText
-               manifestText = strings.Replace(manifestText, "./", "."+bindSuffix+"/", -1)
-               manifestText = strings.Replace(manifestText, ". ", "."+bindSuffix+" ", -1)
-       } else {
-               // either a stream or file from a stream is being sought
-               bindIdx := strings.LastIndex(bindSuffix, "/")
-               var bindSubdir, bindFileName string
-               if bindIdx >= 0 {
-                       bindSubdir = bindSuffix[0:bindIdx]
-                       bindFileName = bindSuffix[bindIdx+1:]
-               }
-               pathIdx := strings.LastIndex(mnt.Path, "/")
-               var pathSubdir, pathFileName string
-               if pathIdx >= 0 {
-                       pathSubdir = mnt.Path[0:pathIdx]
-                       pathFileName = mnt.Path[pathIdx+1:]
-               }
-               streams := strings.Split(collection.ManifestText, "\n")
-               for _, stream := range streams {
-                       tokens := strings.Split(stream, " ")
-                       if tokens[0] == "."+mnt.Path {
-                               // path refers to this stream
-                               adjustedStream := strings.Replace(stream, mnt.Path, bindSuffix, -1)
-                               manifestText = adjustedStream + "\n"
-                               break
-                       } else {
-                               // look for a matching file in this stream
-                               if tokens[0] == "."+pathSubdir {
-                                       // path refers to a file in this stream
-                                       for _, token := range tokens {
-                                               if strings.Index(token, ":"+pathFileName) > 0 {
-                                                       // found the file in the stream; discard all other file tokens
-                                                       for _, t := range tokens {
-                                                               if strings.Index(t, ":") == 0 {
-                                                                       manifestText = " " + t
-                                                               } else {
-                                                                       break // done reading all non-file tokens
-                                                               }
-                                                               token = strings.Replace(token, pathFileName, bindFileName, -1)
-                                                               manifestText = manifestText + token + "\n"
-                                                               manifestText = strings.Replace(manifestText, pathSubdir, bindSubdir, -1)
-                                                               break
-                                                       }
-                                               }
-                                       }
-                               }
-                       }
+       collection := outputCollections[mnt.PortableDataHash]
+       if collection.PortableDataHash == "" {
+               err := runner.ArvClient.Get("collections", mnt.PortableDataHash, nil, &collection)
+               if err != nil {
+                       return "", fmt.Errorf("While getting collection for %v: %v", mnt.PortableDataHash, err)
                }
+               outputCollections[mnt.PortableDataHash] = collection
        }
 
-       return manifestText, nil
-}
+       if collection.ManifestText == "" {
+               runner.CrunchLog.Printf("No manifest text for collection %v", collection.PortableDataHash)
+               return "", nil
+       }
 
-func (runner *ContainerRunner) loadDiscoveryVars() {
-       tl, err := runner.ArvClient.Discovery("defaultTrashLifetime")
-       if err != nil {
-               log.Fatalf("getting defaultTrashLifetime from discovery document: %s", err)
+       mft := manifest.Manifest{Text: collection.ManifestText}
+       extracted := mft.Extract(mnt.Path, bindSuffix)
+       if extracted.Err != nil {
+               return "", fmt.Errorf("Error parsing manifest for %v: %v", mnt.PortableDataHash, extracted.Err.Error())
        }
-       runner.trashLifetime = time.Duration(tl.(float64)) * time.Second
+       return extracted.Text, nil
 }
 
 func (runner *ContainerRunner) CleanupDirs() {
@@ -858,8 +831,9 @@ func (runner *ContainerRunner) CommitLogs() error {
        var response arvados.Collection
        err = runner.ArvClient.Create("collections",
                arvadosclient.Dict{
+                       "ensure_unique_name": true,
                        "collection": arvadosclient.Dict{
-                               "trash_at":      time.Now().Add(runner.trashLifetime).Format(time.RFC3339),
+                               "is_trashed":    true,
                                "name":          "logs for " + runner.Container.UUID,
                                "manifest_text": mt}},
                &response)
@@ -1050,11 +1024,10 @@ func NewContainerRunner(api IArvadosClient,
        cr.NewLogWriter = cr.NewArvLogWriter
        cr.RunArvMount = cr.ArvMountCmd
        cr.MkTempDir = ioutil.TempDir
-       cr.LogCollection = &CollectionWriter{kc, nil, sync.Mutex{}}
+       cr.LogCollection = &CollectionWriter{0, kc, nil, nil, sync.Mutex{}}
        cr.Container.UUID = containerUUID
        cr.CrunchLog = NewThrottledLogger(cr.NewLogWriter("crunch-run"))
        cr.CrunchLog.Immediate = log.New(os.Stderr, containerUUID+" ", 0)
-       cr.loadDiscoveryVars()
        return cr
 }