10218: Logging node information (cpu, mem, disk) by storing command outputs on the...

[arvados.git] / services / crunch-run / crunchrun.go
diff --git a/services/crunch-run/crunchrun.go b/services/crunch-run/crunchrun.go

index 2e475c72e64842b15aa6c7dee88446bc0056b802..88c93e56c63e883246173575e650327f5ff723e5 100644 (file)
--- a/services/crunch-run/crunchrun.go
+++ b/services/crunch-run/crunchrun.go
@@ -5,12 +5,6 @@ import (
         "errors"
         "flag"
         "fmt"
-       "git.curoverse.com/arvados.git/lib/crunchstat"
-       "git.curoverse.com/arvados.git/sdk/go/arvados"
-       "git.curoverse.com/arvados.git/sdk/go/arvadosclient"
-       "git.curoverse.com/arvados.git/sdk/go/keepclient"
-       "git.curoverse.com/arvados.git/sdk/go/manifest"
-       "github.com/curoverse/dockerclient"
         "io"
         "io/ioutil"
         "log"
@@ -19,10 +13,18 @@ import (
         "os/signal"
         "path"
         "path/filepath"
+       "sort"
         "strings"
         "sync"
         "syscall"
         "time"
+
+       "git.curoverse.com/arvados.git/lib/crunchstat"
+       "git.curoverse.com/arvados.git/sdk/go/arvados"
+       "git.curoverse.com/arvados.git/sdk/go/arvadosclient"
+       "git.curoverse.com/arvados.git/sdk/go/keepclient"
+       "git.curoverse.com/arvados.git/sdk/go/manifest"
+       "github.com/curoverse/dockerclient"
  )
  
  // IArvadosClient is the minimal Arvados API methods used by crunch-run.
@@ -40,7 +42,7 @@ var ErrCancelled = errors.New("Cancelled")
  // IKeepClient is the minimal Keep API methods used by crunch-run.
  type IKeepClient interface {
         PutHB(hash string, buf []byte) (string, int, error)
-       ManifestFileReader(m manifest.Manifest, filename string) (keepclient.ReadCloserWithLen, error)
+       ManifestFileReader(m manifest.Manifest, filename string) (keepclient.Reader, error)
  }
  
  // NewLogWriter is a factory function to create a new log writer.
@@ -94,8 +96,8 @@ type ContainerRunner struct {
         SigChan        chan os.Signal
         ArvMountExit   chan error
         finalState     string
-       trashLifetime  time.Duration
  
+       infoLogger   io.WriteCloser
         statLogger   io.WriteCloser
         statReporter *crunchstat.Reporter
         statInterval time.Duration
@@ -123,20 +125,29 @@ func (runner *ContainerRunner) SetupSignals() {
         signal.Notify(runner.SigChan, syscall.SIGINT)
         signal.Notify(runner.SigChan, syscall.SIGQUIT)
  
-       go func(sig <-chan os.Signal) {
-               for range sig {
-                       if !runner.Cancelled {
-                               runner.CancelLock.Lock()
-                               runner.Cancelled = true
-                               if runner.ContainerID != "" {
-                                       runner.Docker.StopContainer(runner.ContainerID, 10)
-                               }
-                               runner.CancelLock.Unlock()
-                       }
-               }
+       go func(sig chan os.Signal) {
+               <-sig
+               runner.stop()
+               signal.Stop(sig)
         }(runner.SigChan)
  }
  
+// stop the underlying Docker container.
+func (runner *ContainerRunner) stop() {
+       runner.CancelLock.Lock()
+       defer runner.CancelLock.Unlock()
+       if runner.Cancelled {
+               return
+       }
+       runner.Cancelled = true
+       if runner.ContainerID != "" {
+               err := runner.Docker.StopContainer(runner.ContainerID, 10)
+               if err != nil {
+                       log.Printf("StopContainer failed: %s", err)
+               }
+       }
+}
+
  // LoadImage determines the docker image id from the container record and
  // checks if it is available in the local Docker image store.  If not, it loads
  // the image from Keep.
@@ -239,8 +250,15 @@ func (runner *ContainerRunner) ArvMountCmd(arvMountCmd []string, token string) (
         return c, nil
  }
  
+func (runner *ContainerRunner) SetupArvMountPoint(prefix string) (err error) {
+       if runner.ArvMountPoint == "" {
+               runner.ArvMountPoint, err = runner.MkTempDir("", prefix)
+       }
+       return
+}
+
  func (runner *ContainerRunner) SetupMounts() (err error) {
-       runner.ArvMountPoint, err = runner.MkTempDir("", "keep")
+       err = runner.SetupArvMountPoint("keep")
         if err != nil {
                 return fmt.Errorf("While creating keep mount temp dir: %v", err)
         }
@@ -257,8 +275,16 @@ func (runner *ContainerRunner) SetupMounts() (err error) {
  
         collectionPaths := []string{}
         runner.Binds = nil
+       needCertMount := true
+
+       var binds []string
+       for bind, _ := range runner.Container.Mounts {
+               binds = append(binds, bind)
+       }
+       sort.Strings(binds)
  
-       for bind, mnt := range runner.Container.Mounts {
+       for _, bind := range binds {
+               mnt := runner.Container.Mounts[bind]
                 if bind == "stdout" {
                         // Is it a "file" mount kind?
                         if mnt.Kind != "file" {
@@ -275,6 +301,16 @@ func (runner *ContainerRunner) SetupMounts() (err error) {
                         }
                 }
  
+               if bind == "/etc/arvados/ca-certificates.crt" {
+                       needCertMount = false
+               }
+
+               if strings.HasPrefix(bind, runner.Container.OutputPath+"/") && bind != runner.Container.OutputPath+"/" {
+                       if mnt.Kind != "collection" {
+                               return fmt.Errorf("Only mount points of kind 'collection' are supported underneath the output_path: %v", bind)
+                       }
+               }
+
                 switch {
                 case mnt.Kind == "collection":
                         var src string
@@ -291,7 +327,21 @@ func (runner *ContainerRunner) SetupMounts() (err error) {
                                 if mnt.Writable {
                                         return fmt.Errorf("Can never write to a collection specified by portable data hash")
                                 }
+                               idx := strings.Index(mnt.PortableDataHash, "/")
+                               if idx > 0 {
+                                       mnt.Path = path.Clean(mnt.PortableDataHash[idx:])
+                                       mnt.PortableDataHash = mnt.PortableDataHash[0:idx]
+                                       runner.Container.Mounts[bind] = mnt
+                               }
                                 src = fmt.Sprintf("%s/by_id/%s", runner.ArvMountPoint, mnt.PortableDataHash)
+                               if mnt.Path != "" && mnt.Path != "." {
+                                       if strings.HasPrefix(mnt.Path, "./") {
+                                               mnt.Path = mnt.Path[2:]
+                                       } else if strings.HasPrefix(mnt.Path, "/") {
+                                               mnt.Path = mnt.Path[1:]
+                                       }
+                                       src += "/" + mnt.Path
+                               }
                         } else {
                                 src = fmt.Sprintf("%s/tmp%d", runner.ArvMountPoint, tmpcount)
                                 arvMountCmd = append(arvMountCmd, "--mount-tmp")
@@ -301,6 +351,8 @@ func (runner *ContainerRunner) SetupMounts() (err error) {
                         if mnt.Writable {
                                 if bind == runner.Container.OutputPath {
                                         runner.HostOutputDir = src
+                               } else if strings.HasPrefix(bind, runner.Container.OutputPath+"/") {
+                                       return fmt.Errorf("Writable mount points are not permitted underneath the output_path: %v", bind)
                                 }
                                 runner.Binds = append(runner.Binds, fmt.Sprintf("%s:%s", src, bind))
                         } else {
@@ -355,6 +407,16 @@ func (runner *ContainerRunner) SetupMounts() (err error) {
                 return fmt.Errorf("Output path does not correspond to a writable mount point")
         }
  
+       if wantAPI := runner.Container.RuntimeConstraints.API; needCertMount && wantAPI != nil && *wantAPI {
+               for _, certfile := range arvadosclient.CertFiles {
+                       _, err := os.Stat(certfile)
+                       if err == nil {
+                               runner.Binds = append(runner.Binds, fmt.Sprintf("%s:/etc/arvados/ca-certificates.crt:ro", certfile))
+                               break
+                       }
+               }
+       }
+
         if pdhOnly {
                 arvMountCmd = append(arvMountCmd, "--mount-by-pdh", "by_id")
         } else {
@@ -443,6 +505,51 @@ func (runner *ContainerRunner) StartCrunchstat() {
         runner.statReporter.Start()
  }
  
+type infoCommand struct {
+       label   string
+       command string
+       args    []string
+}
+
+func newInfoCommand(label string, command string) infoCommand {
+       cmd := strings.Split(command, " ")
+       return infoCommand{
+               label:   label,
+               command: cmd[0],
+               args:    cmd[1:],
+       }
+}
+
+// Gather node information and store it on the log for debugging
+// purposes.
+func (runner *ContainerRunner) LogNodeInfo() (err error) {
+       w := runner.NewLogWriter("node-info")
+       logger := log.New(w, "node-info", 0)
+
+       commands := []infoCommand{
+               newInfoCommand("Host Information", "uname -a"),
+               newInfoCommand("CPU Information", "cat /proc/cpuinfo"),
+               newInfoCommand("Memory Information", "cat /proc/meminfo"),
+               newInfoCommand("Disk Space", "df -m"),
+       }
+
+       var out []byte
+       for _, command := range commands {
+               out, err = exec.Command(command.command, command.args...).Output()
+               if err != nil {
+                       return fmt.Errorf("While running command '%s': %v",
+                               command.command, err)
+               }
+               logger.Printf("%s:\n%s\n", command.label, out)
+       }
+
+       err = w.Close()
+       if err != nil {
+               return fmt.Errorf("While closing node-info logs: %v", err)
+       }
+       return nil
+}
+
  // AttachLogs connects the docker container stdout and stderr logs to the
  // Arvados logger which logs to Keep and the API server logs table.
  func (runner *ContainerRunner) AttachStreams() (err error) {
@@ -549,12 +656,22 @@ func (runner *ContainerRunner) StartContainer() error {
  func (runner *ContainerRunner) WaitFinish() error {
         runner.CrunchLog.Print("Waiting for container to finish")
  
-       result := runner.Docker.Wait(runner.ContainerID)
-       wr := <-result
-       if wr.Error != nil {
-               return fmt.Errorf("While waiting for container to finish: %v", wr.Error)
+       waitDocker := runner.Docker.Wait(runner.ContainerID)
+       waitMount := runner.ArvMountExit
+       for waitDocker != nil {
+               select {
+               case err := <-waitMount:
+                       runner.CrunchLog.Printf("arv-mount exited before container finished: %v", err)
+                       waitMount = nil
+                       runner.stop()
+               case wr := <-waitDocker:
+                       if wr.Error != nil {
+                               return fmt.Errorf("While waiting for container to finish: %v", wr.Error)
+                       }
+                       runner.ExitCode = &wr.ExitCode
+                       waitDocker = nil
+               }
         }
-       runner.ExitCode = &wr.ExitCode
  
         // wait for stdout/stderr to complete
         <-runner.loggingDone
@@ -598,7 +715,7 @@ func (runner *ContainerRunner) CaptureOutput() error {
         _, err = os.Stat(collectionMetafile)
         if err != nil {
                 // Regular directory
-               cw := CollectionWriter{runner.Kc, nil, sync.Mutex{}}
+               cw := CollectionWriter{0, runner.Kc, nil, nil, sync.Mutex{}}
                 manifestText, err = cw.WriteTree(runner.HostOutputDir, runner.CrunchLog.Logger)
                 if err != nil {
                         return fmt.Errorf("While uploading output files: %v", err)
@@ -619,11 +736,45 @@ func (runner *ContainerRunner) CaptureOutput() error {
                 manifestText = rec.ManifestText
         }
  
+       // Pre-populate output from the configured mount points
+       var binds []string
+       for bind, _ := range runner.Container.Mounts {
+               binds = append(binds, bind)
+       }
+       sort.Strings(binds)
+
+       for _, bind := range binds {
+               mnt := runner.Container.Mounts[bind]
+
+               bindSuffix := strings.TrimPrefix(bind, runner.Container.OutputPath)
+
+               if bindSuffix == bind || len(bindSuffix) <= 0 {
+                       // either does not start with OutputPath or is OutputPath itself
+                       continue
+               }
+
+               if mnt.ExcludeFromOutput == true {
+                       continue
+               }
+
+               // append to manifest_text
+               m, err := runner.getCollectionManifestForPath(mnt, bindSuffix)
+               if err != nil {
+                       return err
+               }
+
+               manifestText = manifestText + m
+       }
+
+       // Save output
         var response arvados.Collection
+       manifest := manifest.Manifest{Text: manifestText}
+       manifestText = manifest.Extract(".", ".").Text
         err = runner.ArvClient.Create("collections",
                 arvadosclient.Dict{
+                       "ensure_unique_name": true,
                         "collection": arvadosclient.Dict{
-                               "expires_at":    time.Now().Add(runner.trashLifetime).Format(time.RFC3339),
+                               "is_trashed":    true,
                                 "name":          "output for " + runner.Container.UUID,
                                 "manifest_text": manifestText}},
                 &response)
@@ -634,12 +785,45 @@ func (runner *ContainerRunner) CaptureOutput() error {
         return nil
  }
  
-func (runner *ContainerRunner) loadDiscoveryVars() {
-       tl, err := runner.ArvClient.Discovery("defaultTrashLifetime")
-       if err != nil {
-               log.Fatalf("getting defaultTrashLifetime from discovery document: %s", err)
+var outputCollections = make(map[string]arvados.Collection)
+
+// Fetch the collection for the mnt.PortableDataHash
+// Return the manifest_text fragment corresponding to the specified mnt.Path
+//  after making any required updates.
+//  Ex:
+//    If mnt.Path is not specified,
+//      return the entire manifest_text after replacing any "." with bindSuffix
+//    If mnt.Path corresponds to one stream,
+//      return the manifest_text for that stream after replacing that stream name with bindSuffix
+//    Otherwise, check if a filename in any one stream is being sought. Return the manifest_text
+//      for that stream after replacing stream name with bindSuffix minus the last word
+//      and the file name with last word of the bindSuffix
+//  Allowed path examples:
+//    "path":"/"
+//    "path":"/subdir1"
+//    "path":"/subdir1/subdir2"
+//    "path":"/subdir/filename" etc
+func (runner *ContainerRunner) getCollectionManifestForPath(mnt arvados.Mount, bindSuffix string) (string, error) {
+       collection := outputCollections[mnt.PortableDataHash]
+       if collection.PortableDataHash == "" {
+               err := runner.ArvClient.Get("collections", mnt.PortableDataHash, nil, &collection)
+               if err != nil {
+                       return "", fmt.Errorf("While getting collection for %v: %v", mnt.PortableDataHash, err)
+               }
+               outputCollections[mnt.PortableDataHash] = collection
+       }
+
+       if collection.ManifestText == "" {
+               runner.CrunchLog.Printf("No manifest text for collection %v", collection.PortableDataHash)
+               return "", nil
         }
-       runner.trashLifetime = time.Duration(tl.(float64)) * time.Second
+
+       mft := manifest.Manifest{Text: collection.ManifestText}
+       extracted := mft.Extract(mnt.Path, bindSuffix)
+       if extracted.Err != nil {
+               return "", fmt.Errorf("Error parsing manifest for %v: %v", mnt.PortableDataHash, extracted.Err.Error())
+       }
+       return extracted.Text, nil
  }
  
  func (runner *ContainerRunner) CleanupDirs() {
@@ -693,8 +877,9 @@ func (runner *ContainerRunner) CommitLogs() error {
         var response arvados.Collection
         err = runner.ArvClient.Create("collections",
                 arvadosclient.Dict{
+                       "ensure_unique_name": true,
                         "collection": arvadosclient.Dict{
-                               "expires_at":    time.Now().Add(runner.trashLifetime).Format(time.RFC3339),
+                               "is_trashed":    true,
                                 "name":          "logs for " + runner.Container.UUID,
                                 "manifest_text": mt}},
                 &response)
@@ -737,10 +922,10 @@ func (runner *ContainerRunner) ContainerToken() (string, error) {
  func (runner *ContainerRunner) UpdateContainerFinal() error {
         update := arvadosclient.Dict{}
         update["state"] = runner.finalState
+       if runner.LogsPDH != nil {
+               update["log"] = *runner.LogsPDH
+       }
         if runner.finalState == "Complete" {
-               if runner.LogsPDH != nil {
-                       update["log"] = *runner.LogsPDH
-               }
                 if runner.ExitCode != nil {
                         update["exit_code"] = *runner.ExitCode
                 }
@@ -851,6 +1036,12 @@ func (runner *ContainerRunner) Run() (err error) {
                 return
         }
  
+       // Gather and record node information
+       err = runner.LogNodeInfo()
+       if err != nil {
+               return
+       }
+
         runner.StartCrunchstat()
  
         if runner.IsCancelled() {
@@ -885,11 +1076,10 @@ func NewContainerRunner(api IArvadosClient,
         cr.NewLogWriter = cr.NewArvLogWriter
         cr.RunArvMount = cr.ArvMountCmd
         cr.MkTempDir = ioutil.TempDir
-       cr.LogCollection = &CollectionWriter{kc, nil, sync.Mutex{}}
+       cr.LogCollection = &CollectionWriter{0, kc, nil, nil, sync.Mutex{}}
         cr.Container.UUID = containerUUID
         cr.CrunchLog = NewThrottledLogger(cr.NewLogWriter("crunch-run"))
         cr.CrunchLog.Immediate = log.New(os.Stderr, containerUUID+" ", 0)
-       cr.loadDiscoveryVars()
         return cr
  }
  
@@ -898,10 +1088,15 @@ func main() {
         cgroupRoot := flag.String("cgroup-root", "/sys/fs/cgroup", "path to sysfs cgroup tree")
         cgroupParent := flag.String("cgroup-parent", "docker", "name of container's parent cgroup (ignored if -cgroup-parent-subsystem is used)")
         cgroupParentSubsystem := flag.String("cgroup-parent-subsystem", "", "use current cgroup for given subsystem as parent cgroup for container")
+       caCertsPath := flag.String("ca-certs", "", "Path to TLS root certificates")
         flag.Parse()
  
         containerId := flag.Arg(0)
  
+       if *caCertsPath != "" {
+               arvadosclient.CertFiles = []string{*caCertsPath}
+       }
+
         api, err := arvadosclient.MakeArvadosClient()
         if err != nil {
                 log.Fatalf("%s: %v", containerId, err)