X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/f6bdb550ec87fd38f528f5eb67925d6bcf5af22f..1230d8a106c5c62edcbb9fcf6d1b94585e5596b2:/services/crunch-run/crunchrun.go?ds=sidebyside diff --git a/services/crunch-run/crunchrun.go b/services/crunch-run/crunchrun.go index 0b59f7df91..88c93e56c6 100644 --- a/services/crunch-run/crunchrun.go +++ b/services/crunch-run/crunchrun.go @@ -5,12 +5,6 @@ import ( "errors" "flag" "fmt" - "git.curoverse.com/arvados.git/lib/crunchstat" - "git.curoverse.com/arvados.git/sdk/go/arvados" - "git.curoverse.com/arvados.git/sdk/go/arvadosclient" - "git.curoverse.com/arvados.git/sdk/go/keepclient" - "git.curoverse.com/arvados.git/sdk/go/manifest" - "github.com/curoverse/dockerclient" "io" "io/ioutil" "log" @@ -24,6 +18,13 @@ import ( "sync" "syscall" "time" + + "git.curoverse.com/arvados.git/lib/crunchstat" + "git.curoverse.com/arvados.git/sdk/go/arvados" + "git.curoverse.com/arvados.git/sdk/go/arvadosclient" + "git.curoverse.com/arvados.git/sdk/go/keepclient" + "git.curoverse.com/arvados.git/sdk/go/manifest" + "github.com/curoverse/dockerclient" ) // IArvadosClient is the minimal Arvados API methods used by crunch-run. @@ -96,6 +97,7 @@ type ContainerRunner struct { ArvMountExit chan error finalState string + infoLogger io.WriteCloser statLogger io.WriteCloser statReporter *crunchstat.Reporter statInterval time.Duration @@ -123,20 +125,29 @@ func (runner *ContainerRunner) SetupSignals() { signal.Notify(runner.SigChan, syscall.SIGINT) signal.Notify(runner.SigChan, syscall.SIGQUIT) - go func(sig <-chan os.Signal) { - for range sig { - if !runner.Cancelled { - runner.CancelLock.Lock() - runner.Cancelled = true - if runner.ContainerID != "" { - runner.Docker.StopContainer(runner.ContainerID, 10) - } - runner.CancelLock.Unlock() - } - } + go func(sig chan os.Signal) { + <-sig + runner.stop() + signal.Stop(sig) }(runner.SigChan) } +// stop the underlying Docker container. +func (runner *ContainerRunner) stop() { + runner.CancelLock.Lock() + defer runner.CancelLock.Unlock() + if runner.Cancelled { + return + } + runner.Cancelled = true + if runner.ContainerID != "" { + err := runner.Docker.StopContainer(runner.ContainerID, 10) + if err != nil { + log.Printf("StopContainer failed: %s", err) + } + } +} + // LoadImage determines the docker image id from the container record and // checks if it is available in the local Docker image store. If not, it loads // the image from Keep. @@ -494,6 +505,51 @@ func (runner *ContainerRunner) StartCrunchstat() { runner.statReporter.Start() } +type infoCommand struct { + label string + command string + args []string +} + +func newInfoCommand(label string, command string) infoCommand { + cmd := strings.Split(command, " ") + return infoCommand{ + label: label, + command: cmd[0], + args: cmd[1:], + } +} + +// Gather node information and store it on the log for debugging +// purposes. +func (runner *ContainerRunner) LogNodeInfo() (err error) { + w := runner.NewLogWriter("node-info") + logger := log.New(w, "node-info", 0) + + commands := []infoCommand{ + newInfoCommand("Host Information", "uname -a"), + newInfoCommand("CPU Information", "cat /proc/cpuinfo"), + newInfoCommand("Memory Information", "cat /proc/meminfo"), + newInfoCommand("Disk Space", "df -m"), + } + + var out []byte + for _, command := range commands { + out, err = exec.Command(command.command, command.args...).Output() + if err != nil { + return fmt.Errorf("While running command '%s': %v", + command.command, err) + } + logger.Printf("%s:\n%s\n", command.label, out) + } + + err = w.Close() + if err != nil { + return fmt.Errorf("While closing node-info logs: %v", err) + } + return nil +} + // AttachLogs connects the docker container stdout and stderr logs to the // Arvados logger which logs to Keep and the API server logs table. func (runner *ContainerRunner) AttachStreams() (err error) { @@ -600,12 +656,22 @@ func (runner *ContainerRunner) StartContainer() error { func (runner *ContainerRunner) WaitFinish() error { runner.CrunchLog.Print("Waiting for container to finish") - result := runner.Docker.Wait(runner.ContainerID) - wr := <-result - if wr.Error != nil { - return fmt.Errorf("While waiting for container to finish: %v", wr.Error) + waitDocker := runner.Docker.Wait(runner.ContainerID) + waitMount := runner.ArvMountExit + for waitDocker != nil { + select { + case err := <-waitMount: + runner.CrunchLog.Printf("arv-mount exited before container finished: %v", err) + waitMount = nil + runner.stop() + case wr := <-waitDocker: + if wr.Error != nil { + return fmt.Errorf("While waiting for container to finish: %v", wr.Error) + } + runner.ExitCode = &wr.ExitCode + waitDocker = nil + } } - runner.ExitCode = &wr.ExitCode // wait for stdout/stderr to complete <-runner.loggingDone @@ -649,7 +715,7 @@ func (runner *ContainerRunner) CaptureOutput() error { _, err = os.Stat(collectionMetafile) if err != nil { // Regular directory - cw := CollectionWriter{runner.Kc, nil, sync.Mutex{}} + cw := CollectionWriter{0, runner.Kc, nil, nil, sync.Mutex{}} manifestText, err = cw.WriteTree(runner.HostOutputDir, runner.CrunchLog.Logger) if err != nil { return fmt.Errorf("While uploading output files: %v", err) @@ -706,6 +772,7 @@ func (runner *ContainerRunner) CaptureOutput() error { manifestText = manifest.Extract(".", ".").Text err = runner.ArvClient.Create("collections", arvadosclient.Dict{ + "ensure_unique_name": true, "collection": arvadosclient.Dict{ "is_trashed": true, "name": "output for " + runner.Container.UUID, @@ -810,6 +877,7 @@ func (runner *ContainerRunner) CommitLogs() error { var response arvados.Collection err = runner.ArvClient.Create("collections", arvadosclient.Dict{ + "ensure_unique_name": true, "collection": arvadosclient.Dict{ "is_trashed": true, "name": "logs for " + runner.Container.UUID, @@ -968,6 +1036,12 @@ func (runner *ContainerRunner) Run() (err error) { return } + // Gather and record node information + err = runner.LogNodeInfo() + if err != nil { + return + } + runner.StartCrunchstat() if runner.IsCancelled() { @@ -1002,7 +1076,7 @@ func NewContainerRunner(api IArvadosClient, cr.NewLogWriter = cr.NewArvLogWriter cr.RunArvMount = cr.ArvMountCmd cr.MkTempDir = ioutil.TempDir - cr.LogCollection = &CollectionWriter{kc, nil, sync.Mutex{}} + cr.LogCollection = &CollectionWriter{0, kc, nil, nil, sync.Mutex{}} cr.Container.UUID = containerUUID cr.CrunchLog = NewThrottledLogger(cr.NewLogWriter("crunch-run")) cr.CrunchLog.Immediate = log.New(os.Stderr, containerUUID+" ", 0)