X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/3fa4a2b6138e3e9e468dd885a743ca38f08f0755..a54e88868ac259443e2cd8d5f6fddb4b8154acb9:/services/crunch-run/crunchrun.go diff --git a/services/crunch-run/crunchrun.go b/services/crunch-run/crunchrun.go index 0b59f7df91..e0d707a5a5 100644 --- a/services/crunch-run/crunchrun.go +++ b/services/crunch-run/crunchrun.go @@ -5,12 +5,6 @@ import ( "errors" "flag" "fmt" - "git.curoverse.com/arvados.git/lib/crunchstat" - "git.curoverse.com/arvados.git/sdk/go/arvados" - "git.curoverse.com/arvados.git/sdk/go/arvadosclient" - "git.curoverse.com/arvados.git/sdk/go/keepclient" - "git.curoverse.com/arvados.git/sdk/go/manifest" - "github.com/curoverse/dockerclient" "io" "io/ioutil" "log" @@ -24,6 +18,13 @@ import ( "sync" "syscall" "time" + + "git.curoverse.com/arvados.git/lib/crunchstat" + "git.curoverse.com/arvados.git/sdk/go/arvados" + "git.curoverse.com/arvados.git/sdk/go/arvadosclient" + "git.curoverse.com/arvados.git/sdk/go/keepclient" + "git.curoverse.com/arvados.git/sdk/go/manifest" + "github.com/curoverse/dockerclient" ) // IArvadosClient is the minimal Arvados API methods used by crunch-run. @@ -32,6 +33,7 @@ type IArvadosClient interface { Get(resourceType string, uuid string, parameters arvadosclient.Dict, output interface{}) error Update(resourceType string, uuid string, parameters arvadosclient.Dict, output interface{}) error Call(method, resourceType, uuid, action string, parameters arvadosclient.Dict, output interface{}) error + CallRaw(method string, resourceType string, uuid string, action string, parameters arvadosclient.Dict) (reader io.ReadCloser, err error) Discovery(key string) (interface{}, error) } @@ -90,8 +92,6 @@ type ContainerRunner struct { CleanupTempDir []string Binds []string OutputPDH *string - CancelLock sync.Mutex - Cancelled bool SigChan chan os.Signal ArvMountExit chan error finalState string @@ -113,6 +113,10 @@ type ContainerRunner struct { // parent to be X" feature even on sites where the "specify // cgroup parent" feature breaks. setCgroupParent string + + cStateLock sync.Mutex + cStarted bool // StartContainer() succeeded + cCancelled bool // StopContainer() invoked } // SetupSignals sets up signal handling to gracefully terminate the underlying @@ -123,20 +127,29 @@ func (runner *ContainerRunner) SetupSignals() { signal.Notify(runner.SigChan, syscall.SIGINT) signal.Notify(runner.SigChan, syscall.SIGQUIT) - go func(sig <-chan os.Signal) { - for range sig { - if !runner.Cancelled { - runner.CancelLock.Lock() - runner.Cancelled = true - if runner.ContainerID != "" { - runner.Docker.StopContainer(runner.ContainerID, 10) - } - runner.CancelLock.Unlock() - } - } + go func(sig chan os.Signal) { + <-sig + runner.stop() + signal.Stop(sig) }(runner.SigChan) } +// stop the underlying Docker container. +func (runner *ContainerRunner) stop() { + runner.cStateLock.Lock() + defer runner.cStateLock.Unlock() + if runner.cCancelled { + return + } + runner.cCancelled = true + if runner.cStarted { + err := runner.Docker.StopContainer(runner.ContainerID, 10) + if err != nil { + log.Printf("StopContainer failed: %s", err) + } + } +} + // LoadImage determines the docker image id from the container record and // checks if it is available in the local Docker image store. If not, it loads // the image from Keep. @@ -494,6 +507,105 @@ func (runner *ContainerRunner) StartCrunchstat() { runner.statReporter.Start() } +type infoCommand struct { + label string + cmd []string +} + +// Gather node information and store it on the log for debugging +// purposes. +func (runner *ContainerRunner) LogNodeInfo() (err error) { + w := runner.NewLogWriter("node-info") + logger := log.New(w, "node-info", 0) + + commands := []infoCommand{ + infoCommand{ + label: "Host Information", + cmd: []string{"uname", "-a"}, + }, + infoCommand{ + label: "CPU Information", + cmd: []string{"cat", "/proc/cpuinfo"}, + }, + infoCommand{ + label: "Memory Information", + cmd: []string{"cat", "/proc/meminfo"}, + }, + infoCommand{ + label: "Disk Space", + cmd: []string{"df", "-m", "/"}, + }, + infoCommand{ + label: "Disk Space", + cmd: []string{"df", "-m", os.TempDir()}, + }, + infoCommand{ + label: "Disk INodes", + cmd: []string{"df", "-i", "/"}, + }, + infoCommand{ + label: "Disk INodes", + cmd: []string{"df", "-i", os.TempDir()}, + }, + } + + // Run commands with informational output to be logged. + var out []byte + for _, command := range commands { + out, err = exec.Command(command.cmd[0], command.cmd[1:]...).CombinedOutput() + if err != nil { + return fmt.Errorf("While running command %q: %v", + command.cmd, err) + } + logger.Println(command.label) + for _, line := range strings.Split(string(out), "\n") { + logger.Println(" ", line) + } + } + + err = w.Close() + if err != nil { + return fmt.Errorf("While closing node-info logs: %v", err) + } + return nil +} + +// Get and save the raw JSON container record from the API server +func (runner *ContainerRunner) LogContainerRecord() (err error) { + w := &ArvLogWriter{ + runner.ArvClient, + runner.Container.UUID, + "container", + runner.LogCollection.Open("container.json"), + } + // Get Container record JSON from the API Server + reader, err := runner.ArvClient.CallRaw("GET", "containers", runner.Container.UUID, "", nil) + if err != nil { + return fmt.Errorf("While retrieving container record from the API server: %v", err) + } + // Read the API server response as []byte + json_bytes, err := ioutil.ReadAll(reader) + if err != nil { + return fmt.Errorf("While reading container record API server response: %v", err) + } + // Decode the JSON []byte + var cr map[string]interface{} + if err = json.Unmarshal(json_bytes, &cr); err != nil { + return fmt.Errorf("While decoding the container record JSON response: %v", err) + } + // Re-encode it using indentation to improve readability + enc := json.NewEncoder(w) + enc.SetIndent("", " ") + if err = enc.Encode(cr); err != nil { + return fmt.Errorf("While logging the JSON container record: %v", err) + } + err = w.Close() + if err != nil { + return fmt.Errorf("While closing container.json log: %v", err) + } + return nil +} + // AttachLogs connects the docker container stdout and stderr logs to the // Arvados logger which logs to Keep and the API server logs table. func (runner *ContainerRunner) AttachStreams() (err error) { @@ -588,10 +700,16 @@ func (runner *ContainerRunner) CreateContainer() error { // StartContainer starts the docker container created by CreateContainer. func (runner *ContainerRunner) StartContainer() error { runner.CrunchLog.Printf("Starting Docker container id '%s'", runner.ContainerID) + runner.cStateLock.Lock() + defer runner.cStateLock.Unlock() + if runner.cCancelled { + return ErrCancelled + } err := runner.Docker.StartContainer(runner.ContainerID, &runner.HostConfig) if err != nil { return fmt.Errorf("could not start container: %v", err) } + runner.cStarted = true return nil } @@ -600,12 +718,22 @@ func (runner *ContainerRunner) StartContainer() error { func (runner *ContainerRunner) WaitFinish() error { runner.CrunchLog.Print("Waiting for container to finish") - result := runner.Docker.Wait(runner.ContainerID) - wr := <-result - if wr.Error != nil { - return fmt.Errorf("While waiting for container to finish: %v", wr.Error) + waitDocker := runner.Docker.Wait(runner.ContainerID) + waitMount := runner.ArvMountExit + for waitDocker != nil { + select { + case err := <-waitMount: + runner.CrunchLog.Printf("arv-mount exited before container finished: %v", err) + waitMount = nil + runner.stop() + case wr := <-waitDocker: + if wr.Error != nil { + return fmt.Errorf("While waiting for container to finish: %v", wr.Error) + } + runner.ExitCode = &wr.ExitCode + waitDocker = nil + } } - runner.ExitCode = &wr.ExitCode // wait for stdout/stderr to complete <-runner.loggingDone @@ -649,7 +777,7 @@ func (runner *ContainerRunner) CaptureOutput() error { _, err = os.Stat(collectionMetafile) if err != nil { // Regular directory - cw := CollectionWriter{runner.Kc, nil, sync.Mutex{}} + cw := CollectionWriter{0, runner.Kc, nil, nil, sync.Mutex{}} manifestText, err = cw.WriteTree(runner.HostOutputDir, runner.CrunchLog.Logger) if err != nil { return fmt.Errorf("While uploading output files: %v", err) @@ -706,6 +834,7 @@ func (runner *ContainerRunner) CaptureOutput() error { manifestText = manifest.Extract(".", ".").Text err = runner.ArvClient.Create("collections", arvadosclient.Dict{ + "ensure_unique_name": true, "collection": arvadosclient.Dict{ "is_trashed": true, "name": "output for " + runner.Container.UUID, @@ -810,6 +939,7 @@ func (runner *ContainerRunner) CommitLogs() error { var response arvados.Collection err = runner.ArvClient.Create("collections", arvadosclient.Dict{ + "ensure_unique_name": true, "collection": arvadosclient.Dict{ "is_trashed": true, "name": "logs for " + runner.Container.UUID, @@ -824,9 +954,9 @@ func (runner *ContainerRunner) CommitLogs() error { // UpdateContainerRunning updates the container state to "Running" func (runner *ContainerRunner) UpdateContainerRunning() error { - runner.CancelLock.Lock() - defer runner.CancelLock.Unlock() - if runner.Cancelled { + runner.cStateLock.Lock() + defer runner.cStateLock.Unlock() + if runner.cCancelled { return ErrCancelled } return runner.ArvClient.Update("containers", runner.Container.UUID, @@ -870,9 +1000,9 @@ func (runner *ContainerRunner) UpdateContainerFinal() error { // IsCancelled returns the value of Cancelled, with goroutine safety. func (runner *ContainerRunner) IsCancelled() bool { - runner.CancelLock.Lock() - defer runner.CancelLock.Unlock() - return runner.Cancelled + runner.cStateLock.Lock() + defer runner.cStateLock.Unlock() + return runner.cCancelled } // NewArvLogWriter creates an ArvLogWriter @@ -968,6 +1098,17 @@ func (runner *ContainerRunner) Run() (err error) { return } + // Gather and record node information + err = runner.LogNodeInfo() + if err != nil { + return + } + // Save container.json record on log collection + err = runner.LogContainerRecord() + if err != nil { + return + } + runner.StartCrunchstat() if runner.IsCancelled() { @@ -1002,7 +1143,7 @@ func NewContainerRunner(api IArvadosClient, cr.NewLogWriter = cr.NewArvLogWriter cr.RunArvMount = cr.ArvMountCmd cr.MkTempDir = ioutil.TempDir - cr.LogCollection = &CollectionWriter{kc, nil, sync.Mutex{}} + cr.LogCollection = &CollectionWriter{0, kc, nil, nil, sync.Mutex{}} cr.Container.UUID = containerUUID cr.CrunchLog = NewThrottledLogger(cr.NewLogWriter("crunch-run")) cr.CrunchLog.Immediate = log.New(os.Stderr, containerUUID+" ", 0)