10218: Merge branch 'master' into 10218-record-node-info
authorLucas Di Pentima <lucas@curoverse.com>
Mon, 20 Mar 2017 19:07:40 +0000 (16:07 -0300)
committerLucas Di Pentima <lucas@curoverse.com>
Mon, 20 Mar 2017 19:07:40 +0000 (16:07 -0300)
services/crunch-run/crunchrun.go
services/crunch-run/crunchrun_test.go

index 3b3cdf1c14a872dacbc76b5679db9970609907dd..a3338ec87feec5049370cfcab2433cb6962953e8 100644 (file)
@@ -91,8 +91,6 @@ type ContainerRunner struct {
        CleanupTempDir []string
        Binds          []string
        OutputPDH      *string
-       CancelLock     sync.Mutex
-       Cancelled      bool
        SigChan        chan os.Signal
        ArvMountExit   chan error
        finalState     string
@@ -114,6 +112,10 @@ type ContainerRunner struct {
        // parent to be X" feature even on sites where the "specify
        // cgroup parent" feature breaks.
        setCgroupParent string
+
+       cStateLock sync.Mutex
+       cStarted   bool // StartContainer() succeeded
+       cCancelled bool // StopContainer() invoked
 }
 
 // SetupSignals sets up signal handling to gracefully terminate the underlying
@@ -126,6 +128,7 @@ func (runner *ContainerRunner) SetupSignals() {
 
        go func(sig chan os.Signal) {
                <-sig
+               log.Print("signal handler calling runner.stop()")
                runner.stop()
                signal.Stop(sig)
        }(runner.SigChan)
@@ -133,13 +136,13 @@ func (runner *ContainerRunner) SetupSignals() {
 
 // stop the underlying Docker container.
 func (runner *ContainerRunner) stop() {
-       runner.CancelLock.Lock()
-       defer runner.CancelLock.Unlock()
-       if runner.Cancelled {
+       runner.cStateLock.Lock()
+       defer runner.cStateLock.Unlock()
+       if runner.cCancelled {
                return
        }
-       runner.Cancelled = true
-       if runner.ContainerID != "" {
+       runner.cCancelled = true
+       if runner.cStarted {
                err := runner.Docker.StopContainer(runner.ContainerID, 10)
                if err != nil {
                        log.Printf("StopContainer failed: %s", err)
@@ -504,6 +507,85 @@ func (runner *ContainerRunner) StartCrunchstat() {
        runner.statReporter.Start()
 }
 
+type infoCommand struct {
+       label string
+       cmd   []string
+}
+
+// Gather node information and store it on the log for debugging
+// purposes.
+func (runner *ContainerRunner) LogNodeInfo() (err error) {
+       w := runner.NewLogWriter("node-info")
+       logger := log.New(w, "node-info", 0)
+
+       commands := []infoCommand{
+               infoCommand{
+                       label: "Host Information",
+                       cmd:   []string{"uname", "-a"},
+               },
+               infoCommand{
+                       label: "CPU Information",
+                       cmd:   []string{"cat", "/proc/cpuinfo"},
+               },
+               infoCommand{
+                       label: "Memory Information",
+                       cmd:   []string{"cat", "/proc/meminfo"},
+               },
+               infoCommand{
+                       label: "Disk Space",
+                       cmd:   []string{"df", "-m"},
+               },
+       }
+
+       // Run commands with informational output to be logged.
+       var out []byte
+       for _, command := range commands {
+               out, err = exec.Command(command.cmd[0], command.cmd[1:]...).Output()
+               if err != nil {
+                       return fmt.Errorf("While running command '%s': %v",
+                               command.cmd[0], err)
+               }
+               logger.Println(command.label)
+               for _, line := range strings.Split(string(out), "\n") {
+                       logger.Println(" ", line)
+               }
+       }
+
+       err = w.Close()
+       if err != nil {
+               return fmt.Errorf("While closing node-info logs: %v", err)
+       }
+       return nil
+}
+
+// Get and save the raw JSON container record from the API server
+func (runner *ContainerRunner) LogContainerRecord() (err error) {
+       w := &ArvLogWriter{
+               runner.ArvClient,
+               runner.Container.UUID,
+               "container",
+               runner.LogCollection.Open("container.json"),
+       }
+       logger := log.New(w, "container", 0)
+
+       // Convert container record to pretty-printed JSON []byte
+       rec, err := json.MarshalIndent(runner.Container, "", "    ")
+       if err != nil {
+               return fmt.Errorf("While converting container record to JSON: %v", err)
+       }
+
+       // Write JSON record line-by-line
+       for _, line := range strings.Split(string(rec), "\n") {
+               logger.Println(line)
+       }
+
+       err = w.Close()
+       if err != nil {
+               return fmt.Errorf("While closing container.json log: %v", err)
+       }
+       return nil
+}
+
 // AttachLogs connects the docker container stdout and stderr logs to the
 // Arvados logger which logs to Keep and the API server logs table.
 func (runner *ContainerRunner) AttachStreams() (err error) {
@@ -598,10 +680,16 @@ func (runner *ContainerRunner) CreateContainer() error {
 // StartContainer starts the docker container created by CreateContainer.
 func (runner *ContainerRunner) StartContainer() error {
        runner.CrunchLog.Printf("Starting Docker container id '%s'", runner.ContainerID)
+       runner.cStateLock.Lock()
+       defer runner.cStateLock.Unlock()
+       if runner.cCancelled {
+               return ErrCancelled
+       }
        err := runner.Docker.StartContainer(runner.ContainerID, &runner.HostConfig)
        if err != nil {
                return fmt.Errorf("could not start container: %v", err)
        }
+       runner.cStarted = true
        return nil
 }
 
@@ -846,9 +934,9 @@ func (runner *ContainerRunner) CommitLogs() error {
 
 // UpdateContainerRunning updates the container state to "Running"
 func (runner *ContainerRunner) UpdateContainerRunning() error {
-       runner.CancelLock.Lock()
-       defer runner.CancelLock.Unlock()
-       if runner.Cancelled {
+       runner.cStateLock.Lock()
+       defer runner.cStateLock.Unlock()
+       if runner.cCancelled {
                return ErrCancelled
        }
        return runner.ArvClient.Update("containers", runner.Container.UUID,
@@ -892,9 +980,9 @@ func (runner *ContainerRunner) UpdateContainerFinal() error {
 
 // IsCancelled returns the value of Cancelled, with goroutine safety.
 func (runner *ContainerRunner) IsCancelled() bool {
-       runner.CancelLock.Lock()
-       defer runner.CancelLock.Unlock()
-       return runner.Cancelled
+       runner.cStateLock.Lock()
+       defer runner.cStateLock.Unlock()
+       return runner.cCancelled
 }
 
 // NewArvLogWriter creates an ArvLogWriter
@@ -990,6 +1078,17 @@ func (runner *ContainerRunner) Run() (err error) {
                return
        }
 
+       // Gather and record node information
+       err = runner.LogNodeInfo()
+       if err != nil {
+               return
+       }
+       // Save container.json record on log collection
+       err = runner.LogContainerRecord()
+       if err != nil {
+               return
+       }
+
        runner.StartCrunchstat()
 
        if runner.IsCancelled() {
index 36ddd36d31525533e604d3d9d521de404206c9d8..c26830da552560c6427b8dcf8f30d82fb690bef3 100644 (file)
@@ -11,6 +11,7 @@ import (
        "os"
        "os/exec"
        "path/filepath"
+       "runtime/pprof"
        "sort"
        "strings"
        "sync"
@@ -525,7 +526,7 @@ func (s *TestSuite) TestUpdateContainerCancelled(c *C) {
        api := &ArvTestClient{}
        kc := &KeepTestClient{}
        cr := NewContainerRunner(api, kc, nil, "zzzzz-zzzzz-zzzzzzzzzzzzzzz")
-       cr.Cancelled = true
+       cr.cCancelled = true
        cr.finalState = "Cancelled"
 
        err := cr.UpdateContainerFinal()
@@ -650,6 +651,55 @@ func (s *TestSuite) TestCrunchstat(c *C) {
        c.Check(api.Logs["crunchstat"].String(), Matches, `(?ms).*cgroup stats files never appeared for abcde\n`)
 }
 
+func (s *TestSuite) TestNodeInfoLog(c *C) {
+       api, _, _ := FullRunHelper(c, `{
+               "command": ["sleep", "1"],
+               "container_image": "d4ab34d3d4f8a72f5c4973051ae69fab+122",
+               "cwd": ".",
+               "environment": {},
+               "mounts": {"/tmp": {"kind": "tmp"} },
+               "output_path": "/tmp",
+               "priority": 1,
+               "runtime_constraints": {}
+       }`, nil, func(t *TestDockerClient) {
+               time.Sleep(time.Second)
+               t.logWriter.Close()
+               t.finish <- dockerclient.WaitResult{}
+       })
+
+       c.Check(api.CalledWith("container.exit_code", 0), NotNil)
+       c.Check(api.CalledWith("container.state", "Complete"), NotNil)
+
+       c.Assert(api.Logs["node-info"], NotNil)
+       c.Check(api.Logs["node-info"].String(), Matches, `(?ms).*Host Information.*`)
+       c.Check(api.Logs["node-info"].String(), Matches, `(?ms).*CPU Information.*`)
+       c.Check(api.Logs["node-info"].String(), Matches, `(?ms).*Memory Information.*`)
+       c.Check(api.Logs["node-info"].String(), Matches, `(?ms).*Disk Space.*`)
+}
+
+func (s *TestSuite) TestContainerRecordLog(c *C) {
+       api, _, _ := FullRunHelper(c, `{
+               "command": ["sleep", "1"],
+               "container_image": "d4ab34d3d4f8a72f5c4973051ae69fab+122",
+               "cwd": ".",
+               "environment": {},
+               "mounts": {"/tmp": {"kind": "tmp"} },
+               "output_path": "/tmp",
+               "priority": 1,
+               "runtime_constraints": {}
+       }`, nil, func(t *TestDockerClient) {
+               time.Sleep(time.Second)
+               t.logWriter.Close()
+               t.finish <- dockerclient.WaitResult{}
+       })
+
+       c.Check(api.CalledWith("container.exit_code", 0), NotNil)
+       c.Check(api.CalledWith("container.state", "Complete"), NotNil)
+
+       c.Assert(api.Logs["container"], NotNil)
+       c.Check(api.Logs["container"].String(), Matches, `(?ms).*container_image.*`)
+}
+
 func (s *TestSuite) TestFullRunStderr(c *C) {
        api, _, _ := FullRunHelper(c, `{
     "command": ["/bin/sh", "-c", "echo hello ; echo world 1>&2 ; exit 1"],
@@ -722,7 +772,7 @@ func (s *TestSuite) TestFullRunSetCwd(c *C) {
 func (s *TestSuite) TestStopOnSignal(c *C) {
        s.testStopContainer(c, func(cr *ContainerRunner) {
                go func() {
-                       for cr.ContainerID == "" {
+                       for !cr.cStarted {
                                time.Sleep(time.Millisecond)
                        }
                        cr.SigChan <- syscall.SIGINT
@@ -776,6 +826,7 @@ func (s *TestSuite) testStopContainer(c *C, setup func(cr *ContainerRunner)) {
        }()
        select {
        case <-time.After(20 * time.Second):
+               pprof.Lookup("goroutine").WriteTo(os.Stderr, 1)
                c.Fatal("timed out")
        case err = <-done:
                c.Check(err, IsNil)