CleanupTempDir []string
Binds []string
OutputPDH *string
- CancelLock sync.Mutex
- Cancelled bool
SigChan chan os.Signal
ArvMountExit chan error
finalState string
// parent to be X" feature even on sites where the "specify
// cgroup parent" feature breaks.
setCgroupParent string
+
+ cStateLock sync.Mutex
+ cStarted bool // StartContainer() succeeded
+ cCancelled bool // StopContainer() invoked
}
// SetupSignals sets up signal handling to gracefully terminate the underlying
go func(sig chan os.Signal) {
<-sig
+ log.Print("signal handler calling runner.stop()")
runner.stop()
signal.Stop(sig)
}(runner.SigChan)
// stop the underlying Docker container.
func (runner *ContainerRunner) stop() {
- runner.CancelLock.Lock()
- defer runner.CancelLock.Unlock()
- if runner.Cancelled {
+ runner.cStateLock.Lock()
+ defer runner.cStateLock.Unlock()
+ if runner.cCancelled {
return
}
- runner.Cancelled = true
- if runner.ContainerID != "" {
+ runner.cCancelled = true
+ if runner.cStarted {
err := runner.Docker.StopContainer(runner.ContainerID, 10)
if err != nil {
log.Printf("StopContainer failed: %s", err)
runner.statReporter.Start()
}
+type infoCommand struct {
+ label string
+ cmd []string
+}
+
+// Gather node information and store it on the log for debugging
+// purposes.
+func (runner *ContainerRunner) LogNodeInfo() (err error) {
+ w := runner.NewLogWriter("node-info")
+ logger := log.New(w, "node-info", 0)
+
+ commands := []infoCommand{
+ infoCommand{
+ label: "Host Information",
+ cmd: []string{"uname", "-a"},
+ },
+ infoCommand{
+ label: "CPU Information",
+ cmd: []string{"cat", "/proc/cpuinfo"},
+ },
+ infoCommand{
+ label: "Memory Information",
+ cmd: []string{"cat", "/proc/meminfo"},
+ },
+ infoCommand{
+ label: "Disk Space",
+ cmd: []string{"df", "-m"},
+ },
+ }
+
+ // Run commands with informational output to be logged.
+ var out []byte
+ for _, command := range commands {
+ out, err = exec.Command(command.cmd[0], command.cmd[1:]...).Output()
+ if err != nil {
+ return fmt.Errorf("While running command '%s': %v",
+ command.cmd[0], err)
+ }
+ logger.Println(command.label)
+ for _, line := range strings.Split(string(out), "\n") {
+ logger.Println(" ", line)
+ }
+ }
+
+ err = w.Close()
+ if err != nil {
+ return fmt.Errorf("While closing node-info logs: %v", err)
+ }
+ return nil
+}
+
+// Get and save the raw JSON container record from the API server
+func (runner *ContainerRunner) LogContainerRecord() (err error) {
+ w := &ArvLogWriter{
+ runner.ArvClient,
+ runner.Container.UUID,
+ "container",
+ runner.LogCollection.Open("container.json"),
+ }
+ logger := log.New(w, "container", 0)
+
+ // Convert container record to pretty-printed JSON []byte
+ rec, err := json.MarshalIndent(runner.Container, "", " ")
+ if err != nil {
+ return fmt.Errorf("While converting container record to JSON: %v", err)
+ }
+
+ // Write JSON record line-by-line
+ for _, line := range strings.Split(string(rec), "\n") {
+ logger.Println(line)
+ }
+
+ err = w.Close()
+ if err != nil {
+ return fmt.Errorf("While closing container.json log: %v", err)
+ }
+ return nil
+}
+
// AttachLogs connects the docker container stdout and stderr logs to the
// Arvados logger which logs to Keep and the API server logs table.
func (runner *ContainerRunner) AttachStreams() (err error) {
// StartContainer starts the docker container created by CreateContainer.
func (runner *ContainerRunner) StartContainer() error {
runner.CrunchLog.Printf("Starting Docker container id '%s'", runner.ContainerID)
+ runner.cStateLock.Lock()
+ defer runner.cStateLock.Unlock()
+ if runner.cCancelled {
+ return ErrCancelled
+ }
err := runner.Docker.StartContainer(runner.ContainerID, &runner.HostConfig)
if err != nil {
return fmt.Errorf("could not start container: %v", err)
}
+ runner.cStarted = true
return nil
}
// UpdateContainerRunning updates the container state to "Running"
func (runner *ContainerRunner) UpdateContainerRunning() error {
- runner.CancelLock.Lock()
- defer runner.CancelLock.Unlock()
- if runner.Cancelled {
+ runner.cStateLock.Lock()
+ defer runner.cStateLock.Unlock()
+ if runner.cCancelled {
return ErrCancelled
}
return runner.ArvClient.Update("containers", runner.Container.UUID,
// IsCancelled returns the value of Cancelled, with goroutine safety.
func (runner *ContainerRunner) IsCancelled() bool {
- runner.CancelLock.Lock()
- defer runner.CancelLock.Unlock()
- return runner.Cancelled
+ runner.cStateLock.Lock()
+ defer runner.cStateLock.Unlock()
+ return runner.cCancelled
}
// NewArvLogWriter creates an ArvLogWriter
return
}
+ // Gather and record node information
+ err = runner.LogNodeInfo()
+ if err != nil {
+ return
+ }
+ // Save container.json record on log collection
+ err = runner.LogContainerRecord()
+ if err != nil {
+ return
+ }
+
runner.StartCrunchstat()
if runner.IsCancelled() {
"os"
"os/exec"
"path/filepath"
+ "runtime/pprof"
"sort"
"strings"
"sync"
api := &ArvTestClient{}
kc := &KeepTestClient{}
cr := NewContainerRunner(api, kc, nil, "zzzzz-zzzzz-zzzzzzzzzzzzzzz")
- cr.Cancelled = true
+ cr.cCancelled = true
cr.finalState = "Cancelled"
err := cr.UpdateContainerFinal()
c.Check(api.Logs["crunchstat"].String(), Matches, `(?ms).*cgroup stats files never appeared for abcde\n`)
}
+func (s *TestSuite) TestNodeInfoLog(c *C) {
+ api, _, _ := FullRunHelper(c, `{
+ "command": ["sleep", "1"],
+ "container_image": "d4ab34d3d4f8a72f5c4973051ae69fab+122",
+ "cwd": ".",
+ "environment": {},
+ "mounts": {"/tmp": {"kind": "tmp"} },
+ "output_path": "/tmp",
+ "priority": 1,
+ "runtime_constraints": {}
+ }`, nil, func(t *TestDockerClient) {
+ time.Sleep(time.Second)
+ t.logWriter.Close()
+ t.finish <- dockerclient.WaitResult{}
+ })
+
+ c.Check(api.CalledWith("container.exit_code", 0), NotNil)
+ c.Check(api.CalledWith("container.state", "Complete"), NotNil)
+
+ c.Assert(api.Logs["node-info"], NotNil)
+ c.Check(api.Logs["node-info"].String(), Matches, `(?ms).*Host Information.*`)
+ c.Check(api.Logs["node-info"].String(), Matches, `(?ms).*CPU Information.*`)
+ c.Check(api.Logs["node-info"].String(), Matches, `(?ms).*Memory Information.*`)
+ c.Check(api.Logs["node-info"].String(), Matches, `(?ms).*Disk Space.*`)
+}
+
+func (s *TestSuite) TestContainerRecordLog(c *C) {
+ api, _, _ := FullRunHelper(c, `{
+ "command": ["sleep", "1"],
+ "container_image": "d4ab34d3d4f8a72f5c4973051ae69fab+122",
+ "cwd": ".",
+ "environment": {},
+ "mounts": {"/tmp": {"kind": "tmp"} },
+ "output_path": "/tmp",
+ "priority": 1,
+ "runtime_constraints": {}
+ }`, nil, func(t *TestDockerClient) {
+ time.Sleep(time.Second)
+ t.logWriter.Close()
+ t.finish <- dockerclient.WaitResult{}
+ })
+
+ c.Check(api.CalledWith("container.exit_code", 0), NotNil)
+ c.Check(api.CalledWith("container.state", "Complete"), NotNil)
+
+ c.Assert(api.Logs["container"], NotNil)
+ c.Check(api.Logs["container"].String(), Matches, `(?ms).*container_image.*`)
+}
+
func (s *TestSuite) TestFullRunStderr(c *C) {
api, _, _ := FullRunHelper(c, `{
"command": ["/bin/sh", "-c", "echo hello ; echo world 1>&2 ; exit 1"],
func (s *TestSuite) TestStopOnSignal(c *C) {
s.testStopContainer(c, func(cr *ContainerRunner) {
go func() {
- for cr.ContainerID == "" {
+ for !cr.cStarted {
time.Sleep(time.Millisecond)
}
cr.SigChan <- syscall.SIGINT
}()
select {
case <-time.After(20 * time.Second):
+ pprof.Lookup("goroutine").WriteTo(os.Stderr, 1)
c.Fatal("timed out")
case err = <-done:
c.Check(err, IsNil)