11255: Fix whitespace
[arvados.git] / services / crunch-run / crunchrun.go
index d42fe0baae5de647c96b5ac0bfeb775b602f09fa..062126d6ff42e710523c5779ac2441c109f77f65 100644 (file)
@@ -33,6 +33,7 @@ type IArvadosClient interface {
        Get(resourceType string, uuid string, parameters arvadosclient.Dict, output interface{}) error
        Update(resourceType string, uuid string, parameters arvadosclient.Dict, output interface{}) error
        Call(method, resourceType, uuid, action string, parameters arvadosclient.Dict, output interface{}) error
+       CallRaw(method string, resourceType string, uuid string, action string, parameters arvadosclient.Dict) (reader io.ReadCloser, err error)
        Discovery(key string) (interface{}, error)
 }
 
@@ -91,8 +92,6 @@ type ContainerRunner struct {
        CleanupTempDir []string
        Binds          []string
        OutputPDH      *string
-       CancelLock     sync.Mutex
-       Cancelled      bool
        SigChan        chan os.Signal
        ArvMountExit   chan error
        finalState     string
@@ -114,6 +113,13 @@ type ContainerRunner struct {
        // parent to be X" feature even on sites where the "specify
        // cgroup parent" feature breaks.
        setCgroupParent string
+
+       cStateLock sync.Mutex
+       cStarted   bool // StartContainer() succeeded
+       cCancelled bool // StopContainer() invoked
+
+       enableNetwork string // one of "default" or "always"
+       networkMode   string // passed through to HostConfig.NetworkMode
 }
 
 // SetupSignals sets up signal handling to gracefully terminate the underlying
@@ -133,13 +139,13 @@ func (runner *ContainerRunner) SetupSignals() {
 
 // stop the underlying Docker container.
 func (runner *ContainerRunner) stop() {
-       runner.CancelLock.Lock()
-       defer runner.CancelLock.Unlock()
-       if runner.Cancelled {
+       runner.cStateLock.Lock()
+       defer runner.cStateLock.Unlock()
+       if runner.cCancelled {
                return
        }
-       runner.Cancelled = true
-       if runner.ContainerID != "" {
+       runner.cCancelled = true
+       if runner.cStarted {
                err := runner.Docker.StopContainer(runner.ContainerID, 10)
                if err != nil {
                        log.Printf("StopContainer failed: %s", err)
@@ -504,6 +510,98 @@ func (runner *ContainerRunner) StartCrunchstat() {
        runner.statReporter.Start()
 }
 
+type infoCommand struct {
+       label string
+       cmd   []string
+}
+
+// Gather node information and store it on the log for debugging
+// purposes.
+func (runner *ContainerRunner) LogNodeInfo() (err error) {
+       w := runner.NewLogWriter("node-info")
+       logger := log.New(w, "node-info", 0)
+
+       commands := []infoCommand{
+               infoCommand{
+                       label: "Host Information",
+                       cmd:   []string{"uname", "-a"},
+               },
+               infoCommand{
+                       label: "CPU Information",
+                       cmd:   []string{"cat", "/proc/cpuinfo"},
+               },
+               infoCommand{
+                       label: "Memory Information",
+                       cmd:   []string{"cat", "/proc/meminfo"},
+               },
+               infoCommand{
+                       label: "Disk Space",
+                       cmd:   []string{"df", "-m", "/", os.TempDir()},
+               },
+               infoCommand{
+                       label: "Disk INodes",
+                       cmd:   []string{"df", "-i", "/", os.TempDir()},
+               },
+       }
+
+       // Run commands with informational output to be logged.
+       var out []byte
+       for _, command := range commands {
+               out, err = exec.Command(command.cmd[0], command.cmd[1:]...).CombinedOutput()
+               if err != nil {
+                       return fmt.Errorf("While running command %q: %v",
+                               command.cmd, err)
+               }
+               logger.Println(command.label)
+               for _, line := range strings.Split(string(out), "\n") {
+                       logger.Println(" ", line)
+               }
+       }
+
+       err = w.Close()
+       if err != nil {
+               return fmt.Errorf("While closing node-info logs: %v", err)
+       }
+       return nil
+}
+
+// Get and save the raw JSON container record from the API server
+func (runner *ContainerRunner) LogContainerRecord() (err error) {
+       w := &ArvLogWriter{
+               runner.ArvClient,
+               runner.Container.UUID,
+               "container",
+               runner.LogCollection.Open("container.json"),
+       }
+       // Get Container record JSON from the API Server
+       reader, err := runner.ArvClient.CallRaw("GET", "containers", runner.Container.UUID, "", nil)
+       if err != nil {
+               return fmt.Errorf("While retrieving container record from the API server: %v", err)
+       }
+       defer reader.Close()
+       // Read the API server response as []byte
+       json_bytes, err := ioutil.ReadAll(reader)
+       if err != nil {
+               return fmt.Errorf("While reading container record API server response: %v", err)
+       }
+       // Decode the JSON []byte
+       var cr map[string]interface{}
+       if err = json.Unmarshal(json_bytes, &cr); err != nil {
+               return fmt.Errorf("While decoding the container record JSON response: %v", err)
+       }
+       // Re-encode it using indentation to improve readability
+       enc := json.NewEncoder(w)
+       enc.SetIndent("", "    ")
+       if err = enc.Encode(cr); err != nil {
+               return fmt.Errorf("While logging the JSON container record: %v", err)
+       }
+       err = w.Close()
+       if err != nil {
+               return fmt.Errorf("While closing container.json log: %v", err)
+       }
+       return nil
+}
+
 // AttachLogs connects the docker container stdout and stderr logs to the
 // Arvados logger which logs to Keep and the API server logs table.
 func (runner *ContainerRunner) AttachStreams() (err error) {
@@ -563,6 +661,15 @@ func (runner *ContainerRunner) CreateContainer() error {
        for k, v := range runner.Container.Environment {
                runner.ContainerConfig.Env = append(runner.ContainerConfig.Env, k+"="+v)
        }
+
+       runner.HostConfig = dockerclient.HostConfig{
+               Binds:        runner.Binds,
+               CgroupParent: runner.setCgroupParent,
+               LogConfig: dockerclient.LogConfig{
+                       Type: "none",
+               },
+       }
+
        if wantAPI := runner.Container.RuntimeConstraints.API; wantAPI != nil && *wantAPI {
                tok, err := runner.ContainerToken()
                if err != nil {
@@ -573,9 +680,13 @@ func (runner *ContainerRunner) CreateContainer() error {
                        "ARVADOS_API_HOST="+os.Getenv("ARVADOS_API_HOST"),
                        "ARVADOS_API_HOST_INSECURE="+os.Getenv("ARVADOS_API_HOST_INSECURE"),
                )
-               runner.ContainerConfig.NetworkDisabled = false
+               runner.HostConfig.NetworkMode = runner.networkMode
        } else {
-               runner.ContainerConfig.NetworkDisabled = true
+               if runner.enableNetwork == "always" {
+                       runner.HostConfig.NetworkMode = runner.networkMode
+               } else {
+                       runner.HostConfig.NetworkMode = "none"
+               }
        }
 
        var err error
@@ -584,24 +695,22 @@ func (runner *ContainerRunner) CreateContainer() error {
                return fmt.Errorf("While creating container: %v", err)
        }
 
-       runner.HostConfig = dockerclient.HostConfig{
-               Binds:        runner.Binds,
-               CgroupParent: runner.setCgroupParent,
-               LogConfig: dockerclient.LogConfig{
-                       Type: "none",
-               },
-       }
-
        return runner.AttachStreams()
 }
 
 // StartContainer starts the docker container created by CreateContainer.
 func (runner *ContainerRunner) StartContainer() error {
        runner.CrunchLog.Printf("Starting Docker container id '%s'", runner.ContainerID)
+       runner.cStateLock.Lock()
+       defer runner.cStateLock.Unlock()
+       if runner.cCancelled {
+               return ErrCancelled
+       }
        err := runner.Docker.StartContainer(runner.ContainerID, &runner.HostConfig)
        if err != nil {
                return fmt.Errorf("could not start container: %v", err)
        }
+       runner.cStarted = true
        return nil
 }
 
@@ -726,6 +835,7 @@ func (runner *ContainerRunner) CaptureOutput() error {
        manifestText = manifest.Extract(".", ".").Text
        err = runner.ArvClient.Create("collections",
                arvadosclient.Dict{
+                       "ensure_unique_name": true,
                        "collection": arvadosclient.Dict{
                                "is_trashed":    true,
                                "name":          "output for " + runner.Container.UUID,
@@ -830,6 +940,7 @@ func (runner *ContainerRunner) CommitLogs() error {
        var response arvados.Collection
        err = runner.ArvClient.Create("collections",
                arvadosclient.Dict{
+                       "ensure_unique_name": true,
                        "collection": arvadosclient.Dict{
                                "is_trashed":    true,
                                "name":          "logs for " + runner.Container.UUID,
@@ -844,9 +955,9 @@ func (runner *ContainerRunner) CommitLogs() error {
 
 // UpdateContainerRunning updates the container state to "Running"
 func (runner *ContainerRunner) UpdateContainerRunning() error {
-       runner.CancelLock.Lock()
-       defer runner.CancelLock.Unlock()
-       if runner.Cancelled {
+       runner.cStateLock.Lock()
+       defer runner.cStateLock.Unlock()
+       if runner.cCancelled {
                return ErrCancelled
        }
        return runner.ArvClient.Update("containers", runner.Container.UUID,
@@ -890,9 +1001,9 @@ func (runner *ContainerRunner) UpdateContainerFinal() error {
 
 // IsCancelled returns the value of Cancelled, with goroutine safety.
 func (runner *ContainerRunner) IsCancelled() bool {
-       runner.CancelLock.Lock()
-       defer runner.CancelLock.Unlock()
-       return runner.Cancelled
+       runner.cStateLock.Lock()
+       defer runner.cStateLock.Unlock()
+       return runner.cCancelled
 }
 
 // NewArvLogWriter creates an ArvLogWriter
@@ -988,6 +1099,17 @@ func (runner *ContainerRunner) Run() (err error) {
                return
        }
 
+       // Gather and record node information
+       err = runner.LogNodeInfo()
+       if err != nil {
+               return
+       }
+       // Save container.json record on log collection
+       err = runner.LogContainerRecord()
+       if err != nil {
+               return
+       }
+
        runner.StartCrunchstat()
 
        if runner.IsCancelled() {
@@ -1035,6 +1157,14 @@ func main() {
        cgroupParent := flag.String("cgroup-parent", "docker", "name of container's parent cgroup (ignored if -cgroup-parent-subsystem is used)")
        cgroupParentSubsystem := flag.String("cgroup-parent-subsystem", "", "use current cgroup for given subsystem as parent cgroup for container")
        caCertsPath := flag.String("ca-certs", "", "Path to TLS root certificates")
+       enableNetwork := flag.String("container-enable-networking", "default",
+               `Specify if networking should be enabled for container.  One of 'default', 'always':
+       default: only enable networking if container requests it.
+       always:  containers always have networking enabled
+       `)
+       networkMode := flag.String("container-network-mode", "default",
+               `Set networking mode for container.  Corresponds to Docker network mode (--net).
+       `)
        flag.Parse()
 
        containerId := flag.Arg(0)
@@ -1066,6 +1196,8 @@ func main() {
        cr.statInterval = *statInterval
        cr.cgroupRoot = *cgroupRoot
        cr.expectCgroupParent = *cgroupParent
+       cr.enableNetwork = *enableNetwork
+       cr.networkMode = *networkMode
        if *cgroupParentSubsystem != "" {
                p := findCgroup(*cgroupParentSubsystem)
                cr.setCgroupParent = p