Merge branch '18657-passenger-binstub' refs #18657
[arvados.git] / lib / crunchrun / crunchrun.go
index 589a046a34c0edb94ed97c48b7776cd63c1091cd..af0d49c80e61e65a1120d6aa07ecd1601a064534 100644 (file)
@@ -36,6 +36,7 @@ import (
        "git.arvados.org/arvados.git/sdk/go/arvadosclient"
        "git.arvados.org/arvados.git/sdk/go/keepclient"
        "git.arvados.org/arvados.git/sdk/go/manifest"
+       "golang.org/x/sys/unix"
 )
 
 type command struct{}
@@ -986,6 +987,11 @@ func (runner *ContainerRunner) CreateContainer(imageID string, bindmounts map[st
        runner.executorStdin = stdin
        runner.executorStdout = stdout
        runner.executorStderr = stderr
+
+       if runner.Container.RuntimeConstraints.CUDA.DeviceCount > 0 {
+               nvidiaModprobe(runner.CrunchLog)
+       }
+
        return runner.executor.Create(containerSpec{
                Image:           imageID,
                VCPUs:           runner.Container.RuntimeConstraints.VCPUs,
@@ -995,7 +1001,7 @@ func (runner *ContainerRunner) CreateContainer(imageID string, bindmounts map[st
                BindMounts:      bindmounts,
                Command:         runner.Container.Command,
                EnableNetwork:   enableNetwork,
-               CUDADeviceCount: runner.Container.RuntimeConstraints.CUDADeviceCount,
+               CUDADeviceCount: runner.Container.RuntimeConstraints.CUDA.DeviceCount,
                NetworkMode:     runner.networkMode,
                CgroupParent:    runner.setCgroupParent,
                Stdin:           stdin,
@@ -1051,6 +1057,20 @@ func (runner *ContainerRunner) WaitFinish() error {
        }
        runner.ExitCode = &exitcode
 
+       extra := ""
+       if exitcode&0x80 != 0 {
+               // Convert raw exit status (0x80 + signal number) to a
+               // string to log after the code, like " (signal 101)"
+               // or " (signal 9, killed)"
+               sig := syscall.WaitStatus(exitcode).Signal()
+               if name := unix.SignalName(sig); name != "" {
+                       extra = fmt.Sprintf(" (signal %d, %s)", sig, name)
+               } else {
+                       extra = fmt.Sprintf(" (signal %d)", sig)
+               }
+       }
+       runner.CrunchLog.Printf("Container exited with status code %d%s", exitcode, extra)
+
        var returnErr error
        if err = runner.executorStdin.Close(); err != nil {
                err = fmt.Errorf("error closing container stdin: %s", err)