Merge branch '12630-nvidia-devices' refs #12630
authorPeter Amstutz <peter.amstutz@curii.com>
Mon, 10 Jan 2022 22:01:25 +0000 (17:01 -0500)
committerPeter Amstutz <peter.amstutz@curii.com>
Mon, 10 Jan 2022 22:01:25 +0000 (17:01 -0500)
Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz@curii.com>

lib/crunchrun/crunchrun.go
lib/crunchrun/cuda.go [new file with mode: 0644]
lib/crunchrun/docker.go
lib/crunchrun/docker_test.go
lib/crunchrun/singularity.go

index b237d9fa590cef847366be35a4b728e1442e7941..fb2200a56b8760e5e2e655ce82aacb8fc2d56815 100644 (file)
@@ -987,6 +987,10 @@ func (runner *ContainerRunner) CreateContainer(imageID string, bindmounts map[st
        runner.executorStdout = stdout
        runner.executorStderr = stderr
 
+       if runner.Container.RuntimeConstraints.CUDA.DeviceCount > 0 {
+               nvidiaModprobe(runner.CrunchLog)
+       }
+
        return runner.executor.Create(containerSpec{
                Image:           imageID,
                VCPUs:           runner.Container.RuntimeConstraints.VCPUs,
diff --git a/lib/crunchrun/cuda.go b/lib/crunchrun/cuda.go
new file mode 100644 (file)
index 0000000..c693dbc
--- /dev/null
@@ -0,0 +1,69 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package crunchrun
+
+import (
+       "os/exec"
+)
+
+// nvidiaModprobe makes sure all the nvidia kernel modules and devices
+// are set up.  If we don't have all the modules/devices set up we get
+// "CUDA_ERROR_UNKNOWN".
+func nvidiaModprobe(writer *ThrottledLogger) {
+       // The underlying problem is that when normally running
+       // directly on the host, the CUDA SDK will automatically
+       // detect and set up the devices on demand.  However, when
+       // running inside a container, it lacks sufficient permissions
+       // to do that.  So, it needs to be set up before the container
+       // can be started.
+       //
+       // The Singularity documentation hints about this but isn't
+       // very helpful with a solution.
+       // https://sylabs.io/guides/3.7/user-guide/gpu.html#cuda-error-unknown-when-everything-seems-to-be-correctly-configured
+       //
+       // If we're running "nvidia-persistenced", it sets up most of
+       // these things on system boot.
+       //
+       // However, it seems that doesn't include /dev/nvidia-uvm
+       // We're also no guaranteed to be running
+       // "nvidia-persistenced" or otherwise have the devices set up
+       // for us.  So the most robust solution is to do it ourselves.
+       //
+       // These are idempotent operations so it is harmless in the
+       // case that everything was actually already set up.
+
+       // Running nvida-smi the first time loads the core 'nvidia'
+       // kernel module creates /dev/nvidiactl the per-GPU
+       // /dev/nvidia* devices
+       nvidiaSmi := exec.Command("nvidia-smi", "-L")
+       nvidiaSmi.Stdout = writer
+       nvidiaSmi.Stderr = writer
+       err := nvidiaSmi.Run()
+       if err != nil {
+               writer.Printf("Warning %v: %v", nvidiaSmi.Args, err)
+       }
+
+       // Load the kernel modules & devices associated with
+       // /dev/nvidia-modeset, /dev/nvidia-nvlink, /dev/nvidia-uvm
+       // and /dev/nvidia-uvm-tools (-m, -l and -u).  Annoyingly,
+       // these don't have multiple devices but you need to supply
+       // "-c0" anyway or it won't make the device file.
+
+       // Nvswitch devices are multi-GPU interconnects for up to 16
+       // GPUs.  The "-c0 -s" flag will create /dev/nvidia-nvswitch0.
+       // If someone runs Arvados on a system with multiple
+       // nvswitches (i.e. more than 16 GPUs) they'll have to ensure
+       // that all the /dev/nvidia-nvswitch* devices exist before
+       // crunch-run starts.
+       for _, opt := range []string{"-m", "-l", "-u", "-s"} {
+               nvmodprobe := exec.Command("nvidia-modprobe", "-c0", opt)
+               nvmodprobe.Stdout = writer
+               nvmodprobe.Stderr = writer
+               err = nvmodprobe.Run()
+               if err != nil {
+                       writer.Printf("Warning %v: %v", nvmodprobe.Args, err)
+               }
+       }
+}
index ab00273ef3a8eda38fccfc67b2a6e0b3c00659d3..06e5b5b1ece5794881e8a5cc04c05059563e358d 100644 (file)
@@ -107,10 +107,40 @@ func (e *dockerExecutor) config(spec containerSpec) (dockercontainer.Config, doc
                },
        }
        if spec.CUDADeviceCount != 0 {
+               var deviceIds []string
+               if cudaVisibleDevices := os.Getenv("CUDA_VISIBLE_DEVICES"); cudaVisibleDevices != "" {
+                       // If a resource manager such as slurm or LSF told
+                       // us to select specific devices we need to propagate that.
+                       deviceIds = strings.Split(cudaVisibleDevices, ",")
+               }
+
+               deviceCount := spec.CUDADeviceCount
+               if len(deviceIds) > 0 {
+                       // Docker won't accept both non-empty
+                       // DeviceIDs and a non-zero Count
+                       //
+                       // (it turns out "Count" is a dumb fallback
+                       // that just allocates device 0, 1, 2, ...,
+                       // Count-1)
+                       deviceCount = 0
+               }
+
+               // Capabilities are confusing.  The driver has generic
+               // capabilities "gpu" and "nvidia" but then there's
+               // additional capabilities "compute" and "utility"
+               // that are passed to nvidia-container-cli.
+               //
+               // "compute" means include the CUDA libraries and
+               // "utility" means include the CUDA utility programs
+               // (like nvidia-smi).
+               //
+               // https://github.com/moby/moby/blob/7b9275c0da707b030e62c96b679a976f31f929d3/daemon/nvidia_linux.go#L37
+               // https://github.com/containerd/containerd/blob/main/contrib/nvidia/nvidia.go
                hostCfg.Resources.DeviceRequests = append(hostCfg.Resources.DeviceRequests, dockercontainer.DeviceRequest{
                        Driver:       "nvidia",
-                       Count:        spec.CUDADeviceCount,
-                       Capabilities: [][]string{[]string{"gpu", "nvidia", "compute"}},
+                       Count:        deviceCount,
+                       DeviceIDs:    deviceIds,
+                       Capabilities: [][]string{[]string{"gpu", "nvidia", "compute", "utility"}},
                })
        }
        for path, mount := range spec.BindMounts {
index 9a1573193b6801d7ffd1c32ae73a2d7a9c188784..53201b8d513766f8a2c6a1e17fd715aa65135586 100644 (file)
@@ -59,6 +59,6 @@ func (s *dockerStubSuite) TestDockerContainerConfig(c *C) {
        c.Check(hostCfg.Resources.DeviceRequests, DeepEquals, []dockercontainer.DeviceRequest{{
                Driver:       "nvidia",
                Count:        3,
-               Capabilities: [][]string{{"gpu", "nvidia", "compute"}},
+               Capabilities: [][]string{{"gpu", "nvidia", "compute", "utility"}},
        }})
 }
index cda10aa611b7cf8d3d1d03822495f91a9ee0f8d6..64a3773250701ecd62832e52e88a5fcf8a2b3da2 100644 (file)
@@ -284,6 +284,15 @@ func (e *singularityExecutor) execCmd(path string) *exec.Cmd {
                env = append(env, "SINGULARITYENV_"+k+"="+v)
        }
 
+       // Singularity always makes all nvidia devices visible to the
+       // container.  If a resource manager such as slurm or LSF told
+       // us to select specific devices we need to propagate that.
+       if cudaVisibleDevices := os.Getenv("CUDA_VISIBLE_DEVICES"); cudaVisibleDevices != "" {
+               // If a resource manager such as slurm or LSF told
+               // us to select specific devices we need to propagate that.
+               env = append(env, "SINGULARITYENV_CUDA_VISIBLE_DEVICES="+cudaVisibleDevices)
+       }
+
        args = append(args, e.imageFilename)
        args = append(args, e.spec.Command...)