1 // Copyright (C) The Arvados Authors. All rights reserved.
3 // SPDX-License-Identifier: AGPL-3.0
11 // nvidiaModprobe makes sure all the nvidia kernel modules and devices
12 // are set up. If we don't have all the modules/devices set up we get
13 // "CUDA_ERROR_UNKNOWN".
14 func nvidiaModprobe(writer *ThrottledLogger) {
15 // The underlying problem is that when normally running
16 // directly on the host, the CUDA SDK will automatically
17 // detect and set up the devices on demand. However, when
18 // running inside a container, it lacks sufficient permissions
19 // to do that. So, it needs to be set up before the container
22 // The Singularity documentation hints about this but isn't
23 // very helpful with a solution.
24 // https://sylabs.io/guides/3.7/user-guide/gpu.html#cuda-error-unknown-when-everything-seems-to-be-correctly-configured
26 // If we're running "nvidia-persistenced", it sets up most of
27 // these things on system boot.
29 // However, it seems that doesn't include /dev/nvidia-uvm
30 // We're also no guaranteed to be running
31 // "nvidia-persistenced" or otherwise have the devices set up
32 // for us. So the most robust solution is to do it ourselves.
34 // These are idempotent operations so it is harmless in the
35 // case that everything was actually already set up.
37 // Running nvida-smi the first time loads the core 'nvidia'
38 // kernel module creates /dev/nvidiactl the per-GPU
39 // /dev/nvidia* devices
40 nvidiaSmi := exec.Command("nvidia-smi", "-L")
41 nvidiaSmi.Stdout = writer
42 nvidiaSmi.Stderr = writer
43 err := nvidiaSmi.Run()
45 writer.Printf("Warning %v: %v", nvidiaSmi.Args, err)
48 // Load the kernel modules & devices associated with
49 // /dev/nvidia-modeset, /dev/nvidia-nvlink, /dev/nvidia-uvm
50 // and /dev/nvidia-uvm-tools (-m, -l and -u). Annoyingly,
51 // these don't have multiple devices but you need to supply
52 // "-c0" anyway or it won't make the device file.
54 // Nvswitch devices are multi-GPU interconnects for up to 16
55 // GPUs. The "-c0 -s" flag will create /dev/nvidia-nvswitch0.
56 // If someone runs Arvados on a system with multiple
57 // nvswitches (i.e. more than 16 GPUs) they'll have to ensure
58 // that all the /dev/nvidia-nvswitch* devices exist before
60 for _, opt := range []string{"-m", "-l", "-u", "-s"} {
61 nvmodprobe := exec.Command("nvidia-modprobe", "-c0", opt)
62 nvmodprobe.Stdout = writer
63 nvmodprobe.Stderr = writer
64 err = nvmodprobe.Run()
66 writer.Printf("Warning %v: %v", nvmodprobe.Args, err)