1 // Copyright (C) The Arvados Authors. All rights reserved.
3 // SPDX-License-Identifier: AGPL-3.0
13 // nvidiaModprobe makes sure all the nvidia kernel modules and devices
14 // are set up. If we don't have all the modules/devices set up we get
15 // "CUDA_ERROR_UNKNOWN".
16 func nvidiaModprobe(writer io.Writer) {
17 // The underlying problem is that when normally running
18 // directly on the host, the CUDA SDK will automatically
19 // detect and set up the devices on demand. However, when
20 // running inside a container, it lacks sufficient permissions
21 // to do that. So, it needs to be set up before the container
24 // The Singularity documentation hints about this but isn't
25 // very helpful with a solution.
26 // https://sylabs.io/guides/3.7/user-guide/gpu.html#cuda-error-unknown-when-everything-seems-to-be-correctly-configured
28 // If we're running "nvidia-persistenced", it sets up most of
29 // these things on system boot.
31 // However, it seems that doesn't include /dev/nvidia-uvm
32 // We're also no guaranteed to be running
33 // "nvidia-persistenced" or otherwise have the devices set up
34 // for us. So the most robust solution is to do it ourselves.
36 // These are idempotent operations so it is harmless in the
37 // case that everything was actually already set up.
39 // Running nvida-smi the first time loads the core 'nvidia'
40 // kernel module creates /dev/nvidiactl the per-GPU
41 // /dev/nvidia* devices
42 nvidiaSmi := exec.Command("nvidia-smi", "-L")
43 nvidiaSmi.Stdout = writer
44 nvidiaSmi.Stderr = writer
45 err := nvidiaSmi.Run()
47 fmt.Fprintf(writer, "Warning %v: %v\n", nvidiaSmi.Args, err)
50 // Load the kernel modules & devices associated with
51 // /dev/nvidia-modeset, /dev/nvidia-nvlink, /dev/nvidia-uvm
52 // and /dev/nvidia-uvm-tools (-m, -l and -u). Annoyingly,
53 // these don't have multiple devices but you need to supply
54 // "-c0" anyway or it won't make the device file.
56 // Nvswitch devices are multi-GPU interconnects for up to 16
57 // GPUs. The "-c0 -s" flag will create /dev/nvidia-nvswitch0.
58 // If someone runs Arvados on a system with multiple
59 // nvswitches (i.e. more than 16 GPUs) they'll have to ensure
60 // that all the /dev/nvidia-nvswitch* devices exist before
62 for _, opt := range []string{"-m", "-l", "-u", "-s"} {
63 nvmodprobe := exec.Command("nvidia-modprobe", "-c0", opt)
64 nvmodprobe.Stdout = writer
65 nvmodprobe.Stderr = writer
66 err = nvmodprobe.Run()
68 fmt.Fprintf(writer, "Warning %v: %v\n", nvmodprobe.Args, err)