+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package crunchrun
+
+import (
+ "io"
+ "os/exec"
+)
+
+// nvidiaModprobe makes sure all the nvidia kernel modules and devices
+// are set up. If we don't have all the modules/devices set up we get
+// "CUDA_ERROR_UNKNOWN".
+func nvidiaModprobe(writer io.Writer) {
+ // The underlying problem is that when normally running
+ // directly on the host, the CUDA SDK will automatically
+ // detect and set up the devices on demand. However, when
+ // running inside a container, it lacks sufficient permissions
+ // to do that. So, it needs to be set up before the container
+ // can be started.
+ //
+ // The Singularity documentation hints about this but isn't
+ // very helpful with a solution.
+ // https://sylabs.io/guides/3.7/user-guide/gpu.html#cuda-error-unknown-when-everything-seems-to-be-correctly-configured
+ //
+ // If we're running "nvidia-persistenced", it sets up most of
+ // these things on system boot.
+ //
+ // However, it seems that doesn't include /dev/nvidia-uvm
+ // We're also no guaranteed to be running
+ // "nvidia-persistenced" or otherwise have the devices set up
+ // for us. So the most robust solution is to do it ourselves.
+ //
+ // These are idempotent operations so it is harmless in the
+ // case that everything was actually already set up.
+
+ // Running nvida-smi the first time loads the core 'nvidia'
+ // kernel module creates /dev/nvidiactl the per-GPU
+ // /dev/nvidia* devices
+ nvidiaSmi := exec.Command("nvidia-smi", "-L")
+ nvidiaSmi.Stdout = writer
+ nvidiaSmi.Stderr = writer
+ nvidiaSmi.Run()
+
+ // Load the kernel modules & devices associated with
+ // /dev/nvidia-modeset, /dev/nvidia-nvlink, /dev/nvidia-uvm
+ // and /dev/nvidia-uvm-tools (-m, -l and -u). Annoyingly, you
+ // don't have multiple devices but you need to supply "-c0"
+ // anyway or it won't make the device file.
+ exec.Command("nvidia-modprobe", "-c0", "-m").Run()
+ exec.Command("nvidia-modprobe", "-c0", "-l").Run()
+ exec.Command("nvidia-modprobe", "-c0", "-u").Run()
+
+ // Nvswitch devices are multi-GPU interconnects for up to 16
+ // GPUs. Here we'll create /dev/nvidia-nvswitch0. If someone
+ // runs Arvados on a system with multiple nvswitches
+ // (i.e. more than 16 GPUs) they can either ensure that the
+ // additional /dev/nvidia-nvswitch* devices exist before
+ // crunch-run starts or pay for support (because they clearly
+ // have the budget for it).
+ exec.Command("nvidia-modprobe", "-c0", "-s").Run()
+}