X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/042f47a2c6b5f3db80142164b6493c873aca0b26..13f6d45704efc68ca8419e8917376aa44fdee1be:/lib/crunchrun/cuda.go diff --git a/lib/crunchrun/cuda.go b/lib/crunchrun/cuda.go index 91949c588f..c693dbcb96 100644 --- a/lib/crunchrun/cuda.go +++ b/lib/crunchrun/cuda.go @@ -5,14 +5,13 @@ package crunchrun import ( - "io" "os/exec" ) // nvidiaModprobe makes sure all the nvidia kernel modules and devices // are set up. If we don't have all the modules/devices set up we get // "CUDA_ERROR_UNKNOWN". -func nvidiaModprobe(writer io.Writer) { +func nvidiaModprobe(writer *ThrottledLogger) { // The underlying problem is that when normally running // directly on the host, the CUDA SDK will automatically // detect and set up the devices on demand. However, when @@ -41,23 +40,30 @@ func nvidiaModprobe(writer io.Writer) { nvidiaSmi := exec.Command("nvidia-smi", "-L") nvidiaSmi.Stdout = writer nvidiaSmi.Stderr = writer - nvidiaSmi.Run() + err := nvidiaSmi.Run() + if err != nil { + writer.Printf("Warning %v: %v", nvidiaSmi.Args, err) + } // Load the kernel modules & devices associated with // /dev/nvidia-modeset, /dev/nvidia-nvlink, /dev/nvidia-uvm - // and /dev/nvidia-uvm-tools (-m, -l and -u). Annoyingly, you - // don't have multiple devices but you need to supply "-c0" - // anyway or it won't make the device file. - exec.Command("nvidia-modprobe", "-c0", "-m").Run() - exec.Command("nvidia-modprobe", "-c0", "-l").Run() - exec.Command("nvidia-modprobe", "-c0", "-u").Run() + // and /dev/nvidia-uvm-tools (-m, -l and -u). Annoyingly, + // these don't have multiple devices but you need to supply + // "-c0" anyway or it won't make the device file. // Nvswitch devices are multi-GPU interconnects for up to 16 - // GPUs. Here we'll create /dev/nvidia-nvswitch0. If someone - // runs Arvados on a system with multiple nvswitches - // (i.e. more than 16 GPUs) they can either ensure that the - // additional /dev/nvidia-nvswitch* devices exist before - // crunch-run starts or pay for support (because they clearly - // have the budget for it). - exec.Command("nvidia-modprobe", "-c0", "-s").Run() + // GPUs. The "-c0 -s" flag will create /dev/nvidia-nvswitch0. + // If someone runs Arvados on a system with multiple + // nvswitches (i.e. more than 16 GPUs) they'll have to ensure + // that all the /dev/nvidia-nvswitch* devices exist before + // crunch-run starts. + for _, opt := range []string{"-m", "-l", "-u", "-s"} { + nvmodprobe := exec.Command("nvidia-modprobe", "-c0", opt) + nvmodprobe.Stdout = writer + nvmodprobe.Stderr = writer + err = nvmodprobe.Run() + if err != nil { + writer.Printf("Warning %v: %v", nvmodprobe.Args, err) + } + } }