X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/042f47a2c6b5f3db80142164b6493c873aca0b26..13f6d45704efc68ca8419e8917376aa44fdee1be:/lib/crunchrun/cuda.go

diff --git a/lib/crunchrun/cuda.go b/lib/crunchrun/cuda.go
index 91949c588f..c693dbcb96 100644
--- a/lib/crunchrun/cuda.go
+++ b/lib/crunchrun/cuda.go
@@ -5,14 +5,13 @@
 package crunchrun
 
 import (
-	"io"
 	"os/exec"
 )
 
 // nvidiaModprobe makes sure all the nvidia kernel modules and devices
 // are set up.  If we don't have all the modules/devices set up we get
 // "CUDA_ERROR_UNKNOWN".
-func nvidiaModprobe(writer io.Writer) {
+func nvidiaModprobe(writer *ThrottledLogger) {
 	// The underlying problem is that when normally running
 	// directly on the host, the CUDA SDK will automatically
 	// detect and set up the devices on demand.  However, when
@@ -41,23 +40,30 @@ func nvidiaModprobe(writer io.Writer) {
 	nvidiaSmi := exec.Command("nvidia-smi", "-L")
 	nvidiaSmi.Stdout = writer
 	nvidiaSmi.Stderr = writer
-	nvidiaSmi.Run()
+	err := nvidiaSmi.Run()
+	if err != nil {
+		writer.Printf("Warning %v: %v", nvidiaSmi.Args, err)
+	}
 
 	// Load the kernel modules & devices associated with
 	// /dev/nvidia-modeset, /dev/nvidia-nvlink, /dev/nvidia-uvm
-	// and /dev/nvidia-uvm-tools (-m, -l and -u).  Annoyingly, you
-	// don't have multiple devices but you need to supply "-c0"
-	// anyway or it won't make the device file.
-	exec.Command("nvidia-modprobe", "-c0", "-m").Run()
-	exec.Command("nvidia-modprobe", "-c0", "-l").Run()
-	exec.Command("nvidia-modprobe", "-c0", "-u").Run()
+	// and /dev/nvidia-uvm-tools (-m, -l and -u).  Annoyingly,
+	// these don't have multiple devices but you need to supply
+	// "-c0" anyway or it won't make the device file.
 
 	// Nvswitch devices are multi-GPU interconnects for up to 16
-	// GPUs.  Here we'll create /dev/nvidia-nvswitch0.  If someone
-	// runs Arvados on a system with multiple nvswitches
-	// (i.e. more than 16 GPUs) they can either ensure that the
-	// additional /dev/nvidia-nvswitch* devices exist before
-	// crunch-run starts or pay for support (because they clearly
-	// have the budget for it).
-	exec.Command("nvidia-modprobe", "-c0", "-s").Run()
+	// GPUs.  The "-c0 -s" flag will create /dev/nvidia-nvswitch0.
+	// If someone runs Arvados on a system with multiple
+	// nvswitches (i.e. more than 16 GPUs) they'll have to ensure
+	// that all the /dev/nvidia-nvswitch* devices exist before
+	// crunch-run starts.
+	for _, opt := range []string{"-m", "-l", "-u", "-s"} {
+		nvmodprobe := exec.Command("nvidia-modprobe", "-c0", opt)
+		nvmodprobe.Stdout = writer
+		nvmodprobe.Stderr = writer
+		err = nvmodprobe.Run()
+		if err != nil {
+			writer.Printf("Warning %v: %v", nvmodprobe.Args, err)
+		}
+	}
 }