12630: Report errors from nvidia-modprobe & use Getenv

author Peter Amstutz <peter.amstutz@curii.com>

Mon, 10 Jan 2022 19:11:37 +0000 (14:11 -0500)

committer Peter Amstutz <peter.amstutz@curii.com>

Mon, 10 Jan 2022 19:11:37 +0000 (14:11 -0500)
author Peter Amstutz <peter.amstutz@curii.com>
Mon, 10 Jan 2022 19:11:37 +0000 (14:11 -0500)
committer Peter Amstutz <peter.amstutz@curii.com>
Mon, 10 Jan 2022 19:11:37 +0000 (14:11 -0500)
diff --git a/lib/crunchrun/cuda.go b/lib/crunchrun/cuda.go

index 91949c588feca8123af4f50621a5178b5f6c1e29..8282359eab486277ff8c548e83cc2fae84d514a2 100644 (file)
--- a/lib/crunchrun/cuda.go
+++ b/lib/crunchrun/cuda.go
@@ -5,6 +5,7 @@
  package crunchrun
  
  import (
+       "fmt"
         "io"
         "os/exec"
  )
@@ -41,23 +42,30 @@ func nvidiaModprobe(writer io.Writer) {
         nvidiaSmi := exec.Command("nvidia-smi", "-L")
         nvidiaSmi.Stdout = writer
         nvidiaSmi.Stderr = writer
-       nvidiaSmi.Run()
+       err := nvidiaSmi.Run()
+       if err != nil {
+               writer.Write([]byte(fmt.Sprintf("nvidia-smi error: %v\n", err)))
+       }
  
         // Load the kernel modules & devices associated with
         // /dev/nvidia-modeset, /dev/nvidia-nvlink, /dev/nvidia-uvm
-       // and /dev/nvidia-uvm-tools (-m, -l and -u).  Annoyingly, you
-       // don't have multiple devices but you need to supply "-c0"
-       // anyway or it won't make the device file.
-       exec.Command("nvidia-modprobe", "-c0", "-m").Run()
-       exec.Command("nvidia-modprobe", "-c0", "-l").Run()
-       exec.Command("nvidia-modprobe", "-c0", "-u").Run()
+       // and /dev/nvidia-uvm-tools (-m, -l and -u).  Annoyingly,
+       // these don't have multiple devices but you need to supply
+       // "-c0" anyway or it won't make the device file.
  
         // Nvswitch devices are multi-GPU interconnects for up to 16
-       // GPUs.  Here we'll create /dev/nvidia-nvswitch0.  If someone
-       // runs Arvados on a system with multiple nvswitches
-       // (i.e. more than 16 GPUs) they can either ensure that the
-       // additional /dev/nvidia-nvswitch* devices exist before
-       // crunch-run starts or pay for support (because they clearly
-       // have the budget for it).
-       exec.Command("nvidia-modprobe", "-c0", "-s").Run()
+       // GPUs.  The "-c0 -s" flag will create /dev/nvidia-nvswitch0.
+       // If someone runs Arvados on a system with multiple
+       // nvswitches (i.e. more than 16 GPUs) they'll have to ensure
+       // that all the /dev/nvidia-nvswitch* devices exist before
+       // crunch-run starts.
+       for _, opt := range []string{"-m", "-l", "-u", "-s"} {
+               nvmodprobe := exec.Command("nvidia-modprobe", "-c0", opt)
+               nvmodprobe.Stdout = writer
+               nvmodprobe.Stderr = writer
+               err = nvmodprobe.Run()
+               if err != nil {
+                       writer.Write([]byte(fmt.Sprintf("nvidia-modprobe error: %v\n", err)))
+               }
+       }
  }
diff --git a/lib/crunchrun/docker.go b/lib/crunchrun/docker.go

index c20f78bb1af0cfb6b318f7e9c519657d5079d1c2..06e5b5b1ece5794881e8a5cc04c05059563e358d 100644 (file)
--- a/lib/crunchrun/docker.go
+++ b/lib/crunchrun/docker.go
@@ -108,13 +108,12 @@ func (e *dockerExecutor) config(spec containerSpec) (dockercontainer.Config, doc
         }
         if spec.CUDADeviceCount != 0 {
                 var deviceIds []string
-               for _, s := range os.Environ() {
+               if cudaVisibleDevices := os.Getenv("CUDA_VISIBLE_DEVICES"); cudaVisibleDevices != "" {
                         // If a resource manager such as slurm or LSF told
                         // us to select specific devices we need to propagate that.
-                       if strings.HasPrefix(s, "CUDA_VISIBLE_DEVICES=") {
-                               deviceIds = strings.Split(strings.SplitN(s, "=", 2)[1], ",")
-                       }
+                       deviceIds = strings.Split(cudaVisibleDevices, ",")
                 }
+
                 deviceCount := spec.CUDADeviceCount
                 if len(deviceIds) > 0 {
                         // Docker won't accept both non-empty
diff --git a/lib/crunchrun/singularity.go b/lib/crunchrun/singularity.go

index 942de4300e087a95306fc2de1d20288535581a67..64a3773250701ecd62832e52e88a5fcf8a2b3da2 100644 (file)
--- a/lib/crunchrun/singularity.go
+++ b/lib/crunchrun/singularity.go
@@ -10,7 +10,6 @@ import (
         "os"
         "os/exec"
         "sort"
-       "strings"
         "syscall"
         "time"
  
@@ -288,10 +287,10 @@ func (e *singularityExecutor) execCmd(path string) *exec.Cmd {
         // Singularity always makes all nvidia devices visible to the
         // container.  If a resource manager such as slurm or LSF told
         // us to select specific devices we need to propagate that.
-       for _, s := range os.Environ() {
-               if strings.HasPrefix(s, "CUDA_VISIBLE_DEVICES=") {
-                       env = append(env, "SINGULARITYENV_"+s)
-               }
+       if cudaVisibleDevices := os.Getenv("CUDA_VISIBLE_DEVICES"); cudaVisibleDevices != "" {
+               // If a resource manager such as slurm or LSF told
+               // us to select specific devices we need to propagate that.
+               env = append(env, "SINGULARITYENV_CUDA_VISIBLE_DEVICES="+cudaVisibleDevices)
         }
  
         args = append(args, e.imageFilename)
author	Peter Amstutz <peter.amstutz@curii.com>
	Mon, 10 Jan 2022 19:11:37 +0000 (14:11 -0500)
committer	Peter Amstutz <peter.amstutz@curii.com>
	Mon, 10 Jan 2022 19:11:37 +0000 (14:11 -0500)
lib/crunchrun/cuda.go		patch \| blob \| history
lib/crunchrun/docker.go		patch \| blob \| history
lib/crunchrun/singularity.go		patch \| blob \| history