package crunchrun
import (
+ "fmt"
"io"
"os/exec"
)
nvidiaSmi := exec.Command("nvidia-smi", "-L")
nvidiaSmi.Stdout = writer
nvidiaSmi.Stderr = writer
- nvidiaSmi.Run()
+ err := nvidiaSmi.Run()
+ if err != nil {
+ writer.Write([]byte(fmt.Sprintf("nvidia-smi error: %v\n", err)))
+ }
// Load the kernel modules & devices associated with
// /dev/nvidia-modeset, /dev/nvidia-nvlink, /dev/nvidia-uvm
- // and /dev/nvidia-uvm-tools (-m, -l and -u). Annoyingly, you
- // don't have multiple devices but you need to supply "-c0"
- // anyway or it won't make the device file.
- exec.Command("nvidia-modprobe", "-c0", "-m").Run()
- exec.Command("nvidia-modprobe", "-c0", "-l").Run()
- exec.Command("nvidia-modprobe", "-c0", "-u").Run()
+ // and /dev/nvidia-uvm-tools (-m, -l and -u). Annoyingly,
+ // these don't have multiple devices but you need to supply
+ // "-c0" anyway or it won't make the device file.
// Nvswitch devices are multi-GPU interconnects for up to 16
- // GPUs. Here we'll create /dev/nvidia-nvswitch0. If someone
- // runs Arvados on a system with multiple nvswitches
- // (i.e. more than 16 GPUs) they can either ensure that the
- // additional /dev/nvidia-nvswitch* devices exist before
- // crunch-run starts or pay for support (because they clearly
- // have the budget for it).
- exec.Command("nvidia-modprobe", "-c0", "-s").Run()
+ // GPUs. The "-c0 -s" flag will create /dev/nvidia-nvswitch0.
+ // If someone runs Arvados on a system with multiple
+ // nvswitches (i.e. more than 16 GPUs) they'll have to ensure
+ // that all the /dev/nvidia-nvswitch* devices exist before
+ // crunch-run starts.
+ for _, opt := range []string{"-m", "-l", "-u", "-s"} {
+ nvmodprobe := exec.Command("nvidia-modprobe", "-c0", opt)
+ nvmodprobe.Stdout = writer
+ nvmodprobe.Stderr = writer
+ err = nvmodprobe.Run()
+ if err != nil {
+ writer.Write([]byte(fmt.Sprintf("nvidia-modprobe error: %v\n", err)))
+ }
+ }
}
}
if spec.CUDADeviceCount != 0 {
var deviceIds []string
- for _, s := range os.Environ() {
+ if cudaVisibleDevices := os.Getenv("CUDA_VISIBLE_DEVICES"); cudaVisibleDevices != "" {
// If a resource manager such as slurm or LSF told
// us to select specific devices we need to propagate that.
- if strings.HasPrefix(s, "CUDA_VISIBLE_DEVICES=") {
- deviceIds = strings.Split(strings.SplitN(s, "=", 2)[1], ",")
- }
+ deviceIds = strings.Split(cudaVisibleDevices, ",")
}
+
deviceCount := spec.CUDADeviceCount
if len(deviceIds) > 0 {
// Docker won't accept both non-empty
"os"
"os/exec"
"sort"
- "strings"
"syscall"
"time"
// Singularity always makes all nvidia devices visible to the
// container. If a resource manager such as slurm or LSF told
// us to select specific devices we need to propagate that.
- for _, s := range os.Environ() {
- if strings.HasPrefix(s, "CUDA_VISIBLE_DEVICES=") {
- env = append(env, "SINGULARITYENV_"+s)
- }
+ if cudaVisibleDevices := os.Getenv("CUDA_VISIBLE_DEVICES"); cudaVisibleDevices != "" {
+ // If a resource manager such as slurm or LSF told
+ // us to select specific devices we need to propagate that.
+ env = append(env, "SINGULARITYENV_CUDA_VISIBLE_DEVICES="+cudaVisibleDevices)
}
args = append(args, e.imageFilename)