lib/crunchrun/cuda.go

   1 // Copyright (C) The Arvados Authors. All rights reserved.
   2 //
   3 // SPDX-License-Identifier: AGPL-3.0
   4
   5 package crunchrun
   6
   7 import (
   8         "io"
   9         "os/exec"
  10 )
  11
  12 // nvidiaModprobe makes sure all the nvidia kernel modules and devices
  13 // are set up.  If we don't have all the modules/devices set up we get
  14 // "CUDA_ERROR_UNKNOWN".
  15 func nvidiaModprobe(writer io.Writer) {
  16         // The underlying problem is that when normally running
  17         // directly on the host, the CUDA SDK will automatically
  18         // detect and set up the devices on demand.  However, when
  19         // running inside a container, it lacks sufficient permissions
  20         // to do that.  So, it needs to be set up before the container
  21         // can be started.
  22         //
  23         // The Singularity documentation hints about this but isn't
  24         // very helpful with a solution.
  25         // https://sylabs.io/guides/3.7/user-guide/gpu.html#cuda-error-unknown-when-everything-seems-to-be-correctly-configured
  26         //
  27         // If we're running "nvidia-persistenced", it sets up most of
  28         // these things on system boot.
  29         //
  30         // However, it seems that doesn't include /dev/nvidia-uvm
  31         // We're also no guaranteed to be running
  32         // "nvidia-persistenced" or otherwise have the devices set up
  33         // for us.  So the most robust solution is to do it ourselves.
  34         //
  35         // These are idempotent operations so it is harmless in the
  36         // case that everything was actually already set up.
  37
  38         // Running nvida-smi the first time loads the core 'nvidia'
  39         // kernel module creates /dev/nvidiactl the per-GPU
  40         // /dev/nvidia* devices
  41         nvidiaSmi := exec.Command("nvidia-smi", "-L")
  42         nvidiaSmi.Stdout = writer
  43         nvidiaSmi.Stderr = writer
  44         nvidiaSmi.Run()
  45
  46         // Load the kernel modules & devices associated with
  47         // /dev/nvidia-modeset, /dev/nvidia-nvlink, /dev/nvidia-uvm
  48         // and /dev/nvidia-uvm-tools (-m, -l and -u).  Annoyingly, you
  49         // don't have multiple devices but you need to supply "-c0"
  50         // anyway or it won't make the device file.
  51         exec.Command("nvidia-modprobe", "-c0", "-m").Run()
  52         exec.Command("nvidia-modprobe", "-c0", "-l").Run()
  53         exec.Command("nvidia-modprobe", "-c0", "-u").Run()
  54
  55         // Nvswitch devices are multi-GPU interconnects for up to 16
  56         // GPUs.  Here we'll create /dev/nvidia-nvswitch0.  If someone
  57         // runs Arvados on a system with multiple nvswitches
  58         // (i.e. more than 16 GPUs) they can either ensure that the
  59         // additional /dev/nvidia-nvswitch* devices exist before
  60         // crunch-run starts or pay for support (because they clearly
  61         // have the budget for it).
  62         exec.Command("nvidia-modprobe", "-c0", "-s").Run()
  63 }