lib/crunchrun/cuda.go

   1 // Copyright (C) The Arvados Authors. All rights reserved.
   2 //
   3 // SPDX-License-Identifier: AGPL-3.0
   4
   5 package crunchrun
   6
   7 import (
   8         "os/exec"
   9 )
  10
  11 // nvidiaModprobe makes sure all the nvidia kernel modules and devices
  12 // are set up.  If we don't have all the modules/devices set up we get
  13 // "CUDA_ERROR_UNKNOWN".
  14 func nvidiaModprobe(writer *ThrottledLogger) {
  15         // The underlying problem is that when normally running
  16         // directly on the host, the CUDA SDK will automatically
  17         // detect and set up the devices on demand.  However, when
  18         // running inside a container, it lacks sufficient permissions
  19         // to do that.  So, it needs to be set up before the container
  20         // can be started.
  21         //
  22         // The Singularity documentation hints about this but isn't
  23         // very helpful with a solution.
  24         // https://sylabs.io/guides/3.7/user-guide/gpu.html#cuda-error-unknown-when-everything-seems-to-be-correctly-configured
  25         //
  26         // If we're running "nvidia-persistenced", it sets up most of
  27         // these things on system boot.
  28         //
  29         // However, it seems that doesn't include /dev/nvidia-uvm
  30         // We're also no guaranteed to be running
  31         // "nvidia-persistenced" or otherwise have the devices set up
  32         // for us.  So the most robust solution is to do it ourselves.
  33         //
  34         // These are idempotent operations so it is harmless in the
  35         // case that everything was actually already set up.
  36
  37         // Running nvida-smi the first time loads the core 'nvidia'
  38         // kernel module creates /dev/nvidiactl the per-GPU
  39         // /dev/nvidia* devices
  40         nvidiaSmi := exec.Command("nvidia-smi", "-L")
  41         nvidiaSmi.Stdout = writer
  42         nvidiaSmi.Stderr = writer
  43         err := nvidiaSmi.Run()
  44         if err != nil {
  45                 writer.Printf("Warning %v: %v", nvidiaSmi.Args, err)
  46         }
  47
  48         // Load the kernel modules & devices associated with
  49         // /dev/nvidia-modeset, /dev/nvidia-nvlink, /dev/nvidia-uvm
  50         // and /dev/nvidia-uvm-tools (-m, -l and -u).  Annoyingly,
  51         // these don't have multiple devices but you need to supply
  52         // "-c0" anyway or it won't make the device file.
  53
  54         // Nvswitch devices are multi-GPU interconnects for up to 16
  55         // GPUs.  The "-c0 -s" flag will create /dev/nvidia-nvswitch0.
  56         // If someone runs Arvados on a system with multiple
  57         // nvswitches (i.e. more than 16 GPUs) they'll have to ensure
  58         // that all the /dev/nvidia-nvswitch* devices exist before
  59         // crunch-run starts.
  60         for _, opt := range []string{"-m", "-l", "-u", "-s"} {
  61                 nvmodprobe := exec.Command("nvidia-modprobe", "-c0", opt)
  62                 nvmodprobe.Stdout = writer
  63                 nvmodprobe.Stderr = writer
  64                 err = nvmodprobe.Run()
  65                 if err != nil {
  66                         writer.Printf("Warning %v: %v", nvmodprobe.Args, err)
  67                 }
  68         }
  69 }