12630: Use CUDADeviceCount instead of EnableCUDA
authorPeter Amstutz <peter.amstutz@curii.com>
Mon, 22 Nov 2021 18:35:20 +0000 (13:35 -0500)
committerPeter Amstutz <peter.amstutz@curii.com>
Fri, 10 Dec 2021 16:24:54 +0000 (11:24 -0500)
Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz@curii.com>

lib/crunchrun/crunchrun.go
lib/crunchrun/docker.go
lib/crunchrun/executor.go
lib/crunchrun/singularity.go
sdk/go/arvados/container.go

index dd0169025bb3cba58049dcdc9151738ea7b48463..589a046a34c0edb94ed97c48b7776cd63c1091cd 100644 (file)
@@ -987,19 +987,20 @@ func (runner *ContainerRunner) CreateContainer(imageID string, bindmounts map[st
        runner.executorStdout = stdout
        runner.executorStderr = stderr
        return runner.executor.Create(containerSpec{
-               Image:         imageID,
-               VCPUs:         runner.Container.RuntimeConstraints.VCPUs,
-               RAM:           ram,
-               WorkingDir:    workdir,
-               Env:           env,
-               BindMounts:    bindmounts,
-               Command:       runner.Container.Command,
-               EnableNetwork: enableNetwork,
-               NetworkMode:   runner.networkMode,
-               CgroupParent:  runner.setCgroupParent,
-               Stdin:         stdin,
-               Stdout:        stdout,
-               Stderr:        stderr,
+               Image:           imageID,
+               VCPUs:           runner.Container.RuntimeConstraints.VCPUs,
+               RAM:             ram,
+               WorkingDir:      workdir,
+               Env:             env,
+               BindMounts:      bindmounts,
+               Command:         runner.Container.Command,
+               EnableNetwork:   enableNetwork,
+               CUDADeviceCount: runner.Container.RuntimeConstraints.CUDADeviceCount,
+               NetworkMode:     runner.networkMode,
+               CgroupParent:    runner.setCgroupParent,
+               Stdin:           stdin,
+               Stdout:          stdout,
+               Stderr:          stderr,
        })
 }
 
index 573df7faf0b66ac0fa3ac97195a14b8ed5cde0cc..c64ff83da38ada46a19fc15153f11a42270e28bf 100644 (file)
@@ -106,10 +106,10 @@ func (e *dockerExecutor) Create(spec containerSpec) error {
                        KernelMemory: spec.RAM, // kernel portion
                },
        }
-       if spec.EnableCUDA {
+       if spec.CUDADeviceCount != 0 {
                hostCfg.Resources.DeviceRequests = append(hostCfg.Resources.DeviceRequests, dockercontainer.DeviceRequest{
                        Driver:       "nvidia",
-                       Count:        -1,
+                       Count:        spec.CUDADeviceCount,
                        Capabilities: [][]string{[]string{"gpu", "nvidia", "compute"}},
                })
        }
index bffd701bcd4d0246841a2dfbb4e2d9cde5b9c2ab..dc1bc20b7c3a110269d1f95441a6c7e75af48ace 100644 (file)
@@ -16,20 +16,20 @@ type bindmount struct {
 }
 
 type containerSpec struct {
-       Image         string
-       VCPUs         int
-       RAM           int64
-       WorkingDir    string
-       Env           map[string]string
-       BindMounts    map[string]bindmount
-       Command       []string
-       EnableNetwork bool
-       EnableCUDA    bool
-       NetworkMode   string // docker network mode, normally "default"
-       CgroupParent  string
-       Stdin         io.Reader
-       Stdout        io.Writer
-       Stderr        io.Writer
+       Image           string
+       VCPUs           int
+       RAM             int64
+       WorkingDir      string
+       Env             map[string]string
+       BindMounts      map[string]bindmount
+       Command         []string
+       EnableNetwork   bool
+       CUDADeviceCount int
+       NetworkMode     string // docker network mode, normally "default"
+       CgroupParent    string
+       Stdin           io.Reader
+       Stdout          io.Writer
+       Stderr          io.Writer
 }
 
 // containerExecutor is an interface to a container runtime
index 5637a9b4d924023688fc153e1f5a9ea57d199794..2128aeb2271f8d57da1fe5630d7bd7b7b105e1c6 100644 (file)
@@ -247,7 +247,7 @@ func (e *singularityExecutor) Start() error {
                args = append(args, "--net", "--network=none")
        }
 
-       if e.spec.EnableCUDA {
+       if e.spec.CUDADeviceCount != 0 {
                args = append(args, "--nv")
        }
 
index 014fd6c2bdd27512849e37abf31ad2d2a2c5b5d0..8e335f875ce71d529a37bee859ed75477ed64195 100644 (file)
@@ -100,10 +100,10 @@ type RuntimeConstraints struct {
        RAM                         int64    `json:"ram"`
        VCPUs                       int      `json:"vcpus"`
        KeepCacheRAM                int64    `json:"keep_cache_ram"`
-       CUDADriverVersion           string   `json:"cuda_driver_version"`
-       CUDACubinHardwareCapability []string `json:"cuda_cubin_hardware_capability"`
-       CUDAPTXHardwardCapability   string   `json:"cuda_ptx_hardware_capability"`
-       CUDADeviceCount             int      `json:"cuda_device_count"`
+       CUDADriverVersion           string   `json:"cuda_driver_version,omitempty"`
+       CUDACubinHardwareCapability []string `json:"cuda_cubin_hardware_capability,omitempty"`
+       CUDAPTXHardwardCapability   string   `json:"cuda_ptx_hardware_capability,omitempty"`
+       CUDADeviceCount             int      `json:"cuda_device_count,omitempty"`
 }
 
 // SchedulingParameters specify a container's scheduling parameters