From f04d5211ed026a4e0cbdca77dad447700eb88772 Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Wed, 24 Nov 2021 15:26:46 -0500 Subject: [PATCH] 12630: Remove distinction between cubin/PTX hardware capabilities Fix misspelled "HardwareCapability" Add cuda_ fields runtime_constraints documentation. Arvados-DCO-1.1-Signed-off-by: Peter Amstutz --- .../_container_runtime_constraints.liquid | 3 +++ lib/crunchrun/crunchrun.go | 14 +++++++++++++- sdk/go/arvados/container.go | 15 +++++++-------- 3 files changed, 23 insertions(+), 9 deletions(-) diff --git a/doc/_includes/_container_runtime_constraints.liquid b/doc/_includes/_container_runtime_constraints.liquid index 7e0c8f18f2..dcdc29cf3f 100644 --- a/doc/_includes/_container_runtime_constraints.liquid +++ b/doc/_includes/_container_runtime_constraints.liquid @@ -14,3 +14,6 @@ table(table table-bordered table-condensed). |vcpus|integer|Number of cores to be used to run this process.|Optional. However, a ContainerRequest that is in "Committed" state must provide this.| |keep_cache_ram|integer|Number of keep cache bytes to be used to run this process.|Optional.| |API|boolean|When set, ARVADOS_API_HOST and ARVADOS_API_TOKEN will be set, and container will have networking enabled to access the Arvados API server.|Optional.| +|cuda_driver_version|string|Minimum CUDA driver version.|Optional.| +|cuda_hardware_capability|string|Minimum CUDA hardware capability.|Optional.| +|cuda_device_count|int|Number of GPUs to request.|Optional.| diff --git a/lib/crunchrun/crunchrun.go b/lib/crunchrun/crunchrun.go index 589a046a34..7e68dcd331 100644 --- a/lib/crunchrun/crunchrun.go +++ b/lib/crunchrun/crunchrun.go @@ -986,6 +986,18 @@ func (runner *ContainerRunner) CreateContainer(imageID string, bindmounts map[st runner.executorStdin = stdin runner.executorStdout = stdout runner.executorStderr = stderr + + cudaDeviceCount := 0 + if runner.Container.RuntimeConstraints.CUDADriverVersion != "" || + runner.Container.RuntimeConstraints.CUDAHardwareCapability != "" || + runner.Container.RuntimeConstraints.CUDADeviceCount != 0 { + // if any of these are set, enable CUDA GPU support + cudaDeviceCount = runner.Container.RuntimeConstraints.CUDADeviceCount + if cudaDeviceCount == 0 { + cudaDeviceCount = 1 + } + } + return runner.executor.Create(containerSpec{ Image: imageID, VCPUs: runner.Container.RuntimeConstraints.VCPUs, @@ -995,7 +1007,7 @@ func (runner *ContainerRunner) CreateContainer(imageID string, bindmounts map[st BindMounts: bindmounts, Command: runner.Container.Command, EnableNetwork: enableNetwork, - CUDADeviceCount: runner.Container.RuntimeConstraints.CUDADeviceCount, + CUDADeviceCount: cudaDeviceCount, NetworkMode: runner.networkMode, CgroupParent: runner.setCgroupParent, Stdin: stdin, diff --git a/sdk/go/arvados/container.go b/sdk/go/arvados/container.go index 8e335f875c..27afc1a3ab 100644 --- a/sdk/go/arvados/container.go +++ b/sdk/go/arvados/container.go @@ -96,14 +96,13 @@ type Mount struct { // RuntimeConstraints specify a container's compute resources (RAM, // CPU) and network connectivity. type RuntimeConstraints struct { - API bool `json:"API"` - RAM int64 `json:"ram"` - VCPUs int `json:"vcpus"` - KeepCacheRAM int64 `json:"keep_cache_ram"` - CUDADriverVersion string `json:"cuda_driver_version,omitempty"` - CUDACubinHardwareCapability []string `json:"cuda_cubin_hardware_capability,omitempty"` - CUDAPTXHardwardCapability string `json:"cuda_ptx_hardware_capability,omitempty"` - CUDADeviceCount int `json:"cuda_device_count,omitempty"` + API bool `json:"API"` + RAM int64 `json:"ram"` + VCPUs int `json:"vcpus"` + KeepCacheRAM int64 `json:"keep_cache_ram"` + CUDADriverVersion string `json:"cuda_driver_version,omitempty"` + CUDAHardwareCapability string `json:"cuda_hardware_capability,omitempty"` + CUDADeviceCount int `json:"cuda_device_count,omitempty"` } // SchedulingParameters specify a container's scheduling parameters -- 2.30.2