From 7f88afd565b76903ad4b27fb896ff0cd844dfb7f Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Fri, 17 Dec 2021 15:57:16 -0500 Subject: [PATCH] 18321: Account for CUDA in container reuse Also ensure reuse across versions when CUDA isn't being used. Arvados-DCO-1.1-Signed-off-by: Peter Amstutz --- lib/dispatchcloud/node_size.go | 15 +++++---- lib/dispatchcloud/node_size_test.go | 27 +++++---------- sdk/go/arvados/container.go | 8 ++--- services/api/app/models/arvados_model.rb | 5 +++ services/api/app/models/container.rb | 17 +++++++++- services/api/app/models/container_request.rb | 16 +++++++++ .../api/test/fixtures/container_requests.yml | 4 +++ services/api/test/fixtures/containers.yml | 30 +++++++++++++++++ services/api/test/unit/container_test.rb | 33 +++++++++++++++++-- 9 files changed, 122 insertions(+), 33 deletions(-) diff --git a/lib/dispatchcloud/node_size.go b/lib/dispatchcloud/node_size.go index 6831d8d8e9..7c7643bfc7 100644 --- a/lib/dispatchcloud/node_size.go +++ b/lib/dispatchcloud/node_size.go @@ -84,16 +84,16 @@ func EstimateScratchSpace(ctr *arvados.Container) (needScratch int64) { } // compareVersion returns true if vs1 < vs2, otherwise false -func versionLess(vs1 string, vs2 string) bool { +func versionLess(vs1 string, vs2 string) (bool, error) { v1, err := strconv.ParseFloat(vs1, 64) if err != nil { - return false + return false, err } v2, err := strconv.ParseFloat(vs2, 64) if err != nil { - return false + return false, err } - return v1 < v2 + return v1 < v2, nil } // ChooseInstanceType returns the cheapest available @@ -115,6 +115,9 @@ func ChooseInstanceType(cc *arvados.Cluster, ctr *arvados.Container) (best arvad ok := false for _, it := range cc.InstanceTypes { + driverInsuff, driverErr := versionLess(it.CUDA.DriverVersion, ctr.RuntimeConstraints.CUDA.DriverVersion) + capabilityInsuff, capabilityErr := versionLess(it.CUDA.HardwareCapability, ctr.RuntimeConstraints.CUDA.HardwareCapability) + switch { // reasons to reject a node case ok && it.Price > best.Price: // already selected a node, and this one is more expensive @@ -124,8 +127,8 @@ func ChooseInstanceType(cc *arvados.Cluster, ctr *arvados.Container) (best arvad case it.Preemptible != ctr.SchedulingParameters.Preemptible: // wrong preemptable setting case it.Price == best.Price && (it.RAM < best.RAM || it.VCPUs < best.VCPUs): // same price, worse specs case it.CUDA.DeviceCount < ctr.RuntimeConstraints.CUDA.DeviceCount: // insufficient CUDA devices - case ctr.RuntimeConstraints.CUDA.DeviceCount > 0 && versionLess(it.CUDA.DriverVersion, ctr.RuntimeConstraints.CUDA.DriverVersion): // insufficient driver version - case ctr.RuntimeConstraints.CUDA.DeviceCount > 0 && versionLess(it.CUDA.HardwareCapability, ctr.RuntimeConstraints.CUDA.HardwareCapability): // insufficient hardware capability + case ctr.RuntimeConstraints.CUDA.DeviceCount > 0 && (driverInsuff || driverErr != nil): // insufficient driver version + case ctr.RuntimeConstraints.CUDA.DeviceCount > 0 && (capabilityInsuff || capabilityErr != nil): // insufficient hardware capability // Don't select this node default: // Didn't reject the node, so select it diff --git a/lib/dispatchcloud/node_size_test.go b/lib/dispatchcloud/node_size_test.go index 06255462b3..eb3648e8ac 100644 --- a/lib/dispatchcloud/node_size_test.go +++ b/lib/dispatchcloud/node_size_test.go @@ -155,6 +155,7 @@ func (*NodeSizeSuite) TestChooseGPU(c *check.C) { "best": {Price: 2.2, RAM: 2000000000, VCPUs: 4, Scratch: 2 * GiB, Name: "best", CUDA: arvados.CUDAFeatures{DeviceCount: 1, HardwareCapability: "9.0", DriverVersion: "11.0"}}, "low_driver": {Price: 2.1, RAM: 2000000000, VCPUs: 4, Scratch: 2 * GiB, Name: "low_driver", CUDA: arvados.CUDAFeatures{DeviceCount: 1, HardwareCapability: "9.0", DriverVersion: "10.0"}}, "cheap_gpu": {Price: 2.0, RAM: 2000000000, VCPUs: 4, Scratch: 2 * GiB, Name: "cheap_gpu", CUDA: arvados.CUDAFeatures{DeviceCount: 1, HardwareCapability: "8.0", DriverVersion: "10.0"}}, + "invalid_gpu": {Price: 1.9, RAM: 2000000000, VCPUs: 4, Scratch: 2 * GiB, Name: "invalid_gpu", CUDA: arvados.CUDAFeatures{DeviceCount: 1, HardwareCapability: "12.0.12", DriverVersion: "12.0.12"}}, "non_gpu": {Price: 1.1, RAM: 2000000000, VCPUs: 4, Scratch: 2 * GiB, Name: "non_gpu"}, } @@ -201,23 +202,7 @@ func (*NodeSizeSuite) TestChooseGPU(c *check.C) { HardwareCapability: "", DriverVersion: "10.0", }, - SelectedInstance: "cheap_gpu", - }, - GPUTestCase{ - CUDA: arvados.CUDARuntimeConstraints{ - DeviceCount: 1, - HardwareCapability: "8.0", - DriverVersion: "", - }, - SelectedInstance: "cheap_gpu", - }, - GPUTestCase{ - CUDA: arvados.CUDARuntimeConstraints{ - DeviceCount: 1, - HardwareCapability: "", - DriverVersion: "", - }, - SelectedInstance: "cheap_gpu", + SelectedInstance: "", }, GPUTestCase{ CUDA: arvados.CUDARuntimeConstraints{ @@ -241,7 +226,11 @@ func (*NodeSizeSuite) TestChooseGPU(c *check.C) { CUDA: tc.CUDA, }, }) - c.Check(err, check.IsNil) - c.Check(best.Name, check.Equals, tc.SelectedInstance) + if best.Name != "" { + c.Check(err, check.IsNil) + c.Check(best.Name, check.Equals, tc.SelectedInstance) + } else { + c.Check(err, check.Not(check.IsNil)) + } } } diff --git a/sdk/go/arvados/container.go b/sdk/go/arvados/container.go index d0b1273334..3510a6db04 100644 --- a/sdk/go/arvados/container.go +++ b/sdk/go/arvados/container.go @@ -94,9 +94,9 @@ type Mount struct { } type CUDARuntimeConstraints struct { - DriverVersion string `json:"driver_version,omitempty"` - HardwareCapability string `json:"hardware_capability,omitempty"` - DeviceCount int `json:"device_count,omitempty"` + DriverVersion string `json:"driver_version"` + HardwareCapability string `json:"hardware_capability"` + DeviceCount int `json:"device_count"` } // RuntimeConstraints specify a container's compute resources (RAM, @@ -106,7 +106,7 @@ type RuntimeConstraints struct { RAM int64 `json:"ram"` VCPUs int `json:"vcpus"` KeepCacheRAM int64 `json:"keep_cache_ram"` - CUDA CUDARuntimeConstraints `json:"cuda,omitempty"` + CUDA CUDARuntimeConstraints `json:"cuda"` } // SchedulingParameters specify a container's scheduling parameters diff --git a/services/api/app/models/arvados_model.rb b/services/api/app/models/arvados_model.rb index af226cde82..00934322d2 100644 --- a/services/api/app/models/arvados_model.rb +++ b/services/api/app/models/arvados_model.rb @@ -884,6 +884,11 @@ class ArvadosModel < ApplicationRecord def fill_container_defaults self.runtime_constraints = { 'API' => false, + 'cuda' => { + 'device_count' => 0, + 'driver_version' => '', + 'hardware_capability' => '', + }, 'keep_cache_ram' => 0, 'ram' => 0, 'vcpus' => 0, diff --git a/services/api/app/models/container.rb b/services/api/app/models/container.rb index d6a44c8023..2443da4551 100644 --- a/services/api/app/models/container.rb +++ b/services/api/app/models/container.rb @@ -289,7 +289,22 @@ class Container < ArvadosModel candidates = candidates.where('secret_mounts_md5 = ?', secret_mounts_md5) log_reuse_info(candidates) { "after filtering on secret_mounts_md5 #{secret_mounts_md5.inspect}" } - candidates = candidates.where_serialized(:runtime_constraints, resolve_runtime_constraints(attrs[:runtime_constraints]), md5: true) + if attrs[:runtime_constraints]['cuda'].nil? + attrs[:runtime_constraints]['cuda'] = { + 'device_count' => 0, + 'driver_version' => '', + 'hardware_capability' => '', + } + end + + candidates_inc_cuda = candidates.where_serialized(:runtime_constraints, resolve_runtime_constraints(attrs[:runtime_constraints]), md5: true) + if candidates_inc_cuda.count == 0 and attrs[:runtime_constraints]['cuda']['device_count'] == 0 + # Fallback search on containers introduced before CUDA support, + # exclude empty CUDA request from query + candidates = candidates.where_serialized(:runtime_constraints, resolve_runtime_constraints(attrs[:runtime_constraints].except('cuda')), md5: true) + else + candidates = candidates_inc_cuda + end log_reuse_info(candidates) { "after filtering on runtime_constraints #{attrs[:runtime_constraints].inspect}" } log_reuse_info { "checking for state=Complete with readable output and log..." } diff --git a/services/api/app/models/container_request.rb b/services/api/app/models/container_request.rb index 4a580816cd..00773fcb86 100644 --- a/services/api/app/models/container_request.rb +++ b/services/api/app/models/container_request.rb @@ -336,6 +336,22 @@ class ContainerRequest < ArvadosModel "[#{k}]=#{v.inspect} must be a positive integer") end end + if runtime_constraints['cuda'] + ['device_count'].each do |k| + v = runtime_constraints['cuda'][k] + if !v.is_a?(Integer) || v < 0 + errors.add(:runtime_constraints, + "[cuda.#{k}]=#{v.inspect} must be a positive or zero integer") + end + end + ['driver_version', 'hardware_capability'].each do |k| + v = runtime_constraints['cuda'][k] + if !v.is_a?(String) + errors.add(:runtime_constraints, + "[cuda.#{k}]=#{v.inspect} must be a string") + end + end + end end end diff --git a/services/api/test/fixtures/container_requests.yml b/services/api/test/fixtures/container_requests.yml index 5be132ac30..dca89f56d3 100644 --- a/services/api/test/fixtures/container_requests.yml +++ b/services/api/test/fixtures/container_requests.yml @@ -20,6 +20,10 @@ queued: runtime_constraints: vcpus: 1 ram: 123 + cuda: + driver_version: "" + hardware_capability: "" + device_count: 0 mounts: {} running: diff --git a/services/api/test/fixtures/containers.yml b/services/api/test/fixtures/containers.yml index e7cd0abd1f..53539ce1e0 100644 --- a/services/api/test/fixtures/containers.yml +++ b/services/api/test/fixtures/containers.yml @@ -16,6 +16,10 @@ queued: runtime_constraints: ram: 12000000000 vcpus: 4 + cuda: + driver_version: "" + hardware_capability: "" + device_count: 0 mounts: /tmp: kind: tmp @@ -449,3 +453,29 @@ runtime_token: /var/spool/cwl: kind: tmp capacity: 24000000000 + +cuda_container: + uuid: zzzzz-dz642-cudagpcontainer + owner_uuid: zzzzz-tpzed-000000000000000 + state: Complete + exit_code: 0 + priority: 1 + created_at: 2016-01-11 11:11:11.111111111 Z + updated_at: 2016-01-11 11:11:11.111111111 Z + started_at: 2016-01-11 11:11:11.111111111 Z + finished_at: 2016-01-12 11:12:13.111111111 Z + container_image: test + cwd: test + log: ea10d51bcf88862dbcc36eb292017dfd+45 + output: 1f4b0bc7583c2a7f9102c395f4ffc5e3+45 + output_path: test + command: ["echo", "hello", "/bin/sh", "-c", "'cat' '/keep/fa7aeb5140e2848d39b416daeef4ffc5+45/foobar' '/keep/fa7aeb5140e2848d39b416daeef4ffc5+45/baz' '|' 'gzip' '>' '/dev/null'"] + runtime_constraints: + ram: 12000000000 + vcpus: 4 + cuda: + driver_version: "11.0" + hardware_capability: "9.0" + device_count: 1 + secret_mounts: {} + secret_mounts_md5: 99914b932bd37a50b983c5e7c90ae93b diff --git a/services/api/test/unit/container_test.rb b/services/api/test/unit/container_test.rb index 375ab5a7bb..ac3e6bea42 100644 --- a/services/api/test/unit/container_test.rb +++ b/services/api/test/unit/container_test.rb @@ -14,7 +14,7 @@ class ContainerTest < ActiveSupport::TestCase container_image: 'fa3c1a9cb6783f85f2ecda037e07b8c3+167', output_path: '/tmp', priority: 1, - runtime_constraints: {"vcpus" => 1, "ram" => 1}, + runtime_constraints: {"vcpus" => 1, "ram" => 1, "cuda" => {"device_count":0, "driver_version": "", "hardware_capability": ""}}, } REUSABLE_COMMON_ATTRS = { @@ -26,7 +26,7 @@ class ContainerTest < ActiveSupport::TestCase "API" => false, "keep_cache_ram" => 0, "ram" => 12000000000, - "vcpus" => 4, + "vcpus" => 4 }, mounts: { "test" => {"kind" => "json"}, @@ -229,7 +229,7 @@ class ContainerTest < ActiveSupport::TestCase set_user_from_auth :active env = {"C" => "3", "B" => "2", "A" => "1"} m = {"F" => {"kind" => "3"}, "E" => {"kind" => "2"}, "D" => {"kind" => "1"}} - rc = {"vcpus" => 1, "ram" => 1, "keep_cache_ram" => 1, "API" => true} + rc = {"vcpus" => 1, "ram" => 1, "keep_cache_ram" => 1, "API" => true, "cuda" => {"device_count":0, "driver_version": "", "hardware_capability": ""}} c, _ = minimal_new(environment: env, mounts: m, runtime_constraints: rc) c.reload assert_equal Container.deep_sort_hash(env).to_json, c.environment.to_json @@ -590,6 +590,33 @@ class ContainerTest < ActiveSupport::TestCase assert_equal c1.uuid, reused.uuid end + test "find_reusable method with cuda" do + set_user_from_auth :active + # No cuda + no_cuda_attrs = REUSABLE_COMMON_ATTRS.merge({use_existing:false, priority:1, environment:{"var" => "queued"}, + runtime_constraints: {"vcpus" => 1, "ram" => 1, "keep_cache_ram"=>268435456, "API" => false, + "cuda" => {"device_count":0, "driver_version": "", "hardware_capability": ""}},}) + c1, _ = minimal_new(no_cuda_attrs) + assert_equal Container::Queued, c1.state + + # has cuda + cuda_attrs = REUSABLE_COMMON_ATTRS.merge({use_existing:false, priority:1, environment:{"var" => "queued"}, + runtime_constraints: {"vcpus" => 1, "ram" => 1, "keep_cache_ram"=>268435456, "API" => false, + "cuda" => {"device_count":1, "driver_version": "11.0", "hardware_capability": "9.0"}},}) + c2, _ = minimal_new(cuda_attrs) + assert_equal Container::Queued, c2.state + + # should find the no cuda one + reused = Container.find_reusable(no_cuda_attrs) + assert_not_nil reused + assert_equal reused.uuid, c1.uuid + + # should find the cuda one + reused = Container.find_reusable(cuda_attrs) + assert_not_nil reused + assert_equal reused.uuid, c2.uuid + end + test "Container running" do set_user_from_auth :active c, _ = minimal_new priority: 1 -- 2.30.2