From: Peter Amstutz Date: Mon, 20 Dec 2021 22:15:46 +0000 (-0500) Subject: 18321: Check runtime constraints md5 with/without empty cuda section X-Git-Tag: 2.4.0~130^2~1 X-Git-Url: https://git.arvados.org/arvados.git/commitdiff_plain/6fe152024269d838e31bc224adbd518c43cbfee5 18321: Check runtime constraints md5 with/without empty cuda section * config & API check format of DriverVersion and HardwareCapability * crunch-run only pays attention to CUDA.DeviceCount * update docs Arvados-DCO-1.1-Signed-off-by: Peter Amstutz --- diff --git a/doc/_includes/_container_runtime_constraints.liquid b/doc/_includes/_container_runtime_constraints.liquid index 6926b9d3dc..3b8df32d4b 100644 --- a/doc/_includes/_container_runtime_constraints.liquid +++ b/doc/_includes/_container_runtime_constraints.liquid @@ -19,6 +19,6 @@ table(table table-bordered table-condensed). h3. CUDA GPU support table(table table-bordered table-condensed). -|device_count|int|Number of GPUs to request.|Required to request a GPU node.| -|driver_version|string|Minimum CUDA driver version.|Optional.| -|hardware_capability|string|Minimum CUDA hardware capability.|Optional.| +|device_count|int|Number of GPUs to request.|Count greater than 0 enables CUDA GPU support.| +|driver_version|string|Minimum CUDA driver version, in "X.Y" format.|Required when device_count > 0| +|hardware_capability|string|Minimum CUDA hardware capability, in "X.Y" format.|Required when device_count > 0| diff --git a/lib/config/load.go b/lib/config/load.go index 956a47b1a4..888cc828af 100644 --- a/lib/config/load.go +++ b/lib/config/load.go @@ -14,6 +14,7 @@ import ( "io/ioutil" "os" "regexp" + "strconv" "strings" "git.arvados.org/arvados.git/sdk/go/arvados" @@ -299,6 +300,7 @@ func (ldr *Loader) Load() (*arvados.Config, error) { ldr.checkEmptyKeepstores(cc), ldr.checkUnlistedKeepstores(cc), ldr.checkStorageClasses(cc), + ldr.checkCUDAVersions(cc), // TODO: check non-empty Rendezvous on // services other than Keepstore } { @@ -399,6 +401,24 @@ func (ldr *Loader) checkStorageClasses(cc arvados.Cluster) error { return nil } +func (ldr *Loader) checkCUDAVersions(cc arvados.Cluster) error { + for _, it := range cc.InstanceTypes { + if it.CUDA.DeviceCount == 0 { + continue + } + + _, err := strconv.ParseFloat(it.CUDA.DriverVersion, 64) + if err != nil { + return fmt.Errorf("InstanceType %q has invalid CUDA.DriverVersion %q, expected format X.Y (%v)", it.Name, it.CUDA.DriverVersion, err) + } + _, err = strconv.ParseFloat(it.CUDA.HardwareCapability, 64) + if err != nil { + return fmt.Errorf("InstanceType %q has invalid CUDA.HardwareCapability %q, expected format X.Y (%v)", it.Name, it.CUDA.HardwareCapability, err) + } + } + return nil +} + func checkKeyConflict(label string, m map[string]string) error { saw := map[string]bool{} for k := range m { diff --git a/lib/crunchrun/crunchrun.go b/lib/crunchrun/crunchrun.go index 52d9c4b0f1..b237d9fa59 100644 --- a/lib/crunchrun/crunchrun.go +++ b/lib/crunchrun/crunchrun.go @@ -987,17 +987,6 @@ func (runner *ContainerRunner) CreateContainer(imageID string, bindmounts map[st runner.executorStdout = stdout runner.executorStderr = stderr - cudaDeviceCount := 0 - if runner.Container.RuntimeConstraints.CUDA.DriverVersion != "" || - runner.Container.RuntimeConstraints.CUDA.HardwareCapability != "" || - runner.Container.RuntimeConstraints.CUDA.DeviceCount != 0 { - // if any of these are set, enable CUDA GPU support - cudaDeviceCount = runner.Container.RuntimeConstraints.CUDA.DeviceCount - if cudaDeviceCount == 0 { - cudaDeviceCount = 1 - } - } - return runner.executor.Create(containerSpec{ Image: imageID, VCPUs: runner.Container.RuntimeConstraints.VCPUs, @@ -1007,7 +996,7 @@ func (runner *ContainerRunner) CreateContainer(imageID string, bindmounts map[st BindMounts: bindmounts, Command: runner.Container.Command, EnableNetwork: enableNetwork, - CUDADeviceCount: cudaDeviceCount, + CUDADeviceCount: runner.Container.RuntimeConstraints.CUDA.DeviceCount, NetworkMode: runner.networkMode, CgroupParent: runner.setCgroupParent, Stdin: stdin, diff --git a/services/api/app/models/arvados_model.rb b/services/api/app/models/arvados_model.rb index 00934322d2..374c6720f6 100644 --- a/services/api/app/models/arvados_model.rb +++ b/services/api/app/models/arvados_model.rb @@ -701,7 +701,7 @@ class ArvadosModel < ApplicationRecord false end - def self.where_serialized(colname, value, md5: false) + def self.where_serialized(colname, value, md5: false, multivalue: false) colsql = colname.to_s if md5 colsql = "md5(#{colsql})" @@ -714,7 +714,16 @@ class ArvadosModel < ApplicationRecord sql = "#{colsql} IN (?)" sorted = deep_sort_hash(value) end - params = [sorted.to_yaml, SafeJSON.dump(sorted)] + params = [] + if multivalue + sorted.each do |v| + params << v.to_yaml + params << SafeJSON.dump(v) + end + else + params << sorted.to_yaml + params << SafeJSON.dump(sorted) + end if md5 params = params.map { |x| Digest::MD5.hexdigest(x) } end diff --git a/services/api/app/models/container.rb b/services/api/app/models/container.rb index 2443da4551..0326b12985 100644 --- a/services/api/app/models/container.rb +++ b/services/api/app/models/container.rb @@ -296,15 +296,14 @@ class Container < ArvadosModel 'hardware_capability' => '', } end - - candidates_inc_cuda = candidates.where_serialized(:runtime_constraints, resolve_runtime_constraints(attrs[:runtime_constraints]), md5: true) - if candidates_inc_cuda.count == 0 and attrs[:runtime_constraints]['cuda']['device_count'] == 0 - # Fallback search on containers introduced before CUDA support, - # exclude empty CUDA request from query - candidates = candidates.where_serialized(:runtime_constraints, resolve_runtime_constraints(attrs[:runtime_constraints].except('cuda')), md5: true) - else - candidates = candidates_inc_cuda + resolved_runtime_constraints = [resolve_runtime_constraints(attrs[:runtime_constraints])] + if resolved_runtime_constraints[0]['cuda']['device_count'] == 0 + # If no CUDA requested, extend search to include older container + # records that don't have a 'cuda' section in runtime_constraints + resolved_runtime_constraints << resolved_runtime_constraints[0].except('cuda') end + + candidates = candidates.where_serialized(:runtime_constraints, resolved_runtime_constraints, md5: true, multivalue: true) log_reuse_info(candidates) { "after filtering on runtime_constraints #{attrs[:runtime_constraints].inspect}" } log_reuse_info { "checking for state=Complete with readable output and log..." } diff --git a/services/api/app/models/container_request.rb b/services/api/app/models/container_request.rb index 00773fcb86..a3264e419b 100644 --- a/services/api/app/models/container_request.rb +++ b/services/api/app/models/container_request.rb @@ -346,9 +346,9 @@ class ContainerRequest < ArvadosModel end ['driver_version', 'hardware_capability'].each do |k| v = runtime_constraints['cuda'][k] - if !v.is_a?(String) + if !v.is_a?(String) || (runtime_constraints['cuda']['device_count'] > 0 && v.to_f == 0.0) errors.add(:runtime_constraints, - "[cuda.#{k}]=#{v.inspect} must be a string") + "[cuda.#{k}]=#{v.inspect} must be a string in format 'X.Y' version") end end end