18321: Check runtime constraints md5 with/without empty cuda section
authorPeter Amstutz <peter.amstutz@curii.com>
Mon, 20 Dec 2021 22:15:46 +0000 (17:15 -0500)
committerPeter Amstutz <peter.amstutz@curii.com>
Mon, 20 Dec 2021 22:15:46 +0000 (17:15 -0500)
* config & API check format of DriverVersion and HardwareCapability
* crunch-run only pays attention to CUDA.DeviceCount
* update docs

Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz@curii.com>

doc/_includes/_container_runtime_constraints.liquid
lib/config/load.go
lib/crunchrun/crunchrun.go
services/api/app/models/arvados_model.rb
services/api/app/models/container.rb
services/api/app/models/container_request.rb

index 6926b9d3dcd5e7fe851dc1783069e9cda22d0222..3b8df32d4b6d4aca0357ddec6ca7c213408a251e 100644 (file)
@@ -19,6 +19,6 @@ table(table table-bordered table-condensed).
 h3. CUDA GPU support
 
 table(table table-bordered table-condensed).
-|device_count|int|Number of GPUs to request.|Required to request a GPU node.|
-|driver_version|string|Minimum CUDA driver version.|Optional.|
-|hardware_capability|string|Minimum CUDA hardware capability.|Optional.|
+|device_count|int|Number of GPUs to request.|Count greater than 0 enables CUDA GPU support.|
+|driver_version|string|Minimum CUDA driver version, in "X.Y" format.|Required when device_count > 0|
+|hardware_capability|string|Minimum CUDA hardware capability, in "X.Y" format.|Required when device_count > 0|
index 956a47b1a4ac2ef992958739d5189eaf5e519ed5..888cc828afe74909a11c22b524dd809f3ebd0259 100644 (file)
@@ -14,6 +14,7 @@ import (
        "io/ioutil"
        "os"
        "regexp"
+       "strconv"
        "strings"
 
        "git.arvados.org/arvados.git/sdk/go/arvados"
@@ -299,6 +300,7 @@ func (ldr *Loader) Load() (*arvados.Config, error) {
                        ldr.checkEmptyKeepstores(cc),
                        ldr.checkUnlistedKeepstores(cc),
                        ldr.checkStorageClasses(cc),
+                       ldr.checkCUDAVersions(cc),
                        // TODO: check non-empty Rendezvous on
                        // services other than Keepstore
                } {
@@ -399,6 +401,24 @@ func (ldr *Loader) checkStorageClasses(cc arvados.Cluster) error {
        return nil
 }
 
+func (ldr *Loader) checkCUDAVersions(cc arvados.Cluster) error {
+       for _, it := range cc.InstanceTypes {
+               if it.CUDA.DeviceCount == 0 {
+                       continue
+               }
+
+               _, err := strconv.ParseFloat(it.CUDA.DriverVersion, 64)
+               if err != nil {
+                       return fmt.Errorf("InstanceType %q has invalid CUDA.DriverVersion %q, expected format X.Y (%v)", it.Name, it.CUDA.DriverVersion, err)
+               }
+               _, err = strconv.ParseFloat(it.CUDA.HardwareCapability, 64)
+               if err != nil {
+                       return fmt.Errorf("InstanceType %q has invalid CUDA.HardwareCapability %q, expected format X.Y (%v)", it.Name, it.CUDA.HardwareCapability, err)
+               }
+       }
+       return nil
+}
+
 func checkKeyConflict(label string, m map[string]string) error {
        saw := map[string]bool{}
        for k := range m {
index 52d9c4b0f1959262f7a57094d55c0991c22315e1..b237d9fa590cef847366be35a4b728e1442e7941 100644 (file)
@@ -987,17 +987,6 @@ func (runner *ContainerRunner) CreateContainer(imageID string, bindmounts map[st
        runner.executorStdout = stdout
        runner.executorStderr = stderr
 
-       cudaDeviceCount := 0
-       if runner.Container.RuntimeConstraints.CUDA.DriverVersion != "" ||
-               runner.Container.RuntimeConstraints.CUDA.HardwareCapability != "" ||
-               runner.Container.RuntimeConstraints.CUDA.DeviceCount != 0 {
-               // if any of these are set, enable CUDA GPU support
-               cudaDeviceCount = runner.Container.RuntimeConstraints.CUDA.DeviceCount
-               if cudaDeviceCount == 0 {
-                       cudaDeviceCount = 1
-               }
-       }
-
        return runner.executor.Create(containerSpec{
                Image:           imageID,
                VCPUs:           runner.Container.RuntimeConstraints.VCPUs,
@@ -1007,7 +996,7 @@ func (runner *ContainerRunner) CreateContainer(imageID string, bindmounts map[st
                BindMounts:      bindmounts,
                Command:         runner.Container.Command,
                EnableNetwork:   enableNetwork,
-               CUDADeviceCount: cudaDeviceCount,
+               CUDADeviceCount: runner.Container.RuntimeConstraints.CUDA.DeviceCount,
                NetworkMode:     runner.networkMode,
                CgroupParent:    runner.setCgroupParent,
                Stdin:           stdin,
index 00934322d25cc5ba54ddc70a0a9ac9af6c6f70b0..374c6720f60a790970f9391a3ab14642357814f3 100644 (file)
@@ -701,7 +701,7 @@ class ArvadosModel < ApplicationRecord
     false
   end
 
-  def self.where_serialized(colname, value, md5: false)
+  def self.where_serialized(colname, value, md5: false, multivalue: false)
     colsql = colname.to_s
     if md5
       colsql = "md5(#{colsql})"
@@ -714,7 +714,16 @@ class ArvadosModel < ApplicationRecord
       sql = "#{colsql} IN (?)"
       sorted = deep_sort_hash(value)
     end
-    params = [sorted.to_yaml, SafeJSON.dump(sorted)]
+    params = []
+    if multivalue
+      sorted.each do |v|
+        params << v.to_yaml
+        params << SafeJSON.dump(v)
+      end
+    else
+      params << sorted.to_yaml
+      params << SafeJSON.dump(sorted)
+    end
     if md5
       params = params.map { |x| Digest::MD5.hexdigest(x) }
     end
index 2443da45515ba8cfdbbdaebfe21a7bca3c5dbb4e..0326b1298574f80cd94434590a2e142a694889c7 100644 (file)
@@ -296,15 +296,14 @@ class Container < ArvadosModel
         'hardware_capability' => '',
       }
     end
-
-    candidates_inc_cuda = candidates.where_serialized(:runtime_constraints, resolve_runtime_constraints(attrs[:runtime_constraints]), md5: true)
-    if candidates_inc_cuda.count == 0 and attrs[:runtime_constraints]['cuda']['device_count'] == 0
-      # Fallback search on containers introduced before CUDA support,
-      # exclude empty CUDA request from query
-      candidates = candidates.where_serialized(:runtime_constraints, resolve_runtime_constraints(attrs[:runtime_constraints].except('cuda')), md5: true)
-    else
-      candidates = candidates_inc_cuda
+    resolved_runtime_constraints = [resolve_runtime_constraints(attrs[:runtime_constraints])]
+    if resolved_runtime_constraints[0]['cuda']['device_count'] == 0
+      # If no CUDA requested, extend search to include older container
+      # records that don't have a 'cuda' section in runtime_constraints
+      resolved_runtime_constraints << resolved_runtime_constraints[0].except('cuda')
     end
+
+    candidates = candidates.where_serialized(:runtime_constraints, resolved_runtime_constraints, md5: true, multivalue: true)
     log_reuse_info(candidates) { "after filtering on runtime_constraints #{attrs[:runtime_constraints].inspect}" }
 
     log_reuse_info { "checking for state=Complete with readable output and log..." }
index 00773fcb86818f057170700ba3ab79e157307bb6..a3264e419b080a989e1552bc43b1ae278f37db5d 100644 (file)
@@ -346,9 +346,9 @@ class ContainerRequest < ArvadosModel
         end
         ['driver_version', 'hardware_capability'].each do |k|
           v = runtime_constraints['cuda'][k]
-          if !v.is_a?(String)
+          if !v.is_a?(String) || (runtime_constraints['cuda']['device_count'] > 0 && v.to_f == 0.0)
             errors.add(:runtime_constraints,
-                       "[cuda.#{k}]=#{v.inspect} must be a string")
+                       "[cuda.#{k}]=#{v.inspect} must be a string in format 'X.Y' version")
           end
         end
       end