18321: Account for CUDA in container reuse
authorPeter Amstutz <peter.amstutz@curii.com>
Fri, 17 Dec 2021 20:57:16 +0000 (15:57 -0500)
committerPeter Amstutz <peter.amstutz@curii.com>
Fri, 17 Dec 2021 20:57:16 +0000 (15:57 -0500)
Also ensure reuse across versions when CUDA isn't being used.

Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz@curii.com>

lib/dispatchcloud/node_size.go
lib/dispatchcloud/node_size_test.go
sdk/go/arvados/container.go
services/api/app/models/arvados_model.rb
services/api/app/models/container.rb
services/api/app/models/container_request.rb
services/api/test/fixtures/container_requests.yml
services/api/test/fixtures/containers.yml
services/api/test/unit/container_test.rb

index 6831d8d8e9a50268ebc9861fc41b73255040b65d..7c7643bfc7622fc8c876eba9c2e01d9203385074 100644 (file)
@@ -84,16 +84,16 @@ func EstimateScratchSpace(ctr *arvados.Container) (needScratch int64) {
 }
 
 // compareVersion returns true if vs1 < vs2, otherwise false
-func versionLess(vs1 string, vs2 string) bool {
+func versionLess(vs1 string, vs2 string) (bool, error) {
        v1, err := strconv.ParseFloat(vs1, 64)
        if err != nil {
-               return false
+               return false, err
        }
        v2, err := strconv.ParseFloat(vs2, 64)
        if err != nil {
-               return false
+               return false, err
        }
-       return v1 < v2
+       return v1 < v2, nil
 }
 
 // ChooseInstanceType returns the cheapest available
@@ -115,6 +115,9 @@ func ChooseInstanceType(cc *arvados.Cluster, ctr *arvados.Container) (best arvad
 
        ok := false
        for _, it := range cc.InstanceTypes {
+               driverInsuff, driverErr := versionLess(it.CUDA.DriverVersion, ctr.RuntimeConstraints.CUDA.DriverVersion)
+               capabilityInsuff, capabilityErr := versionLess(it.CUDA.HardwareCapability, ctr.RuntimeConstraints.CUDA.HardwareCapability)
+
                switch {
                // reasons to reject a node
                case ok && it.Price > best.Price: // already selected a node, and this one is more expensive
@@ -124,8 +127,8 @@ func ChooseInstanceType(cc *arvados.Cluster, ctr *arvados.Container) (best arvad
                case it.Preemptible != ctr.SchedulingParameters.Preemptible: // wrong preemptable setting
                case it.Price == best.Price && (it.RAM < best.RAM || it.VCPUs < best.VCPUs): // same price, worse specs
                case it.CUDA.DeviceCount < ctr.RuntimeConstraints.CUDA.DeviceCount: // insufficient CUDA devices
-               case ctr.RuntimeConstraints.CUDA.DeviceCount > 0 && versionLess(it.CUDA.DriverVersion, ctr.RuntimeConstraints.CUDA.DriverVersion): // insufficient driver version
-               case ctr.RuntimeConstraints.CUDA.DeviceCount > 0 && versionLess(it.CUDA.HardwareCapability, ctr.RuntimeConstraints.CUDA.HardwareCapability): // insufficient hardware capability
+               case ctr.RuntimeConstraints.CUDA.DeviceCount > 0 && (driverInsuff || driverErr != nil): // insufficient driver version
+               case ctr.RuntimeConstraints.CUDA.DeviceCount > 0 && (capabilityInsuff || capabilityErr != nil): // insufficient hardware capability
                        // Don't select this node
                default:
                        // Didn't reject the node, so select it
index 06255462b318621e55f7dc883e74956b5fdfaf73..eb3648e8ac13265995bf98b040c47106ea380ea3 100644 (file)
@@ -155,6 +155,7 @@ func (*NodeSizeSuite) TestChooseGPU(c *check.C) {
                "best":           {Price: 2.2, RAM: 2000000000, VCPUs: 4, Scratch: 2 * GiB, Name: "best", CUDA: arvados.CUDAFeatures{DeviceCount: 1, HardwareCapability: "9.0", DriverVersion: "11.0"}},
                "low_driver":     {Price: 2.1, RAM: 2000000000, VCPUs: 4, Scratch: 2 * GiB, Name: "low_driver", CUDA: arvados.CUDAFeatures{DeviceCount: 1, HardwareCapability: "9.0", DriverVersion: "10.0"}},
                "cheap_gpu":      {Price: 2.0, RAM: 2000000000, VCPUs: 4, Scratch: 2 * GiB, Name: "cheap_gpu", CUDA: arvados.CUDAFeatures{DeviceCount: 1, HardwareCapability: "8.0", DriverVersion: "10.0"}},
+               "invalid_gpu":    {Price: 1.9, RAM: 2000000000, VCPUs: 4, Scratch: 2 * GiB, Name: "invalid_gpu", CUDA: arvados.CUDAFeatures{DeviceCount: 1, HardwareCapability: "12.0.12", DriverVersion: "12.0.12"}},
                "non_gpu":        {Price: 1.1, RAM: 2000000000, VCPUs: 4, Scratch: 2 * GiB, Name: "non_gpu"},
        }
 
@@ -201,23 +202,7 @@ func (*NodeSizeSuite) TestChooseGPU(c *check.C) {
                                HardwareCapability: "",
                                DriverVersion:      "10.0",
                        },
-                       SelectedInstance: "cheap_gpu",
-               },
-               GPUTestCase{
-                       CUDA: arvados.CUDARuntimeConstraints{
-                               DeviceCount:        1,
-                               HardwareCapability: "8.0",
-                               DriverVersion:      "",
-                       },
-                       SelectedInstance: "cheap_gpu",
-               },
-               GPUTestCase{
-                       CUDA: arvados.CUDARuntimeConstraints{
-                               DeviceCount:        1,
-                               HardwareCapability: "",
-                               DriverVersion:      "",
-                       },
-                       SelectedInstance: "cheap_gpu",
+                       SelectedInstance: "",
                },
                GPUTestCase{
                        CUDA: arvados.CUDARuntimeConstraints{
@@ -241,7 +226,11 @@ func (*NodeSizeSuite) TestChooseGPU(c *check.C) {
                                CUDA:         tc.CUDA,
                        },
                })
-               c.Check(err, check.IsNil)
-               c.Check(best.Name, check.Equals, tc.SelectedInstance)
+               if best.Name != "" {
+                       c.Check(err, check.IsNil)
+                       c.Check(best.Name, check.Equals, tc.SelectedInstance)
+               } else {
+                       c.Check(err, check.Not(check.IsNil))
+               }
        }
 }
index d0b12733340080418039facd27cd0133e650a234..3510a6db048565b019cbdbfaca24d6f5540bf458 100644 (file)
@@ -94,9 +94,9 @@ type Mount struct {
 }
 
 type CUDARuntimeConstraints struct {
-       DriverVersion      string `json:"driver_version,omitempty"`
-       HardwareCapability string `json:"hardware_capability,omitempty"`
-       DeviceCount        int    `json:"device_count,omitempty"`
+       DriverVersion      string `json:"driver_version"`
+       HardwareCapability string `json:"hardware_capability"`
+       DeviceCount        int    `json:"device_count"`
 }
 
 // RuntimeConstraints specify a container's compute resources (RAM,
@@ -106,7 +106,7 @@ type RuntimeConstraints struct {
        RAM          int64                  `json:"ram"`
        VCPUs        int                    `json:"vcpus"`
        KeepCacheRAM int64                  `json:"keep_cache_ram"`
-       CUDA         CUDARuntimeConstraints `json:"cuda,omitempty"`
+       CUDA         CUDARuntimeConstraints `json:"cuda"`
 }
 
 // SchedulingParameters specify a container's scheduling parameters
index af226cde821399a0d115fdddfc698ec57b39ace9..00934322d25cc5ba54ddc70a0a9ac9af6c6f70b0 100644 (file)
@@ -884,6 +884,11 @@ class ArvadosModel < ApplicationRecord
   def fill_container_defaults
     self.runtime_constraints = {
       'API' => false,
+      'cuda' => {
+        'device_count' => 0,
+        'driver_version' => '',
+        'hardware_capability' => '',
+      },
       'keep_cache_ram' => 0,
       'ram' => 0,
       'vcpus' => 0,
index d6a44c80239d46c05d9d7ead49e14f538e19846d..2443da45515ba8cfdbbdaebfe21a7bca3c5dbb4e 100644 (file)
@@ -289,7 +289,22 @@ class Container < ArvadosModel
     candidates = candidates.where('secret_mounts_md5 = ?', secret_mounts_md5)
     log_reuse_info(candidates) { "after filtering on secret_mounts_md5 #{secret_mounts_md5.inspect}" }
 
-    candidates = candidates.where_serialized(:runtime_constraints, resolve_runtime_constraints(attrs[:runtime_constraints]), md5: true)
+    if attrs[:runtime_constraints]['cuda'].nil?
+      attrs[:runtime_constraints]['cuda'] = {
+        'device_count' => 0,
+        'driver_version' => '',
+        'hardware_capability' => '',
+      }
+    end
+
+    candidates_inc_cuda = candidates.where_serialized(:runtime_constraints, resolve_runtime_constraints(attrs[:runtime_constraints]), md5: true)
+    if candidates_inc_cuda.count == 0 and attrs[:runtime_constraints]['cuda']['device_count'] == 0
+      # Fallback search on containers introduced before CUDA support,
+      # exclude empty CUDA request from query
+      candidates = candidates.where_serialized(:runtime_constraints, resolve_runtime_constraints(attrs[:runtime_constraints].except('cuda')), md5: true)
+    else
+      candidates = candidates_inc_cuda
+    end
     log_reuse_info(candidates) { "after filtering on runtime_constraints #{attrs[:runtime_constraints].inspect}" }
 
     log_reuse_info { "checking for state=Complete with readable output and log..." }
index 4a580816cd737674966781d63dfee9a1b4a1dbee..00773fcb86818f057170700ba3ab79e157307bb6 100644 (file)
@@ -336,6 +336,22 @@ class ContainerRequest < ArvadosModel
                      "[#{k}]=#{v.inspect} must be a positive integer")
         end
       end
+      if runtime_constraints['cuda']
+        ['device_count'].each do |k|
+          v = runtime_constraints['cuda'][k]
+          if !v.is_a?(Integer) || v < 0
+            errors.add(:runtime_constraints,
+                       "[cuda.#{k}]=#{v.inspect} must be a positive or zero integer")
+          end
+        end
+        ['driver_version', 'hardware_capability'].each do |k|
+          v = runtime_constraints['cuda'][k]
+          if !v.is_a?(String)
+            errors.add(:runtime_constraints,
+                       "[cuda.#{k}]=#{v.inspect} must be a string")
+          end
+        end
+      end
     end
   end
 
index 5be132ac30933187b998dd94b9ff975c27ee9988..dca89f56d3bb511612c93b6ce38ce750c630a2f2 100644 (file)
@@ -20,6 +20,10 @@ queued:
   runtime_constraints:
     vcpus: 1
     ram: 123
+    cuda:
+      driver_version: ""
+      hardware_capability: ""
+      device_count: 0
   mounts: {}
 
 running:
index e7cd0abd1fefb4751311cd9f91ba6f3ce572ae99..53539ce1e054d6f537675e86a6974373a98d0639 100644 (file)
@@ -16,6 +16,10 @@ queued:
   runtime_constraints:
     ram: 12000000000
     vcpus: 4
+    cuda:
+      driver_version: ""
+      hardware_capability: ""
+      device_count: 0
   mounts:
     /tmp:
       kind: tmp
@@ -449,3 +453,29 @@ runtime_token:
     /var/spool/cwl:
       kind: tmp
       capacity: 24000000000
+
+cuda_container:
+  uuid: zzzzz-dz642-cudagpcontainer
+  owner_uuid: zzzzz-tpzed-000000000000000
+  state: Complete
+  exit_code: 0
+  priority: 1
+  created_at: 2016-01-11 11:11:11.111111111 Z
+  updated_at: 2016-01-11 11:11:11.111111111 Z
+  started_at: 2016-01-11 11:11:11.111111111 Z
+  finished_at: 2016-01-12 11:12:13.111111111 Z
+  container_image: test
+  cwd: test
+  log: ea10d51bcf88862dbcc36eb292017dfd+45
+  output: 1f4b0bc7583c2a7f9102c395f4ffc5e3+45
+  output_path: test
+  command: ["echo", "hello", "/bin/sh", "-c", "'cat' '/keep/fa7aeb5140e2848d39b416daeef4ffc5+45/foobar' '/keep/fa7aeb5140e2848d39b416daeef4ffc5+45/baz' '|' 'gzip' '>' '/dev/null'"]
+  runtime_constraints:
+    ram: 12000000000
+    vcpus: 4
+    cuda:
+      driver_version: "11.0"
+      hardware_capability: "9.0"
+      device_count: 1
+  secret_mounts: {}
+  secret_mounts_md5: 99914b932bd37a50b983c5e7c90ae93b
index 375ab5a7bbb9d22a10144a6ae02e1b72a8f3da8c..ac3e6bea4280ae660042cfb4745c592dbe3c684e 100644 (file)
@@ -14,7 +14,7 @@ class ContainerTest < ActiveSupport::TestCase
     container_image: 'fa3c1a9cb6783f85f2ecda037e07b8c3+167',
     output_path: '/tmp',
     priority: 1,
-    runtime_constraints: {"vcpus" => 1, "ram" => 1},
+    runtime_constraints: {"vcpus" => 1, "ram" => 1, "cuda" => {"device_count":0, "driver_version": "", "hardware_capability": ""}},
   }
 
   REUSABLE_COMMON_ATTRS = {
@@ -26,7 +26,7 @@ class ContainerTest < ActiveSupport::TestCase
       "API" => false,
       "keep_cache_ram" => 0,
       "ram" => 12000000000,
-      "vcpus" => 4,
+      "vcpus" => 4
     },
     mounts: {
       "test" => {"kind" => "json"},
@@ -229,7 +229,7 @@ class ContainerTest < ActiveSupport::TestCase
     set_user_from_auth :active
     env = {"C" => "3", "B" => "2", "A" => "1"}
     m = {"F" => {"kind" => "3"}, "E" => {"kind" => "2"}, "D" => {"kind" => "1"}}
-    rc = {"vcpus" => 1, "ram" => 1, "keep_cache_ram" => 1, "API" => true}
+    rc = {"vcpus" => 1, "ram" => 1, "keep_cache_ram" => 1, "API" => true, "cuda" => {"device_count":0, "driver_version": "", "hardware_capability": ""}}
     c, _ = minimal_new(environment: env, mounts: m, runtime_constraints: rc)
     c.reload
     assert_equal Container.deep_sort_hash(env).to_json, c.environment.to_json
@@ -590,6 +590,33 @@ class ContainerTest < ActiveSupport::TestCase
     assert_equal c1.uuid, reused.uuid
   end
 
+  test "find_reusable method with cuda" do
+    set_user_from_auth :active
+    # No cuda
+    no_cuda_attrs = REUSABLE_COMMON_ATTRS.merge({use_existing:false, priority:1, environment:{"var" => "queued"},
+                                                runtime_constraints: {"vcpus" => 1, "ram" => 1, "keep_cache_ram"=>268435456, "API" => false,
+                                                                      "cuda" => {"device_count":0, "driver_version": "", "hardware_capability": ""}},})
+    c1, _ = minimal_new(no_cuda_attrs)
+    assert_equal Container::Queued, c1.state
+
+    # has cuda
+    cuda_attrs = REUSABLE_COMMON_ATTRS.merge({use_existing:false, priority:1, environment:{"var" => "queued"},
+                                                runtime_constraints: {"vcpus" => 1, "ram" => 1, "keep_cache_ram"=>268435456, "API" => false,
+                                                                      "cuda" => {"device_count":1, "driver_version": "11.0", "hardware_capability": "9.0"}},})
+    c2, _ = minimal_new(cuda_attrs)
+    assert_equal Container::Queued, c2.state
+
+    # should find the no cuda one
+    reused = Container.find_reusable(no_cuda_attrs)
+    assert_not_nil reused
+    assert_equal reused.uuid, c1.uuid
+
+    # should find the cuda one
+    reused = Container.find_reusable(cuda_attrs)
+    assert_not_nil reused
+    assert_equal reused.uuid, c2.uuid
+  end
+
   test "Container running" do
     set_user_from_auth :active
     c, _ = minimal_new priority: 1