Also ensure reuse across versions when CUDA isn't being used.
Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz@curii.com>
}
// compareVersion returns true if vs1 < vs2, otherwise false
-func versionLess(vs1 string, vs2 string) bool {
+func versionLess(vs1 string, vs2 string) (bool, error) {
v1, err := strconv.ParseFloat(vs1, 64)
if err != nil {
- return false
+ return false, err
}
v2, err := strconv.ParseFloat(vs2, 64)
if err != nil {
- return false
+ return false, err
}
- return v1 < v2
+ return v1 < v2, nil
}
// ChooseInstanceType returns the cheapest available
ok := false
for _, it := range cc.InstanceTypes {
+ driverInsuff, driverErr := versionLess(it.CUDA.DriverVersion, ctr.RuntimeConstraints.CUDA.DriverVersion)
+ capabilityInsuff, capabilityErr := versionLess(it.CUDA.HardwareCapability, ctr.RuntimeConstraints.CUDA.HardwareCapability)
+
switch {
// reasons to reject a node
case ok && it.Price > best.Price: // already selected a node, and this one is more expensive
case it.Preemptible != ctr.SchedulingParameters.Preemptible: // wrong preemptable setting
case it.Price == best.Price && (it.RAM < best.RAM || it.VCPUs < best.VCPUs): // same price, worse specs
case it.CUDA.DeviceCount < ctr.RuntimeConstraints.CUDA.DeviceCount: // insufficient CUDA devices
- case ctr.RuntimeConstraints.CUDA.DeviceCount > 0 && versionLess(it.CUDA.DriverVersion, ctr.RuntimeConstraints.CUDA.DriverVersion): // insufficient driver version
- case ctr.RuntimeConstraints.CUDA.DeviceCount > 0 && versionLess(it.CUDA.HardwareCapability, ctr.RuntimeConstraints.CUDA.HardwareCapability): // insufficient hardware capability
+ case ctr.RuntimeConstraints.CUDA.DeviceCount > 0 && (driverInsuff || driverErr != nil): // insufficient driver version
+ case ctr.RuntimeConstraints.CUDA.DeviceCount > 0 && (capabilityInsuff || capabilityErr != nil): // insufficient hardware capability
// Don't select this node
default:
// Didn't reject the node, so select it
"best": {Price: 2.2, RAM: 2000000000, VCPUs: 4, Scratch: 2 * GiB, Name: "best", CUDA: arvados.CUDAFeatures{DeviceCount: 1, HardwareCapability: "9.0", DriverVersion: "11.0"}},
"low_driver": {Price: 2.1, RAM: 2000000000, VCPUs: 4, Scratch: 2 * GiB, Name: "low_driver", CUDA: arvados.CUDAFeatures{DeviceCount: 1, HardwareCapability: "9.0", DriverVersion: "10.0"}},
"cheap_gpu": {Price: 2.0, RAM: 2000000000, VCPUs: 4, Scratch: 2 * GiB, Name: "cheap_gpu", CUDA: arvados.CUDAFeatures{DeviceCount: 1, HardwareCapability: "8.0", DriverVersion: "10.0"}},
+ "invalid_gpu": {Price: 1.9, RAM: 2000000000, VCPUs: 4, Scratch: 2 * GiB, Name: "invalid_gpu", CUDA: arvados.CUDAFeatures{DeviceCount: 1, HardwareCapability: "12.0.12", DriverVersion: "12.0.12"}},
"non_gpu": {Price: 1.1, RAM: 2000000000, VCPUs: 4, Scratch: 2 * GiB, Name: "non_gpu"},
}
HardwareCapability: "",
DriverVersion: "10.0",
},
- SelectedInstance: "cheap_gpu",
- },
- GPUTestCase{
- CUDA: arvados.CUDARuntimeConstraints{
- DeviceCount: 1,
- HardwareCapability: "8.0",
- DriverVersion: "",
- },
- SelectedInstance: "cheap_gpu",
- },
- GPUTestCase{
- CUDA: arvados.CUDARuntimeConstraints{
- DeviceCount: 1,
- HardwareCapability: "",
- DriverVersion: "",
- },
- SelectedInstance: "cheap_gpu",
+ SelectedInstance: "",
},
GPUTestCase{
CUDA: arvados.CUDARuntimeConstraints{
CUDA: tc.CUDA,
},
})
- c.Check(err, check.IsNil)
- c.Check(best.Name, check.Equals, tc.SelectedInstance)
+ if best.Name != "" {
+ c.Check(err, check.IsNil)
+ c.Check(best.Name, check.Equals, tc.SelectedInstance)
+ } else {
+ c.Check(err, check.Not(check.IsNil))
+ }
}
}
}
type CUDARuntimeConstraints struct {
- DriverVersion string `json:"driver_version,omitempty"`
- HardwareCapability string `json:"hardware_capability,omitempty"`
- DeviceCount int `json:"device_count,omitempty"`
+ DriverVersion string `json:"driver_version"`
+ HardwareCapability string `json:"hardware_capability"`
+ DeviceCount int `json:"device_count"`
}
// RuntimeConstraints specify a container's compute resources (RAM,
RAM int64 `json:"ram"`
VCPUs int `json:"vcpus"`
KeepCacheRAM int64 `json:"keep_cache_ram"`
- CUDA CUDARuntimeConstraints `json:"cuda,omitempty"`
+ CUDA CUDARuntimeConstraints `json:"cuda"`
}
// SchedulingParameters specify a container's scheduling parameters
def fill_container_defaults
self.runtime_constraints = {
'API' => false,
+ 'cuda' => {
+ 'device_count' => 0,
+ 'driver_version' => '',
+ 'hardware_capability' => '',
+ },
'keep_cache_ram' => 0,
'ram' => 0,
'vcpus' => 0,
candidates = candidates.where('secret_mounts_md5 = ?', secret_mounts_md5)
log_reuse_info(candidates) { "after filtering on secret_mounts_md5 #{secret_mounts_md5.inspect}" }
- candidates = candidates.where_serialized(:runtime_constraints, resolve_runtime_constraints(attrs[:runtime_constraints]), md5: true)
+ if attrs[:runtime_constraints]['cuda'].nil?
+ attrs[:runtime_constraints]['cuda'] = {
+ 'device_count' => 0,
+ 'driver_version' => '',
+ 'hardware_capability' => '',
+ }
+ end
+
+ candidates_inc_cuda = candidates.where_serialized(:runtime_constraints, resolve_runtime_constraints(attrs[:runtime_constraints]), md5: true)
+ if candidates_inc_cuda.count == 0 and attrs[:runtime_constraints]['cuda']['device_count'] == 0
+ # Fallback search on containers introduced before CUDA support,
+ # exclude empty CUDA request from query
+ candidates = candidates.where_serialized(:runtime_constraints, resolve_runtime_constraints(attrs[:runtime_constraints].except('cuda')), md5: true)
+ else
+ candidates = candidates_inc_cuda
+ end
log_reuse_info(candidates) { "after filtering on runtime_constraints #{attrs[:runtime_constraints].inspect}" }
log_reuse_info { "checking for state=Complete with readable output and log..." }
"[#{k}]=#{v.inspect} must be a positive integer")
end
end
+ if runtime_constraints['cuda']
+ ['device_count'].each do |k|
+ v = runtime_constraints['cuda'][k]
+ if !v.is_a?(Integer) || v < 0
+ errors.add(:runtime_constraints,
+ "[cuda.#{k}]=#{v.inspect} must be a positive or zero integer")
+ end
+ end
+ ['driver_version', 'hardware_capability'].each do |k|
+ v = runtime_constraints['cuda'][k]
+ if !v.is_a?(String)
+ errors.add(:runtime_constraints,
+ "[cuda.#{k}]=#{v.inspect} must be a string")
+ end
+ end
+ end
end
end
runtime_constraints:
vcpus: 1
ram: 123
+ cuda:
+ driver_version: ""
+ hardware_capability: ""
+ device_count: 0
mounts: {}
running:
runtime_constraints:
ram: 12000000000
vcpus: 4
+ cuda:
+ driver_version: ""
+ hardware_capability: ""
+ device_count: 0
mounts:
/tmp:
kind: tmp
/var/spool/cwl:
kind: tmp
capacity: 24000000000
+
+cuda_container:
+ uuid: zzzzz-dz642-cudagpcontainer
+ owner_uuid: zzzzz-tpzed-000000000000000
+ state: Complete
+ exit_code: 0
+ priority: 1
+ created_at: 2016-01-11 11:11:11.111111111 Z
+ updated_at: 2016-01-11 11:11:11.111111111 Z
+ started_at: 2016-01-11 11:11:11.111111111 Z
+ finished_at: 2016-01-12 11:12:13.111111111 Z
+ container_image: test
+ cwd: test
+ log: ea10d51bcf88862dbcc36eb292017dfd+45
+ output: 1f4b0bc7583c2a7f9102c395f4ffc5e3+45
+ output_path: test
+ command: ["echo", "hello", "/bin/sh", "-c", "'cat' '/keep/fa7aeb5140e2848d39b416daeef4ffc5+45/foobar' '/keep/fa7aeb5140e2848d39b416daeef4ffc5+45/baz' '|' 'gzip' '>' '/dev/null'"]
+ runtime_constraints:
+ ram: 12000000000
+ vcpus: 4
+ cuda:
+ driver_version: "11.0"
+ hardware_capability: "9.0"
+ device_count: 1
+ secret_mounts: {}
+ secret_mounts_md5: 99914b932bd37a50b983c5e7c90ae93b
container_image: 'fa3c1a9cb6783f85f2ecda037e07b8c3+167',
output_path: '/tmp',
priority: 1,
- runtime_constraints: {"vcpus" => 1, "ram" => 1},
+ runtime_constraints: {"vcpus" => 1, "ram" => 1, "cuda" => {"device_count":0, "driver_version": "", "hardware_capability": ""}},
}
REUSABLE_COMMON_ATTRS = {
"API" => false,
"keep_cache_ram" => 0,
"ram" => 12000000000,
- "vcpus" => 4,
+ "vcpus" => 4
},
mounts: {
"test" => {"kind" => "json"},
set_user_from_auth :active
env = {"C" => "3", "B" => "2", "A" => "1"}
m = {"F" => {"kind" => "3"}, "E" => {"kind" => "2"}, "D" => {"kind" => "1"}}
- rc = {"vcpus" => 1, "ram" => 1, "keep_cache_ram" => 1, "API" => true}
+ rc = {"vcpus" => 1, "ram" => 1, "keep_cache_ram" => 1, "API" => true, "cuda" => {"device_count":0, "driver_version": "", "hardware_capability": ""}}
c, _ = minimal_new(environment: env, mounts: m, runtime_constraints: rc)
c.reload
assert_equal Container.deep_sort_hash(env).to_json, c.environment.to_json
assert_equal c1.uuid, reused.uuid
end
+ test "find_reusable method with cuda" do
+ set_user_from_auth :active
+ # No cuda
+ no_cuda_attrs = REUSABLE_COMMON_ATTRS.merge({use_existing:false, priority:1, environment:{"var" => "queued"},
+ runtime_constraints: {"vcpus" => 1, "ram" => 1, "keep_cache_ram"=>268435456, "API" => false,
+ "cuda" => {"device_count":0, "driver_version": "", "hardware_capability": ""}},})
+ c1, _ = minimal_new(no_cuda_attrs)
+ assert_equal Container::Queued, c1.state
+
+ # has cuda
+ cuda_attrs = REUSABLE_COMMON_ATTRS.merge({use_existing:false, priority:1, environment:{"var" => "queued"},
+ runtime_constraints: {"vcpus" => 1, "ram" => 1, "keep_cache_ram"=>268435456, "API" => false,
+ "cuda" => {"device_count":1, "driver_version": "11.0", "hardware_capability": "9.0"}},})
+ c2, _ = minimal_new(cuda_attrs)
+ assert_equal Container::Queued, c2.state
+
+ # should find the no cuda one
+ reused = Container.find_reusable(no_cuda_attrs)
+ assert_not_nil reused
+ assert_equal reused.uuid, c1.uuid
+
+ # should find the cuda one
+ reused = Container.find_reusable(cuda_attrs)
+ assert_not_nil reused
+ assert_equal reused.uuid, c2.uuid
+ end
+
test "Container running" do
set_user_from_auth :active
c, _ = minimal_new priority: 1