From b05ec24843655e162c8c3207e1695debdca9725e Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Mon, 13 Dec 2021 11:55:45 -0500 Subject: [PATCH] 18321: Incorporate CUDA request into picking a node type Arvados-DCO-1.1-Signed-off-by: Peter Amstutz --- lib/config/config.default.yml | 5 ++ lib/config/generated_config.go | 5 ++ lib/dispatchcloud/node_size.go | 32 ++++++++++--- lib/dispatchcloud/node_size_test.go | 73 +++++++++++++++++++++++++++++ sdk/go/arvados/config.go | 7 +++ 5 files changed, 115 insertions(+), 7 deletions(-) diff --git a/lib/config/config.default.yml b/lib/config/config.default.yml index a84dc5d316..c69cee75b9 100644 --- a/lib/config/config.default.yml +++ b/lib/config/config.default.yml @@ -1303,6 +1303,11 @@ Clusters: AddedScratch: 0 Price: 0.1 Preemptible: false + # Include this section if the node type includes GPU (CUDA) support + CUDA: + DriverVersion: "11.0" + HardwareCapability: "9.0" + DeviceCount: 1 StorageClasses: diff --git a/lib/config/generated_config.go b/lib/config/generated_config.go index 567ac30a9b..9294f76968 100644 --- a/lib/config/generated_config.go +++ b/lib/config/generated_config.go @@ -1309,6 +1309,11 @@ Clusters: AddedScratch: 0 Price: 0.1 Preemptible: false + # Include this section if the node type includes GPU (CUDA) support + CUDA: + DriverVersion: "11.0" + HardwareCapability: "9.0" + DeviceCount: 1 StorageClasses: diff --git a/lib/dispatchcloud/node_size.go b/lib/dispatchcloud/node_size.go index 1b10826cbb..aa2cd7d569 100644 --- a/lib/dispatchcloud/node_size.go +++ b/lib/dispatchcloud/node_size.go @@ -83,6 +83,19 @@ func EstimateScratchSpace(ctr *arvados.Container) (needScratch int64) { return } +// compareVersion returns true if vs1 >= vs2, otherwise false +func compareVersion(vs1 string, vs2 string) bool { + v1, err := strconv.ParseFloat(vs1, 64) + if err != nil { + return false + } + v2, err := strconv.ParseFloat(vs2, 64) + if err != nil { + return false + } + return v1 >= v2 +} + // ChooseInstanceType returns the cheapest available // arvados.InstanceType big enough to run ctr. func ChooseInstanceType(cc *arvados.Cluster, ctr *arvados.Container) (best arvados.InstanceType, err error) { @@ -103,14 +116,19 @@ func ChooseInstanceType(cc *arvados.Cluster, ctr *arvados.Container) (best arvad ok := false for _, it := range cc.InstanceTypes { switch { - case ok && it.Price > best.Price: - case int64(it.Scratch) < needScratch: - case int64(it.RAM) < needRAM: - case it.VCPUs < needVCPUs: - case it.Preemptible != ctr.SchedulingParameters.Preemptible: - case it.Price == best.Price && (it.RAM < best.RAM || it.VCPUs < best.VCPUs): - // Equal price, but worse specs + // reasons to reject a node + case ok && it.Price > best.Price: // already selected a node, and this one is more expensive + case int64(it.Scratch) < needScratch: // insufficient scratch + case int64(it.RAM) < needRAM: // insufficient RAM + case it.VCPUs < needVCPUs: // insufficient VCPUs + case it.Preemptible != ctr.SchedulingParameters.Preemptible: // wrong preemptable setting + case it.Price == best.Price && (it.RAM < best.RAM || it.VCPUs < best.VCPUs): // same price, worse specs + case it.CUDA.DeviceCount < ctr.RuntimeConstraints.CUDADeviceCount: // insufficient CUDA devices + case it.CUDA.DeviceCount > 0 && !compareVersion(it.CUDA.DriverVersion, ctr.RuntimeConstraints.CUDADriverVersion): // insufficient driver version + case it.CUDA.DeviceCount > 0 && !compareVersion(it.CUDA.HardwareCapability, ctr.RuntimeConstraints.CUDAHardwareCapability): // insufficient hardware capability + // Don't select this node default: + // Didn't reject the node, so select it // Lower price || (same price && better specs) best = it ok = true diff --git a/lib/dispatchcloud/node_size_test.go b/lib/dispatchcloud/node_size_test.go index abd292cbaf..cdcf4033fc 100644 --- a/lib/dispatchcloud/node_size_test.go +++ b/lib/dispatchcloud/node_size_test.go @@ -147,3 +147,76 @@ func (*NodeSizeSuite) TestScratchForDockerImage(c *check.C) { // Short manifest will return 0 c.Check(n, check.Equals, int64(0)) } + +func (*NodeSizeSuite) TestChooseGPU(c *check.C) { + menu := map[string]arvados.InstanceType{ + "costly": {Price: 4.4, RAM: 4000000000, VCPUs: 8, Scratch: 2 * GiB, Name: "costly", CUDA: arvados.CUDAFeatures{DeviceCount: 2, HardwareCapability: "9.0", DriverVersion: "11.0"}}, + "low_capability": {Price: 2.1, RAM: 2000000000, VCPUs: 4, Scratch: 2 * GiB, Name: "low_capability", CUDA: arvados.CUDAFeatures{DeviceCount: 1, HardwareCapability: "8.0", DriverVersion: "11.0"}}, + "best": {Price: 2.2, RAM: 2000000000, VCPUs: 4, Scratch: 2 * GiB, Name: "best", CUDA: arvados.CUDAFeatures{DeviceCount: 1, HardwareCapability: "9.0", DriverVersion: "11.0"}}, + "low_driver": {Price: 2.1, RAM: 2000000000, VCPUs: 4, Scratch: 2 * GiB, Name: "low_driver", CUDA: arvados.CUDAFeatures{DeviceCount: 1, HardwareCapability: "9.0", DriverVersion: "10.0"}}, + "small": {Price: 1.1, RAM: 1000000000, VCPUs: 2, Scratch: 2 * GiB, Name: "small"}, + } + best, err := ChooseInstanceType(&arvados.Cluster{InstanceTypes: menu}, &arvados.Container{ + Mounts: map[string]arvados.Mount{ + "/tmp": {Kind: "tmp", Capacity: 2 * int64(GiB)}, + }, + RuntimeConstraints: arvados.RuntimeConstraints{ + VCPUs: 2, + RAM: 987654321, + KeepCacheRAM: 123456789, + CUDADeviceCount: 1, + CUDAHardwareCapability: "9.0", + CUDADriverVersion: "11.0", + }, + }) + c.Check(err, check.IsNil) + c.Check(best.Name, check.Equals, "best") + c.Check(best.RAM >= 1234567890, check.Equals, true) + c.Check(best.VCPUs >= 2, check.Equals, true) + c.Check(best.CUDA.DeviceCount >= 1, check.Equals, true) + c.Check(best.CUDA.DriverVersion, check.Equals, "11.0") + c.Check(best.CUDA.HardwareCapability, check.Equals, "9.0") + + best, err = ChooseInstanceType(&arvados.Cluster{InstanceTypes: menu}, &arvados.Container{ + Mounts: map[string]arvados.Mount{ + "/tmp": {Kind: "tmp", Capacity: 2 * int64(GiB)}, + }, + RuntimeConstraints: arvados.RuntimeConstraints{ + VCPUs: 2, + RAM: 987654321, + KeepCacheRAM: 123456789, + CUDADeviceCount: 2, + CUDAHardwareCapability: "9.0", + CUDADriverVersion: "11.0", + }, + }) + c.Check(err, check.IsNil) + c.Check(best.Name, check.Equals, "costly") + c.Check(best.RAM >= 1234567890, check.Equals, true) + c.Check(best.VCPUs >= 2, check.Equals, true) + c.Check(best.CUDA.DeviceCount >= 2, check.Equals, true) + c.Check(best.CUDA.DriverVersion, check.Equals, "11.0") + c.Check(best.CUDA.HardwareCapability, check.Equals, "9.0") + + best, err = ChooseInstanceType(&arvados.Cluster{InstanceTypes: menu}, &arvados.Container{ + Mounts: map[string]arvados.Mount{ + "/tmp": {Kind: "tmp", Capacity: 2 * int64(GiB)}, + }, + RuntimeConstraints: arvados.RuntimeConstraints{ + VCPUs: 2, + RAM: 987654321, + KeepCacheRAM: 123456789, + CUDADeviceCount: 1, + CUDAHardwareCapability: "8.0", + CUDADriverVersion: "11.0", + }, + }) + c.Check(err, check.IsNil) + c.Check(best.Name, check.Equals, "low_capability") + c.Check(best.RAM >= 1234567890, check.Equals, true) + c.Check(best.VCPUs >= 2, check.Equals, true) + c.Check(best.CUDA.DeviceCount >= 1, check.Equals, true) + c.Check(best.CUDA.DriverVersion, check.Equals, "11.0") + c.Check(best.CUDA.HardwareCapability, check.Equals, "8.0") + +} diff --git a/sdk/go/arvados/config.go b/sdk/go/arvados/config.go index 474ce33b0e..3111d61403 100644 --- a/sdk/go/arvados/config.go +++ b/sdk/go/arvados/config.go @@ -410,6 +410,12 @@ type RemoteCluster struct { ActivateUsers bool } +type CUDAFeatures struct { + DriverVersion string + HardwareCapability string + DeviceCount int +} + type InstanceType struct { Name string ProviderType string @@ -420,6 +426,7 @@ type InstanceType struct { AddedScratch ByteSize Price float64 Preemptible bool + CUDA CUDAFeatures } type ContainersConfig struct { -- 2.30.2