X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/cbf93e8d897448dbd52369afe89fef2392140ff1..HEAD:/lib/dispatchcloud/node_size.go

diff --git a/lib/dispatchcloud/node_size.go b/lib/dispatchcloud/node_size.go
index 339e042c1a..802bc65c28 100644
--- a/lib/dispatchcloud/node_size.go
+++ b/lib/dispatchcloud/node_size.go
@@ -6,21 +6,17 @@ package dispatchcloud
 
 import (
 	"errors"
-	"log"
-	"os/exec"
+	"math"
 	"regexp"
 	"sort"
 	"strconv"
-	"strings"
-	"time"
 
-	"git.curoverse.com/arvados.git/sdk/go/arvados"
+	"git.arvados.org/arvados.git/sdk/go/arvados"
 )
 
-var (
-	ErrInstanceTypesNotConfigured = errors.New("site configuration does not list any instance types")
-	discountConfiguredRAMPercent  = 5
-)
+var ErrInstanceTypesNotConfigured = errors.New("site configuration does not list any instance types")
+
+var discountConfiguredRAMPercent = 5
 
 // ConstraintsNotSatisfiableError includes a list of available instance types
 // to be reported back to the user.
@@ -38,12 +34,10 @@ var pdhRegexp = regexp.MustCompile(`^[0-9a-f]{32}\+(\d+)$`)
 func estimateDockerImageSize(collectionPDH string) int64 {
 	m := pdhRegexp.FindStringSubmatch(collectionPDH)
 	if m == nil {
-		log.Printf("estimateDockerImageSize: '%v' did not match pdhRegexp, returning 0", collectionPDH)
 		return 0
 	}
 	n, err := strconv.ParseInt(m[1], 10, 64)
 	if err != nil || n < 122 {
-		log.Printf("estimateDockerImageSize: short manifest %v or error (%v), returning 0", n, err)
 		return 0
 	}
 	// To avoid having to fetch the collection, take advantage of
@@ -53,7 +47,7 @@ func estimateDockerImageSize(collectionPDH string) int64 {
 	// the size of the manifest.
 	//
 	// Use the following heuristic:
-	// - Start with the length of the mainfest (n)
+	// - Start with the length of the manifest (n)
 	// - Subtract 80 characters for the filename and file segment
 	// - Divide by 42 to get the number of block identifiers ('hash\+size\ ' is 32+1+8+1)
 	// - Assume each block is full, multiply by 64 MiB
@@ -63,7 +57,7 @@ func estimateDockerImageSize(collectionPDH string) int64 {
 // EstimateScratchSpace estimates how much available disk space (in
 // bytes) is needed to run the container by summing the capacity
 // requested by 'tmp' mounts plus disk space required to load the
-// Docker image.
+// Docker image plus arv-mount block cache.
 func EstimateScratchSpace(ctr *arvados.Container) (needScratch int64) {
 	for _, m := range ctr.Mounts {
 		if m.Kind == "tmp" {
@@ -87,15 +81,35 @@ func EstimateScratchSpace(ctr *arvados.Container) (needScratch int64) {
 	// Now reserve space for the extracted image on disk.
 	needScratch += dockerImageSize
 
+	// Now reserve space the arv-mount disk cache
+	needScratch += ctr.RuntimeConstraints.KeepCacheDisk
+
 	return
 }
 
-// ChooseInstanceType returns the cheapest available
-// arvados.InstanceType big enough to run ctr.
-func ChooseInstanceType(cc *arvados.Cluster, ctr *arvados.Container) (best arvados.InstanceType, err error) {
+// compareVersion returns true if vs1 < vs2, otherwise false
+func versionLess(vs1 string, vs2 string) (bool, error) {
+	v1, err := strconv.ParseFloat(vs1, 64)
+	if err != nil {
+		return false, err
+	}
+	v2, err := strconv.ParseFloat(vs2, 64)
+	if err != nil {
+		return false, err
+	}
+	return v1 < v2, nil
+}
+
+// ChooseInstanceType returns the arvados.InstanceTypes eligible to
+// run ctr, i.e., those that have enough RAM, VCPUs, etc., and are not
+// too expensive according to cluster configuration.
+//
+// The returned types are sorted with lower prices first.
+//
+// The error is non-nil if and only if the returned slice is empty.
+func ChooseInstanceType(cc *arvados.Cluster, ctr *arvados.Container) ([]arvados.InstanceType, error) {
 	if len(cc.InstanceTypes) == 0 {
-		err = ErrInstanceTypesNotConfigured
-		return
+		return nil, ErrInstanceTypesNotConfigured
 	}
 
 	needScratch := EstimateScratchSpace(ctr)
@@ -103,25 +117,42 @@ func ChooseInstanceType(cc *arvados.Cluster, ctr *arvados.Container) (best arvad
 	needVCPUs := ctr.RuntimeConstraints.VCPUs
 
 	needRAM := ctr.RuntimeConstraints.RAM + ctr.RuntimeConstraints.KeepCacheRAM
+	needRAM += int64(cc.Containers.ReserveExtraRAM)
+	if cc.Containers.LocalKeepBlobBuffersPerVCPU > 0 {
+		// + 200 MiB for keepstore process + 10% for GOGC=10
+		needRAM += 220 << 20
+		// + 64 MiB for each blob buffer + 10% for GOGC=10
+		needRAM += int64(cc.Containers.LocalKeepBlobBuffersPerVCPU * needVCPUs * (1 << 26) * 11 / 10)
+	}
 	needRAM = (needRAM * 100) / int64(100-discountConfiguredRAMPercent)
 
-	ok := false
+	maxPriceFactor := math.Max(cc.Containers.MaximumPriceFactor, 1)
+	var types []arvados.InstanceType
+	var maxPrice float64
 	for _, it := range cc.InstanceTypes {
+		driverInsuff, driverErr := versionLess(it.CUDA.DriverVersion, ctr.RuntimeConstraints.CUDA.DriverVersion)
+		capabilityInsuff, capabilityErr := versionLess(it.CUDA.HardwareCapability, ctr.RuntimeConstraints.CUDA.HardwareCapability)
+
 		switch {
-		case ok && it.Price > best.Price:
-		case int64(it.Scratch) < needScratch:
-		case int64(it.RAM) < needRAM:
-		case it.VCPUs < needVCPUs:
-		case it.Preemptible != ctr.SchedulingParameters.Preemptible:
-		case it.Price == best.Price && (it.RAM < best.RAM || it.VCPUs < best.VCPUs):
-			// Equal price, but worse specs
+		// reasons to reject a node
+		case maxPrice > 0 && it.Price > maxPrice: // too expensive
+		case int64(it.Scratch) < needScratch: // insufficient scratch
+		case int64(it.RAM) < needRAM: // insufficient RAM
+		case it.VCPUs < needVCPUs: // insufficient VCPUs
+		case it.Preemptible != ctr.SchedulingParameters.Preemptible: // wrong preemptable setting
+		case it.CUDA.DeviceCount < ctr.RuntimeConstraints.CUDA.DeviceCount: // insufficient CUDA devices
+		case ctr.RuntimeConstraints.CUDA.DeviceCount > 0 && (driverInsuff || driverErr != nil): // insufficient driver version
+		case ctr.RuntimeConstraints.CUDA.DeviceCount > 0 && (capabilityInsuff || capabilityErr != nil): // insufficient hardware capability
+			// Don't select this node
 		default:
-			// Lower price || (same price && better specs)
-			best = it
-			ok = true
+			// Didn't reject the node, so select it
+			types = append(types, it)
+			if newmax := it.Price * maxPriceFactor; newmax < maxPrice || maxPrice == 0 {
+				maxPrice = newmax
+			}
 		}
 	}
-	if !ok {
+	if len(types) == 0 {
 		availableTypes := make([]arvados.InstanceType, 0, len(cc.InstanceTypes))
 		for _, t := range cc.InstanceTypes {
 			availableTypes = append(availableTypes, t)
@@ -129,69 +160,39 @@ func ChooseInstanceType(cc *arvados.Cluster, ctr *arvados.Container) (best arvad
 		sort.Slice(availableTypes, func(a, b int) bool {
 			return availableTypes[a].Price < availableTypes[b].Price
 		})
-		err = ConstraintsNotSatisfiableError{
+		return nil, ConstraintsNotSatisfiableError{
 			errors.New("constraints not satisfiable by any configured instance type"),
 			availableTypes,
 		}
-		return
 	}
-	return
-}
-
-// SlurmNodeTypeFeatureKludge ensures SLURM accepts every instance
-// type name as a valid feature name, even if no instances of that
-// type have appeared yet.
-//
-// It takes advantage of some SLURM peculiarities:
-//
-// (1) A feature is valid after it has been offered by a node, even if
-// it is no longer offered by any node. So, to make a feature name
-// valid, we can add it to a dummy node ("compute0"), then remove it.
-//
-// (2) To test whether a set of feature names are valid without
-// actually submitting a job, we can call srun --test-only with the
-// desired features.
-//
-// SlurmNodeTypeFeatureKludge does a test-and-fix operation
-// immediately, and then periodically, in case slurm restarts and
-// forgets the list of valid features. It never returns (unless there
-// are no node types configured, in which case it returns
-// immediately), so it should generally be invoked with "go".
-func SlurmNodeTypeFeatureKludge(cc *arvados.Cluster) {
-	if len(cc.InstanceTypes) == 0 {
-		return
-	}
-	var features []string
-	for _, it := range cc.InstanceTypes {
-		features = append(features, "instancetype="+it.Name)
-	}
-	for {
-		slurmKludge(features)
-		time.Sleep(2 * time.Second)
-	}
-}
-
-const slurmDummyNode = "compute0"
-
-func slurmKludge(features []string) {
-	allFeatures := strings.Join(features, ",")
-
-	cmd := exec.Command("sinfo", "--nodes="+slurmDummyNode, "--format=%f", "--noheader")
-	out, err := cmd.CombinedOutput()
-	if err != nil {
-		log.Printf("running %q %q: %s (output was %q)", cmd.Path, cmd.Args, err, out)
-		return
-	}
-	if string(out) == allFeatures+"\n" {
-		// Already configured correctly, nothing to do.
-		return
-	}
-
-	log.Printf("configuring node %q with all node type features", slurmDummyNode)
-	cmd = exec.Command("scontrol", "update", "NodeName="+slurmDummyNode, "Features="+allFeatures)
-	log.Printf("running: %q %q", cmd.Path, cmd.Args)
-	out, err = cmd.CombinedOutput()
-	if err != nil {
-		log.Printf("error: scontrol: %s (output was %q)", err, out)
+	sort.Slice(types, func(i, j int) bool {
+		if types[i].Price != types[j].Price {
+			// prefer lower price
+			return types[i].Price < types[j].Price
+		}
+		if types[i].RAM != types[j].RAM {
+			// if same price, prefer more RAM
+			return types[i].RAM > types[j].RAM
+		}
+		if types[i].VCPUs != types[j].VCPUs {
+			// if same price and RAM, prefer more VCPUs
+			return types[i].VCPUs > types[j].VCPUs
+		}
+		if types[i].Scratch != types[j].Scratch {
+			// if same price and RAM and VCPUs, prefer more scratch
+			return types[i].Scratch > types[j].Scratch
+		}
+		// no preference, just sort the same way each time
+		return types[i].Name < types[j].Name
+	})
+	// Truncate types at maxPrice. We rejected it.Price>maxPrice
+	// in the loop above, but at that point maxPrice wasn't
+	// necessarily the final (lowest) maxPrice.
+	for i, it := range types {
+		if i > 0 && it.Price > maxPrice {
+			types = types[:i]
+			break
+		}
 	}
+	return types, nil
 }