X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/88a29cd091468feb98e5cd541c560f4d35bca716..1980d00a9afeaea0bc6f266892e43de14ccd297e:/lib/dispatchcloud/node_size.go diff --git a/lib/dispatchcloud/node_size.go b/lib/dispatchcloud/node_size.go index 34f83a6efd..1c36d6cf5b 100644 --- a/lib/dispatchcloud/node_size.go +++ b/lib/dispatchcloud/node_size.go @@ -5,10 +5,10 @@ package dispatchcloud import ( - "bytes" "errors" "log" "os/exec" + "sort" "strings" "time" @@ -16,11 +16,17 @@ import ( ) var ( - ErrConstraintsNotSatisfiable = errors.New("constraints not satisfiable by any configured instance type") ErrInstanceTypesNotConfigured = errors.New("site configuration does not list any instance types") discountConfiguredRAMPercent = 5 ) +// ConstraintsNotSatisfiableError includes a list of available instance types +// to be reported back to the user. +type ConstraintsNotSatisfiableError struct { + error + AvailableTypes []arvados.InstanceType +} + // ChooseInstanceType returns the cheapest available // arvados.InstanceType big enough to run ctr. func ChooseInstanceType(cc *arvados.Cluster, ctr *arvados.Container) (best arvados.InstanceType, err error) { @@ -41,20 +47,35 @@ func ChooseInstanceType(cc *arvados.Cluster, ctr *arvados.Container) (best arvad needRAM := ctr.RuntimeConstraints.RAM + ctr.RuntimeConstraints.KeepCacheRAM needRAM = (needRAM * 100) / int64(100-discountConfiguredRAMPercent) - err = ErrConstraintsNotSatisfiable + ok := false for _, it := range cc.InstanceTypes { switch { - case err == nil && it.Price > best.Price: - case it.Scratch < needScratch: - case it.RAM < needRAM: + case ok && it.Price > best.Price: + case int64(it.Scratch) < needScratch: + case int64(it.RAM) < needRAM: case it.VCPUs < needVCPUs: + case it.Preemptible != ctr.SchedulingParameters.Preemptible: case it.Price == best.Price && (it.RAM < best.RAM || it.VCPUs < best.VCPUs): // Equal price, but worse specs default: // Lower price || (same price && better specs) best = it - err = nil + ok = true + } + } + if !ok { + availableTypes := make([]arvados.InstanceType, 0, len(cc.InstanceTypes)) + for _, t := range cc.InstanceTypes { + availableTypes = append(availableTypes, t) } + sort.Slice(availableTypes, func(a, b int) bool { + return availableTypes[a].Price < availableTypes[b].Price + }) + err = ConstraintsNotSatisfiableError{ + errors.New("constraints not satisfiable by any configured instance type"), + availableTypes, + } + return } return } @@ -69,11 +90,9 @@ func ChooseInstanceType(cc *arvados.Cluster, ctr *arvados.Container) (best arvad // it is no longer offered by any node. So, to make a feature name // valid, we can add it to a dummy node ("compute0"), then remove it. // -// (2) when srun is given an invalid --gres argument and an invalid -// --constraint argument, the error message mentions "Invalid feature -// specification". So, to test whether a feature name is valid without -// actually submitting a job, we can call srun with the feature name -// and an invalid --gres argument. +// (2) To test whether a set of feature names are valid without +// actually submitting a job, we can call srun --test-only with the +// desired features. // // SlurmNodeTypeFeatureKludge does a test-and-fix operation // immediately, and then periodically, in case slurm restarts and @@ -90,39 +109,31 @@ func SlurmNodeTypeFeatureKludge(cc *arvados.Cluster) { } for { slurmKludge(features) - time.Sleep(time.Minute) + time.Sleep(2 * time.Second) } } -var ( - slurmDummyNode = "compute0" - slurmErrBadFeature = "Invalid feature" - slurmErrBadGres = "Invalid generic resource" -) +const slurmDummyNode = "compute0" func slurmKludge(features []string) { - cmd := exec.Command("srun", "--gres=invalid-gres-specification", "--constraint="+strings.Join(features, "&"), "true") - out, err := cmd.CombinedOutput() - switch { - case err == nil: - log.Printf("warning: guaranteed-to-fail srun command did not fail: %q %q", cmd.Path, cmd.Args) - log.Printf("output was: %q", out) - - case bytes.Contains(out, []byte(slurmErrBadFeature)): - log.Printf("temporarily configuring node %q with all node type features", slurmDummyNode) - for _, nodeFeatures := range []string{strings.Join(features, ","), ""} { - cmd = exec.Command("scontrol", "update", "NodeName="+slurmDummyNode, "Features="+nodeFeatures) - log.Printf("running: %q %q", cmd.Path, cmd.Args) - out, err := cmd.CombinedOutput() - if err != nil { - log.Printf("error: scontrol: %s (output was %q)", err, out) - } - } + allFeatures := strings.Join(features, ",") - case bytes.Contains(out, []byte(slurmErrBadGres)): - // Evidently our node-type feature names are all valid. + cmd := exec.Command("sinfo", "--nodes="+slurmDummyNode, "--format=%f", "--noheader") + out, err := cmd.CombinedOutput() + if err != nil { + log.Printf("running %q %q: %s (output was %q)", cmd.Path, cmd.Args, err, out) + return + } + if string(out) == allFeatures+"\n" { + // Already configured correctly, nothing to do. + return + } - default: - log.Printf("warning: expected srun error %q or %q, but output was %q", slurmErrBadFeature, slurmErrBadGres, out) + log.Printf("configuring node %q with all node type features", slurmDummyNode) + cmd = exec.Command("scontrol", "update", "NodeName="+slurmDummyNode, "Features="+allFeatures) + log.Printf("running: %q %q", cmd.Path, cmd.Args) + out, err = cmd.CombinedOutput() + if err != nil { + log.Printf("error: scontrol: %s (output was %q)", err, out) } }