X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/03f277bf4b616f41ef7ed4b195d44dab83d16144..73ad2ee9af3b97c46293bdfc9e2925a67726b786:/lib/dispatchcloud/node_size.go diff --git a/lib/dispatchcloud/node_size.go b/lib/dispatchcloud/node_size.go index 3b72c4aeeb..b5fd0262a8 100644 --- a/lib/dispatchcloud/node_size.go +++ b/lib/dispatchcloud/node_size.go @@ -5,10 +5,10 @@ package dispatchcloud import ( - "bytes" "errors" "log" "os/exec" + "sort" "strings" "time" @@ -16,30 +16,53 @@ import ( ) var ( - ErrConstraintsNotSatisfiable = errors.New("constraints not satisfiable by any configured instance type") ErrInstanceTypesNotConfigured = errors.New("site configuration does not list any instance types") discountConfiguredRAMPercent = 5 ) +// ConstraintsNotSatisfiableError includes a list of available instance types +// to be reported back to the user. +type ConstraintsNotSatisfiableError struct { + error + AvailableTypes []arvados.InstanceType +} + // ChooseInstanceType returns the cheapest available // arvados.InstanceType big enough to run ctr. func ChooseInstanceType(cc *arvados.Cluster, ctr *arvados.Container) (best arvados.InstanceType, err error) { - needVCPUs := ctr.RuntimeConstraints.VCPUs - needRAM := ctr.RuntimeConstraints.RAM + ctr.RuntimeConstraints.KeepCacheRAM - - needRAM = (needRAM * 100) / int64(100-discountConfiguredRAMPercent) - if len(cc.InstanceTypes) == 0 { err = ErrInstanceTypesNotConfigured return } - err = ErrConstraintsNotSatisfiable + needScratch := int64(0) + for _, m := range ctr.Mounts { + if m.Kind == "tmp" { + needScratch += m.Capacity + } + } + + needVCPUs := ctr.RuntimeConstraints.VCPUs + + needRAM := ctr.RuntimeConstraints.RAM + ctr.RuntimeConstraints.KeepCacheRAM + needRAM = (needRAM * 100) / int64(100-discountConfiguredRAMPercent) + + availableTypes := make([]arvados.InstanceType, len(cc.InstanceTypes)) + copy(availableTypes, cc.InstanceTypes) + sort.Slice(availableTypes, func(a, b int) bool { + return availableTypes[a].Price < availableTypes[b].Price + }) + err = ConstraintsNotSatisfiableError{ + errors.New("constraints not satisfiable by any configured instance type"), + availableTypes, + } for _, it := range cc.InstanceTypes { switch { case err == nil && it.Price > best.Price: + case it.Scratch < needScratch: case it.RAM < needRAM: case it.VCPUs < needVCPUs: + case it.Preemptable != ctr.SchedulingParameters.Preemptable: case it.Price == best.Price && (it.RAM < best.RAM || it.VCPUs < best.VCPUs): // Equal price, but worse specs default: @@ -61,11 +84,9 @@ func ChooseInstanceType(cc *arvados.Cluster, ctr *arvados.Container) (best arvad // it is no longer offered by any node. So, to make a feature name // valid, we can add it to a dummy node ("compute0"), then remove it. // -// (2) when srun is given an invalid --gres argument and an invalid -// --constraint argument, the error message mentions "Invalid feature -// specification". So, to test whether a feature name is valid without -// actually submitting a job, we can call srun with the feature name -// and an invalid --gres argument. +// (2) To test whether a set of feature names are valid without +// actually submitting a job, we can call srun --test-only with the +// desired features. // // SlurmNodeTypeFeatureKludge does a test-and-fix operation // immediately, and then periodically, in case slurm restarts and @@ -82,39 +103,31 @@ func SlurmNodeTypeFeatureKludge(cc *arvados.Cluster) { } for { slurmKludge(features) - time.Sleep(time.Minute) + time.Sleep(2 * time.Second) } } -var ( - slurmDummyNode = "compute0" - slurmErrBadFeature = "Invalid feature" - slurmErrBadGres = "Invalid generic resource" -) +const slurmDummyNode = "compute0" func slurmKludge(features []string) { - cmd := exec.Command("srun", "--gres=invalid-gres-specification", "--constraint="+strings.Join(features, "&"), "true") - out, err := cmd.CombinedOutput() - switch { - case err == nil: - log.Printf("warning: guaranteed-to-fail srun command did not fail: %q %q", cmd.Path, cmd.Args) - log.Printf("output was: %q", out) + allFeatures := strings.Join(features, ",") - case bytes.Contains(out, []byte(slurmErrBadFeature)): - log.Printf("temporarily configuring node %q with all node type features", slurmDummyNode) - for _, nodeFeatures := range []string{strings.Join(features, ","), ""} { - cmd = exec.Command("scontrol", "update", "NodeName="+slurmDummyNode, "Features="+nodeFeatures) - log.Printf("running: %q %q", cmd.Path, cmd.Args) - out, err := cmd.CombinedOutput() - if err != nil { - log.Printf("error: scontrol: %s (output was %q)", err, out) - } - } - - case bytes.Contains(out, []byte(slurmErrBadGres)): - // Evidently our node-type feature names are all valid. + cmd := exec.Command("sinfo", "--nodes="+slurmDummyNode, "--format=%f", "--noheader") + out, err := cmd.CombinedOutput() + if err != nil { + log.Printf("running %q %q: %s (output was %q)", cmd.Path, cmd.Args, err, out) + return + } + if string(out) == allFeatures+"\n" { + // Already configured correctly, nothing to do. + return + } - default: - log.Printf("warning: expected srun error %q or %q, but output was %q", slurmErrBadFeature, slurmErrBadGres, out) + log.Printf("configuring node %q with all node type features", slurmDummyNode) + cmd = exec.Command("scontrol", "update", "NodeName="+slurmDummyNode, "Features="+allFeatures) + log.Printf("running: %q %q", cmd.Path, cmd.Args) + out, err = cmd.CombinedOutput() + if err != nil { + log.Printf("error: scontrol: %s (output was %q)", err, out) } }