X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/01fe37c91379764afc9569b5f5b907e79e15001e..1980d00a9afeaea0bc6f266892e43de14ccd297e:/lib/dispatchcloud/node_size.go diff --git a/lib/dispatchcloud/node_size.go b/lib/dispatchcloud/node_size.go index 41c6ff4251..1c36d6cf5b 100644 --- a/lib/dispatchcloud/node_size.go +++ b/lib/dispatchcloud/node_size.go @@ -5,10 +5,10 @@ package dispatchcloud import ( - "bytes" "errors" "log" "os/exec" + "sort" "strings" "time" @@ -16,11 +16,17 @@ import ( ) var ( - ErrConstraintsNotSatisfiable = errors.New("constraints not satisfiable by any configured instance type") ErrInstanceTypesNotConfigured = errors.New("site configuration does not list any instance types") discountConfiguredRAMPercent = 5 ) +// ConstraintsNotSatisfiableError includes a list of available instance types +// to be reported back to the user. +type ConstraintsNotSatisfiableError struct { + error + AvailableTypes []arvados.InstanceType +} + // ChooseInstanceType returns the cheapest available // arvados.InstanceType big enough to run ctr. func ChooseInstanceType(cc *arvados.Cluster, ctr *arvados.Container) (best arvados.InstanceType, err error) { @@ -41,20 +47,35 @@ func ChooseInstanceType(cc *arvados.Cluster, ctr *arvados.Container) (best arvad needRAM := ctr.RuntimeConstraints.RAM + ctr.RuntimeConstraints.KeepCacheRAM needRAM = (needRAM * 100) / int64(100-discountConfiguredRAMPercent) - err = ErrConstraintsNotSatisfiable + ok := false for _, it := range cc.InstanceTypes { switch { - case err == nil && it.Price > best.Price: - case it.Scratch < needScratch: - case it.RAM < needRAM: + case ok && it.Price > best.Price: + case int64(it.Scratch) < needScratch: + case int64(it.RAM) < needRAM: case it.VCPUs < needVCPUs: + case it.Preemptible != ctr.SchedulingParameters.Preemptible: case it.Price == best.Price && (it.RAM < best.RAM || it.VCPUs < best.VCPUs): // Equal price, but worse specs default: // Lower price || (same price && better specs) best = it - err = nil + ok = true + } + } + if !ok { + availableTypes := make([]arvados.InstanceType, 0, len(cc.InstanceTypes)) + for _, t := range cc.InstanceTypes { + availableTypes = append(availableTypes, t) } + sort.Slice(availableTypes, func(a, b int) bool { + return availableTypes[a].Price < availableTypes[b].Price + }) + err = ConstraintsNotSatisfiableError{ + errors.New("constraints not satisfiable by any configured instance type"), + availableTypes, + } + return } return } @@ -88,35 +109,31 @@ func SlurmNodeTypeFeatureKludge(cc *arvados.Cluster) { } for { slurmKludge(features) - time.Sleep(time.Minute) + time.Sleep(2 * time.Second) } } -var ( - slurmDummyNode = "compute0" - slurmErrBadFeature = "Invalid feature" - slurmErrNoNodes = "node configuration is not available" -) +const slurmDummyNode = "compute0" func slurmKludge(features []string) { - cmd := exec.Command("srun", "--test-only", "--constraint="+strings.Join(features, "&"), "false") - out, err := cmd.CombinedOutput() - switch { - case err == nil || bytes.Contains(out, []byte(slurmErrNoNodes)): - // Evidently our node-type feature names are all valid. + allFeatures := strings.Join(features, ",") - case bytes.Contains(out, []byte(slurmErrBadFeature)): - log.Printf("temporarily configuring node %q with all node type features", slurmDummyNode) - for _, nodeFeatures := range []string{strings.Join(features, ","), ""} { - cmd = exec.Command("scontrol", "update", "NodeName="+slurmDummyNode, "Features="+nodeFeatures) - log.Printf("running: %q %q", cmd.Path, cmd.Args) - out, err := cmd.CombinedOutput() - if err != nil { - log.Printf("error: scontrol: %s (output was %q)", err, out) - } - } + cmd := exec.Command("sinfo", "--nodes="+slurmDummyNode, "--format=%f", "--noheader") + out, err := cmd.CombinedOutput() + if err != nil { + log.Printf("running %q %q: %s (output was %q)", cmd.Path, cmd.Args, err, out) + return + } + if string(out) == allFeatures+"\n" { + // Already configured correctly, nothing to do. + return + } - default: - log.Printf("warning: expected srun error %q, %q, or success, but output was %q", slurmErrBadFeature, slurmErrNoNodes, out) + log.Printf("configuring node %q with all node type features", slurmDummyNode) + cmd = exec.Command("scontrol", "update", "NodeName="+slurmDummyNode, "Features="+allFeatures) + log.Printf("running: %q %q", cmd.Path, cmd.Args) + out, err = cmd.CombinedOutput() + if err != nil { + log.Printf("error: scontrol: %s (output was %q)", err, out) } }