refs #13667 "Merge branch '13667-add-version-file'".
[arvados.git] / services / crunch-dispatch-slurm / crunch-dispatch-slurm.go
index e3801df41bce1f1d1801e0a45b39cd808434ab3c..d1f19dd7b5702e2431471bc7ce2164f553a8cb11 100644 (file)
@@ -7,6 +7,7 @@ package main
 // Dispatcher service for Crunch that submits containers to the slurm queue.
 
 import (
+       "bytes"
        "context"
        "flag"
        "fmt"
@@ -202,7 +203,7 @@ func (disp *Dispatcher) checkSqueueForOrphans() {
        }
 }
 
-func (disp *Dispatcher) sbatchArgs(container arvados.Container) ([]string, error) {
+func (disp *Dispatcher) slurmConstraintArgs(container arvados.Container) []string {
        mem := int64(math.Ceil(float64(container.RuntimeConstraints.RAM+container.RuntimeConstraints.KeepCacheRAM+disp.ReserveExtraRAM) / float64(1048576)))
 
        var disk int64
@@ -212,32 +213,33 @@ func (disp *Dispatcher) sbatchArgs(container arvados.Container) ([]string, error
                }
        }
        disk = int64(math.Ceil(float64(disk) / float64(1048576)))
-
-       var args []string
-       args = append(args, disp.SbatchArguments...)
-       args = append(args,
-               fmt.Sprintf("--job-name=%s", container.UUID),
-               fmt.Sprintf("--nice=%d", initialNiceValue))
-
-       constraintArgs := []string{
+       return []string{
                fmt.Sprintf("--mem=%d", mem),
                fmt.Sprintf("--cpus-per-task=%d", container.RuntimeConstraints.VCPUs),
                fmt.Sprintf("--tmp=%d", disk),
        }
+}
+
+func (disp *Dispatcher) sbatchArgs(container arvados.Container) ([]string, error) {
+       var args []string
+       args = append(args, disp.SbatchArguments...)
+       args = append(args, "--job-name="+container.UUID, fmt.Sprintf("--nice=%d", initialNiceValue))
+
        if disp.cluster == nil {
                // no instance types configured
+               args = append(args, disp.slurmConstraintArgs(container)...)
        } else if it, err := dispatchcloud.ChooseInstanceType(disp.cluster, &container); err == dispatchcloud.ErrInstanceTypesNotConfigured {
                // ditto
+               args = append(args, disp.slurmConstraintArgs(container)...)
        } else if err != nil {
                return nil, err
        } else {
                // use instancetype constraint instead of slurm mem/cpu/tmp specs
-               constraintArgs = []string{"--constraint=instancetype=" + it.Name}
+               args = append(args, "--constraint=instancetype="+it.Name)
        }
-       args = append(args, constraintArgs...)
 
        if len(container.SchedulingParameters.Partitions) > 0 {
-               args = append(args, fmt.Sprintf("--partition=%s", strings.Join(container.SchedulingParameters.Partitions, ",")))
+               args = append(args, "--partition="+strings.Join(container.SchedulingParameters.Partitions, ","))
        }
 
        return args, nil
@@ -273,8 +275,21 @@ func (disp *Dispatcher) runContainer(_ *dispatch.Dispatcher, ctr arvados.Contain
                log.Printf("Submitting container %s to slurm", ctr.UUID)
                if err := disp.submit(ctr, disp.CrunchRunCommand); err != nil {
                        var text string
-                       if err == dispatchcloud.ErrConstraintsNotSatisfiable {
-                               text = fmt.Sprintf("cannot run container %s: %s", ctr.UUID, err)
+                       if err, ok := err.(dispatchcloud.ConstraintsNotSatisfiableError); ok {
+                               var logBuf bytes.Buffer
+                               fmt.Fprintf(&logBuf, "cannot run container %s: %s\n", ctr.UUID, err)
+                               if len(err.AvailableTypes) == 0 {
+                                       fmt.Fprint(&logBuf, "No instance types are configured.\n")
+                               } else {
+                                       fmt.Fprint(&logBuf, "Available instance types:\n")
+                                       for _, t := range err.AvailableTypes {
+                                               fmt.Fprintf(&logBuf,
+                                                       "Type %q: %d VCPUs, %d RAM, %d Scratch, %f Price\n",
+                                                       t.Name, t.VCPUs, t.RAM, t.Scratch, t.Price,
+                                               )
+                                       }
+                               }
+                               text = logBuf.String()
                                disp.UpdateState(ctr.UUID, dispatch.Cancelled)
                        } else {
                                text = fmt.Sprintf("Error submitting container %s to slurm: %s", ctr.UUID, err)