X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/df591042778408d03d410d5c22a669d85652d1ea..e2cc6c1935565ee965f38501a4974e9d0e6b29e5:/services/crunch-dispatch-slurm/crunch-dispatch-slurm.go diff --git a/services/crunch-dispatch-slurm/crunch-dispatch-slurm.go b/services/crunch-dispatch-slurm/crunch-dispatch-slurm.go index 23e4b3a8cb..d1f19dd7b5 100644 --- a/services/crunch-dispatch-slurm/crunch-dispatch-slurm.go +++ b/services/crunch-dispatch-slurm/crunch-dispatch-slurm.go @@ -7,6 +7,7 @@ package main // Dispatcher service for Crunch that submits containers to the slurm queue. import ( + "bytes" "context" "flag" "fmt" @@ -202,7 +203,7 @@ func (disp *Dispatcher) checkSqueueForOrphans() { } } -func (disp *Dispatcher) sbatchArgs(container arvados.Container) ([]string, error) { +func (disp *Dispatcher) slurmConstraintArgs(container arvados.Container) []string { mem := int64(math.Ceil(float64(container.RuntimeConstraints.RAM+container.RuntimeConstraints.KeepCacheRAM+disp.ReserveExtraRAM) / float64(1048576))) var disk int64 @@ -212,29 +213,36 @@ func (disp *Dispatcher) sbatchArgs(container arvados.Container) ([]string, error } } disk = int64(math.Ceil(float64(disk) / float64(1048576))) - - var sbatchArgs []string - sbatchArgs = append(sbatchArgs, disp.SbatchArguments...) - sbatchArgs = append(sbatchArgs, fmt.Sprintf("--job-name=%s", container.UUID)) - sbatchArgs = append(sbatchArgs, fmt.Sprintf("--mem=%d", mem)) - sbatchArgs = append(sbatchArgs, fmt.Sprintf("--cpus-per-task=%d", container.RuntimeConstraints.VCPUs)) - sbatchArgs = append(sbatchArgs, fmt.Sprintf("--tmp=%d", disk)) - sbatchArgs = append(sbatchArgs, fmt.Sprintf("--nice=%d", initialNiceValue)) - if len(container.SchedulingParameters.Partitions) > 0 { - sbatchArgs = append(sbatchArgs, fmt.Sprintf("--partition=%s", strings.Join(container.SchedulingParameters.Partitions, ","))) + return []string{ + fmt.Sprintf("--mem=%d", mem), + fmt.Sprintf("--cpus-per-task=%d", container.RuntimeConstraints.VCPUs), + fmt.Sprintf("--tmp=%d", disk), } +} + +func (disp *Dispatcher) sbatchArgs(container arvados.Container) ([]string, error) { + var args []string + args = append(args, disp.SbatchArguments...) + args = append(args, "--job-name="+container.UUID, fmt.Sprintf("--nice=%d", initialNiceValue)) if disp.cluster == nil { // no instance types configured + args = append(args, disp.slurmConstraintArgs(container)...) } else if it, err := dispatchcloud.ChooseInstanceType(disp.cluster, &container); err == dispatchcloud.ErrInstanceTypesNotConfigured { // ditto + args = append(args, disp.slurmConstraintArgs(container)...) } else if err != nil { return nil, err } else { - sbatchArgs = append(sbatchArgs, "--constraint=instancetype="+it.Name) + // use instancetype constraint instead of slurm mem/cpu/tmp specs + args = append(args, "--constraint=instancetype="+it.Name) } - return sbatchArgs, nil + if len(container.SchedulingParameters.Partitions) > 0 { + args = append(args, "--partition="+strings.Join(container.SchedulingParameters.Partitions, ",")) + } + + return args, nil } func (disp *Dispatcher) submit(container arvados.Container, crunchRunCommand []string) error { @@ -267,8 +275,21 @@ func (disp *Dispatcher) runContainer(_ *dispatch.Dispatcher, ctr arvados.Contain log.Printf("Submitting container %s to slurm", ctr.UUID) if err := disp.submit(ctr, disp.CrunchRunCommand); err != nil { var text string - if err == dispatchcloud.ErrConstraintsNotSatisfiable { - text = fmt.Sprintf("cannot run container %s: %s", ctr.UUID, err) + if err, ok := err.(dispatchcloud.ConstraintsNotSatisfiableError); ok { + var logBuf bytes.Buffer + fmt.Fprintf(&logBuf, "cannot run container %s: %s\n", ctr.UUID, err) + if len(err.AvailableTypes) == 0 { + fmt.Fprint(&logBuf, "No instance types are configured.\n") + } else { + fmt.Fprint(&logBuf, "Available instance types:\n") + for _, t := range err.AvailableTypes { + fmt.Fprintf(&logBuf, + "Type %q: %d VCPUs, %d RAM, %d Scratch, %f Price\n", + t.Name, t.VCPUs, t.RAM, t.Scratch, t.Price, + ) + } + } + text = logBuf.String() disp.UpdateState(ctr.UUID, dispatch.Cancelled) } else { text = fmt.Sprintf("Error submitting container %s to slurm: %s", ctr.UUID, err)