if ctr.State != dispatch.Locked {
// already started by prior invocation
- } else if _, ok := disp.lsfqueue.JobID(ctr.UUID); !ok {
+ } else if _, ok := disp.lsfqueue.Lookup(ctr.UUID); !ok {
disp.logger.Printf("Submitting container %s to LSF", ctr.UUID)
cmd := []string{disp.Cluster.Containers.CrunchRunCommand}
cmd = append(cmd, "--runtime-engine="+disp.Cluster.Containers.RuntimeEngine)
disp.logger.Printf("Start monitoring container %v in state %q", ctr.UUID, ctr.State)
defer disp.logger.Printf("Done monitoring container %s", ctr.UUID)
- // If the container disappears from the lsf queue, there is
- // no point in waiting for further dispatch updates: just
- // clean up and return.
go func(uuid string) {
+ cancelled := false
for ctx.Err() == nil {
- if _, ok := disp.lsfqueue.JobID(uuid); !ok {
+ qent, ok := disp.lsfqueue.Lookup(uuid)
+ if !ok {
+ // If the container disappears from
+ // the lsf queue, there is no point in
+ // waiting for further dispatch
+ // updates: just clean up and return.
disp.logger.Printf("container %s job disappeared from LSF queue", uuid)
cancel()
return
}
+ if !cancelled && qent.Stat == "PEND" && strings.Contains(qent.PendReason, "There are no suitable hosts for the job") {
+ disp.logger.Printf("container %s: %s", uuid, qent.PendReason)
+ err := disp.arvDispatcher.Arv.Update("containers", uuid, arvadosclient.Dict{
+ "container": map[string]interface{}{
+ "runtime_status": map[string]string{
+ "error": qent.PendReason,
+ },
+ },
+ }, nil)
+ if err != nil {
+ disp.logger.Printf("error setting runtime_status on %s: %s", uuid, err)
+ continue // retry
+ }
+ err = disp.arvDispatcher.UpdateState(uuid, dispatch.Cancelled)
+ if err != nil {
+ continue // retry (UpdateState() already logged the error)
+ }
+ cancelled = true
+ }
}
}(ctr.UUID)
// from the queue.
ticker := time.NewTicker(5 * time.Second)
defer ticker.Stop()
- for jobid, ok := disp.lsfqueue.JobID(ctr.UUID); ok; _, ok = disp.lsfqueue.JobID(ctr.UUID) {
- err := disp.lsfcli.Bkill(jobid)
+ for qent, ok := disp.lsfqueue.Lookup(ctr.UUID); ok; _, ok = disp.lsfqueue.Lookup(ctr.UUID) {
+ err := disp.lsfcli.Bkill(qent.ID)
if err != nil {
- disp.logger.Warnf("%s: bkill(%d): %s", ctr.UUID, jobid, err)
+ disp.logger.Warnf("%s: bkill(%s): %s", ctr.UUID, qent.ID, err)
}
<-ticker.C
}
}
func (disp *dispatcher) bkill(ctr arvados.Container) {
- if jobid, ok := disp.lsfqueue.JobID(ctr.UUID); !ok {
+ if qent, ok := disp.lsfqueue.Lookup(ctr.UUID); !ok {
disp.logger.Debugf("bkill(%s): redundant, job not in queue", ctr.UUID)
- } else if err := disp.lsfcli.Bkill(jobid); err != nil {
- disp.logger.Warnf("%s: bkill(%d): %s", ctr.UUID, jobid, err)
+ } else if err := disp.lsfcli.Bkill(qent.ID); err != nil {
+ disp.logger.Warnf("%s: bkill(%s): %s", ctr.UUID, qent.ID, err)
}
}
func (disp *dispatcher) bsubArgs(container arvados.Container) ([]string, error) {
args := []string{"bsub"}
- args = append(args, disp.Cluster.Containers.LSF.BsubArgumentsList...)
- args = append(args, "-J", container.UUID)
- args = append(args, disp.bsubConstraintArgs(container)...)
- if u := disp.Cluster.Containers.LSF.BsubSudoUser; u != "" {
- args = append([]string{"sudo", "-E", "-u", u}, args...)
- }
- return args, nil
-}
-func (disp *dispatcher) bsubConstraintArgs(container arvados.Container) []string {
- // TODO: propagate container.SchedulingParameters.Partitions
tmp := int64(math.Ceil(float64(dispatchcloud.EstimateScratchSpace(&container)) / 1048576))
vcpus := container.RuntimeConstraints.VCPUs
mem := int64(math.Ceil(float64(container.RuntimeConstraints.RAM+
container.RuntimeConstraints.KeepCacheRAM+
int64(disp.Cluster.Containers.ReserveExtraRAM)) / 1048576))
- return []string{
- "-R", fmt.Sprintf("rusage[mem=%dMB:tmp=%dMB] affinity[core(%d)]", mem, tmp, vcpus),
+
+ repl := map[string]string{
+ "%%": "%",
+ "%C": fmt.Sprintf("%d", vcpus),
+ "%M": fmt.Sprintf("%d", mem),
+ "%T": fmt.Sprintf("%d", tmp),
+ "%U": container.UUID,
+ "%G": fmt.Sprintf("%d", container.RuntimeConstraints.CUDA.DeviceCount),
+ }
+
+ re := regexp.MustCompile(`%.`)
+ var substitutionErrors string
+ argumentTemplate := disp.Cluster.Containers.LSF.BsubArgumentsList
+ if container.RuntimeConstraints.CUDA.DeviceCount > 0 {
+ argumentTemplate = append(argumentTemplate, disp.Cluster.Containers.LSF.BsubCUDAArguments...)
+ }
+ for _, a := range argumentTemplate {
+ args = append(args, re.ReplaceAllStringFunc(a, func(s string) string {
+ subst := repl[s]
+ if len(subst) == 0 {
+ substitutionErrors += fmt.Sprintf("Unknown substitution parameter %s in BsubArgumentsList, ", s)
+ }
+ return subst
+ }))
}
+ if len(substitutionErrors) != 0 {
+ return nil, fmt.Errorf("%s", substitutionErrors[:len(substitutionErrors)-2])
+ }
+
+ if u := disp.Cluster.Containers.LSF.BsubSudoUser; u != "" {
+ args = append([]string{"sudo", "-E", "-u", u}, args...)
+ }
+ return args, nil
}
// Check the next bjobs report, and invoke TrackContainer for all the