-// sbatchCmd
-func sbatchFunc(container arvados.Container) *exec.Cmd {
- memPerCPU := math.Ceil(float64(container.RuntimeConstraints.RAM) / (float64(container.RuntimeConstraints.VCPUs) * 1048576))
-
- var sbatchArgs []string
- sbatchArgs = append(sbatchArgs, "--share")
- sbatchArgs = append(sbatchArgs, theConfig.SbatchArguments...)
- sbatchArgs = append(sbatchArgs, fmt.Sprintf("--job-name=%s", container.UUID))
- sbatchArgs = append(sbatchArgs, fmt.Sprintf("--mem-per-cpu=%d", int(memPerCPU)))
- sbatchArgs = append(sbatchArgs, fmt.Sprintf("--cpus-per-task=%d", container.RuntimeConstraints.VCPUs))
- if len(container.SchedulingParameters.Partitions) > 0 {
- sbatchArgs = append(sbatchArgs, fmt.Sprintf("--partition=%s", strings.Join(container.SchedulingParameters.Partitions, ",")))
+var containerUuidPattern = regexp.MustCompile(`^[a-z0-9]{5}-dz642-[a-z0-9]{15}$`)
+
+// Check the next squeue report, and invoke TrackContainer for all the
+// containers in the report. This gives us a chance to cancel slurm
+// jobs started by a previous dispatch process that never released
+// their slurm allocations even though their container states are
+// Cancelled or Complete. See https://dev.arvados.org/issues/10979
+func (disp *Dispatcher) checkSqueueForOrphans() {
+ for _, uuid := range disp.sqCheck.All() {
+ if !containerUuidPattern.MatchString(uuid) || !strings.HasPrefix(uuid, disp.cluster.ClusterID) {
+ continue
+ }
+ err := disp.TrackContainer(uuid)
+ if err != nil {
+ log.Printf("checkSqueueForOrphans: TrackContainer(%s): %s", uuid, err)
+ }