11100: Adjust delete_at in before_validation hook instead of validation. Permit chang...
[arvados.git] / services / crunch-dispatch-slurm / crunch-dispatch-slurm.go
index 7cb14fab15cc051103ea355996b246e0e6f43645..d84d461c30184e12b45cd87611580d03004a343b 100644 (file)
@@ -123,32 +123,28 @@ func doMain() error {
                log.Printf("Error notifying init daemon: %v", err)
        }
 
-       containerTrackerTicker := trackContainers(dispatcher)
-       defer containerTrackerTicker.Stop()
+       go checkSqueueForOrphans(dispatcher, sqCheck)
 
        return dispatcher.Run(context.Background())
 }
 
-var containerUuidPattern = regexp.MustCompile(`[a-z0-9]{5}-dz642-[a-z0-9]{15}$`)
-
-// Start a goroutine to check squeue report periodically, and
-// invoke TrackContainer for all the containers in the report.
-func trackContainers(dispatcher *dispatch.Dispatcher) *time.Ticker {
-       ticker := time.NewTicker(sqCheck.Period)
-       go func() {
-               for {
-                       select {
-                       case <-ticker.C:
-                               for uuid := range sqCheck.AllUuids() {
-                                       match := containerUuidPattern.MatchString(uuid)
-                                       if match {
-                                               dispatcher.TrackContainer(uuid)
-                                       }
-                               }
-                       }
+var containerUuidPattern = regexp.MustCompile(`^[a-z0-9]{5}-dz642-[a-z0-9]{15}$`)
+
+// Check the next squeue report, and invoke TrackContainer for all the
+// containers in the report. This gives us a chance to cancel slurm
+// jobs started by a previous dispatch process that never released
+// their slurm allocations even though their container states are
+// Cancelled or Complete. See https://dev.arvados.org/issues/10979
+func checkSqueueForOrphans(dispatcher *dispatch.Dispatcher, sqCheck *SqueueChecker) {
+       for _, uuid := range sqCheck.All() {
+               if !containerUuidPattern.MatchString(uuid) {
+                       continue
+               }
+               err := dispatcher.TrackContainer(uuid)
+               if err != nil {
+                       log.Printf("checkSqueueForOrphans: TrackContainer(%s): %s", uuid, err)
                }
-       }()
-       return ticker
+       }
 }
 
 // sbatchCmd