X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/fe33cfe407912df46ae34b5419ebdc84650829f6..cd020c016106fbe844501c5f434c16f4def4e08d:/lib/dispatchcloud/scheduler/fix_stale_locks.go diff --git a/lib/dispatchcloud/scheduler/fix_stale_locks.go b/lib/dispatchcloud/scheduler/fix_stale_locks.go index 264f9e4ec6..1f9338f7b8 100644 --- a/lib/dispatchcloud/scheduler/fix_stale_locks.go +++ b/lib/dispatchcloud/scheduler/fix_stale_locks.go @@ -19,24 +19,15 @@ import ( func (sch *Scheduler) fixStaleLocks() { wp := sch.pool.Subscribe() defer sch.pool.Unsubscribe(wp) + + var stale []string timeout := time.NewTimer(sch.staleLockTimeout) waiting: - for { - unlock := false - select { - case <-wp: - // If all workers have been contacted, unlock - // containers that aren't claimed by any - // worker. - unlock = sch.pool.CountWorkers()[worker.StateUnknown] == 0 - case <-timeout.C: - // Give up and unlock the containers, even - // though they might be working. - unlock = true - } - + for sch.pool.CountWorkers()[worker.StateUnknown] > 0 { running := sch.pool.Running() qEntries, _ := sch.queue.Entries() + + stale = nil for uuid, ent := range qEntries { if ent.Container.State != arvados.ContainerStateLocked { continue @@ -44,14 +35,24 @@ waiting: if _, running := running[uuid]; running { continue } - if !unlock { - continue waiting - } - err := sch.queue.Unlock(uuid) - if err != nil { - sch.logger.Warnf("Unlock %s: %s", uuid, err) - } + stale = append(stale, uuid) + } + if len(stale) == 0 { + return + } + + select { + case <-wp: + case <-timeout.C: + // Give up. + break waiting + } + } + + for _, uuid := range stale { + err := sch.queue.Unlock(uuid) + if err != nil { + sch.logger.Warnf("Unlock %s: %s", uuid, err) } - return } }