20649: Fix panic on race, worker shutdown vs. container startup.
authorTom Clegg <tom@curii.com>
Mon, 14 Aug 2023 15:11:51 +0000 (11:11 -0400)
committerTom Clegg <tom@curii.com>
Mon, 14 Aug 2023 15:17:12 +0000 (11:17 -0400)
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom@curii.com>

lib/dispatchcloud/worker/worker.go

index 8b9326fa9c0025467fc85578cf3b0e614dbe4a55..0b406ce61de61f248b76bfa30c5ad449b8383374 100644 (file)
@@ -188,6 +188,14 @@ func (wkr *worker) startContainer(ctr arvados.Container) {
                }
                wkr.mtx.Lock()
                defer wkr.mtx.Unlock()
+               if wkr.starting[ctr.UUID] != rr {
+                       // Someone else (e.g., wkr.probeAndUpdate() ->
+                       // wkr.updateRunning() or wkr.Close()) already
+                       // moved our runner from wkr.starting to
+                       // wkr.running or deleted it while we were in
+                       // rr.Start().
+                       return
+               }
                now := time.Now()
                wkr.updated = now
                wkr.busy = now
@@ -665,10 +673,12 @@ func (wkr *worker) Close() {
        for uuid, rr := range wkr.running {
                wkr.logger.WithField("ContainerUUID", uuid).Info("crunch-run process abandoned")
                rr.Close()
+               delete(wkr.running, uuid)
        }
        for uuid, rr := range wkr.starting {
                wkr.logger.WithField("ContainerUUID", uuid).Info("crunch-run process abandoned")
                rr.Close()
+               delete(wkr.starting, uuid)
        }
 }