14977: Drop containers when the controller says they don't exist.
authorTom Clegg <tclegg@veritasgenetics.com>
Fri, 15 Mar 2019 19:21:39 +0000 (15:21 -0400)
committerTom Clegg <tclegg@veritasgenetics.com>
Fri, 15 Mar 2019 19:21:39 +0000 (15:21 -0400)
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg@veritasgenetics.com>

lib/dispatchcloud/container/queue.go

index bbe47625a893d6874d2c3c415952948f290de74f..9cb210d0e4ba950e3ea43acb5830b6774f7c0ee0 100644 (file)
@@ -5,6 +5,7 @@
 package container
 
 import (
+       "errors"
        "io"
        "sync"
        "time"
@@ -398,32 +399,62 @@ func (cq *Queue) poll() (map[string]*arvados.Container, error) {
        }
        apply(avail)
 
-       var missing []string
+       missing := map[string]bool{}
        cq.mtx.Lock()
        for uuid, ent := range cq.current {
                if next[uuid] == nil &&
                        ent.Container.State != arvados.ContainerStateCancelled &&
                        ent.Container.State != arvados.ContainerStateComplete {
-                       missing = append(missing, uuid)
+                       missing[uuid] = true
                }
        }
        cq.mtx.Unlock()
 
-       for i, page := 0, 20; i < len(missing); i += page {
-               batch := missing[i:]
-               if len(batch) > page {
-                       batch = batch[:page]
+       for len(missing) > 0 {
+               var batch []string
+               for uuid := range missing {
+                       batch = append(batch, uuid)
+                       if len(batch) == 20 {
+                               break
+                       }
                }
+               filters := []arvados.Filter{{"uuid", "in", batch}}
                ended, err := cq.fetchAll(arvados.ResourceListParams{
                        Select:  selectParam,
                        Order:   "uuid",
                        Count:   "none",
-                       Filters: []arvados.Filter{{"uuid", "in", batch}},
+                       Filters: filters,
                })
                if err != nil {
                        return nil, err
                }
+               if len(ended) == 0 {
+                       // This is the only case where we can conclude
+                       // a container has been deleted from the
+                       // database. A short (but non-zero) page, on
+                       // the other hand, can be caused by a response
+                       // size limit.
+                       for _, uuid := range batch {
+                               cq.logger.WithField("ContainerUUID", uuid).Warn("container not found by controller (deleted?)")
+                               delete(missing, uuid)
+                               cq.mtx.Lock()
+                               cq.delEnt(uuid, cq.current[uuid].Container.State)
+                               cq.mtx.Unlock()
+                       }
+                       continue
+               }
                apply(ended)
+               for _, ctr := range ended {
+                       if _, ok := missing[ctr.UUID]; !ok {
+                               msg := "BUG? server response did not match requested filters, erroring out rather than risk deadlock"
+                               cq.logger.WithFields(logrus.Fields{
+                                       "ContainerUUID": ctr.UUID,
+                                       "Filters":       filters,
+                               }).Error(msg)
+                               return nil, errors.New(msg)
+                       }
+                       delete(missing, ctr.UUID)
+               }
        }
        return next, nil
 }