18051: Fix premature ejection from WebDAV collection cache.
authorTom Clegg <tom@curii.com>
Wed, 15 Sep 2021 07:21:45 +0000 (03:21 -0400)
committerTom Clegg <tom@curii.com>
Wed, 15 Sep 2021 19:47:55 +0000 (15:47 -0400)
Avoid multiple concurrent cache sweeps.

Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom@curii.com>

services/keep-web/cache.go
services/keep-web/cache_test.go
services/keep-web/handler_test.go
services/keep-web/server_test.go

index a52af804841fb58f4b837893ae83e3cb76d960b4..16fbd0788cb3840f299d649aa73378aae579fce1 100644 (file)
@@ -25,9 +25,11 @@ type cache struct {
        metrics     cacheMetrics
        pdhs        *lru.TwoQueueCache
        collections *lru.TwoQueueCache
-       permissions *lru.TwoQueueCache
        sessions    *lru.TwoQueueCache
        setupOnce   sync.Once
+
+       chPruneSessions    chan struct{}
+       chPruneCollections chan struct{}
 }
 
 type cacheMetrics struct {
@@ -37,7 +39,6 @@ type cacheMetrics struct {
        sessionEntries    prometheus.Gauge
        collectionHits    prometheus.Counter
        pdhHits           prometheus.Counter
-       permissionHits    prometheus.Counter
        sessionHits       prometheus.Counter
        sessionMisses     prometheus.Counter
        apiCalls          prometheus.Counter
@@ -65,13 +66,6 @@ func (m *cacheMetrics) setup(reg *prometheus.Registry) {
                Help:      "Number of uuid-to-pdh cache hits.",
        })
        reg.MustRegister(m.pdhHits)
-       m.permissionHits = prometheus.NewCounter(prometheus.CounterOpts{
-               Namespace: "arvados",
-               Subsystem: "keepweb_collectioncache",
-               Name:      "permission_hits",
-               Help:      "Number of targetID-to-permission cache hits.",
-       })
-       reg.MustRegister(m.permissionHits)
        m.apiCalls = prometheus.NewCounter(prometheus.CounterOpts{
                Namespace: "arvados",
                Subsystem: "keepweb_collectioncache",
@@ -117,8 +111,9 @@ func (m *cacheMetrics) setup(reg *prometheus.Registry) {
 }
 
 type cachedPDH struct {
-       expire time.Time
-       pdh    string
+       expire  time.Time
+       refresh time.Time
+       pdh     string
 }
 
 type cachedCollection struct {
@@ -149,10 +144,6 @@ func (c *cache) setup() {
        if err != nil {
                panic(err)
        }
-       c.permissions, err = lru.New2Q(c.config.MaxPermissionEntries)
-       if err != nil {
-               panic(err)
-       }
        c.sessions, err = lru.New2Q(c.config.MaxSessions)
        if err != nil {
                panic(err)
@@ -168,6 +159,18 @@ func (c *cache) setup() {
                        c.updateGauges()
                }
        }()
+       c.chPruneCollections = make(chan struct{}, 1)
+       go func() {
+               for range c.chPruneCollections {
+                       c.pruneCollections()
+               }
+       }()
+       c.chPruneSessions = make(chan struct{}, 1)
+       go func() {
+               for range c.chPruneSessions {
+                       c.pruneSessions()
+               }
+       }()
 }
 
 func (c *cache) updateGauges() {
@@ -192,19 +195,25 @@ func (c *cache) Update(client *arvados.Client, coll arvados.Collection, fs arvad
        }
        coll.ManifestText = m
        var updated arvados.Collection
-       defer c.pdhs.Remove(coll.UUID)
        err = client.RequestAndDecode(&updated, "PATCH", "arvados/v1/collections/"+coll.UUID, nil, map[string]interface{}{
                "collection": map[string]string{
                        "manifest_text": coll.ManifestText,
                },
        })
-       if err == nil {
-               c.collections.Add(client.AuthToken+"\000"+updated.PortableDataHash, &cachedCollection{
-                       expire:     time.Now().Add(time.Duration(c.config.TTL)),
-                       collection: &updated,
-               })
+       if err != nil {
+               c.pdhs.Remove(coll.UUID)
+               return err
        }
-       return err
+       c.collections.Add(client.AuthToken+"\000"+updated.PortableDataHash, &cachedCollection{
+               expire:     time.Now().Add(time.Duration(c.config.TTL)),
+               collection: &updated,
+       })
+       c.pdhs.Add(coll.UUID, &cachedPDH{
+               expire:  time.Now().Add(time.Duration(c.config.TTL)),
+               refresh: time.Now().Add(time.Duration(c.config.UUIDTTL)),
+               pdh:     updated.PortableDataHash,
+       })
+       return nil
 }
 
 // ResetSession unloads any potentially stale state. Should be called
@@ -246,7 +255,10 @@ func (c *cache) GetSession(token string) (arvados.CustomFileSystem, *cachedSessi
        } else {
                c.metrics.sessionHits.Inc()
        }
-       go c.pruneSessions()
+       select {
+       case c.chPruneSessions <- struct{}{}:
+       default:
+       }
        fs, _ := sess.fs.Load().(arvados.CustomFileSystem)
        if fs != nil && !expired {
                return fs, sess, nil
@@ -302,19 +314,7 @@ func (c *cache) Get(arv *arvadosclient.ArvadosClient, targetID string, forceRelo
        c.setupOnce.Do(c.setup)
        c.metrics.requests.Inc()
 
-       permOK := false
-       permKey := arv.ApiToken + "\000" + targetID
-       if forceReload {
-       } else if ent, cached := c.permissions.Get(permKey); cached {
-               ent := ent.(*cachedPermission)
-               if ent.expire.Before(time.Now()) {
-                       c.permissions.Remove(permKey)
-               } else {
-                       permOK = true
-                       c.metrics.permissionHits.Inc()
-               }
-       }
-
+       var pdhRefresh bool
        var pdh string
        if arvadosclient.PDHMatch(targetID) {
                pdh = targetID
@@ -324,22 +324,26 @@ func (c *cache) Get(arv *arvadosclient.ArvadosClient, targetID string, forceRelo
                        c.pdhs.Remove(targetID)
                } else {
                        pdh = ent.pdh
+                       pdhRefresh = forceReload || time.Now().After(ent.refresh)
                        c.metrics.pdhHits.Inc()
                }
        }
 
-       var collection *arvados.Collection
-       if pdh != "" {
-               collection = c.lookupCollection(arv.ApiToken + "\000" + pdh)
-       }
-
-       if collection != nil && permOK {
-               return collection, nil
-       } else if collection != nil {
-               // Ask API for current PDH for this targetID. Most
-               // likely, the cached PDH is still correct; if so,
-               // _and_ the current token has permission, we can
-               // use our cached manifest.
+       if pdh == "" {
+               // UUID->PDH mapping is not cached, might as well get
+               // the whole collection record and be done (below).
+       } else if cached := c.lookupCollection(arv.ApiToken + "\000" + pdh); cached == nil {
+               // PDH->manifest is not cached, might as well get the
+               // whole collection record (below).
+       } else if !pdhRefresh {
+               // We looked up UUID->PDH very recently, and we still
+               // have the manifest for that PDH.
+               return cached, nil
+       } else {
+               // Get current PDH for this UUID (and confirm we still
+               // have read permission).  Most likely, the cached PDH
+               // is still correct, in which case we can use our
+               // cached manifest.
                c.metrics.apiCalls.Inc()
                var current arvados.Collection
                err := arv.Get("collections", targetID, selectPDH, &current)
@@ -347,47 +351,44 @@ func (c *cache) Get(arv *arvadosclient.ArvadosClient, targetID string, forceRelo
                        return nil, err
                }
                if current.PortableDataHash == pdh {
-                       c.permissions.Add(permKey, &cachedPermission{
-                               expire: time.Now().Add(time.Duration(c.config.TTL)),
-                       })
-                       if pdh != targetID {
-                               c.pdhs.Add(targetID, &cachedPDH{
-                                       expire: time.Now().Add(time.Duration(c.config.UUIDTTL)),
-                                       pdh:    pdh,
-                               })
-                       }
-                       return collection, err
+                       // PDH has not changed, cached manifest is
+                       // correct.
+                       return cached, err
                }
-               // PDH changed, but now we know we have
-               // permission -- and maybe we already have the
-               // new PDH in the cache.
-               if coll := c.lookupCollection(arv.ApiToken + "\000" + current.PortableDataHash); coll != nil {
-                       return coll, nil
+               if cached := c.lookupCollection(arv.ApiToken + "\000" + current.PortableDataHash); cached != nil {
+                       // PDH changed, and we already have the
+                       // manifest for that new PDH.
+                       return cached, nil
                }
        }
 
-       // Collection manifest is not cached.
+       // Either UUID->PDH is not cached, or PDH->manifest is not
+       // cached.
+       var retrieved arvados.Collection
        c.metrics.apiCalls.Inc()
-       err := arv.Get("collections", targetID, nil, &collection)
+       err := arv.Get("collections", targetID, nil, &retrieved)
        if err != nil {
                return nil, err
        }
        exp := time.Now().Add(time.Duration(c.config.TTL))
-       c.permissions.Add(permKey, &cachedPermission{
-               expire: exp,
-       })
-       c.pdhs.Add(targetID, &cachedPDH{
-               expire: time.Now().Add(time.Duration(c.config.UUIDTTL)),
-               pdh:    collection.PortableDataHash,
-       })
-       c.collections.Add(arv.ApiToken+"\000"+collection.PortableDataHash, &cachedCollection{
+       if targetID != retrieved.PortableDataHash {
+               c.pdhs.Add(targetID, &cachedPDH{
+                       expire:  exp,
+                       refresh: time.Now().Add(time.Duration(c.config.UUIDTTL)),
+                       pdh:     retrieved.PortableDataHash,
+               })
+       }
+       c.collections.Add(arv.ApiToken+"\000"+retrieved.PortableDataHash, &cachedCollection{
                expire:     exp,
-               collection: collection,
+               collection: &retrieved,
        })
-       if int64(len(collection.ManifestText)) > c.config.MaxCollectionBytes/int64(c.config.MaxCollectionEntries) {
-               go c.pruneCollections()
+       if int64(len(retrieved.ManifestText)) > c.config.MaxCollectionBytes/int64(c.config.MaxCollectionEntries) {
+               select {
+               case c.chPruneCollections <- struct{}{}:
+               default:
+               }
        }
-       return collection, nil
+       return &retrieved, nil
 }
 
 // pruneCollections checks the total bytes occupied by manifest_text
index 0a0eef6e44ee36d7d8f302e38d47133285efab6d..80e00b02ad5c962ec7674b82449ee18cf6ddac63 100644 (file)
@@ -51,7 +51,6 @@ func (s *UnitSuite) TestCache(c *check.C) {
        s.checkCacheMetrics(c, cache.registry,
                "requests 5",
                "hits 4",
-               "permission_hits 4",
                "pdh_hits 4",
                "api_calls 1")
 
@@ -72,7 +71,6 @@ func (s *UnitSuite) TestCache(c *check.C) {
        s.checkCacheMetrics(c, cache.registry,
                "requests 6",
                "hits 4",
-               "permission_hits 4",
                "pdh_hits 4",
                "api_calls 2")
 
@@ -85,7 +83,6 @@ func (s *UnitSuite) TestCache(c *check.C) {
        s.checkCacheMetrics(c, cache.registry,
                "requests 7",
                "hits 5",
-               "permission_hits 5",
                "pdh_hits 4",
                "api_calls 2")
 
@@ -105,7 +102,6 @@ func (s *UnitSuite) TestCache(c *check.C) {
        s.checkCacheMetrics(c, cache.registry,
                "requests 27",
                "hits 23",
-               "permission_hits 23",
                "pdh_hits 22",
                "api_calls 4")
 }
@@ -125,9 +121,8 @@ func (s *UnitSuite) TestCacheForceReloadByPDH(c *check.C) {
        s.checkCacheMetrics(c, cache.registry,
                "requests 4",
                "hits 3",
-               "permission_hits 1",
                "pdh_hits 0",
-               "api_calls 3")
+               "api_calls 1")
 }
 
 func (s *UnitSuite) TestCacheForceReloadByUUID(c *check.C) {
@@ -145,7 +140,6 @@ func (s *UnitSuite) TestCacheForceReloadByUUID(c *check.C) {
        s.checkCacheMetrics(c, cache.registry,
                "requests 4",
                "hits 3",
-               "permission_hits 1",
                "pdh_hits 3",
                "api_calls 3")
 }
index e883e806ccf509fc87a73f592300619e61901fd3..a74b85b98da6d7657b85906cb3729d865b073d67 100644 (file)
@@ -1067,7 +1067,7 @@ func (s *IntegrationSuite) TestFileContentType(c *check.C) {
                contentType string
        }{
                {"picture.txt", "BMX bikes are small this year\n", "text/plain; charset=utf-8"},
-               {"picture.bmp", "BMX bikes are small this year\n", "image/x-ms-bmp"},
+               {"picture.bmp", "BMX bikes are small this year\n", "image/(x-ms-)?bmp"},
                {"picture.jpg", "BMX bikes are small this year\n", "image/jpeg"},
                {"picture1", "BMX bikes are small this year\n", "image/bmp"},            // content sniff; "BM" is the magic signature for .bmp
                {"picture2", "Cars are small this year\n", "text/plain; charset=utf-8"}, // content sniff
@@ -1103,7 +1103,7 @@ func (s *IntegrationSuite) TestFileContentType(c *check.C) {
                resp := httptest.NewRecorder()
                s.testServer.Handler.ServeHTTP(resp, req)
                c.Check(resp.Code, check.Equals, http.StatusOK)
-               c.Check(resp.Header().Get("Content-Type"), check.Equals, trial.contentType)
+               c.Check(resp.Header().Get("Content-Type"), check.Matches, trial.contentType)
                c.Check(resp.Body.String(), check.Equals, trial.content)
        }
 }
index 21d767208740dedd148051714410747e0f0a8da2..d592295209b3c0599147b672090eadc81dc36274 100644 (file)
@@ -393,7 +393,6 @@ func (s *IntegrationSuite) TestMetrics(c *check.C) {
        c.Check(counters["arvados_keepweb_collectioncache_api_calls//"].Value, check.Equals, int64(2))
        c.Check(counters["arvados_keepweb_collectioncache_hits//"].Value, check.Equals, int64(1))
        c.Check(counters["arvados_keepweb_collectioncache_pdh_hits//"].Value, check.Equals, int64(1))
-       c.Check(counters["arvados_keepweb_collectioncache_permission_hits//"].Value, check.Equals, int64(1))
        c.Check(gauges["arvados_keepweb_collectioncache_cached_manifests//"].Value, check.Equals, float64(1))
        // FooCollection's cached manifest size is 45 ("1f4b0....+45") plus one 51-byte blob signature
        c.Check(gauges["arvados_keepweb_sessions_cached_collection_bytes//"].Value, check.Equals, float64(45+51))