From: Tom Clegg Date: Tue, 21 Jun 2022 19:31:44 +0000 (-0400) Subject: Merge branch '19192-fix-deadlock' X-Git-Tag: 2.5.0~134 X-Git-Url: https://git.arvados.org/arvados.git/commitdiff_plain/0bab5ef7b173a6df95045cfee6b397e2a2638ccc?hp=c1125ba20b2e89c26cf6a1554f5dd0f727fa02ff Merge branch '19192-fix-deadlock' refs #19192 Arvados-DCO-1.1-Signed-off-by: Tom Clegg --- diff --git a/sdk/go/arvados/fs_base.go b/sdk/go/arvados/fs_base.go index bebb74261e..ce9253ab3d 100644 --- a/sdk/go/arvados/fs_base.go +++ b/sdk/go/arvados/fs_base.go @@ -415,7 +415,7 @@ func (n *treenode) MemorySize() (size int64) { for _, inode := range n.inodes { size += inode.MemorySize() } - return + return 64 + size } type fileSystem struct { diff --git a/sdk/go/arvados/fs_collection.go b/sdk/go/arvados/fs_collection.go index f4dae746e2..ccfbdc4da2 100644 --- a/sdk/go/arvados/fs_collection.go +++ b/sdk/go/arvados/fs_collection.go @@ -1159,15 +1159,17 @@ func (dn *dirnode) MemorySize() (size int64) { case *dirnode: size += node.MemorySize() case *filenode: + size += 64 for _, seg := range node.segments { switch seg := seg.(type) { case *memSegment: size += int64(seg.Len()) } + size += 64 } } } - return + return 64 + size } // caller must have write lock. diff --git a/sdk/go/arvados/fs_collection_test.go b/sdk/go/arvados/fs_collection_test.go index b221aaa083..c2cac3c6ce 100644 --- a/sdk/go/arvados/fs_collection_test.go +++ b/sdk/go/arvados/fs_collection_test.go @@ -1221,7 +1221,8 @@ func (s *CollectionFSSuite) TestFlushFullBlocksOnly(c *check.C) { c.Assert(err, check.IsNil) } } - c.Check(fs.MemorySize(), check.Equals, int64(nDirs*67<<20)) + inodebytes := int64((nDirs*(67*2+1) + 1) * 64) + c.Check(fs.MemorySize(), check.Equals, int64(nDirs*67<<20)+inodebytes) c.Check(flushed, check.Equals, int64(0)) waitForFlush := func(expectUnflushed, expectFlushed int64) { @@ -1232,27 +1233,27 @@ func (s *CollectionFSSuite) TestFlushFullBlocksOnly(c *check.C) { } // Nothing flushed yet - waitForFlush((nDirs*67)<<20, 0) + waitForFlush((nDirs*67)<<20+inodebytes, 0) // Flushing a non-empty dir "/" is non-recursive and there are // no top-level files, so this has no effect fs.Flush("/", false) - waitForFlush((nDirs*67)<<20, 0) + waitForFlush((nDirs*67)<<20+inodebytes, 0) // Flush the full block in dir0 fs.Flush("dir0", false) - waitForFlush((nDirs*67-64)<<20, 64<<20) + waitForFlush((nDirs*67-64)<<20+inodebytes, 64<<20) err = fs.Flush("dir-does-not-exist", false) c.Check(err, check.NotNil) // Flush full blocks in all dirs fs.Flush("", false) - waitForFlush(nDirs*3<<20, nDirs*64<<20) + waitForFlush(nDirs*3<<20+inodebytes, nDirs*64<<20) // Flush non-full blocks, too fs.Flush("", true) - waitForFlush(0, nDirs*67<<20) + waitForFlush(inodebytes, nDirs*67<<20) } // Even when writing lots of files/dirs from different goroutines, as diff --git a/sdk/go/arvados/fs_lookup.go b/sdk/go/arvados/fs_lookup.go index 471dc69c82..2bb09995e1 100644 --- a/sdk/go/arvados/fs_lookup.go +++ b/sdk/go/arvados/fs_lookup.go @@ -6,7 +6,6 @@ package arvados import ( "os" - "sync" "time" ) @@ -21,9 +20,8 @@ type lookupnode struct { stale func(time.Time) bool // internal fields - staleLock sync.Mutex - staleAll time.Time - staleOne map[string]time.Time + staleAll time.Time + staleOne map[string]time.Time } // Sync flushes pending writes for loaded children and, if successful, @@ -33,29 +31,28 @@ func (ln *lookupnode) Sync() error { if err != nil { return err } - ln.staleLock.Lock() + ln.Lock() ln.staleAll = time.Time{} ln.staleOne = nil - ln.staleLock.Unlock() + ln.Unlock() return nil } func (ln *lookupnode) Readdir() ([]os.FileInfo, error) { - ln.staleLock.Lock() - defer ln.staleLock.Unlock() + ln.Lock() checkTime := time.Now() if ln.stale(ln.staleAll) { all, err := ln.loadAll(ln) if err != nil { + ln.Unlock() return nil, err } for _, child := range all { - ln.treenode.Lock() _, err = ln.treenode.Child(child.FileInfo().Name(), func(inode) (inode, error) { return child, nil }) - ln.treenode.Unlock() if err != nil { + ln.Unlock() return nil, err } } @@ -65,6 +62,7 @@ func (ln *lookupnode) Readdir() ([]os.FileInfo, error) { // newer than ln.staleAll. Reclaim memory. ln.staleOne = nil } + ln.Unlock() return ln.treenode.Readdir() } @@ -72,8 +70,6 @@ func (ln *lookupnode) Readdir() ([]os.FileInfo, error) { // children, instead calling loadOne when a non-existing child is // looked up. func (ln *lookupnode) Child(name string, replace func(inode) (inode, error)) (inode, error) { - ln.staleLock.Lock() - defer ln.staleLock.Unlock() checkTime := time.Now() var existing inode var err error diff --git a/sdk/go/arvados/fs_site_test.go b/sdk/go/arvados/fs_site_test.go index bf24efa7ed..3abe2b457f 100644 --- a/sdk/go/arvados/fs_site_test.go +++ b/sdk/go/arvados/fs_site_test.go @@ -11,6 +11,7 @@ import ( "net/http" "os" "strings" + "sync" "syscall" "time" @@ -372,3 +373,117 @@ func (s *SiteFSSuite) TestSnapshotSplice(c *check.C) { c.Check(string(buf), check.Equals, string(thisfile)) } } + +func (s *SiteFSSuite) TestLocks(c *check.C) { + DebugLocksPanicMode = false + done := make(chan struct{}) + defer close(done) + ticker := time.NewTicker(2 * time.Second) + go func() { + for { + timeout := time.AfterFunc(5*time.Second, func() { + // c.FailNow() doesn't break deadlock, but this sure does + panic("timed out -- deadlock?") + }) + select { + case <-done: + timeout.Stop() + return + case <-ticker.C: + c.Logf("MemorySize == %d", s.fs.MemorySize()) + } + timeout.Stop() + } + }() + ncolls := 5 + ndirs := 3 + nfiles := 5 + projects := make([]Group, 5) + for pnum := range projects { + c.Logf("make project %d", pnum) + err := s.client.RequestAndDecode(&projects[pnum], "POST", "arvados/v1/groups", nil, map[string]interface{}{ + "group": map[string]string{ + "name": fmt.Sprintf("TestLocks project %d", pnum), + "owner_uuid": fixtureAProjectUUID, + "group_class": "project", + }, + "ensure_unique_name": true, + }) + c.Assert(err, check.IsNil) + for cnum := 0; cnum < ncolls; cnum++ { + c.Logf("make project %d collection %d", pnum, cnum) + var coll Collection + err = s.client.RequestAndDecode(&coll, "POST", "arvados/v1/collections", nil, map[string]interface{}{ + "collection": map[string]string{ + "name": fmt.Sprintf("TestLocks collection %d", cnum), + "owner_uuid": projects[pnum].UUID, + }, + }) + c.Assert(err, check.IsNil) + for d1num := 0; d1num < ndirs; d1num++ { + s.fs.Mkdir(fmt.Sprintf("/by_id/%s/dir1-%d", coll.UUID, d1num), 0777) + for d2num := 0; d2num < ndirs; d2num++ { + s.fs.Mkdir(fmt.Sprintf("/by_id/%s/dir1-%d/dir2-%d", coll.UUID, d1num, d2num), 0777) + for fnum := 0; fnum < nfiles; fnum++ { + f, err := s.fs.OpenFile(fmt.Sprintf("/by_id/%s/dir1-%d/dir2-%d/file-%d", coll.UUID, d1num, d2num, fnum), os.O_CREATE|os.O_RDWR, 0755) + c.Assert(err, check.IsNil) + f.Close() + f, err = s.fs.OpenFile(fmt.Sprintf("/by_id/%s/dir1-%d/file-%d", coll.UUID, d1num, fnum), os.O_CREATE|os.O_RDWR, 0755) + c.Assert(err, check.IsNil) + f.Close() + } + } + } + } + } + c.Log("sync") + s.fs.Sync() + var wg sync.WaitGroup + for n := 0; n < 100; n++ { + wg.Add(1) + go func() { + defer wg.Done() + for pnum, project := range projects { + c.Logf("read project %d", pnum) + if pnum%2 == 0 { + f, err := s.fs.Open(fmt.Sprintf("/by_id/%s", project.UUID)) + c.Assert(err, check.IsNil) + f.Readdir(-1) + f.Close() + } + for cnum := 0; cnum < ncolls; cnum++ { + c.Logf("read project %d collection %d", pnum, cnum) + if pnum%2 == 0 { + f, err := s.fs.Open(fmt.Sprintf("/by_id/%s/TestLocks collection %d", project.UUID, cnum)) + c.Assert(err, check.IsNil) + _, err = f.Readdir(-1) + c.Assert(err, check.IsNil) + f.Close() + } + if pnum%3 == 0 { + for d1num := 0; d1num < ndirs; d1num++ { + f, err := s.fs.Open(fmt.Sprintf("/by_id/%s/TestLocks collection %d/dir1-%d", project.UUID, cnum, d1num)) + c.Assert(err, check.IsNil) + fis, err := f.Readdir(-1) + c.Assert(err, check.IsNil) + c.Assert(fis, check.HasLen, ndirs+nfiles) + f.Close() + } + } + for d1num := 0; d1num < ndirs; d1num++ { + for d2num := 0; d2num < ndirs; d2num++ { + f, err := s.fs.Open(fmt.Sprintf("/by_id/%s/TestLocks collection %d/dir1-%d/dir2-%d", project.UUID, cnum, d1num, d2num)) + c.Assert(err, check.IsNil) + fis, err := f.Readdir(-1) + c.Assert(err, check.IsNil) + c.Assert(fis, check.HasLen, nfiles) + f.Close() + } + } + } + } + }() + } + wg.Wait() + c.Logf("MemorySize == %d", s.fs.MemorySize()) +} diff --git a/services/keep-web/server_test.go b/services/keep-web/server_test.go index dd8ce06172..61c540808b 100644 --- a/services/keep-web/server_test.go +++ b/services/keep-web/server_test.go @@ -480,8 +480,10 @@ func (s *IntegrationSuite) TestMetrics(c *check.C) { c.Check(counters["arvados_keepweb_collectioncache_hits//"].Value, check.Equals, int64(1)) c.Check(counters["arvados_keepweb_collectioncache_pdh_hits//"].Value, check.Equals, int64(1)) c.Check(gauges["arvados_keepweb_collectioncache_cached_manifests//"].Value, check.Equals, float64(1)) - // FooCollection's cached manifest size is 45 ("1f4b0....+45") plus one 51-byte blob signature - c.Check(gauges["arvados_keepweb_sessions_cached_collection_bytes//"].Value, check.Equals, float64(45+51)) + // FooCollection's cached manifest size is 45 ("1f4b0....+45") + // plus one 51-byte blob signature; session fs counts 3 inodes + // * 64 bytes. + c.Check(gauges["arvados_keepweb_sessions_cached_collection_bytes//"].Value, check.Equals, float64(45+51+64*3)) // If the Host header indicates a collection, /metrics.json // refers to a file in the collection -- the metrics handler