From 659f946d3be9bedd1765b08c15d1de06e5070927 Mon Sep 17 00:00:00 2001 From: Tom Clegg Date: Mon, 20 Jun 2022 21:02:06 -0400 Subject: [PATCH] 19192: Fix deadlock in lookupnode. Readdir() was locking treenode after staleLock. Child() was locking staleLock after treenode. Arvados-DCO-1.1-Signed-off-by: Tom Clegg --- sdk/go/arvados/fs_lookup.go | 20 +++--- sdk/go/arvados/fs_site_test.go | 115 +++++++++++++++++++++++++++++++++ 2 files changed, 123 insertions(+), 12 deletions(-) diff --git a/sdk/go/arvados/fs_lookup.go b/sdk/go/arvados/fs_lookup.go index 471dc69c82..2bb09995e1 100644 --- a/sdk/go/arvados/fs_lookup.go +++ b/sdk/go/arvados/fs_lookup.go @@ -6,7 +6,6 @@ package arvados import ( "os" - "sync" "time" ) @@ -21,9 +20,8 @@ type lookupnode struct { stale func(time.Time) bool // internal fields - staleLock sync.Mutex - staleAll time.Time - staleOne map[string]time.Time + staleAll time.Time + staleOne map[string]time.Time } // Sync flushes pending writes for loaded children and, if successful, @@ -33,29 +31,28 @@ func (ln *lookupnode) Sync() error { if err != nil { return err } - ln.staleLock.Lock() + ln.Lock() ln.staleAll = time.Time{} ln.staleOne = nil - ln.staleLock.Unlock() + ln.Unlock() return nil } func (ln *lookupnode) Readdir() ([]os.FileInfo, error) { - ln.staleLock.Lock() - defer ln.staleLock.Unlock() + ln.Lock() checkTime := time.Now() if ln.stale(ln.staleAll) { all, err := ln.loadAll(ln) if err != nil { + ln.Unlock() return nil, err } for _, child := range all { - ln.treenode.Lock() _, err = ln.treenode.Child(child.FileInfo().Name(), func(inode) (inode, error) { return child, nil }) - ln.treenode.Unlock() if err != nil { + ln.Unlock() return nil, err } } @@ -65,6 +62,7 @@ func (ln *lookupnode) Readdir() ([]os.FileInfo, error) { // newer than ln.staleAll. Reclaim memory. ln.staleOne = nil } + ln.Unlock() return ln.treenode.Readdir() } @@ -72,8 +70,6 @@ func (ln *lookupnode) Readdir() ([]os.FileInfo, error) { // children, instead calling loadOne when a non-existing child is // looked up. func (ln *lookupnode) Child(name string, replace func(inode) (inode, error)) (inode, error) { - ln.staleLock.Lock() - defer ln.staleLock.Unlock() checkTime := time.Now() var existing inode var err error diff --git a/sdk/go/arvados/fs_site_test.go b/sdk/go/arvados/fs_site_test.go index bf24efa7ed..3abe2b457f 100644 --- a/sdk/go/arvados/fs_site_test.go +++ b/sdk/go/arvados/fs_site_test.go @@ -11,6 +11,7 @@ import ( "net/http" "os" "strings" + "sync" "syscall" "time" @@ -372,3 +373,117 @@ func (s *SiteFSSuite) TestSnapshotSplice(c *check.C) { c.Check(string(buf), check.Equals, string(thisfile)) } } + +func (s *SiteFSSuite) TestLocks(c *check.C) { + DebugLocksPanicMode = false + done := make(chan struct{}) + defer close(done) + ticker := time.NewTicker(2 * time.Second) + go func() { + for { + timeout := time.AfterFunc(5*time.Second, func() { + // c.FailNow() doesn't break deadlock, but this sure does + panic("timed out -- deadlock?") + }) + select { + case <-done: + timeout.Stop() + return + case <-ticker.C: + c.Logf("MemorySize == %d", s.fs.MemorySize()) + } + timeout.Stop() + } + }() + ncolls := 5 + ndirs := 3 + nfiles := 5 + projects := make([]Group, 5) + for pnum := range projects { + c.Logf("make project %d", pnum) + err := s.client.RequestAndDecode(&projects[pnum], "POST", "arvados/v1/groups", nil, map[string]interface{}{ + "group": map[string]string{ + "name": fmt.Sprintf("TestLocks project %d", pnum), + "owner_uuid": fixtureAProjectUUID, + "group_class": "project", + }, + "ensure_unique_name": true, + }) + c.Assert(err, check.IsNil) + for cnum := 0; cnum < ncolls; cnum++ { + c.Logf("make project %d collection %d", pnum, cnum) + var coll Collection + err = s.client.RequestAndDecode(&coll, "POST", "arvados/v1/collections", nil, map[string]interface{}{ + "collection": map[string]string{ + "name": fmt.Sprintf("TestLocks collection %d", cnum), + "owner_uuid": projects[pnum].UUID, + }, + }) + c.Assert(err, check.IsNil) + for d1num := 0; d1num < ndirs; d1num++ { + s.fs.Mkdir(fmt.Sprintf("/by_id/%s/dir1-%d", coll.UUID, d1num), 0777) + for d2num := 0; d2num < ndirs; d2num++ { + s.fs.Mkdir(fmt.Sprintf("/by_id/%s/dir1-%d/dir2-%d", coll.UUID, d1num, d2num), 0777) + for fnum := 0; fnum < nfiles; fnum++ { + f, err := s.fs.OpenFile(fmt.Sprintf("/by_id/%s/dir1-%d/dir2-%d/file-%d", coll.UUID, d1num, d2num, fnum), os.O_CREATE|os.O_RDWR, 0755) + c.Assert(err, check.IsNil) + f.Close() + f, err = s.fs.OpenFile(fmt.Sprintf("/by_id/%s/dir1-%d/file-%d", coll.UUID, d1num, fnum), os.O_CREATE|os.O_RDWR, 0755) + c.Assert(err, check.IsNil) + f.Close() + } + } + } + } + } + c.Log("sync") + s.fs.Sync() + var wg sync.WaitGroup + for n := 0; n < 100; n++ { + wg.Add(1) + go func() { + defer wg.Done() + for pnum, project := range projects { + c.Logf("read project %d", pnum) + if pnum%2 == 0 { + f, err := s.fs.Open(fmt.Sprintf("/by_id/%s", project.UUID)) + c.Assert(err, check.IsNil) + f.Readdir(-1) + f.Close() + } + for cnum := 0; cnum < ncolls; cnum++ { + c.Logf("read project %d collection %d", pnum, cnum) + if pnum%2 == 0 { + f, err := s.fs.Open(fmt.Sprintf("/by_id/%s/TestLocks collection %d", project.UUID, cnum)) + c.Assert(err, check.IsNil) + _, err = f.Readdir(-1) + c.Assert(err, check.IsNil) + f.Close() + } + if pnum%3 == 0 { + for d1num := 0; d1num < ndirs; d1num++ { + f, err := s.fs.Open(fmt.Sprintf("/by_id/%s/TestLocks collection %d/dir1-%d", project.UUID, cnum, d1num)) + c.Assert(err, check.IsNil) + fis, err := f.Readdir(-1) + c.Assert(err, check.IsNil) + c.Assert(fis, check.HasLen, ndirs+nfiles) + f.Close() + } + } + for d1num := 0; d1num < ndirs; d1num++ { + for d2num := 0; d2num < ndirs; d2num++ { + f, err := s.fs.Open(fmt.Sprintf("/by_id/%s/TestLocks collection %d/dir1-%d/dir2-%d", project.UUID, cnum, d1num, d2num)) + c.Assert(err, check.IsNil) + fis, err := f.Readdir(-1) + c.Assert(err, check.IsNil) + c.Assert(fis, check.HasLen, nfiles) + f.Close() + } + } + } + } + }() + } + wg.Wait() + c.Logf("MemorySize == %d", s.fs.MemorySize()) +} -- 2.30.2