19192: Fix deadlock in lookupnode.
authorTom Clegg <tom@curii.com>
Tue, 21 Jun 2022 01:02:06 +0000 (21:02 -0400)
committerTom Clegg <tom@curii.com>
Tue, 21 Jun 2022 01:15:24 +0000 (21:15 -0400)
Readdir() was locking treenode after staleLock.

Child() was locking staleLock after treenode.

Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom@curii.com>

sdk/go/arvados/fs_lookup.go
sdk/go/arvados/fs_site_test.go

index 471dc69c82f75318d290eaccfa7b8d4a542540f1..2bb09995e16e476b3889abbdc1c61b6fc1abbd15 100644 (file)
@@ -6,7 +6,6 @@ package arvados
 
 import (
        "os"
-       "sync"
        "time"
 )
 
@@ -21,9 +20,8 @@ type lookupnode struct {
        stale   func(time.Time) bool
 
        // internal fields
-       staleLock sync.Mutex
-       staleAll  time.Time
-       staleOne  map[string]time.Time
+       staleAll time.Time
+       staleOne map[string]time.Time
 }
 
 // Sync flushes pending writes for loaded children and, if successful,
@@ -33,29 +31,28 @@ func (ln *lookupnode) Sync() error {
        if err != nil {
                return err
        }
-       ln.staleLock.Lock()
+       ln.Lock()
        ln.staleAll = time.Time{}
        ln.staleOne = nil
-       ln.staleLock.Unlock()
+       ln.Unlock()
        return nil
 }
 
 func (ln *lookupnode) Readdir() ([]os.FileInfo, error) {
-       ln.staleLock.Lock()
-       defer ln.staleLock.Unlock()
+       ln.Lock()
        checkTime := time.Now()
        if ln.stale(ln.staleAll) {
                all, err := ln.loadAll(ln)
                if err != nil {
+                       ln.Unlock()
                        return nil, err
                }
                for _, child := range all {
-                       ln.treenode.Lock()
                        _, err = ln.treenode.Child(child.FileInfo().Name(), func(inode) (inode, error) {
                                return child, nil
                        })
-                       ln.treenode.Unlock()
                        if err != nil {
+                               ln.Unlock()
                                return nil, err
                        }
                }
@@ -65,6 +62,7 @@ func (ln *lookupnode) Readdir() ([]os.FileInfo, error) {
                // newer than ln.staleAll. Reclaim memory.
                ln.staleOne = nil
        }
+       ln.Unlock()
        return ln.treenode.Readdir()
 }
 
@@ -72,8 +70,6 @@ func (ln *lookupnode) Readdir() ([]os.FileInfo, error) {
 // children, instead calling loadOne when a non-existing child is
 // looked up.
 func (ln *lookupnode) Child(name string, replace func(inode) (inode, error)) (inode, error) {
-       ln.staleLock.Lock()
-       defer ln.staleLock.Unlock()
        checkTime := time.Now()
        var existing inode
        var err error
index bf24efa7ed055f78fe2db76c56f43794cf0d0e0b..3abe2b457f702b510a46f97ccf528f35f337eb92 100644 (file)
@@ -11,6 +11,7 @@ import (
        "net/http"
        "os"
        "strings"
+       "sync"
        "syscall"
        "time"
 
@@ -372,3 +373,117 @@ func (s *SiteFSSuite) TestSnapshotSplice(c *check.C) {
                c.Check(string(buf), check.Equals, string(thisfile))
        }
 }
+
+func (s *SiteFSSuite) TestLocks(c *check.C) {
+       DebugLocksPanicMode = false
+       done := make(chan struct{})
+       defer close(done)
+       ticker := time.NewTicker(2 * time.Second)
+       go func() {
+               for {
+                       timeout := time.AfterFunc(5*time.Second, func() {
+                               // c.FailNow() doesn't break deadlock, but this sure does
+                               panic("timed out -- deadlock?")
+                       })
+                       select {
+                       case <-done:
+                               timeout.Stop()
+                               return
+                       case <-ticker.C:
+                               c.Logf("MemorySize == %d", s.fs.MemorySize())
+                       }
+                       timeout.Stop()
+               }
+       }()
+       ncolls := 5
+       ndirs := 3
+       nfiles := 5
+       projects := make([]Group, 5)
+       for pnum := range projects {
+               c.Logf("make project %d", pnum)
+               err := s.client.RequestAndDecode(&projects[pnum], "POST", "arvados/v1/groups", nil, map[string]interface{}{
+                       "group": map[string]string{
+                               "name":        fmt.Sprintf("TestLocks project %d", pnum),
+                               "owner_uuid":  fixtureAProjectUUID,
+                               "group_class": "project",
+                       },
+                       "ensure_unique_name": true,
+               })
+               c.Assert(err, check.IsNil)
+               for cnum := 0; cnum < ncolls; cnum++ {
+                       c.Logf("make project %d collection %d", pnum, cnum)
+                       var coll Collection
+                       err = s.client.RequestAndDecode(&coll, "POST", "arvados/v1/collections", nil, map[string]interface{}{
+                               "collection": map[string]string{
+                                       "name":       fmt.Sprintf("TestLocks collection %d", cnum),
+                                       "owner_uuid": projects[pnum].UUID,
+                               },
+                       })
+                       c.Assert(err, check.IsNil)
+                       for d1num := 0; d1num < ndirs; d1num++ {
+                               s.fs.Mkdir(fmt.Sprintf("/by_id/%s/dir1-%d", coll.UUID, d1num), 0777)
+                               for d2num := 0; d2num < ndirs; d2num++ {
+                                       s.fs.Mkdir(fmt.Sprintf("/by_id/%s/dir1-%d/dir2-%d", coll.UUID, d1num, d2num), 0777)
+                                       for fnum := 0; fnum < nfiles; fnum++ {
+                                               f, err := s.fs.OpenFile(fmt.Sprintf("/by_id/%s/dir1-%d/dir2-%d/file-%d", coll.UUID, d1num, d2num, fnum), os.O_CREATE|os.O_RDWR, 0755)
+                                               c.Assert(err, check.IsNil)
+                                               f.Close()
+                                               f, err = s.fs.OpenFile(fmt.Sprintf("/by_id/%s/dir1-%d/file-%d", coll.UUID, d1num, fnum), os.O_CREATE|os.O_RDWR, 0755)
+                                               c.Assert(err, check.IsNil)
+                                               f.Close()
+                                       }
+                               }
+                       }
+               }
+       }
+       c.Log("sync")
+       s.fs.Sync()
+       var wg sync.WaitGroup
+       for n := 0; n < 100; n++ {
+               wg.Add(1)
+               go func() {
+                       defer wg.Done()
+                       for pnum, project := range projects {
+                               c.Logf("read project %d", pnum)
+                               if pnum%2 == 0 {
+                                       f, err := s.fs.Open(fmt.Sprintf("/by_id/%s", project.UUID))
+                                       c.Assert(err, check.IsNil)
+                                       f.Readdir(-1)
+                                       f.Close()
+                               }
+                               for cnum := 0; cnum < ncolls; cnum++ {
+                                       c.Logf("read project %d collection %d", pnum, cnum)
+                                       if pnum%2 == 0 {
+                                               f, err := s.fs.Open(fmt.Sprintf("/by_id/%s/TestLocks collection %d", project.UUID, cnum))
+                                               c.Assert(err, check.IsNil)
+                                               _, err = f.Readdir(-1)
+                                               c.Assert(err, check.IsNil)
+                                               f.Close()
+                                       }
+                                       if pnum%3 == 0 {
+                                               for d1num := 0; d1num < ndirs; d1num++ {
+                                                       f, err := s.fs.Open(fmt.Sprintf("/by_id/%s/TestLocks collection %d/dir1-%d", project.UUID, cnum, d1num))
+                                                       c.Assert(err, check.IsNil)
+                                                       fis, err := f.Readdir(-1)
+                                                       c.Assert(err, check.IsNil)
+                                                       c.Assert(fis, check.HasLen, ndirs+nfiles)
+                                                       f.Close()
+                                               }
+                                       }
+                                       for d1num := 0; d1num < ndirs; d1num++ {
+                                               for d2num := 0; d2num < ndirs; d2num++ {
+                                                       f, err := s.fs.Open(fmt.Sprintf("/by_id/%s/TestLocks collection %d/dir1-%d/dir2-%d", project.UUID, cnum, d1num, d2num))
+                                                       c.Assert(err, check.IsNil)
+                                                       fis, err := f.Readdir(-1)
+                                                       c.Assert(err, check.IsNil)
+                                                       c.Assert(fis, check.HasLen, nfiles)
+                                                       f.Close()
+                                               }
+                                       }
+                               }
+                       }
+               }()
+       }
+       wg.Wait()
+       c.Logf("MemorySize == %d", s.fs.MemorySize())
+}