15942: Fix deadlock caused by unclosed "done" channel.
authorTom Clegg <tom@tomclegg.ca>
Sun, 22 Dec 2019 00:20:45 +0000 (19:20 -0500)
committerTom Clegg <tom@tomclegg.ca>
Sun, 22 Dec 2019 00:20:45 +0000 (19:20 -0500)
Also, stop relying on the flushing goroutine to set flushing=nil when
finished; closing the channel is now enough to indicate work is done.

Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom@tomclegg.ca>

sdk/go/arvados/fs_collection.go
sdk/go/arvados/fs_collection_test.go

index d0e97f2ad184b44e7800a577247704e8af717581..37bd494914df507dc9fc193576dc9e0afcc98ea9 100644 (file)
@@ -568,7 +568,6 @@ func (fn *filenode) pruneMemSegments() {
                                // A new seg.buf has been allocated.
                                return
                        }
-                       seg.flushing = nil
                        if err != nil {
                                // TODO: stall (or return errors from)
                                // subsequent writes until flushing
@@ -671,9 +670,10 @@ func (dn *dirnode) commitBlock(ctx context.Context, refs []fnSegmentRef, bufsize
        offsets := make([]int, 0, len(refs)) // location of segment's data within block
        for _, ref := range refs {
                seg := ref.fn.segments[ref.idx].(*memSegment)
-               if seg.flushing != nil && !sync {
+               if !sync && seg.flushingUnfinished() {
                        // Let the other flushing goroutine finish. If
                        // it fails, we'll try again next time.
+                       close(done)
                        return nil
                } else {
                        // In sync mode, we proceed regardless of
@@ -700,15 +700,6 @@ func (dn *dirnode) commitBlock(ctx context.Context, refs []fnSegmentRef, bufsize
                defer close(errs)
                locator, _, err := dn.fs.PutB(block)
                dn.fs.throttle().Release()
-               {
-                       defer func() {
-                               for _, seg := range segs {
-                                       if seg.flushing == done {
-                                               seg.flushing = nil
-                                       }
-                               }
-                       }()
-               }
                if err != nil {
                        errs <- err
                        return
@@ -1198,13 +1189,26 @@ type segment interface {
 
 type memSegment struct {
        buf []byte
-       // If flushing is not nil, then a) buf is being shared by a
-       // pruneMemSegments goroutine, and must be copied on write;
-       // and b) the flushing channel will close when the goroutine
-       // finishes, whether it succeeds or not.
+       // If flushing is not nil and not ready/closed, then a) buf is
+       // being shared by a pruneMemSegments goroutine, and must be
+       // copied on write; and b) the flushing channel will close
+       // when the goroutine finishes, whether it succeeds or not.
        flushing <-chan struct{}
 }
 
+func (me *memSegment) flushingUnfinished() bool {
+       if me.flushing == nil {
+               return false
+       }
+       select {
+       case <-me.flushing:
+               me.flushing = nil
+               return false
+       default:
+               return true
+       }
+}
+
 func (me *memSegment) Len() int {
        return len(me.buf)
 }
index 49fdc397daee610c4a98d1ab8753c716f22a8cba..f01369a885ece3b7c315c832b114caaf77715862 100644 (file)
@@ -17,6 +17,7 @@ import (
        "os"
        "regexp"
        "runtime"
+       "runtime/pprof"
        "strings"
        "sync"
        "sync/atomic"
@@ -1280,6 +1281,47 @@ func (s *CollectionFSSuite) TestMaxUnflushed(c *check.C) {
        fs.Flush("", true)
 }
 
+func (s *CollectionFSSuite) TestFlushStress(c *check.C) {
+       done := false
+       defer func() { done = true }()
+       time.AfterFunc(10*time.Second, func() {
+               if !done {
+                       pprof.Lookup("goroutine").WriteTo(os.Stderr, 1)
+                       panic("timeout")
+               }
+       })
+
+       wrote := 0
+       s.kc.onPut = func(p []byte) {
+               s.kc.Lock()
+               s.kc.blocks = map[string][]byte{}
+               wrote++
+               defer c.Logf("wrote block %d, %d bytes", wrote, len(p))
+               s.kc.Unlock()
+               time.Sleep(20 * time.Millisecond)
+       }
+
+       fs, err := (&Collection{}).FileSystem(s.client, s.kc)
+       c.Assert(err, check.IsNil)
+
+       data := make([]byte, 1<<20)
+       for i := 0; i < 3; i++ {
+               dir := fmt.Sprintf("dir%d", i)
+               fs.Mkdir(dir, 0755)
+               for j := 0; j < 200; j++ {
+                       data[0] = byte(j)
+                       f, err := fs.OpenFile(fmt.Sprintf("%s/file%d", dir, j), os.O_WRONLY|os.O_CREATE, 0)
+                       c.Assert(err, check.IsNil)
+                       _, err = f.Write(data)
+                       c.Assert(err, check.IsNil)
+                       f.Close()
+                       fs.Flush(dir, false)
+               }
+               _, err := fs.MarshalManifest(".")
+               c.Check(err, check.IsNil)
+       }
+}
+
 func (s *CollectionFSSuite) TestFlushShort(c *check.C) {
        s.kc.onPut = func([]byte) {
                s.kc.Lock()
@@ -1301,6 +1343,9 @@ func (s *CollectionFSSuite) TestFlushShort(c *check.C) {
                        f.Close()
                        fs.Flush(dir, false)
                }
+               fs.Flush(dir, true)
+               _, err := fs.MarshalManifest(".")
+               c.Check(err, check.IsNil)
        }
 }