X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/18c8fa2337a2db04ee6060184885731f4b5c7d7b..c502c5a50aae825683ee4cff629c6839a4209501:/sdk/go/arvados/fs_base.go?ds=sidebyside diff --git a/sdk/go/arvados/fs_base.go b/sdk/go/arvados/fs_base.go index 8d987d4cac..0cde825b38 100644 --- a/sdk/go/arvados/fs_base.go +++ b/sdk/go/arvados/fs_base.go @@ -8,6 +8,7 @@ import ( "errors" "fmt" "io" + "io/fs" "log" "net/http" "os" @@ -27,9 +28,44 @@ var ( ErrWriteOnlyMode = errors.New("file is O_WRONLY") ErrSyncNotSupported = errors.New("O_SYNC flag is not supported") ErrIsDirectory = errors.New("cannot rename file to overwrite existing directory") + ErrNotADirectory = errors.New("not a directory") ErrPermission = os.ErrPermission + DebugLocksPanicMode = false ) +type syncer interface { + Sync() error +} + +func debugPanicIfNotLocked(l sync.Locker, writing bool) { + if !DebugLocksPanicMode { + return + } + race := false + if rl, ok := l.(interface { + RLock() + RUnlock() + }); ok && writing { + go func() { + // Fail if we can grab the read lock during an + // operation that purportedly has write lock. + rl.RLock() + race = true + rl.RUnlock() + }() + } else { + go func() { + l.Lock() + race = true + l.Unlock() + }() + } + time.Sleep(100) + if race { + panic("bug: caller-must-have-lock func called, but nobody has lock") + } +} + // A File is an *os.File-like interface for reading and writing files // in a FileSystem. type File interface { @@ -42,6 +78,24 @@ type File interface { Stat() (os.FileInfo, error) Truncate(int64) error Sync() error + // Create a snapshot of a file or directory tree, which can + // then be spliced onto a different path or a different + // collection. + Snapshot() (*Subtree, error) + // Replace this file or directory with the given snapshot. + // The target must be inside a collection: Splice returns an + // error if the File is a virtual file or directory like + // by_id, a project directory, .arvados#collection, + // etc. Splice can replace directories with regular files and + // vice versa, except it cannot replace the root directory of + // a collection with a regular file. + Splice(snapshot *Subtree) error +} + +// A Subtree is a detached part of a filesystem tree that can be +// spliced into a filesystem via (File)Splice(). +type Subtree struct { + inode inode } // A FileSystem is an http.Filesystem plus Stat() and support for @@ -51,8 +105,17 @@ type FileSystem interface { http.FileSystem fsBackend - newDirnode(parent inode, name string, perm os.FileMode, modTime time.Time) (node inode, err error) - newFilenode(parent inode, name string, perm os.FileMode, modTime time.Time) (node inode, err error) + rootnode() inode + + // filesystem-wide lock: used by Rename() to prevent deadlock + // while locking multiple inodes. + locker() sync.Locker + + // throttle for limiting concurrent background writers + throttle() *throttle + + // create a new node with nil parent. + newNode(name string, perm os.FileMode, modTime time.Time) (node inode, err error) // analogous to os.Stat() Stat(name string) (os.FileInfo, error) @@ -79,27 +142,72 @@ type FileSystem interface { Remove(name string) error RemoveAll(name string) error Rename(oldname, newname string) error + + // Write buffered data from memory to storage, returning when + // all updates have been saved to persistent storage. Sync() error + + // Write buffered data from memory to storage, but don't wait + // for all writes to finish before returning. If shortBlocks + // is true, flush everything; otherwise, if there's less than + // a full block of buffered data at the end of a stream, leave + // it buffered in memory in case more data can be appended. If + // path is "", flush all dirs/streams; otherwise, flush only + // the specified dir/stream. + Flush(path string, shortBlocks bool) error + + // Estimate current memory usage. + MemorySize() int64 +} + +type fsFS struct { + FileSystem +} + +// FS returns an fs.FS interface to the given FileSystem, to enable +// the use of fs.WalkDir, etc. +func FS(fs FileSystem) fs.FS { return fsFS{fs} } +func (fs fsFS) Open(path string) (fs.File, error) { + f, err := fs.FileSystem.Open(path) + return f, err } type inode interface { + SetParent(parent inode, name string) Parent() inode FS() FileSystem Read([]byte, filenodePtr) (int, filenodePtr, error) Write([]byte, filenodePtr) (int, filenodePtr, error) Truncate(int64) error IsDir() bool - Readdir() []os.FileInfo + Readdir() ([]os.FileInfo, error) Size() int64 FileInfo() os.FileInfo + // Create a snapshot of this node and its descendants. + Snapshot() (inode, error) + // Replace this node with a copy of the provided snapshot. + // Caller may provide the same snapshot to multiple Splice + // calls, but must not modify the snapshot concurrently. + Splice(inode) error // Child() performs lookups and updates of named child nodes. // + // (The term "child" here is used strictly. This means name is + // not "." or "..", and name does not contain "/".) + // // If replace is non-nil, Child calls replace(x) where x is // the current child inode with the given name. If possible, // the child inode is replaced with the one returned by // replace(). // + // If replace(x) returns an inode (besides x or nil) that is + // subsequently returned by Child(), then Child()'s caller + // must ensure the new child's name and parent are set/updated + // to Child()'s name argument and its receiver respectively. + // This is not necessarily done before replace(x) returns, but + // it must be done before Child()'s caller releases the + // parent's lock. + // // Nil represents "no child". replace(nil) signifies that no // child with this name exists yet. If replace() returns nil, // the existing child should be deleted if possible. @@ -113,11 +221,12 @@ type inode interface { // a child was added or changed, the new child is returned. // // Caller must have lock (or rlock if replace is nil). - Child(name string, replace func(inode) inode) inode + Child(name string, replace func(inode) (inode, error)) (inode, error) sync.Locker RLock() RUnlock() + MemorySize() int64 } type fileinfo struct { @@ -125,6 +234,15 @@ type fileinfo struct { mode os.FileMode size int64 modTime time.Time + // Source data structure: *Collection, *Group, or + // nil. Currently populated only for project dirs and + // top-level collection dirs; *not* populated for + // /by_id/{uuid} dirs (only subdirs below that). Does not stay + // up to date with upstream changes. + // + // Intended to support keep-web's properties-as-s3-metadata + // feature (https://dev.arvados.org/issues/19088). + sys interface{} } // Name implements os.FileInfo. @@ -152,9 +270,9 @@ func (fi fileinfo) Size() int64 { return fi.size } -// Sys implements os.FileInfo. +// Sys implements os.FileInfo. See comment in fileinfo struct. func (fi fileinfo) Sys() interface{} { - return nil + return fi.sys } type nullnode struct{} @@ -183,12 +301,27 @@ func (*nullnode) IsDir() bool { return false } -func (*nullnode) Readdir() []os.FileInfo { - return nil +func (*nullnode) Readdir() ([]os.FileInfo, error) { + return nil, ErrInvalidOperation } -func (*nullnode) Child(name string, replace func(inode) inode) inode { - return nil +func (*nullnode) Child(name string, replace func(inode) (inode, error)) (inode, error) { + return nil, ErrNotADirectory +} + +func (*nullnode) MemorySize() int64 { + // Types that embed nullnode should report their own size, but + // if they don't, we at least report a non-zero size to ensure + // a large tree doesn't get reported as 0 bytes. + return 64 +} + +func (*nullnode) Snapshot() (inode, error) { + return nil, ErrInvalidOperation +} + +func (*nullnode) Splice(inode) error { + return ErrInvalidOperation } type treenode struct { @@ -204,6 +337,13 @@ func (n *treenode) FS() FileSystem { return n.fs } +func (n *treenode) SetParent(p inode, name string) { + n.Lock() + defer n.Unlock() + n.parent = p + n.fileinfo.name = name +} + func (n *treenode) Parent() inode { n.RLock() defer n.RUnlock() @@ -214,16 +354,28 @@ func (n *treenode) IsDir() bool { return true } -func (n *treenode) Child(name string, replace func(inode) inode) (child inode) { - // TODO: special treatment for "", ".", ".." +func (n *treenode) Child(name string, replace func(inode) (inode, error)) (child inode, err error) { + debugPanicIfNotLocked(n, false) child = n.inodes[name] - if replace != nil { - child = replace(child) - if child == nil { - delete(n.inodes, name) - } else { - n.inodes[name] = child - } + if name == "" || name == "." || name == ".." { + err = ErrInvalidArgument + return + } + if replace == nil { + return + } + newchild, err := replace(child) + if err != nil { + return + } + if newchild == nil { + debugPanicIfNotLocked(n, true) + delete(n.inodes, name) + } else if newchild != child { + debugPanicIfNotLocked(n, true) + n.inodes[name] = newchild + n.fileinfo.modTime = time.Now() + child = newchild } return } @@ -239,7 +391,7 @@ func (n *treenode) FileInfo() os.FileInfo { return n.fileinfo } -func (n *treenode) Readdir() (fi []os.FileInfo) { +func (n *treenode) Readdir() (fi []os.FileInfo, err error) { n.RLock() defer n.RUnlock() fi = make([]os.FileInfo, 0, len(n.inodes)) @@ -249,9 +401,49 @@ func (n *treenode) Readdir() (fi []os.FileInfo) { return } +func (n *treenode) Sync() error { + n.RLock() + defer n.RUnlock() + for _, inode := range n.inodes { + syncer, ok := inode.(syncer) + if !ok { + return ErrInvalidOperation + } + err := syncer.Sync() + if err != nil { + return err + } + } + return nil +} + +func (n *treenode) MemorySize() (size int64) { + n.RLock() + defer n.RUnlock() + debugPanicIfNotLocked(n, false) + for _, inode := range n.inodes { + size += inode.MemorySize() + } + return 64 + size +} + type fileSystem struct { root inode fsBackend + mutex sync.Mutex + thr *throttle +} + +func (fs *fileSystem) rootnode() inode { + return fs.root +} + +func (fs *fileSystem) throttle() *throttle { + return fs.thr +} + +func (fs *fileSystem) locker() sync.Locker { + return &fs.mutex } // OpenFile is analogous to os.OpenFile(). @@ -264,9 +456,9 @@ func (fs *fileSystem) openFile(name string, flag int, perm os.FileMode) (*fileha return nil, ErrSyncNotSupported } dirname, name := path.Split(name) - parent := rlookup(fs.root, dirname) - if parent == nil { - return nil, os.ErrNotExist + parent, err := rlookup(fs.root, dirname) + if err != nil { + return nil, err } var readable, writable bool switch flag & (os.O_RDWR | os.O_RDONLY | os.O_WRONLY) { @@ -280,43 +472,43 @@ func (fs *fileSystem) openFile(name string, flag int, perm os.FileMode) (*fileha default: return nil, fmt.Errorf("invalid flags 0x%x", flag) } - if !writable && parent.IsDir() { + if parent.IsDir() { // A directory can be opened via "foo/", "foo/.", or // "foo/..". switch name { case ".", "": - return &filehandle{inode: parent}, nil + return &filehandle{inode: parent, readable: readable, writable: writable}, nil case "..": - return &filehandle{inode: parent.Parent()}, nil + return &filehandle{inode: parent.Parent(), readable: readable, writable: writable}, nil } } createMode := flag&os.O_CREATE != 0 - if createMode { - parent.Lock() - defer parent.Unlock() - } else { - parent.RLock() - defer parent.RUnlock() - } - n := parent.Child(name, nil) - if n == nil { + // We always need to take Lock() here, not just RLock(). Even + // if we know we won't be creating a file, parent might be a + // lookupnode, which sometimes populates its inodes map during + // a Child() call. + parent.Lock() + defer parent.Unlock() + n, err := parent.Child(name, nil) + if err != nil { + return nil, err + } else if n == nil { if !createMode { return nil, os.ErrNotExist } - var err error - n = parent.Child(name, func(inode) inode { - if perm.IsDir() { - n, err = fs.newDirnode(parent, name, perm|0755, time.Now()) - } else { - n, err = fs.newFilenode(parent, name, perm|0755, time.Now()) + n, err = parent.Child(name, func(inode) (repl inode, err error) { + repl, err = parent.FS().newNode(name, perm|0755, time.Now()) + if err != nil { + return } - return n + repl.SetParent(parent, name) + return }) if err != nil { return nil, err } else if n == nil { - // parent rejected new child - return nil, ErrInvalidOperation + // Parent rejected new child, but returned no error + return nil, ErrInvalidArgument } } else if flag&os.O_EXCL != 0 { return nil, ErrFileExists @@ -345,37 +537,37 @@ func (fs *fileSystem) Create(name string) (File, error) { return fs.OpenFile(name, os.O_CREATE|os.O_RDWR|os.O_TRUNC, 0) } -func (fs *fileSystem) Mkdir(name string, perm os.FileMode) (err error) { +func (fs *fileSystem) Mkdir(name string, perm os.FileMode) error { dirname, name := path.Split(name) - n := rlookup(fs.root, dirname) - if n == nil { - return os.ErrNotExist + n, err := rlookup(fs.root, dirname) + if err != nil { + return err } n.Lock() defer n.Unlock() - if n.Child(name, nil) != nil { + if child, err := n.Child(name, nil); err != nil { + return err + } else if child != nil { return os.ErrExist } - child := n.Child(name, func(inode) (child inode) { - child, err = fs.newDirnode(n, name, perm, time.Now()) + + _, err = n.Child(name, func(inode) (repl inode, err error) { + repl, err = n.FS().newNode(name, perm|os.ModeDir, time.Now()) + if err != nil { + return + } + repl.SetParent(n, name) return }) - if err != nil { - return err - } else if child == nil { - return ErrInvalidArgument - } - return nil + return err } -func (fs *fileSystem) Stat(name string) (fi os.FileInfo, err error) { - node := rlookup(fs.root, name) - if node == nil { - err = os.ErrNotExist - } else { - fi = node.FileInfo() +func (fs *fileSystem) Stat(name string) (os.FileInfo, error) { + node, err := rlookup(fs.root, name) + if err != nil { + return nil, err } - return + return node.FileInfo(), nil } func (fs *fileSystem) Rename(oldname, newname string) error { @@ -402,14 +594,33 @@ func (fs *fileSystem) Rename(oldname, newname string) error { } defer newdirf.Close() - // When acquiring locks on multiple nodes, all common - // ancestors must be locked first in order to avoid - // deadlock. This is assured by locking the path from root to - // newdir, then locking the path from root to olddir, skipping - // any already-locked nodes. + // TODO: If the nearest common ancestor ("nca") of olddirf and + // newdirf is on a different filesystem than fs, we should + // call nca.FS().Rename() instead of proceeding. Until then + // it's awkward for filesystems to implement their own Rename + // methods effectively: the only one that runs is the one on + // the root FileSystem exposed to the caller (webdav, fuse, + // etc). + + // When acquiring locks on multiple inodes, avoid deadlock by + // locking the entire containing filesystem first. + cfs := olddirf.inode.FS() + cfs.locker().Lock() + defer cfs.locker().Unlock() + + if cfs != newdirf.inode.FS() { + // Moving inodes across filesystems is not (yet) + // supported. Locking inodes from different + // filesystems could deadlock, so we must error out + // now. + return ErrInvalidOperation + } + + // To ensure we can test reliably whether we're about to move + // a directory into itself, lock all potential common + // ancestors of olddir and newdir. needLock := []sync.Locker{} - for _, f := range []*filehandle{olddirf, newdirf} { - node := f.inode + for _, node := range []inode{olddirf.inode, newdirf.inode} { needLock = append(needLock, node) for node.Parent() != node && node.Parent().FS() == node.FS() { node = node.Parent() @@ -425,37 +636,31 @@ func (fs *fileSystem) Rename(oldname, newname string) error { } } - err = nil - olddirf.inode.Child(oldname, func(oldinode inode) inode { + _, err = olddirf.inode.Child(oldname, func(oldinode inode) (inode, error) { if oldinode == nil { - err = os.ErrNotExist - return nil + return oldinode, os.ErrNotExist + } + if locked[oldinode] { + // oldinode cannot become a descendant of itself. + return oldinode, ErrInvalidArgument } - newdirf.inode.Child(newname, func(existing inode) inode { + if oldinode.FS() != cfs && newdirf.inode != olddirf.inode { + // moving a mount point to a different parent + // is not (yet) supported. + return oldinode, ErrInvalidArgument + } + accepted, err := newdirf.inode.Child(newname, func(existing inode) (inode, error) { if existing != nil && existing.IsDir() { - err = ErrIsDirectory - return existing + return existing, ErrIsDirectory } - return oldinode + return oldinode, nil }) if err != nil { - return oldinode - } - oldinode.Lock() - defer oldinode.Unlock() - switch n := oldinode.(type) { - case *dirnode: - n.parent = newdirf.inode - n.fileinfo.name = newname - case *filenode: - n.parent = newdirf.inode - n.fileinfo.name = newname - default: - panic(fmt.Sprintf("bad inode type %T", n)) + // Leave oldinode in olddir. + return oldinode, err } - //TODO: olddirf.setModTime(time.Now()) - //TODO: newdirf.setModTime(time.Now()) - return nil + accepted.SetParent(newdirf.inode, newname) + return nil, nil }) return err } @@ -474,80 +679,51 @@ func (fs *fileSystem) RemoveAll(name string) error { return err } -func (fs *fileSystem) remove(name string, recursive bool) (err error) { +func (fs *fileSystem) remove(name string, recursive bool) error { dirname, name := path.Split(name) if name == "" || name == "." || name == ".." { return ErrInvalidArgument } - dir := rlookup(fs.root, dirname) - if dir == nil { - return os.ErrNotExist + dir, err := rlookup(fs.root, dirname) + if err != nil { + return err } dir.Lock() defer dir.Unlock() - dir.Child(name, func(node inode) inode { + _, err = dir.Child(name, func(node inode) (inode, error) { if node == nil { - err = os.ErrNotExist - return nil + return nil, os.ErrNotExist } if !recursive && node.IsDir() && node.Size() > 0 { - err = ErrDirectoryNotEmpty - return node + return node, ErrDirectoryNotEmpty } - return nil + return nil, nil }) return err } -// Caller must have parent lock, and must have already ensured -// parent.Child(name,nil) is nil. -func (fs *fileSystem) newDirnode(parent inode, name string, perm os.FileMode, modTime time.Time) (node inode, err error) { - if name == "" || name == "." || name == ".." { - return nil, ErrInvalidArgument - } - return &dirnode{ - treenode: treenode{ - fs: parent.FS(), - parent: parent, - fileinfo: fileinfo{ - name: name, - mode: perm | os.ModeDir, - modTime: modTime, - }, - inodes: make(map[string]inode), - }, - }, nil +func (fs *fileSystem) Sync() error { + if syncer, ok := fs.root.(syncer); ok { + return syncer.Sync() + } + return ErrInvalidOperation } -func (fs *fileSystem) newFilenode(parent inode, name string, perm os.FileMode, modTime time.Time) (node inode, err error) { - if name == "" || name == "." || name == ".." { - return nil, ErrInvalidArgument - } - return &filenode{ - fs: parent.FS(), - parent: parent, - fileinfo: fileinfo{ - name: name, - mode: perm & ^os.ModeDir, - modTime: modTime, - }, - }, nil +func (fs *fileSystem) Flush(string, bool) error { + log.Printf("TODO: flush fileSystem") + return ErrInvalidOperation } -func (fs *fileSystem) Sync() error { - log.Printf("TODO: sync fileSystem") - return ErrInvalidOperation +func (fs *fileSystem) MemorySize() int64 { + return fs.root.MemorySize() } // rlookup (recursive lookup) returns the inode for the file/directory // with the given name (which may contain "/" separators). If no such // file/directory exists, the returned node is nil. -func rlookup(start inode, path string) (node inode) { +func rlookup(start inode, path string) (node inode, err error) { node = start for _, name := range strings.Split(path, "/") { - if node == nil { - break - } if node.IsDir() { if name == "." || name == "" { continue @@ -557,11 +733,50 @@ func rlookup(start inode, path string) (node inode) { continue } } - node = func() inode { - node.RLock() - defer node.RUnlock() + node, err = func() (inode, error) { + node.Lock() + defer node.Unlock() return node.Child(name, nil) }() + if node == nil || err != nil { + break + } + } + if node == nil && err == nil { + err = os.ErrNotExist } return } + +func permittedName(name string) bool { + return name != "" && name != "." && name != ".." && !strings.Contains(name, "/") +} + +// Snapshot returns a Subtree that's a copy of the given path. It +// returns an error if the path is not inside a collection. +func Snapshot(fs FileSystem, path string) (*Subtree, error) { + f, err := fs.OpenFile(path, os.O_RDONLY, 0) + if err != nil { + return nil, err + } + defer f.Close() + return f.Snapshot() +} + +// Splice inserts newsubtree at the indicated target path. +// +// Splice returns an error if target is not inside a collection. +// +// Splice returns an error if target is the root of a collection and +// newsubtree is a snapshot of a file. +func Splice(fs FileSystem, target string, newsubtree *Subtree) error { + f, err := fs.OpenFile(target, os.O_WRONLY, 0) + if os.IsNotExist(err) { + f, err = fs.OpenFile(target, os.O_CREATE|os.O_WRONLY, 0700) + } + if err != nil { + return fmt.Errorf("open %s: %w", target, err) + } + defer f.Close() + return f.Splice(newsubtree) +}