"errors"
"fmt"
"io"
+ "io/fs"
"log"
"net/http"
"os"
"path"
+ "path/filepath"
"strings"
"sync"
"time"
ErrIsDirectory = errors.New("cannot rename file to overwrite existing directory")
ErrNotADirectory = errors.New("not a directory")
ErrPermission = os.ErrPermission
+ DebugLocksPanicMode = false
)
type syncer interface {
Sync() error
}
+func debugPanicIfNotLocked(l sync.Locker, writing bool) {
+ if !DebugLocksPanicMode {
+ return
+ }
+ race := false
+ if rl, ok := l.(interface {
+ RLock()
+ RUnlock()
+ }); ok && writing {
+ go func() {
+ // Fail if we can grab the read lock during an
+ // operation that purportedly has write lock.
+ rl.RLock()
+ race = true
+ rl.RUnlock()
+ }()
+ } else {
+ go func() {
+ l.Lock()
+ race = true
+ l.Unlock()
+ }()
+ }
+ time.Sleep(100)
+ if race {
+ panic("bug: caller-must-have-lock func called, but nobody has lock")
+ }
+}
+
// A File is an *os.File-like interface for reading and writing files
// in a FileSystem.
type File interface {
Stat() (os.FileInfo, error)
Truncate(int64) error
Sync() error
+ // Create a snapshot of a file or directory tree, which can
+ // then be spliced onto a different path or a different
+ // collection.
+ Snapshot() (*Subtree, error)
+ // Replace this file or directory with the given snapshot.
+ // The target must be inside a collection: Splice returns an
+ // error if the File is a virtual file or directory like
+ // by_id, a project directory, .arvados#collection,
+ // etc. Splice can replace directories with regular files and
+ // vice versa, except it cannot replace the root directory of
+ // a collection with a regular file.
+ Splice(snapshot *Subtree) error
+}
+
+// A Subtree is a detached part of a filesystem tree that can be
+// spliced into a filesystem via (File)Splice().
+type Subtree struct {
+ inode inode
}
// A FileSystem is an http.Filesystem plus Stat() and support for
MemorySize() int64
}
+type fsFS struct {
+ FileSystem
+}
+
+// FS returns an fs.FS interface to the given FileSystem, to enable
+// the use of fs.WalkDir, etc.
+func FS(fs FileSystem) fs.FS { return fsFS{fs} }
+func (fs fsFS) Open(path string) (fs.File, error) {
+ f, err := fs.FileSystem.Open(path)
+ return f, err
+}
+
type inode interface {
SetParent(parent inode, name string)
Parent() inode
Readdir() ([]os.FileInfo, error)
Size() int64
FileInfo() os.FileInfo
+ // Create a snapshot of this node and its descendants.
+ Snapshot() (inode, error)
+ // Replace this node with a copy of the provided snapshot.
+ // Caller may provide the same snapshot to multiple Splice
+ // calls, but must not modify the snapshot concurrently.
+ Splice(inode) error
// Child() performs lookups and updates of named child nodes.
//
mode os.FileMode
size int64
modTime time.Time
+ // If not nil, sys() returns the source data structure, which
+ // can be a *Collection, *Group, or nil. Currently populated
+ // only for project dirs and top-level collection dirs. Does
+ // not stay up to date with upstream changes.
+ //
+ // Intended to support keep-web's properties-as-s3-metadata
+ // feature (https://dev.arvados.org/issues/19088).
+ sys func() interface{}
}
// Name implements os.FileInfo.
return fi.size
}
-// Sys implements os.FileInfo.
+// Sys implements os.FileInfo. See comment in fileinfo struct.
func (fi fileinfo) Sys() interface{} {
- return nil
+ if fi.sys == nil {
+ return nil
+ }
+ return fi.sys()
}
type nullnode struct{}
return 64
}
+func (*nullnode) Snapshot() (inode, error) {
+ return nil, ErrInvalidOperation
+}
+
+func (*nullnode) Splice(inode) error {
+ return ErrInvalidOperation
+}
+
type treenode struct {
fs FileSystem
parent inode
}
func (n *treenode) Child(name string, replace func(inode) (inode, error)) (child inode, err error) {
+ debugPanicIfNotLocked(n, false)
child = n.inodes[name]
if name == "" || name == "." || name == ".." {
err = ErrInvalidArgument
return
}
if newchild == nil {
+ debugPanicIfNotLocked(n, true)
delete(n.inodes, name)
} else if newchild != child {
+ debugPanicIfNotLocked(n, true)
n.inodes[name] = newchild
n.fileinfo.modTime = time.Now()
child = newchild
}
func (n *treenode) FileInfo() os.FileInfo {
- n.Lock()
- defer n.Unlock()
- n.fileinfo.size = int64(len(n.inodes))
- return n.fileinfo
+ n.RLock()
+ defer n.RUnlock()
+ fi := n.fileinfo
+ fi.size = int64(len(n.inodes))
+ return fi
}
func (n *treenode) Readdir() (fi []os.FileInfo, err error) {
+ // We need RLock to safely read n.inodes, but we must release
+ // it before calling FileInfo() on the child nodes. Otherwise,
+ // we risk deadlock when filter groups A and B match each
+ // other, concurrent Readdir() calls try to RLock them in
+ // opposite orders, and one cannot be RLocked a second time
+ // because a third caller is waiting for a write lock.
n.RLock()
- defer n.RUnlock()
- fi = make([]os.FileInfo, 0, len(n.inodes))
+ inodes := make([]inode, 0, len(n.inodes))
for _, inode := range n.inodes {
+ inodes = append(inodes, inode)
+ }
+ n.RUnlock()
+ fi = make([]os.FileInfo, 0, len(inodes))
+ for _, inode := range inodes {
fi = append(fi, inode.FileInfo())
}
return
}
func (n *treenode) MemorySize() (size int64) {
+ // To avoid making other callers wait while we count the
+ // entire filesystem size, we lock the node only long enough
+ // to copy the list of children. We accept that the resulting
+ // size will sometimes be misleading (e.g., we will
+ // double-count an item that moves from A to B after we check
+ // A's size but before we check B's size).
n.RLock()
- defer n.RUnlock()
+ debugPanicIfNotLocked(n, false)
+ todo := make([]inode, 0, len(n.inodes))
for _, inode := range n.inodes {
+ todo = append(todo, inode)
+ }
+ n.RUnlock()
+ for _, inode := range todo {
size += inode.MemorySize()
}
- return
+ return 64 + size
}
type fileSystem struct {
return nil, ErrSyncNotSupported
}
dirname, name := path.Split(name)
- parent, err := rlookup(fs.root, dirname)
+ ancestors := map[inode]bool{}
+ parent, err := rlookup(fs.root, dirname, ancestors)
if err != nil {
return nil, err
}
default:
return nil, fmt.Errorf("invalid flags 0x%x", flag)
}
- if !writable && parent.IsDir() {
+ if parent.IsDir() {
// A directory can be opened via "foo/", "foo/.", or
// "foo/..".
switch name {
case ".", "":
- return &filehandle{inode: parent}, nil
+ return &filehandle{inode: parent, readable: readable, writable: writable}, nil
case "..":
- return &filehandle{inode: parent.Parent()}, nil
+ return &filehandle{inode: parent.Parent(), readable: readable, writable: writable}, nil
}
}
createMode := flag&os.O_CREATE != 0
- if createMode {
- parent.Lock()
- defer parent.Unlock()
- } else {
- parent.RLock()
- defer parent.RUnlock()
- }
+ // We always need to take Lock() here, not just RLock(). Even
+ // if we know we won't be creating a file, parent might be a
+ // lookupnode, which sometimes populates its inodes map during
+ // a Child() call.
+ parent.Lock()
+ defer parent.Unlock()
n, err := parent.Child(name, nil)
if err != nil {
return nil, err
return nil, err
}
}
+ // If n and one of its parents/ancestors are [hardlinks to]
+ // the same node (e.g., a filter group that matches itself),
+ // open an "empty directory" node instead, so the inner
+ // hardlink appears empty. This is needed to ensure
+ // Open("a/b/c/x/x").Readdir() appears empty, matching the
+ // behavior of rlookup("a/b/c/x/x/z") => ErrNotExist.
+ if hl, ok := n.(*hardlink); (ok && ancestors[hl.inode]) || ancestors[n] {
+ n = &treenode{
+ fs: n.FS(),
+ parent: parent,
+ inodes: nil,
+ fileinfo: fileinfo{
+ name: name,
+ modTime: time.Now(),
+ mode: 0555 | os.ModeDir,
+ },
+ }
+ }
return &filehandle{
inode: n,
append: flag&os.O_APPEND != 0,
func (fs *fileSystem) Mkdir(name string, perm os.FileMode) error {
dirname, name := path.Split(name)
- n, err := rlookup(fs.root, dirname)
+ n, err := rlookup(fs.root, dirname, nil)
if err != nil {
return err
}
}
func (fs *fileSystem) Stat(name string) (os.FileInfo, error) {
- node, err := rlookup(fs.root, name)
+ node, err := rlookup(fs.root, name, nil)
if err != nil {
return nil, err
}
// supported. Locking inodes from different
// filesystems could deadlock, so we must error out
// now.
- return ErrInvalidArgument
+ return ErrInvalidOperation
}
// To ensure we can test reliably whether we're about to move
}
locked := map[sync.Locker]bool{}
for i := len(needLock) - 1; i >= 0; i-- {
- if n := needLock[i]; !locked[n] {
+ n := needLock[i]
+ if fs, ok := n.(interface{ rootnode() inode }); ok {
+ // Lock the fs's root dir directly, not
+ // through the fs. Otherwise our "locked" map
+ // would not reliably prevent double-locking
+ // the fs's root dir.
+ n = fs.rootnode()
+ }
+ if !locked[n] {
n.Lock()
defer n.Unlock()
locked[n] = true
if name == "" || name == "." || name == ".." {
return ErrInvalidArgument
}
- dir, err := rlookup(fs.root, dirname)
+ dir, err := rlookup(fs.root, dirname, nil)
if err != nil {
return err
}
// rlookup (recursive lookup) returns the inode for the file/directory
// with the given name (which may contain "/" separators). If no such
// file/directory exists, the returned node is nil.
-func rlookup(start inode, path string) (node inode, err error) {
+//
+// The visited map should be either nil or empty. If non-nil, all
+// nodes and hardlink targets visited by the given path will be added
+// to it.
+//
+// If a cycle is detected, the second occurrence of the offending node
+// will be replaced by an empty directory. For example, if "x" is a
+// filter group that matches itself, then rlookup("a/b/c/x") will
+// return the filter group, and rlookup("a/b/c/x/x") will return an
+// empty directory.
+func rlookup(start inode, path string, visited map[inode]bool) (node inode, err error) {
+ if visited == nil {
+ visited = map[inode]bool{}
+ }
node = start
+ // Clean up ./ and ../ and double-slashes, but (unlike
+ // filepath.Clean) retain a trailing slash, because looking up
+ // ".../regularfile/" should fail.
+ trailingSlash := strings.HasSuffix(path, "/")
+ path = filepath.Clean(path)
+ if trailingSlash && path != "/" {
+ path += "/"
+ }
for _, name := range strings.Split(path, "/") {
+ visited[node] = true
if node.IsDir() {
if name == "." || name == "" {
continue
}
}
node, err = func() (inode, error) {
- node.RLock()
- defer node.RUnlock()
+ node.Lock()
+ defer node.Unlock()
return node.Child(name, nil)
}()
if node == nil || err != nil {
break
}
+ checknode := node
+ if hardlinked, ok := checknode.(*hardlink); ok {
+ checknode = hardlinked.inode
+ }
+ if visited[checknode] {
+ node = &treenode{
+ fs: node.FS(),
+ parent: node.Parent(),
+ inodes: nil,
+ fileinfo: fileinfo{
+ name: name,
+ modTime: time.Now(),
+ mode: 0555 | os.ModeDir,
+ },
+ }
+ } else {
+ visited[checknode] = true
+ }
}
if node == nil && err == nil {
err = os.ErrNotExist
func permittedName(name string) bool {
return name != "" && name != "." && name != ".." && !strings.Contains(name, "/")
}
+
+// Snapshot returns a Subtree that's a copy of the given path. It
+// returns an error if the path is not inside a collection.
+func Snapshot(fs FileSystem, path string) (*Subtree, error) {
+ f, err := fs.OpenFile(path, os.O_RDONLY, 0)
+ if err != nil {
+ return nil, err
+ }
+ defer f.Close()
+ return f.Snapshot()
+}
+
+// Splice inserts newsubtree at the indicated target path.
+//
+// Splice returns an error if target is not inside a collection.
+//
+// Splice returns an error if target is the root of a collection and
+// newsubtree is a snapshot of a file.
+func Splice(fs FileSystem, target string, newsubtree *Subtree) error {
+ f, err := fs.OpenFile(target, os.O_WRONLY, 0)
+ if os.IsNotExist(err) {
+ f, err = fs.OpenFile(target, os.O_CREATE|os.O_WRONLY, 0700)
+ }
+ if err != nil {
+ return fmt.Errorf("open %s: %w", target, err)
+ }
+ defer f.Close()
+ return f.Splice(newsubtree)
+}