1 // Copyright (C) The Arvados Authors. All rights reserved.
3 // SPDX-License-Identifier: Apache-2.0
21 ErrReadOnlyFile = errors.New("read-only file")
22 ErrNegativeOffset = errors.New("cannot seek to negative offset")
23 ErrFileExists = errors.New("file exists")
24 ErrInvalidOperation = errors.New("invalid operation")
25 ErrInvalidArgument = errors.New("invalid argument")
26 ErrDirectoryNotEmpty = errors.New("directory not empty")
27 ErrWriteOnlyMode = errors.New("file is O_WRONLY")
28 ErrSyncNotSupported = errors.New("O_SYNC flag is not supported")
29 ErrIsDirectory = errors.New("cannot rename file to overwrite existing directory")
30 ErrNotADirectory = errors.New("not a directory")
31 ErrPermission = os.ErrPermission
32 DebugLocksPanicMode = false
35 type syncer interface {
39 func debugPanicIfNotLocked(l sync.Locker, writing bool) {
40 if !DebugLocksPanicMode {
44 if rl, ok := l.(interface {
49 // Fail if we can grab the read lock during an
50 // operation that purportedly has write lock.
64 panic("bug: caller-must-have-lock func called, but nobody has lock")
68 // A File is an *os.File-like interface for reading and writing files
76 Readdir(int) ([]os.FileInfo, error)
77 Stat() (os.FileInfo, error)
80 // Create a snapshot of a file or directory tree, which can
81 // then be spliced onto a different path or a different
83 Snapshot() (*Subtree, error)
84 // Replace this file or directory with the given snapshot. It
85 // is an error to replace a directory with a file. If snapshot
86 // is (or might be) a directory, remove the directory, create
87 // a file with the same name, and splice the file.
88 Splice(snapshot *Subtree) error
91 // A Subtree is a detached part of a filesystem tree that can be
92 // spliced into a filesystem via (File)Splice().
97 // A FileSystem is an http.Filesystem plus Stat() and support for
98 // opening writable files. All methods are safe to call from multiple
100 type FileSystem interface {
106 // filesystem-wide lock: used by Rename() to prevent deadlock
107 // while locking multiple inodes.
110 // throttle for limiting concurrent background writers
113 // create a new node with nil parent.
114 newNode(name string, perm os.FileMode, modTime time.Time) (node inode, err error)
116 // analogous to os.Stat()
117 Stat(name string) (os.FileInfo, error)
119 // analogous to os.Create(): create/truncate a file and open it O_RDWR.
120 Create(name string) (File, error)
122 // Like os.OpenFile(): create or open a file or directory.
124 // If flag&os.O_EXCL==0, it opens an existing file or
125 // directory if one exists. If flag&os.O_CREATE!=0, it creates
126 // a new empty file or directory if one does not already
129 // When creating a new item, perm&os.ModeDir determines
130 // whether it is a file or a directory.
132 // A file can be opened multiple times and used concurrently
133 // from multiple goroutines. However, each File object should
134 // be used by only one goroutine at a time.
135 OpenFile(name string, flag int, perm os.FileMode) (File, error)
137 Mkdir(name string, perm os.FileMode) error
138 Remove(name string) error
139 RemoveAll(name string) error
140 Rename(oldname, newname string) error
142 // Write buffered data from memory to storage, returning when
143 // all updates have been saved to persistent storage.
146 // Write buffered data from memory to storage, but don't wait
147 // for all writes to finish before returning. If shortBlocks
148 // is true, flush everything; otherwise, if there's less than
149 // a full block of buffered data at the end of a stream, leave
150 // it buffered in memory in case more data can be appended. If
151 // path is "", flush all dirs/streams; otherwise, flush only
152 // the specified dir/stream.
153 Flush(path string, shortBlocks bool) error
155 // Estimate current memory usage.
159 type inode interface {
160 SetParent(parent inode, name string)
163 Read([]byte, filenodePtr) (int, filenodePtr, error)
164 Write([]byte, filenodePtr) (int, filenodePtr, error)
165 Truncate(int64) error
167 Readdir() ([]os.FileInfo, error)
169 FileInfo() os.FileInfo
170 // Create a snapshot of this node and its descendants.
171 Snapshot() (inode, error)
172 // Replace this node with a copy of the provided snapshot.
173 // Caller may provide the same snapshot to multiple Splice
174 // calls, but must not modify the the snapshot concurrently.
177 // Child() performs lookups and updates of named child nodes.
179 // (The term "child" here is used strictly. This means name is
180 // not "." or "..", and name does not contain "/".)
182 // If replace is non-nil, Child calls replace(x) where x is
183 // the current child inode with the given name. If possible,
184 // the child inode is replaced with the one returned by
187 // If replace(x) returns an inode (besides x or nil) that is
188 // subsequently returned by Child(), then Child()'s caller
189 // must ensure the new child's name and parent are set/updated
190 // to Child()'s name argument and its receiver respectively.
191 // This is not necessarily done before replace(x) returns, but
192 // it must be done before Child()'s caller releases the
195 // Nil represents "no child". replace(nil) signifies that no
196 // child with this name exists yet. If replace() returns nil,
197 // the existing child should be deleted if possible.
199 // An implementation of Child() is permitted to ignore
200 // replace() or its return value. For example, a regular file
201 // inode does not have children, so Child() always returns
204 // Child() returns the child, if any, with the given name: if
205 // a child was added or changed, the new child is returned.
207 // Caller must have lock (or rlock if replace is nil).
208 Child(name string, replace func(inode) (inode, error)) (inode, error)
216 type fileinfo struct {
223 // Name implements os.FileInfo.
224 func (fi fileinfo) Name() string {
228 // ModTime implements os.FileInfo.
229 func (fi fileinfo) ModTime() time.Time {
233 // Mode implements os.FileInfo.
234 func (fi fileinfo) Mode() os.FileMode {
238 // IsDir implements os.FileInfo.
239 func (fi fileinfo) IsDir() bool {
240 return fi.mode&os.ModeDir != 0
243 // Size implements os.FileInfo.
244 func (fi fileinfo) Size() int64 {
248 // Sys implements os.FileInfo.
249 func (fi fileinfo) Sys() interface{} {
253 type nullnode struct{}
255 func (*nullnode) Mkdir(string, os.FileMode) error {
256 return ErrInvalidOperation
259 func (*nullnode) Read([]byte, filenodePtr) (int, filenodePtr, error) {
260 return 0, filenodePtr{}, ErrInvalidOperation
263 func (*nullnode) Write([]byte, filenodePtr) (int, filenodePtr, error) {
264 return 0, filenodePtr{}, ErrInvalidOperation
267 func (*nullnode) Truncate(int64) error {
268 return ErrInvalidOperation
271 func (*nullnode) FileInfo() os.FileInfo {
275 func (*nullnode) IsDir() bool {
279 func (*nullnode) Readdir() ([]os.FileInfo, error) {
280 return nil, ErrInvalidOperation
283 func (*nullnode) Child(name string, replace func(inode) (inode, error)) (inode, error) {
284 return nil, ErrNotADirectory
287 func (*nullnode) MemorySize() int64 {
288 // Types that embed nullnode should report their own size, but
289 // if they don't, we at least report a non-zero size to ensure
290 // a large tree doesn't get reported as 0 bytes.
294 func (*nullnode) Snapshot() (inode, error) {
295 return nil, ErrInvalidOperation
298 func (*nullnode) Splice(inode) error {
299 return ErrInvalidOperation
302 type treenode struct {
305 inodes map[string]inode
311 func (n *treenode) FS() FileSystem {
315 func (n *treenode) SetParent(p inode, name string) {
319 n.fileinfo.name = name
322 func (n *treenode) Parent() inode {
328 func (n *treenode) IsDir() bool {
332 func (n *treenode) Child(name string, replace func(inode) (inode, error)) (child inode, err error) {
333 debugPanicIfNotLocked(n, false)
334 child = n.inodes[name]
335 if name == "" || name == "." || name == ".." {
336 err = ErrInvalidArgument
342 newchild, err := replace(child)
347 debugPanicIfNotLocked(n, true)
348 delete(n.inodes, name)
349 } else if newchild != child {
350 debugPanicIfNotLocked(n, true)
351 n.inodes[name] = newchild
352 n.fileinfo.modTime = time.Now()
358 func (n *treenode) Size() int64 {
359 return n.FileInfo().Size()
362 func (n *treenode) FileInfo() os.FileInfo {
365 n.fileinfo.size = int64(len(n.inodes))
369 func (n *treenode) Readdir() (fi []os.FileInfo, err error) {
372 fi = make([]os.FileInfo, 0, len(n.inodes))
373 for _, inode := range n.inodes {
374 fi = append(fi, inode.FileInfo())
379 func (n *treenode) Sync() error {
382 for _, inode := range n.inodes {
383 syncer, ok := inode.(syncer)
385 return ErrInvalidOperation
395 func (n *treenode) MemorySize() (size int64) {
398 debugPanicIfNotLocked(n, false)
399 for _, inode := range n.inodes {
400 size += inode.MemorySize()
405 type fileSystem struct {
412 func (fs *fileSystem) rootnode() inode {
416 func (fs *fileSystem) throttle() *throttle {
420 func (fs *fileSystem) locker() sync.Locker {
424 // OpenFile is analogous to os.OpenFile().
425 func (fs *fileSystem) OpenFile(name string, flag int, perm os.FileMode) (File, error) {
426 return fs.openFile(name, flag, perm)
429 func (fs *fileSystem) openFile(name string, flag int, perm os.FileMode) (*filehandle, error) {
430 if flag&os.O_SYNC != 0 {
431 return nil, ErrSyncNotSupported
433 dirname, name := path.Split(name)
434 parent, err := rlookup(fs.root, dirname)
438 var readable, writable bool
439 switch flag & (os.O_RDWR | os.O_RDONLY | os.O_WRONLY) {
448 return nil, fmt.Errorf("invalid flags 0x%x", flag)
450 if !writable && parent.IsDir() {
451 // A directory can be opened via "foo/", "foo/.", or
455 return &filehandle{inode: parent}, nil
457 return &filehandle{inode: parent.Parent()}, nil
460 createMode := flag&os.O_CREATE != 0
461 // We always need to take Lock() here, not just RLock(). Even
462 // if we know we won't be creating a file, parent might be a
463 // lookupnode, which sometimes populates its inodes map during
466 defer parent.Unlock()
467 n, err := parent.Child(name, nil)
472 return nil, os.ErrNotExist
474 n, err = parent.Child(name, func(inode) (repl inode, err error) {
475 repl, err = parent.FS().newNode(name, perm|0755, time.Now())
479 repl.SetParent(parent, name)
485 // Parent rejected new child, but returned no error
486 return nil, ErrInvalidArgument
488 } else if flag&os.O_EXCL != 0 {
489 return nil, ErrFileExists
490 } else if flag&os.O_TRUNC != 0 {
492 return nil, fmt.Errorf("invalid flag O_TRUNC in read-only mode")
493 } else if n.IsDir() {
494 return nil, fmt.Errorf("invalid flag O_TRUNC when opening directory")
495 } else if err := n.Truncate(0); err != nil {
501 append: flag&os.O_APPEND != 0,
507 func (fs *fileSystem) Open(name string) (http.File, error) {
508 return fs.OpenFile(name, os.O_RDONLY, 0)
511 func (fs *fileSystem) Create(name string) (File, error) {
512 return fs.OpenFile(name, os.O_CREATE|os.O_RDWR|os.O_TRUNC, 0)
515 func (fs *fileSystem) Mkdir(name string, perm os.FileMode) error {
516 dirname, name := path.Split(name)
517 n, err := rlookup(fs.root, dirname)
523 if child, err := n.Child(name, nil); err != nil {
525 } else if child != nil {
529 _, err = n.Child(name, func(inode) (repl inode, err error) {
530 repl, err = n.FS().newNode(name, perm|os.ModeDir, time.Now())
534 repl.SetParent(n, name)
540 func (fs *fileSystem) Stat(name string) (os.FileInfo, error) {
541 node, err := rlookup(fs.root, name)
545 return node.FileInfo(), nil
548 func (fs *fileSystem) Rename(oldname, newname string) error {
549 olddir, oldname := path.Split(oldname)
550 if oldname == "" || oldname == "." || oldname == ".." {
551 return ErrInvalidArgument
553 olddirf, err := fs.openFile(olddir+".", os.O_RDONLY, 0)
555 return fmt.Errorf("%q: %s", olddir, err)
557 defer olddirf.Close()
559 newdir, newname := path.Split(newname)
560 if newname == "." || newname == ".." {
561 return ErrInvalidArgument
562 } else if newname == "" {
563 // Rename("a/b", "c/") means Rename("a/b", "c/b")
566 newdirf, err := fs.openFile(newdir+".", os.O_RDONLY, 0)
568 return fmt.Errorf("%q: %s", newdir, err)
570 defer newdirf.Close()
572 // TODO: If the nearest common ancestor ("nca") of olddirf and
573 // newdirf is on a different filesystem than fs, we should
574 // call nca.FS().Rename() instead of proceeding. Until then
575 // it's awkward for filesystems to implement their own Rename
576 // methods effectively: the only one that runs is the one on
577 // the root FileSystem exposed to the caller (webdav, fuse,
580 // When acquiring locks on multiple inodes, avoid deadlock by
581 // locking the entire containing filesystem first.
582 cfs := olddirf.inode.FS()
584 defer cfs.locker().Unlock()
586 if cfs != newdirf.inode.FS() {
587 // Moving inodes across filesystems is not (yet)
588 // supported. Locking inodes from different
589 // filesystems could deadlock, so we must error out
591 return ErrInvalidArgument
594 // To ensure we can test reliably whether we're about to move
595 // a directory into itself, lock all potential common
596 // ancestors of olddir and newdir.
597 needLock := []sync.Locker{}
598 for _, node := range []inode{olddirf.inode, newdirf.inode} {
599 needLock = append(needLock, node)
600 for node.Parent() != node && node.Parent().FS() == node.FS() {
602 needLock = append(needLock, node)
605 locked := map[sync.Locker]bool{}
606 for i := len(needLock) - 1; i >= 0; i-- {
607 if n := needLock[i]; !locked[n] {
614 _, err = olddirf.inode.Child(oldname, func(oldinode inode) (inode, error) {
616 return oldinode, os.ErrNotExist
618 if locked[oldinode] {
619 // oldinode cannot become a descendant of itself.
620 return oldinode, ErrInvalidArgument
622 if oldinode.FS() != cfs && newdirf.inode != olddirf.inode {
623 // moving a mount point to a different parent
624 // is not (yet) supported.
625 return oldinode, ErrInvalidArgument
627 accepted, err := newdirf.inode.Child(newname, func(existing inode) (inode, error) {
628 if existing != nil && existing.IsDir() {
629 return existing, ErrIsDirectory
634 // Leave oldinode in olddir.
637 accepted.SetParent(newdirf.inode, newname)
643 func (fs *fileSystem) Remove(name string) error {
644 return fs.remove(strings.TrimRight(name, "/"), false)
647 func (fs *fileSystem) RemoveAll(name string) error {
648 err := fs.remove(strings.TrimRight(name, "/"), true)
649 if os.IsNotExist(err) {
650 // "If the path does not exist, RemoveAll returns
651 // nil." (see "os" pkg)
657 func (fs *fileSystem) remove(name string, recursive bool) error {
658 dirname, name := path.Split(name)
659 if name == "" || name == "." || name == ".." {
660 return ErrInvalidArgument
662 dir, err := rlookup(fs.root, dirname)
668 _, err = dir.Child(name, func(node inode) (inode, error) {
670 return nil, os.ErrNotExist
672 if !recursive && node.IsDir() && node.Size() > 0 {
673 return node, ErrDirectoryNotEmpty
680 func (fs *fileSystem) Sync() error {
681 if syncer, ok := fs.root.(syncer); ok {
684 return ErrInvalidOperation
687 func (fs *fileSystem) Flush(string, bool) error {
688 log.Printf("TODO: flush fileSystem")
689 return ErrInvalidOperation
692 func (fs *fileSystem) MemorySize() int64 {
693 return fs.root.MemorySize()
696 // rlookup (recursive lookup) returns the inode for the file/directory
697 // with the given name (which may contain "/" separators). If no such
698 // file/directory exists, the returned node is nil.
699 func rlookup(start inode, path string) (node inode, err error) {
701 for _, name := range strings.Split(path, "/") {
703 if name == "." || name == "" {
711 node, err = func() (inode, error) {
714 return node.Child(name, nil)
716 if node == nil || err != nil {
720 if node == nil && err == nil {
726 func permittedName(name string) bool {
727 return name != "" && name != "." && name != ".." && !strings.Contains(name, "/")
730 // Snapshot returns a Subtree that's a copy of the given path. It
731 // returns an error if the path is not inside a collection.
732 func Snapshot(fs FileSystem, path string) (*Subtree, error) {
733 f, err := fs.OpenFile(path, os.O_RDONLY, 0)
741 // Splice inserts newsubtree at the indicated target path.
743 // Splice returns an error if target is not inside a collection.
745 // Splice returns an error if target is an existing directory and
746 // newsubtree is a snapshot of a file.
747 func Splice(fs FileSystem, target string, newsubtree *Subtree) error {
748 f, err := fs.OpenFile(target, os.O_WRONLY, 0)
749 if os.IsNotExist(err) {
750 f, err = fs.OpenFile(target, os.O_CREATE|os.O_WRONLY, 0700)
756 return f.Splice(newsubtree)