// Copyright (C) The Arvados Authors. All rights reserved.
//
// SPDX-License-Identifier: Apache-2.0

package arvados

import (
	"bytes"
	"context"
	"encoding/json"
	"fmt"
	"io"
	"os"
	"path"
	"regexp"
	"sort"
	"strconv"
	"strings"
	"sync"
	"sync/atomic"
	"time"
)

var (
	maxBlockSize      = 1 << 26
	concurrentWriters = 4 // max goroutines writing to Keep in background and during flush()
)

// A CollectionFileSystem is a FileSystem that can be serialized as a
// manifest and stored as a collection.
type CollectionFileSystem interface {
	FileSystem

	// Flush all file data to Keep and return a snapshot of the
	// filesystem suitable for saving as (Collection)ManifestText.
	// Prefix (normally ".") is a top level directory, effectively
	// prepended to all paths in the returned manifest.
	MarshalManifest(prefix string) (string, error)

	// Total data bytes in all files.
	Size() int64
}

type collectionFileSystem struct {
	fileSystem
	uuid           string
	savedPDH       atomic.Value
	replicas       int
	storageClasses []string
	// guessSignatureTTL tracks a lower bound for the server's
	// configured BlobSigningTTL. The guess is initially zero, and
	// increases when we come across a signature with an expiry
	// time further in the future than the previous guess.
	//
	// When the guessed TTL is much smaller than the real TTL,
	// preemptive signature refresh is delayed or missed entirely,
	// which is OK.
	guessSignatureTTL time.Duration
	holdCheckChanges  time.Time
	lockCheckChanges  sync.Mutex
}

// FileSystem returns a CollectionFileSystem for the collection.
func (c *Collection) FileSystem(client apiClient, kc keepClient) (CollectionFileSystem, error) {
	modTime := c.ModifiedAt
	if modTime.IsZero() {
		modTime = time.Now()
	}
	fs := &collectionFileSystem{
		uuid:           c.UUID,
		storageClasses: c.StorageClassesDesired,
		fileSystem: fileSystem{
			fsBackend: keepBackend{apiClient: client, keepClient: kc},
			thr:       newThrottle(concurrentWriters),
		},
	}
	fs.savedPDH.Store(c.PortableDataHash)
	if r := c.ReplicationDesired; r != nil {
		fs.replicas = *r
	}
	root := &dirnode{
		fs: fs,
		treenode: treenode{
			fileinfo: fileinfo{
				name:    ".",
				mode:    os.ModeDir | 0755,
				modTime: modTime,
				sys:     c,
			},
			inodes: make(map[string]inode),
		},
	}
	root.SetParent(root, ".")
	if err := root.loadManifest(c.ManifestText); err != nil {
		return nil, err
	}
	backdateTree(root, modTime)
	fs.root = root
	return fs, nil
}

// caller must have lock (or guarantee no concurrent accesses somehow)
func eachNode(n inode, ffunc func(*filenode), dfunc func(*dirnode)) {
	switch n := n.(type) {
	case *filenode:
		if ffunc != nil {
			ffunc(n)
		}
	case *dirnode:
		if dfunc != nil {
			dfunc(n)
		}
		for _, n := range n.inodes {
			eachNode(n, ffunc, dfunc)
		}
	}
}

// caller must have lock (or guarantee no concurrent accesses somehow)
func backdateTree(n inode, modTime time.Time) {
	eachNode(n, func(fn *filenode) {
		fn.fileinfo.modTime = modTime
	}, func(dn *dirnode) {
		dn.fileinfo.modTime = modTime
	})
}

// Approximate portion of signature TTL remaining, usually between 0
// and 1, or negative if some signatures have expired.
func (fs *collectionFileSystem) signatureTimeLeft() (float64, time.Duration) {
	var (
		now      = time.Now()
		earliest = now.Add(time.Hour * 24 * 7 * 365)
		latest   time.Time
	)
	fs.fileSystem.root.RLock()
	eachNode(fs.root, func(fn *filenode) {
		fn.Lock()
		defer fn.Unlock()
		for _, seg := range fn.segments {
			seg, ok := seg.(storedSegment)
			if !ok {
				continue
			}
			expiryTime, err := signatureExpiryTime(seg.locator)
			if err != nil {
				continue
			}
			if expiryTime.Before(earliest) {
				earliest = expiryTime
			}
			if expiryTime.After(latest) {
				latest = expiryTime
			}
		}
	}, nil)
	fs.fileSystem.root.RUnlock()

	if latest.IsZero() {
		// No signatures == 100% of TTL remaining.
		return 1, 1
	}

	ttl := latest.Sub(now)
	fs.fileSystem.root.Lock()
	{
		if ttl > fs.guessSignatureTTL {
			// ttl is closer to the real TTL than
			// guessSignatureTTL.
			fs.guessSignatureTTL = ttl
		} else {
			// Use the previous best guess to compute the
			// portion remaining (below, after unlocking
			// mutex).
			ttl = fs.guessSignatureTTL
		}
	}
	fs.fileSystem.root.Unlock()

	return earliest.Sub(now).Seconds() / ttl.Seconds(), ttl
}

func (fs *collectionFileSystem) updateSignatures(newmanifest string) {
	newLoc := map[string]string{}
	for _, tok := range regexp.MustCompile(`\S+`).FindAllString(newmanifest, -1) {
		if mBlkRe.MatchString(tok) {
			newLoc[stripAllHints(tok)] = tok
		}
	}
	fs.fileSystem.root.Lock()
	defer fs.fileSystem.root.Unlock()
	eachNode(fs.root, func(fn *filenode) {
		fn.Lock()
		defer fn.Unlock()
		for idx, seg := range fn.segments {
			seg, ok := seg.(storedSegment)
			if !ok {
				continue
			}
			loc, ok := newLoc[stripAllHints(seg.locator)]
			if !ok {
				continue
			}
			seg.locator = loc
			fn.segments[idx] = seg
		}
	}, nil)
}

func (fs *collectionFileSystem) newNode(name string, perm os.FileMode, modTime time.Time) (node inode, err error) {
	if name == "" || name == "." || name == ".." {
		return nil, ErrInvalidArgument
	}
	if perm.IsDir() {
		return &dirnode{
			fs: fs,
			treenode: treenode{
				fileinfo: fileinfo{
					name:    name,
					mode:    perm | os.ModeDir,
					modTime: modTime,
				},
				inodes: make(map[string]inode),
			},
		}, nil
	}
	return &filenode{
		fs: fs,
		fileinfo: fileinfo{
			name:    name,
			mode:    perm & ^os.ModeDir,
			modTime: modTime,
		},
	}, nil
}

func (fs *collectionFileSystem) Child(name string, replace func(inode) (inode, error)) (inode, error) {
	return fs.rootnode().Child(name, replace)
}

func (fs *collectionFileSystem) FS() FileSystem {
	return fs
}

func (fs *collectionFileSystem) FileInfo() os.FileInfo {
	return fs.rootnode().FileInfo()
}

func (fs *collectionFileSystem) IsDir() bool {
	return true
}

func (fs *collectionFileSystem) Lock() {
	fs.rootnode().Lock()
}

func (fs *collectionFileSystem) Unlock() {
	fs.rootnode().Unlock()
}

func (fs *collectionFileSystem) RLock() {
	fs.rootnode().RLock()
}

func (fs *collectionFileSystem) RUnlock() {
	fs.rootnode().RUnlock()
}

func (fs *collectionFileSystem) Parent() inode {
	return fs.rootnode().Parent()
}

func (fs *collectionFileSystem) Read(_ []byte, ptr filenodePtr) (int, filenodePtr, error) {
	return 0, ptr, ErrInvalidOperation
}

func (fs *collectionFileSystem) Write(_ []byte, ptr filenodePtr) (int, filenodePtr, error) {
	return 0, ptr, ErrInvalidOperation
}

func (fs *collectionFileSystem) Readdir() ([]os.FileInfo, error) {
	return fs.rootnode().Readdir()
}

func (fs *collectionFileSystem) SetParent(parent inode, name string) {
	fs.rootnode().SetParent(parent, name)
}

func (fs *collectionFileSystem) Truncate(int64) error {
	return ErrInvalidOperation
}

// Check for and incorporate upstream changes -- unless that has
// already been done recently, in which case this func is a no-op.
func (fs *collectionFileSystem) checkChangesOnServer() error {
	if fs.uuid == "" && fs.savedPDH.Load() == "" {
		return nil
	}

	// First try UUID if any, then last known PDH. Stop if all
	// signatures are new enough.
	checkingAll := false
	for _, id := range []string{fs.uuid, fs.savedPDH.Load().(string)} {
		if id == "" {
			continue
		}

		fs.lockCheckChanges.Lock()
		if !checkingAll && fs.holdCheckChanges.After(time.Now()) {
			fs.lockCheckChanges.Unlock()
			return nil
		}
		remain, ttl := fs.signatureTimeLeft()
		if remain > 0.01 && !checkingAll {
			fs.holdCheckChanges = time.Now().Add(ttl / 100)
		}
		fs.lockCheckChanges.Unlock()

		if remain >= 0.5 {
			break
		}
		checkingAll = true
		var coll Collection
		err := fs.RequestAndDecode(&coll, "GET", "arvados/v1/collections/"+id, nil, map[string]interface{}{"select": []string{"portable_data_hash", "manifest_text"}})
		if err != nil {
			continue
		}
		fs.updateSignatures(coll.ManifestText)
	}
	return nil
}

// Refresh signature on a single locator, if necessary. Assume caller
// has lock. If an update is needed, and there are any storedSegments
// whose signatures can be updated, start a background task to update
// them asynchronously when the caller releases locks.
func (fs *collectionFileSystem) refreshSignature(locator string) string {
	exp, err := signatureExpiryTime(locator)
	if err != nil || exp.Sub(time.Now()) > time.Minute {
		// Synchronous update is not needed. Start an
		// asynchronous update if needed.
		go fs.checkChangesOnServer()
		return locator
	}
	var manifests string
	for _, id := range []string{fs.uuid, fs.savedPDH.Load().(string)} {
		if id == "" {
			continue
		}
		var coll Collection
		err := fs.RequestAndDecode(&coll, "GET", "arvados/v1/collections/"+id, nil, map[string]interface{}{"select": []string{"portable_data_hash", "manifest_text"}})
		if err != nil {
			continue
		}
		manifests += coll.ManifestText
	}
	hash := stripAllHints(locator)
	for _, tok := range regexp.MustCompile(`\S+`).FindAllString(manifests, -1) {
		if mBlkRe.MatchString(tok) {
			if stripAllHints(tok) == hash {
				locator = tok
				break
			}
		}
	}
	go fs.updateSignatures(manifests)
	return locator
}

func (fs *collectionFileSystem) Sync() error {
	err := fs.checkChangesOnServer()
	if err != nil {
		return err
	}
	if fs.uuid == "" {
		return nil
	}
	txt, err := fs.MarshalManifest(".")
	if err != nil {
		return fmt.Errorf("sync failed: %s", err)
	}
	if PortableDataHash(txt) == fs.savedPDH.Load() {
		// No local changes since last save or initial load.
		return nil
	}
	coll := Collection{
		UUID:         fs.uuid,
		ManifestText: txt,
	}

	selectFields := []string{"uuid", "portable_data_hash"}
	fs.lockCheckChanges.Lock()
	remain, _ := fs.signatureTimeLeft()
	fs.lockCheckChanges.Unlock()
	if remain < 0.5 {
		selectFields = append(selectFields, "manifest_text")
	}

	err = fs.RequestAndDecode(&coll, "PUT", "arvados/v1/collections/"+fs.uuid, nil, map[string]interface{}{
		"collection": map[string]string{
			"manifest_text": coll.ManifestText,
		},
		"select": selectFields,
	})
	if err != nil {
		return fmt.Errorf("sync failed: update %s: %s", fs.uuid, err)
	}
	fs.updateSignatures(coll.ManifestText)
	fs.savedPDH.Store(coll.PortableDataHash)
	return nil
}

func (fs *collectionFileSystem) Flush(path string, shortBlocks bool) error {
	node, err := rlookup(fs.fileSystem.root, path)
	if err != nil {
		return err
	}
	dn, ok := node.(*dirnode)
	if !ok {
		return ErrNotADirectory
	}
	dn.Lock()
	defer dn.Unlock()
	names := dn.sortedNames()
	if path != "" {
		// Caller only wants to flush the specified dir,
		// non-recursively.  Drop subdirs from the list of
		// names.
		var filenames []string
		for _, name := range names {
			if _, ok := dn.inodes[name].(*filenode); ok {
				filenames = append(filenames, name)
			}
		}
		names = filenames
	}
	for _, name := range names {
		child := dn.inodes[name]
		child.Lock()
		defer child.Unlock()
	}
	return dn.flush(context.TODO(), names, flushOpts{sync: false, shortBlocks: shortBlocks})
}

func (fs *collectionFileSystem) MemorySize() int64 {
	fs.fileSystem.root.Lock()
	defer fs.fileSystem.root.Unlock()
	return fs.fileSystem.root.(*dirnode).MemorySize()
}

func (fs *collectionFileSystem) MarshalManifest(prefix string) (string, error) {
	fs.fileSystem.root.Lock()
	defer fs.fileSystem.root.Unlock()
	return fs.fileSystem.root.(*dirnode).marshalManifest(context.TODO(), prefix)
}

func (fs *collectionFileSystem) Size() int64 {
	return fs.fileSystem.root.(*dirnode).TreeSize()
}

func (fs *collectionFileSystem) Snapshot() (inode, error) {
	return fs.fileSystem.root.Snapshot()
}

func (fs *collectionFileSystem) Splice(r inode) error {
	return fs.fileSystem.root.Splice(r)
}

// filenodePtr is an offset into a file that is (usually) efficient to
// seek to. Specifically, if filenode.repacked==filenodePtr.repacked
// then
// filenode.segments[filenodePtr.segmentIdx][filenodePtr.segmentOff]
// corresponds to file offset filenodePtr.off. Otherwise, it is
// necessary to reexamine len(filenode.segments[0]) etc. to find the
// correct segment and offset.
type filenodePtr struct {
	off        int64
	segmentIdx int
	segmentOff int
	repacked   int64
}

// seek returns a ptr that is consistent with both startPtr.off and
// the current state of fn. The caller must already hold fn.RLock() or
// fn.Lock().
//
// If startPtr is beyond EOF, ptr.segment* will indicate precisely
// EOF.
//
// After seeking:
//
//     ptr.segmentIdx == len(filenode.segments) // i.e., at EOF
//     ||
//     filenode.segments[ptr.segmentIdx].Len() > ptr.segmentOff
func (fn *filenode) seek(startPtr filenodePtr) (ptr filenodePtr) {
	ptr = startPtr
	if ptr.off < 0 {
		// meaningless anyway
		return
	} else if ptr.off >= fn.fileinfo.size {
		ptr.segmentIdx = len(fn.segments)
		ptr.segmentOff = 0
		ptr.repacked = fn.repacked
		return
	} else if ptr.repacked == fn.repacked {
		// segmentIdx and segmentOff accurately reflect
		// ptr.off, but might have fallen off the end of a
		// segment
		if ptr.segmentOff >= fn.segments[ptr.segmentIdx].Len() {
			ptr.segmentIdx++
			ptr.segmentOff = 0
		}
		return
	}
	defer func() {
		ptr.repacked = fn.repacked
	}()
	if ptr.off >= fn.fileinfo.size {
		ptr.segmentIdx, ptr.segmentOff = len(fn.segments), 0
		return
	}
	// Recompute segmentIdx and segmentOff.  We have already
	// established fn.fileinfo.size > ptr.off >= 0, so we don't
	// have to deal with edge cases here.
	var off int64
	for ptr.segmentIdx, ptr.segmentOff = 0, 0; off < ptr.off; ptr.segmentIdx++ {
		// This would panic (index out of range) if
		// fn.fileinfo.size were larger than
		// sum(fn.segments[i].Len()) -- but that can't happen
		// because we have ensured fn.fileinfo.size is always
		// accurate.
		segLen := int64(fn.segments[ptr.segmentIdx].Len())
		if off+segLen > ptr.off {
			ptr.segmentOff = int(ptr.off - off)
			break
		}
		off += segLen
	}
	return
}

// filenode implements inode.
type filenode struct {
	parent   inode
	fs       *collectionFileSystem
	fileinfo fileinfo
	segments []segment
	// number of times `segments` has changed in a
	// way that might invalidate a filenodePtr
	repacked int64
	memsize  int64 // bytes in memSegments
	sync.RWMutex
	nullnode
}

// caller must have lock
func (fn *filenode) appendSegment(e segment) {
	fn.segments = append(fn.segments, e)
	fn.fileinfo.size += int64(e.Len())
}

func (fn *filenode) SetParent(p inode, name string) {
	fn.Lock()
	defer fn.Unlock()
	fn.parent = p
	fn.fileinfo.name = name
}

func (fn *filenode) Parent() inode {
	fn.RLock()
	defer fn.RUnlock()
	return fn.parent
}

func (fn *filenode) FS() FileSystem {
	return fn.fs
}

// Read reads file data from a single segment, starting at startPtr,
// into p. startPtr is assumed not to be up-to-date. Caller must have
// RLock or Lock.
func (fn *filenode) Read(p []byte, startPtr filenodePtr) (n int, ptr filenodePtr, err error) {
	ptr = fn.seek(startPtr)
	if ptr.off < 0 {
		err = ErrNegativeOffset
		return
	}
	if ptr.segmentIdx >= len(fn.segments) {
		err = io.EOF
		return
	}
	if ss, ok := fn.segments[ptr.segmentIdx].(storedSegment); ok {
		ss.locator = fn.fs.refreshSignature(ss.locator)
		fn.segments[ptr.segmentIdx] = ss
	}
	n, err = fn.segments[ptr.segmentIdx].ReadAt(p, int64(ptr.segmentOff))
	if n > 0 {
		ptr.off += int64(n)
		ptr.segmentOff += n
		if ptr.segmentOff == fn.segments[ptr.segmentIdx].Len() {
			ptr.segmentIdx++
			ptr.segmentOff = 0
			if ptr.segmentIdx < len(fn.segments) && err == io.EOF {
				err = nil
			}
		}
	}
	return
}

func (fn *filenode) Size() int64 {
	fn.RLock()
	defer fn.RUnlock()
	return fn.fileinfo.Size()
}

func (fn *filenode) FileInfo() os.FileInfo {
	fn.RLock()
	defer fn.RUnlock()
	return fn.fileinfo
}

func (fn *filenode) Truncate(size int64) error {
	fn.Lock()
	defer fn.Unlock()
	return fn.truncate(size)
}

func (fn *filenode) truncate(size int64) error {
	if size == fn.fileinfo.size {
		return nil
	}
	fn.repacked++
	if size < fn.fileinfo.size {
		ptr := fn.seek(filenodePtr{off: size})
		for i := ptr.segmentIdx; i < len(fn.segments); i++ {
			if seg, ok := fn.segments[i].(*memSegment); ok {
				fn.memsize -= int64(seg.Len())
			}
		}
		if ptr.segmentOff == 0 {
			fn.segments = fn.segments[:ptr.segmentIdx]
		} else {
			fn.segments = fn.segments[:ptr.segmentIdx+1]
			switch seg := fn.segments[ptr.segmentIdx].(type) {
			case *memSegment:
				seg.Truncate(ptr.segmentOff)
				fn.memsize += int64(seg.Len())
			default:
				fn.segments[ptr.segmentIdx] = seg.Slice(0, ptr.segmentOff)
			}
		}
		fn.fileinfo.size = size
		return nil
	}
	for size > fn.fileinfo.size {
		grow := size - fn.fileinfo.size
		var seg *memSegment
		var ok bool
		if len(fn.segments) == 0 {
			seg = &memSegment{}
			fn.segments = append(fn.segments, seg)
		} else if seg, ok = fn.segments[len(fn.segments)-1].(*memSegment); !ok || seg.Len() >= maxBlockSize {
			seg = &memSegment{}
			fn.segments = append(fn.segments, seg)
		}
		if maxgrow := int64(maxBlockSize - seg.Len()); maxgrow < grow {
			grow = maxgrow
		}
		seg.Truncate(seg.Len() + int(grow))
		fn.fileinfo.size += grow
		fn.memsize += grow
	}
	return nil
}

// Write writes data from p to the file, starting at startPtr,
// extending the file size if necessary. Caller must have Lock.
func (fn *filenode) Write(p []byte, startPtr filenodePtr) (n int, ptr filenodePtr, err error) {
	if startPtr.off > fn.fileinfo.size {
		if err = fn.truncate(startPtr.off); err != nil {
			return 0, startPtr, err
		}
	}
	ptr = fn.seek(startPtr)
	if ptr.off < 0 {
		err = ErrNegativeOffset
		return
	}
	for len(p) > 0 && err == nil {
		cando := p
		if len(cando) > maxBlockSize {
			cando = cando[:maxBlockSize]
		}
		// Rearrange/grow fn.segments (and shrink cando if
		// needed) such that cando can be copied to
		// fn.segments[ptr.segmentIdx] at offset
		// ptr.segmentOff.
		cur := ptr.segmentIdx
		prev := ptr.segmentIdx - 1
		var curWritable bool
		if cur < len(fn.segments) {
			_, curWritable = fn.segments[cur].(*memSegment)
		}
		var prevAppendable bool
		if prev >= 0 && fn.segments[prev].Len() < maxBlockSize {
			_, prevAppendable = fn.segments[prev].(*memSegment)
		}
		if ptr.segmentOff > 0 && !curWritable {
			// Split a non-writable block.
			if max := fn.segments[cur].Len() - ptr.segmentOff; max <= len(cando) {
				// Truncate cur, and insert a new
				// segment after it.
				cando = cando[:max]
				fn.segments = append(fn.segments, nil)
				copy(fn.segments[cur+1:], fn.segments[cur:])
			} else {
				// Split cur into two copies, truncate
				// the one on the left, shift the one
				// on the right, and insert a new
				// segment between them.
				fn.segments = append(fn.segments, nil, nil)
				copy(fn.segments[cur+2:], fn.segments[cur:])
				fn.segments[cur+2] = fn.segments[cur+2].Slice(ptr.segmentOff+len(cando), -1)
			}
			cur++
			prev++
			seg := &memSegment{}
			seg.Truncate(len(cando))
			fn.memsize += int64(len(cando))
			fn.segments[cur] = seg
			fn.segments[prev] = fn.segments[prev].Slice(0, ptr.segmentOff)
			ptr.segmentIdx++
			ptr.segmentOff = 0
			fn.repacked++
			ptr.repacked++
		} else if curWritable {
			if fit := int(fn.segments[cur].Len()) - ptr.segmentOff; fit < len(cando) {
				cando = cando[:fit]
			}
		} else {
			if prevAppendable {
				// Shrink cando if needed to fit in
				// prev segment.
				if cangrow := maxBlockSize - fn.segments[prev].Len(); cangrow < len(cando) {
					cando = cando[:cangrow]
				}
			}

			if cur == len(fn.segments) {
				// ptr is at EOF, filesize is changing.
				fn.fileinfo.size += int64(len(cando))
			} else if el := fn.segments[cur].Len(); el <= len(cando) {
				// cando is long enough that we won't
				// need cur any more. shrink cando to
				// be exactly as long as cur
				// (otherwise we'd accidentally shift
				// the effective position of all
				// segments after cur).
				cando = cando[:el]
				copy(fn.segments[cur:], fn.segments[cur+1:])
				fn.segments = fn.segments[:len(fn.segments)-1]
			} else {
				// shrink cur by the same #bytes we're growing prev
				fn.segments[cur] = fn.segments[cur].Slice(len(cando), -1)
			}

			if prevAppendable {
				// Grow prev.
				ptr.segmentIdx--
				ptr.segmentOff = fn.segments[prev].Len()
				fn.segments[prev].(*memSegment).Truncate(ptr.segmentOff + len(cando))
				fn.memsize += int64(len(cando))
				ptr.repacked++
				fn.repacked++
			} else {
				// Insert a segment between prev and
				// cur, and advance prev/cur.
				fn.segments = append(fn.segments, nil)
				if cur < len(fn.segments) {
					copy(fn.segments[cur+1:], fn.segments[cur:])
					ptr.repacked++
					fn.repacked++
				} else {
					// appending a new segment does
					// not invalidate any ptrs
				}
				seg := &memSegment{}
				seg.Truncate(len(cando))
				fn.memsize += int64(len(cando))
				fn.segments[cur] = seg
			}
		}

		// Finally we can copy bytes from cando to the current segment.
		fn.segments[ptr.segmentIdx].(*memSegment).WriteAt(cando, ptr.segmentOff)
		n += len(cando)
		p = p[len(cando):]

		ptr.off += int64(len(cando))
		ptr.segmentOff += len(cando)
		if ptr.segmentOff >= maxBlockSize {
			fn.pruneMemSegments()
		}
		if fn.segments[ptr.segmentIdx].Len() == ptr.segmentOff {
			ptr.segmentOff = 0
			ptr.segmentIdx++
		}

		fn.fileinfo.modTime = time.Now()
	}
	return
}

// Write some data out to disk to reduce memory use. Caller must have
// write lock.
func (fn *filenode) pruneMemSegments() {
	// TODO: share code with (*dirnode)flush()
	// TODO: pack/flush small blocks too, when fragmented
	for idx, seg := range fn.segments {
		seg, ok := seg.(*memSegment)
		if !ok || seg.Len() < maxBlockSize || seg.flushing != nil {
			continue
		}
		// Setting seg.flushing guarantees seg.buf will not be
		// modified in place: WriteAt and Truncate will
		// allocate a new buf instead, if necessary.
		idx, buf := idx, seg.buf
		done := make(chan struct{})
		seg.flushing = done
		// If lots of background writes are already in
		// progress, block here until one finishes, rather
		// than pile up an unlimited number of buffered writes
		// and network flush operations.
		fn.fs.throttle().Acquire()
		go func() {
			defer close(done)
			resp, err := fn.FS().BlockWrite(context.Background(), BlockWriteOptions{
				Data:           buf,
				Replicas:       fn.fs.replicas,
				StorageClasses: fn.fs.storageClasses,
			})
			fn.fs.throttle().Release()
			fn.Lock()
			defer fn.Unlock()
			if seg.flushing != done {
				// A new seg.buf has been allocated.
				return
			}
			if err != nil {
				// TODO: stall (or return errors from)
				// subsequent writes until flushing
				// starts to succeed.
				return
			}
			if len(fn.segments) <= idx || fn.segments[idx] != seg || len(seg.buf) != len(buf) {
				// Segment has been dropped/moved/resized.
				return
			}
			fn.memsize -= int64(len(buf))
			fn.segments[idx] = storedSegment{
				kc:      fn.FS(),
				locator: resp.Locator,
				size:    len(buf),
				offset:  0,
				length:  len(buf),
			}
		}()
	}
}

// Block until all pending pruneMemSegments/flush work is
// finished. Caller must NOT have lock.
func (fn *filenode) waitPrune() {
	var pending []<-chan struct{}
	fn.Lock()
	for _, seg := range fn.segments {
		if seg, ok := seg.(*memSegment); ok && seg.flushing != nil {
			pending = append(pending, seg.flushing)
		}
	}
	fn.Unlock()
	for _, p := range pending {
		<-p
	}
}

func (fn *filenode) Snapshot() (inode, error) {
	fn.RLock()
	defer fn.RUnlock()
	segments := make([]segment, 0, len(fn.segments))
	for _, seg := range fn.segments {
		segments = append(segments, seg.Slice(0, seg.Len()))
	}
	return &filenode{
		fileinfo: fn.fileinfo,
		segments: segments,
	}, nil
}

func (fn *filenode) Splice(repl inode) error {
	repl, err := repl.Snapshot()
	if err != nil {
		return err
	}
	fn.parent.Lock()
	defer fn.parent.Unlock()
	fn.Lock()
	defer fn.Unlock()
	_, err = fn.parent.Child(fn.fileinfo.name, func(inode) (inode, error) { return repl, nil })
	if err != nil {
		return err
	}
	switch repl := repl.(type) {
	case *dirnode:
		repl.parent = fn.parent
		repl.fileinfo.name = fn.fileinfo.name
		repl.setTreeFS(fn.fs)
	case *filenode:
		repl.parent = fn.parent
		repl.fileinfo.name = fn.fileinfo.name
		repl.fs = fn.fs
	default:
		return fmt.Errorf("cannot splice snapshot containing %T: %w", repl, ErrInvalidArgument)
	}
	return nil
}

type dirnode struct {
	fs *collectionFileSystem
	treenode
}

func (dn *dirnode) FS() FileSystem {
	return dn.fs
}

func (dn *dirnode) Child(name string, replace func(inode) (inode, error)) (inode, error) {
	if dn == dn.fs.rootnode() && name == ".arvados#collection" {
		gn := &getternode{Getter: func() ([]byte, error) {
			var coll Collection
			var err error
			coll.ManifestText, err = dn.fs.MarshalManifest(".")
			if err != nil {
				return nil, err
			}
			coll.UUID = dn.fs.uuid
			data, err := json.Marshal(&coll)
			if err == nil {
				data = append(data, '\n')
			}
			return data, err
		}}
		gn.SetParent(dn, name)
		return gn, nil
	}
	return dn.treenode.Child(name, replace)
}

type fnSegmentRef struct {
	fn  *filenode
	idx int
}

// commitBlock concatenates the data from the given filenode segments
// (which must be *memSegments), writes the data out to Keep as a
// single block, and replaces the filenodes' *memSegments with
// storedSegments that reference the relevant portions of the new
// block.
//
// bufsize is the total data size in refs. It is used to preallocate
// the correct amount of memory when len(refs)>1.
//
// If sync is false, commitBlock returns right away, after starting a
// goroutine to do the writes, reacquire the filenodes' locks, and
// swap out the *memSegments. Some filenodes' segments might get
// modified/rearranged in the meantime, in which case commitBlock
// won't replace them.
//
// Caller must have write lock.
func (dn *dirnode) commitBlock(ctx context.Context, refs []fnSegmentRef, bufsize int, sync bool) error {
	if len(refs) == 0 {
		return nil
	}
	if err := ctx.Err(); err != nil {
		return err
	}
	done := make(chan struct{})
	var block []byte
	segs := make([]*memSegment, 0, len(refs))
	offsets := make([]int, 0, len(refs)) // location of segment's data within block
	for _, ref := range refs {
		seg := ref.fn.segments[ref.idx].(*memSegment)
		if !sync && seg.flushingUnfinished() {
			// Let the other flushing goroutine finish. If
			// it fails, we'll try again next time.
			close(done)
			return nil
		}
		// In sync mode, we proceed regardless of
		// whether another flush is in progress: It
		// can't finish before we do, because we hold
		// fn's lock until we finish our own writes.
		seg.flushing = done
		offsets = append(offsets, len(block))
		if len(refs) == 1 {
			block = seg.buf
		} else if block == nil {
			block = append(make([]byte, 0, bufsize), seg.buf...)
		} else {
			block = append(block, seg.buf...)
		}
		segs = append(segs, seg)
	}
	blocksize := len(block)
	dn.fs.throttle().Acquire()
	errs := make(chan error, 1)
	go func() {
		defer close(done)
		defer close(errs)
		resp, err := dn.fs.BlockWrite(context.Background(), BlockWriteOptions{
			Data:           block,
			Replicas:       dn.fs.replicas,
			StorageClasses: dn.fs.storageClasses,
		})
		dn.fs.throttle().Release()
		if err != nil {
			errs <- err
			return
		}
		for idx, ref := range refs {
			if !sync {
				ref.fn.Lock()
				// In async mode, fn's lock was
				// released while we were waiting for
				// PutB(); lots of things might have
				// changed.
				if len(ref.fn.segments) <= ref.idx {
					// file segments have
					// rearranged or changed in
					// some way
					ref.fn.Unlock()
					continue
				} else if seg, ok := ref.fn.segments[ref.idx].(*memSegment); !ok || seg != segs[idx] {
					// segment has been replaced
					ref.fn.Unlock()
					continue
				} else if seg.flushing != done {
					// seg.buf has been replaced
					ref.fn.Unlock()
					continue
				}
			}
			data := ref.fn.segments[ref.idx].(*memSegment).buf
			ref.fn.segments[ref.idx] = storedSegment{
				kc:      dn.fs,
				locator: resp.Locator,
				size:    blocksize,
				offset:  offsets[idx],
				length:  len(data),
			}
			// atomic is needed here despite caller having
			// lock: caller might be running concurrent
			// commitBlock() goroutines using the same
			// lock, writing different segments from the
			// same file.
			atomic.AddInt64(&ref.fn.memsize, -int64(len(data)))
			if !sync {
				ref.fn.Unlock()
			}
		}
	}()
	if sync {
		return <-errs
	}
	return nil
}

type flushOpts struct {
	sync        bool
	shortBlocks bool
}

// flush in-memory data and remote-cluster block references (for the
// children with the given names, which must be children of dn) to
// local-cluster persistent storage.
//
// Caller must have write lock on dn and the named children.
//
// If any children are dirs, they will be flushed recursively.
func (dn *dirnode) flush(ctx context.Context, names []string, opts flushOpts) error {
	cg := newContextGroup(ctx)
	defer cg.Cancel()

	goCommit := func(refs []fnSegmentRef, bufsize int) {
		cg.Go(func() error {
			return dn.commitBlock(cg.Context(), refs, bufsize, opts.sync)
		})
	}

	var pending []fnSegmentRef
	var pendingLen int = 0
	localLocator := map[string]string{}
	for _, name := range names {
		switch node := dn.inodes[name].(type) {
		case *dirnode:
			grandchildNames := node.sortedNames()
			for _, grandchildName := range grandchildNames {
				grandchild := node.inodes[grandchildName]
				grandchild.Lock()
				defer grandchild.Unlock()
			}
			cg.Go(func() error { return node.flush(cg.Context(), grandchildNames, opts) })
		case *filenode:
			for idx, seg := range node.segments {
				switch seg := seg.(type) {
				case storedSegment:
					loc, ok := localLocator[seg.locator]
					if !ok {
						var err error
						loc, err = dn.fs.LocalLocator(seg.locator)
						if err != nil {
							return err
						}
						localLocator[seg.locator] = loc
					}
					seg.locator = loc
					node.segments[idx] = seg
				case *memSegment:
					if seg.Len() > maxBlockSize/2 {
						goCommit([]fnSegmentRef{{node, idx}}, seg.Len())
						continue
					}
					if pendingLen+seg.Len() > maxBlockSize {
						goCommit(pending, pendingLen)
						pending = nil
						pendingLen = 0
					}
					pending = append(pending, fnSegmentRef{node, idx})
					pendingLen += seg.Len()
				default:
					panic(fmt.Sprintf("can't sync segment type %T", seg))
				}
			}
		}
	}
	if opts.shortBlocks {
		goCommit(pending, pendingLen)
	}
	return cg.Wait()
}

// caller must have write lock.
func (dn *dirnode) MemorySize() (size int64) {
	for _, name := range dn.sortedNames() {
		node := dn.inodes[name]
		node.Lock()
		defer node.Unlock()
		switch node := node.(type) {
		case *dirnode:
			size += node.MemorySize()
		case *filenode:
			size += 64
			for _, seg := range node.segments {
				switch seg := seg.(type) {
				case *memSegment:
					size += int64(seg.Len())
				}
				size += 64
			}
		}
	}
	return 64 + size
}

// caller must have write lock.
func (dn *dirnode) sortedNames() []string {
	names := make([]string, 0, len(dn.inodes))
	for name := range dn.inodes {
		names = append(names, name)
	}
	sort.Strings(names)
	return names
}

// caller must have write lock.
func (dn *dirnode) marshalManifest(ctx context.Context, prefix string) (string, error) {
	cg := newContextGroup(ctx)
	defer cg.Cancel()

	if len(dn.inodes) == 0 {
		if prefix == "." {
			return "", nil
		}
		// Express the existence of an empty directory by
		// adding an empty file named `\056`, which (unlike
		// the more obvious spelling `.`) is accepted by the
		// API's manifest validator.
		return manifestEscape(prefix) + " d41d8cd98f00b204e9800998ecf8427e+0 0:0:\\056\n", nil
	}

	names := dn.sortedNames()

	// Wait for children to finish any pending write operations
	// before locking them.
	for _, name := range names {
		node := dn.inodes[name]
		if fn, ok := node.(*filenode); ok {
			fn.waitPrune()
		}
	}

	var dirnames []string
	var filenames []string
	for _, name := range names {
		node := dn.inodes[name]
		node.Lock()
		defer node.Unlock()
		switch node := node.(type) {
		case *dirnode:
			dirnames = append(dirnames, name)
		case *filenode:
			filenames = append(filenames, name)
		default:
			panic(fmt.Sprintf("can't marshal inode type %T", node))
		}
	}

	subdirs := make([]string, len(dirnames))
	rootdir := ""
	for i, name := range dirnames {
		i, name := i, name
		cg.Go(func() error {
			txt, err := dn.inodes[name].(*dirnode).marshalManifest(cg.Context(), prefix+"/"+name)
			subdirs[i] = txt
			return err
		})
	}

	cg.Go(func() error {
		var streamLen int64
		type filepart struct {
			name   string
			offset int64
			length int64
		}

		var fileparts []filepart
		var blocks []string
		if err := dn.flush(cg.Context(), filenames, flushOpts{sync: true, shortBlocks: true}); err != nil {
			return err
		}
		for _, name := range filenames {
			node := dn.inodes[name].(*filenode)
			if len(node.segments) == 0 {
				fileparts = append(fileparts, filepart{name: name})
				continue
			}
			for _, seg := range node.segments {
				switch seg := seg.(type) {
				case storedSegment:
					if len(blocks) > 0 && blocks[len(blocks)-1] == seg.locator {
						streamLen -= int64(seg.size)
					} else {
						blocks = append(blocks, seg.locator)
					}
					next := filepart{
						name:   name,
						offset: streamLen + int64(seg.offset),
						length: int64(seg.length),
					}
					if prev := len(fileparts) - 1; prev >= 0 &&
						fileparts[prev].name == name &&
						fileparts[prev].offset+fileparts[prev].length == next.offset {
						fileparts[prev].length += next.length
					} else {
						fileparts = append(fileparts, next)
					}
					streamLen += int64(seg.size)
				default:
					// This can't happen: we
					// haven't unlocked since
					// calling flush(sync=true).
					panic(fmt.Sprintf("can't marshal segment type %T", seg))
				}
			}
		}
		var filetokens []string
		for _, s := range fileparts {
			filetokens = append(filetokens, fmt.Sprintf("%d:%d:%s", s.offset, s.length, manifestEscape(s.name)))
		}
		if len(filetokens) == 0 {
			return nil
		} else if len(blocks) == 0 {
			blocks = []string{"d41d8cd98f00b204e9800998ecf8427e+0"}
		}
		rootdir = manifestEscape(prefix) + " " + strings.Join(blocks, " ") + " " + strings.Join(filetokens, " ") + "\n"
		return nil
	})
	err := cg.Wait()
	return rootdir + strings.Join(subdirs, ""), err
}

func (dn *dirnode) loadManifest(txt string) error {
	streams := bytes.Split([]byte(txt), []byte{'\n'})
	if len(streams[len(streams)-1]) != 0 {
		return fmt.Errorf("line %d: no trailing newline", len(streams))
	}
	streams = streams[:len(streams)-1]
	segments := []storedSegment{}
	// To reduce allocs, we reuse a single "pathparts" slice
	// (pre-split on "/" separators) for the duration of this
	// func.
	var pathparts []string
	// To reduce allocs, we reuse a single "toks" slice of 3 byte
	// slices.
	var toks = make([][]byte, 3)
	// Similar to bytes.SplitN(token, []byte{c}, 3), but splits
	// into the toks slice rather than allocating a new one, and
	// returns the number of toks (1, 2, or 3).
	splitToToks := func(src []byte, c rune) int {
		c1 := bytes.IndexRune(src, c)
		if c1 < 0 {
			toks[0] = src
			return 1
		}
		toks[0], src = src[:c1], src[c1+1:]
		c2 := bytes.IndexRune(src, c)
		if c2 < 0 {
			toks[1] = src
			return 2
		}
		toks[1], toks[2] = src[:c2], src[c2+1:]
		return 3
	}
	for i, stream := range streams {
		lineno := i + 1
		var anyFileTokens bool
		var pos int64
		var segIdx int
		segments = segments[:0]
		pathparts = nil
		streamparts := 0
		for i, token := range bytes.Split(stream, []byte{' '}) {
			if i == 0 {
				pathparts = strings.Split(manifestUnescape(string(token)), "/")
				streamparts = len(pathparts)
				continue
			}
			if !bytes.ContainsRune(token, ':') {
				if anyFileTokens {
					return fmt.Errorf("line %d: bad file segment %q", lineno, token)
				}
				if splitToToks(token, '+') < 2 {
					return fmt.Errorf("line %d: bad locator %q", lineno, token)
				}
				length, err := strconv.ParseInt(string(toks[1]), 10, 32)
				if err != nil || length < 0 {
					return fmt.Errorf("line %d: bad locator %q", lineno, token)
				}
				segments = append(segments, storedSegment{
					locator: string(token),
					size:    int(length),
					offset:  0,
					length:  int(length),
				})
				continue
			} else if len(segments) == 0 {
				return fmt.Errorf("line %d: bad locator %q", lineno, token)
			}
			if splitToToks(token, ':') != 3 {
				return fmt.Errorf("line %d: bad file segment %q", lineno, token)
			}
			anyFileTokens = true

			offset, err := strconv.ParseInt(string(toks[0]), 10, 64)
			if err != nil || offset < 0 {
				return fmt.Errorf("line %d: bad file segment %q", lineno, token)
			}
			length, err := strconv.ParseInt(string(toks[1]), 10, 64)
			if err != nil || length < 0 {
				return fmt.Errorf("line %d: bad file segment %q", lineno, token)
			}
			if !bytes.ContainsAny(toks[2], `\/`) {
				// optimization for a common case
				pathparts = append(pathparts[:streamparts], string(toks[2]))
			} else {
				pathparts = append(pathparts[:streamparts], strings.Split(manifestUnescape(string(toks[2])), "/")...)
			}
			fnode, err := dn.createFileAndParents(pathparts)
			if fnode == nil && err == nil && length == 0 {
				// Special case: an empty file used as
				// a marker to preserve an otherwise
				// empty directory in a manifest.
				continue
			}
			if err != nil || (fnode == nil && length != 0) {
				return fmt.Errorf("line %d: cannot use name %q with length %d: %s", lineno, toks[2], length, err)
			}
			// Map the stream offset/range coordinates to
			// block/offset/range coordinates and add
			// corresponding storedSegments to the filenode
			if pos > offset {
				// Can't continue where we left off.
				// TODO: binary search instead of
				// rewinding all the way (but this
				// situation might be rare anyway)
				segIdx, pos = 0, 0
			}
			for ; segIdx < len(segments); segIdx++ {
				seg := segments[segIdx]
				next := pos + int64(seg.Len())
				if next <= offset || seg.Len() == 0 {
					pos = next
					continue
				}
				if pos >= offset+length {
					break
				}
				var blkOff int
				if pos < offset {
					blkOff = int(offset - pos)
				}
				blkLen := seg.Len() - blkOff
				if pos+int64(blkOff+blkLen) > offset+length {
					blkLen = int(offset + length - pos - int64(blkOff))
				}
				fnode.appendSegment(storedSegment{
					kc:      dn.fs,
					locator: seg.locator,
					size:    seg.size,
					offset:  blkOff,
					length:  blkLen,
				})
				if next > offset+length {
					break
				} else {
					pos = next
				}
			}
			if segIdx == len(segments) && pos < offset+length {
				return fmt.Errorf("line %d: invalid segment in %d-byte stream: %q", lineno, pos, token)
			}
		}
		if !anyFileTokens {
			return fmt.Errorf("line %d: no file segments", lineno)
		} else if len(segments) == 0 {
			return fmt.Errorf("line %d: no locators", lineno)
		} else if streamparts == 0 {
			return fmt.Errorf("line %d: no stream name", lineno)
		}
	}
	return nil
}

// only safe to call from loadManifest -- no locking.
//
// If path is a "parent directory exists" marker (the last path
// component is "."), the returned values are both nil.
//
// Newly added nodes have modtime==0. Caller is responsible for fixing
// them with backdateTree.
func (dn *dirnode) createFileAndParents(names []string) (fn *filenode, err error) {
	var node inode = dn
	basename := names[len(names)-1]
	for _, name := range names[:len(names)-1] {
		switch name {
		case "", ".":
			continue
		case "..":
			if node == dn {
				// can't be sure parent will be a *dirnode
				return nil, ErrInvalidArgument
			}
			node = node.Parent()
			continue
		}
		node.Lock()
		unlock := node.Unlock
		node, err = node.Child(name, func(child inode) (inode, error) {
			if child == nil {
				// note modtime will be fixed later in backdateTree()
				child, err := node.FS().newNode(name, 0755|os.ModeDir, time.Time{})
				if err != nil {
					return nil, err
				}
				child.SetParent(node, name)
				return child, nil
			} else if !child.IsDir() {
				return child, ErrFileExists
			} else {
				return child, nil
			}
		})
		unlock()
		if err != nil {
			return
		}
	}
	if basename == "." {
		return
	} else if !permittedName(basename) {
		err = fmt.Errorf("invalid file part %q in path %q", basename, names)
		return
	}
	node.Lock()
	defer node.Unlock()
	_, err = node.Child(basename, func(child inode) (inode, error) {
		switch child := child.(type) {
		case nil:
			child, err = node.FS().newNode(basename, 0755, time.Time{})
			if err != nil {
				return nil, err
			}
			child.SetParent(node, basename)
			fn = child.(*filenode)
			return child, nil
		case *filenode:
			fn = child
			return child, nil
		case *dirnode:
			return child, ErrIsDirectory
		default:
			return child, ErrInvalidArgument
		}
	})
	return
}

func (dn *dirnode) TreeSize() (bytes int64) {
	dn.RLock()
	defer dn.RUnlock()
	for _, i := range dn.inodes {
		switch i := i.(type) {
		case *filenode:
			bytes += i.Size()
		case *dirnode:
			bytes += i.TreeSize()
		}
	}
	return
}

func (dn *dirnode) Snapshot() (inode, error) {
	return dn.snapshot()
}

func (dn *dirnode) snapshot() (*dirnode, error) {
	dn.RLock()
	defer dn.RUnlock()
	snap := &dirnode{
		treenode: treenode{
			inodes:   make(map[string]inode, len(dn.inodes)),
			fileinfo: dn.fileinfo,
		},
	}
	for name, child := range dn.inodes {
		dupchild, err := child.Snapshot()
		if err != nil {
			return nil, err
		}
		snap.inodes[name] = dupchild
		dupchild.SetParent(snap, name)
	}
	return snap, nil
}

func (dn *dirnode) Splice(repl inode) error {
	repl, err := repl.Snapshot()
	if err != nil {
		return fmt.Errorf("cannot copy snapshot: %w", err)
	}
	switch repl := repl.(type) {
	default:
		return fmt.Errorf("cannot splice snapshot containing %T: %w", repl, ErrInvalidArgument)
	case *dirnode:
		dn.Lock()
		defer dn.Unlock()
		dn.inodes = repl.inodes
		dn.setTreeFS(dn.fs)
	case *filenode:
		dn.parent.Lock()
		defer dn.parent.Unlock()
		removing, err := dn.parent.Child(dn.fileinfo.name, nil)
		if err != nil {
			return fmt.Errorf("cannot use Splice to replace a top-level directory with a file: %w", ErrInvalidOperation)
		} else if removing != dn {
			// If ../thisdirname is not this dirnode, it
			// must be an inode that wraps a dirnode, like
			// a collectionFileSystem or deferrednode.
			if deferred, ok := removing.(*deferrednode); ok {
				// More useful to report the type of
				// the wrapped node rather than just
				// *deferrednode. (We know the real
				// inode is already loaded because dn
				// is inside it.)
				removing = deferred.realinode()
			}
			return fmt.Errorf("cannot use Splice to attach a file at top level of %T: %w", removing, ErrInvalidOperation)
		}
		dn.Lock()
		defer dn.Unlock()
		_, err = dn.parent.Child(dn.fileinfo.name, func(inode) (inode, error) { return repl, nil })
		if err != nil {
			return fmt.Errorf("error replacing filenode: dn.parent.Child(): %w", err)
		}
		repl.fs = dn.fs
	}
	return nil
}

func (dn *dirnode) setTreeFS(fs *collectionFileSystem) {
	dn.fs = fs
	for _, child := range dn.inodes {
		switch child := child.(type) {
		case *dirnode:
			child.setTreeFS(fs)
		case *filenode:
			child.fs = fs
		}
	}
}

type segment interface {
	io.ReaderAt
	Len() int
	// Return a new segment with a subsection of the data from this
	// one. length<0 means length=Len()-off.
	Slice(off int, length int) segment
}

type memSegment struct {
	buf []byte
	// If flushing is not nil and not ready/closed, then a) buf is
	// being shared by a pruneMemSegments goroutine, and must be
	// copied on write; and b) the flushing channel will close
	// when the goroutine finishes, whether it succeeds or not.
	flushing <-chan struct{}
}

func (me *memSegment) flushingUnfinished() bool {
	if me.flushing == nil {
		return false
	}
	select {
	case <-me.flushing:
		me.flushing = nil
		return false
	default:
		return true
	}
}

func (me *memSegment) Len() int {
	return len(me.buf)
}

func (me *memSegment) Slice(off, length int) segment {
	if length < 0 {
		length = len(me.buf) - off
	}
	buf := make([]byte, length)
	copy(buf, me.buf[off:])
	return &memSegment{buf: buf}
}

func (me *memSegment) Truncate(n int) {
	if n > cap(me.buf) || (me.flushing != nil && n > len(me.buf)) {
		newsize := 1024
		for newsize < n {
			newsize = newsize << 2
		}
		newbuf := make([]byte, n, newsize)
		copy(newbuf, me.buf)
		me.buf, me.flushing = newbuf, nil
	} else {
		// reclaim existing capacity, and zero reclaimed part
		oldlen := len(me.buf)
		me.buf = me.buf[:n]
		for i := oldlen; i < n; i++ {
			me.buf[i] = 0
		}
	}
}

func (me *memSegment) WriteAt(p []byte, off int) {
	if off+len(p) > len(me.buf) {
		panic("overflowed segment")
	}
	if me.flushing != nil {
		me.buf, me.flushing = append([]byte(nil), me.buf...), nil
	}
	copy(me.buf[off:], p)
}

func (me *memSegment) ReadAt(p []byte, off int64) (n int, err error) {
	if off > int64(me.Len()) {
		err = io.EOF
		return
	}
	n = copy(p, me.buf[int(off):])
	if n < len(p) {
		err = io.EOF
	}
	return
}

type storedSegment struct {
	kc      fsBackend
	locator string
	size    int // size of stored block (also encoded in locator)
	offset  int // position of segment within the stored block
	length  int // bytes in this segment (offset + length <= size)
}

func (se storedSegment) Len() int {
	return se.length
}

func (se storedSegment) Slice(n, size int) segment {
	se.offset += n
	se.length -= n
	if size >= 0 && se.length > size {
		se.length = size
	}
	return se
}

func (se storedSegment) ReadAt(p []byte, off int64) (n int, err error) {
	if off > int64(se.length) {
		return 0, io.EOF
	}
	maxlen := se.length - int(off)
	if len(p) > maxlen {
		p = p[:maxlen]
		n, err = se.kc.ReadAt(se.locator, p, int(off)+se.offset)
		if err == nil {
			err = io.EOF
		}
		return
	}
	return se.kc.ReadAt(se.locator, p, int(off)+se.offset)
}

func canonicalName(name string) string {
	name = path.Clean("/" + name)
	if name == "/" || name == "./" {
		name = "."
	} else if strings.HasPrefix(name, "/") {
		name = "." + name
	}
	return name
}

var manifestEscapeSeq = regexp.MustCompile(`\\([0-7]{3}|\\)`)

func manifestUnescapeFunc(seq string) string {
	if seq == `\\` {
		return `\`
	}
	i, err := strconv.ParseUint(seq[1:], 8, 8)
	if err != nil {
		// Invalid escape sequence: can't unescape.
		return seq
	}
	return string([]byte{byte(i)})
}

func manifestUnescape(s string) string {
	return manifestEscapeSeq.ReplaceAllStringFunc(s, manifestUnescapeFunc)
}

var manifestEscapedChar = regexp.MustCompile(`[\000-\040:\s\\]`)

func manifestEscapeFunc(seq string) string {
	return fmt.Sprintf("\\%03o", byte(seq[0]))
}

func manifestEscape(s string) string {
	return manifestEscapedChar.ReplaceAllStringFunc(s, manifestEscapeFunc)
}