//
// SPDX-License-Identifier: AGPL-3.0
-package main
+package keepstore
import (
"context"
"fmt"
"io"
"math/big"
+ "sort"
"sync/atomic"
"time"
- "git.curoverse.com/arvados.git/sdk/go/arvados"
+ "git.arvados.org/arvados.git/sdk/go/arvados"
"github.com/sirupsen/logrus"
)
// Trash moves the block data from the underlying storage
// device to trash area. The block then stays in trash for
- // -trash-lifetime interval before it is actually deleted.
+ // BlobTrashLifetime before it is actually deleted.
//
// loc is as described in Get.
//
// If the timestamp for the given locator is newer than
- // BlobSignatureTTL, Trash must not trash the data.
+ // BlobSigningTTL, Trash must not trash the data.
//
// If a Trash operation overlaps with any Touch or Put
// operations on the same locator, the implementation must
// reliably or fail outright.
//
// Corollary: A successful Touch or Put guarantees a block
- // will not be trashed for at least BlobSignatureTTL
- // seconds.
+ // will not be trashed for at least BlobSigningTTL seconds.
Trash(loc string) error
// Untrash moves block from trash back into store
// secrets.
String() string
- // EmptyTrash looks for trashed blocks that exceeded TrashLifetime
- // and deletes them from the volume.
+ // EmptyTrash looks for trashed blocks that exceeded
+ // BlobTrashLifetime and deletes them from the volume.
EmptyTrash()
// Return a globally unique ID of the underlying storage
if err != nil {
return nil, fmt.Errorf("error initializing volume %s: %s", uuid, err)
}
- logger.Printf("started volume %s (%s), ReadOnly=%v", uuid, vol, cfgvol.ReadOnly)
+ logger.Printf("started volume %s (%s), ReadOnly=%v", uuid, vol, cfgvol.ReadOnly || va.ReadOnly)
sc := cfgvol.StorageClasses
if len(sc) == 0 {
vm.writables = append(vm.writables, mnt)
}
}
+ // pri(mnt): return highest priority of any storage class
+ // offered by mnt
+ pri := func(mnt *VolumeMount) int {
+ any, best := false, 0
+ for class := range mnt.KeepMount.StorageClasses {
+ if p := cluster.StorageClasses[class].Priority; !any || best < p {
+ best = p
+ any = true
+ }
+ }
+ return best
+ }
+ // less(a,b): sort first by highest priority of any offered
+ // storage class (highest->lowest), then by volume UUID
+ less := func(a, b *VolumeMount) bool {
+ if pa, pb := pri(a), pri(b); pa != pb {
+ return pa > pb
+ } else {
+ return a.KeepMount.UUID < b.KeepMount.UUID
+ }
+ }
+ sort.Slice(vm.readables, func(i, j int) bool {
+ return less(vm.readables[i], vm.readables[j])
+ })
+ sort.Slice(vm.writables, func(i, j int) bool {
+ return less(vm.writables[i], vm.writables[j])
+ })
+ sort.Slice(vm.mounts, func(i, j int) bool {
+ return less(vm.mounts[i], vm.mounts[j])
+ })
return vm, nil
}
func (vm *RRVolumeManager) Lookup(uuid string, needWrite bool) *VolumeMount {
if mnt, ok := vm.mountMap[uuid]; ok && (!needWrite || !mnt.ReadOnly) {
return mnt
- } else {
- return nil
}
+ return nil
}
// AllReadable returns an array of all readable volumes
return vm.readables
}
-// AllWritable returns an array of all writable volumes
+// AllWritable returns writable volumes, sorted by priority/uuid. Used
+// by CompareAndTouch to ensure higher-priority volumes are checked
+// first.
func (vm *RRVolumeManager) AllWritable() []*VolumeMount {
return vm.writables
}
-// NextWritable returns the next writable
-func (vm *RRVolumeManager) NextWritable() *VolumeMount {
+// NextWritable returns writable volumes, rotated by vm.counter so
+// each volume gets a turn to be first. Used by PutBlock to distribute
+// new data across available volumes.
+func (vm *RRVolumeManager) NextWritable() []*VolumeMount {
if len(vm.writables) == 0 {
return nil
}
- i := atomic.AddUint32(&vm.counter, 1)
- return vm.writables[i%uint32(len(vm.writables))]
+ offset := (int(atomic.AddUint32(&vm.counter, 1)) - 1) % len(vm.writables)
+ return append(append([]*VolumeMount(nil), vm.writables[offset:]...), vm.writables[:offset]...)
}
// VolumeStats returns an ioStats for the given volume.