X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/f04693da1811e670d4cbb981debeecf14d79137c..e7555eb1212db8627af6689b250aaf4bd38cefdb:/services/keepstore/volume.go diff --git a/services/keepstore/volume.go b/services/keepstore/volume.go index 8614355022..c3b8cd6283 100644 --- a/services/keepstore/volume.go +++ b/services/keepstore/volume.go @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: AGPL-3.0 -package main +package keepstore import ( "context" @@ -10,10 +10,11 @@ import ( "fmt" "io" "math/big" + "sort" "sync/atomic" "time" - "git.curoverse.com/arvados.git/sdk/go/arvados" + "git.arvados.org/arvados.git/sdk/go/arvados" "github.com/sirupsen/logrus" ) @@ -170,12 +171,12 @@ type Volume interface { // Trash moves the block data from the underlying storage // device to trash area. The block then stays in trash for - // -trash-lifetime interval before it is actually deleted. + // BlobTrashLifetime before it is actually deleted. // // loc is as described in Get. // // If the timestamp for the given locator is newer than - // BlobSignatureTTL, Trash must not trash the data. + // BlobSigningTTL, Trash must not trash the data. // // If a Trash operation overlaps with any Touch or Put // operations on the same locator, the implementation must @@ -196,8 +197,7 @@ type Volume interface { // reliably or fail outright. // // Corollary: A successful Touch or Put guarantees a block - // will not be trashed for at least BlobSignatureTTL - // seconds. + // will not be trashed for at least BlobSigningTTL seconds. Trash(loc string) error // Untrash moves block from trash back into store @@ -216,8 +216,8 @@ type Volume interface { // secrets. String() string - // EmptyTrash looks for trashed blocks that exceeded TrashLifetime - // and deletes them from the volume. + // EmptyTrash looks for trashed blocks that exceeded + // BlobTrashLifetime and deletes them from the volume. EmptyTrash() // Return a globally unique ID of the underlying storage @@ -316,7 +316,7 @@ func makeRRVolumeManager(logger logrus.FieldLogger, cluster *arvados.Cluster, my if err != nil { return nil, fmt.Errorf("error initializing volume %s: %s", uuid, err) } - logger.Printf("started volume %s (%s), ReadOnly=%v", uuid, vol, cfgvol.ReadOnly) + logger.Printf("started volume %s (%s), ReadOnly=%v", uuid, vol, cfgvol.ReadOnly || va.ReadOnly) sc := cfgvol.StorageClasses if len(sc) == 0 { @@ -344,6 +344,36 @@ func makeRRVolumeManager(logger logrus.FieldLogger, cluster *arvados.Cluster, my vm.writables = append(vm.writables, mnt) } } + // pri(mnt): return highest priority of any storage class + // offered by mnt + pri := func(mnt *VolumeMount) int { + any, best := false, 0 + for class := range mnt.KeepMount.StorageClasses { + if p := cluster.StorageClasses[class].Priority; !any || best < p { + best = p + any = true + } + } + return best + } + // less(a,b): sort first by highest priority of any offered + // storage class (highest->lowest), then by volume UUID + less := func(a, b *VolumeMount) bool { + if pa, pb := pri(a), pri(b); pa != pb { + return pa > pb + } else { + return a.KeepMount.UUID < b.KeepMount.UUID + } + } + sort.Slice(vm.readables, func(i, j int) bool { + return less(vm.readables[i], vm.readables[j]) + }) + sort.Slice(vm.writables, func(i, j int) bool { + return less(vm.writables[i], vm.writables[j]) + }) + sort.Slice(vm.mounts, func(i, j int) bool { + return less(vm.mounts[i], vm.mounts[j]) + }) return vm, nil } @@ -354,9 +384,8 @@ func (vm *RRVolumeManager) Mounts() []*VolumeMount { func (vm *RRVolumeManager) Lookup(uuid string, needWrite bool) *VolumeMount { if mnt, ok := vm.mountMap[uuid]; ok && (!needWrite || !mnt.ReadOnly) { return mnt - } else { - return nil } + return nil } // AllReadable returns an array of all readable volumes @@ -364,18 +393,22 @@ func (vm *RRVolumeManager) AllReadable() []*VolumeMount { return vm.readables } -// AllWritable returns an array of all writable volumes +// AllWritable returns writable volumes, sorted by priority/uuid. Used +// by CompareAndTouch to ensure higher-priority volumes are checked +// first. func (vm *RRVolumeManager) AllWritable() []*VolumeMount { return vm.writables } -// NextWritable returns the next writable -func (vm *RRVolumeManager) NextWritable() *VolumeMount { +// NextWritable returns writable volumes, rotated by vm.counter so +// each volume gets a turn to be first. Used by PutBlock to distribute +// new data across available volumes. +func (vm *RRVolumeManager) NextWritable() []*VolumeMount { if len(vm.writables) == 0 { return nil } - i := atomic.AddUint32(&vm.counter, 1) - return vm.writables[i%uint32(len(vm.writables))] + offset := (int(atomic.AddUint32(&vm.counter, 1)) - 1) % len(vm.writables) + return append(append([]*VolumeMount(nil), vm.writables[offset:]...), vm.writables[:offset]...) } // VolumeStats returns an ioStats for the given volume.