//
// SPDX-License-Identifier: AGPL-3.0
-package main
+package keepstore
import (
"container/list"
"strconv"
"strings"
"sync"
+ "sync/atomic"
"time"
+ "git.arvados.org/arvados.git/lib/cmd"
"git.arvados.org/arvados.git/sdk/go/arvados"
"git.arvados.org/arvados.git/sdk/go/ctxlog"
"git.arvados.org/arvados.git/sdk/go/health"
}
func (rtr *router) handleGET(resp http.ResponseWriter, req *http.Request) {
- ctx, cancel := contextForResponse(context.TODO(), resp)
- defer cancel()
-
locator := req.URL.Path[1:]
if strings.Contains(locator, "+R") && !strings.Contains(locator, "+A") {
- rtr.remoteProxy.Get(ctx, resp, req, rtr.cluster, rtr.volmgr)
+ rtr.remoteProxy.Get(req.Context(), resp, req, rtr.cluster, rtr.volmgr)
return
}
// isn't here, we can return 404 now instead of waiting for a
// buffer.
- buf, err := getBufferWithContext(ctx, bufs, BlockSize)
+ buf, err := getBufferWithContext(req.Context(), bufs, BlockSize)
if err != nil {
http.Error(resp, err.Error(), http.StatusServiceUnavailable)
return
}
defer bufs.Put(buf)
- size, err := GetBlock(ctx, rtr.volmgr, mux.Vars(req)["hash"], buf, resp)
+ size, err := GetBlock(req.Context(), rtr.volmgr, mux.Vars(req)["hash"], buf, resp)
if err != nil {
code := http.StatusInternalServerError
if err, ok := err.(*KeepError); ok {
resp.Write(buf[:size])
}
-// Return a new context that gets cancelled by resp's CloseNotifier.
-func contextForResponse(parent context.Context, resp http.ResponseWriter) (context.Context, context.CancelFunc) {
- ctx, cancel := context.WithCancel(parent)
- if cn, ok := resp.(http.CloseNotifier); ok {
- go func(c <-chan bool) {
- select {
- case <-c:
- cancel()
- case <-ctx.Done():
- }
- }(cn.CloseNotify())
- }
- return ctx, cancel
-}
-
// Get a buffer from the pool -- but give up and return a non-nil
// error if ctx ends before we get a buffer.
func getBufferWithContext(ctx context.Context, bufs *bufferPool, bufSize int) ([]byte, error) {
}
func (rtr *router) handlePUT(resp http.ResponseWriter, req *http.Request) {
- ctx, cancel := contextForResponse(context.TODO(), resp)
- defer cancel()
-
hash := mux.Vars(req)["hash"]
// Detect as many error conditions as possible before reading
}
}
- buf, err := getBufferWithContext(ctx, bufs, int(req.ContentLength))
+ buf, err := getBufferWithContext(req.Context(), bufs, int(req.ContentLength))
if err != nil {
http.Error(resp, err.Error(), http.StatusServiceUnavailable)
return
return
}
- result, err := PutBlock(ctx, rtr.volmgr, buf, hash, wantStorageClasses)
+ result, err := PutBlock(req.Context(), rtr.volmgr, buf, hash, wantStorageClasses)
bufs.Put(buf)
if err != nil {
// populate the given NodeStatus struct with current values.
func (rtr *router) readNodeStatus(st *NodeStatus) {
- st.Version = version
+ st.Version = strings.SplitN(cmd.Version.String(), " ", 2)[0]
vols := rtr.volmgr.AllReadable()
if cap(st.Volumes) < len(vols) {
st.Volumes = make([]*volumeStatusEnt, len(vols))
// Otherwise, the response code is 200 OK, with a response body
// consisting of the JSON message
//
-// {"copies_deleted":d,"copies_failed":f}
+// {"copies_deleted":d,"copies_failed":f}
//
// where d and f are integers representing the number of blocks that
// were successfully and unsuccessfully deleted.
-//
func (rtr *router) handleDELETE(resp http.ResponseWriter, req *http.Request) {
hash := mux.Vars(req)["hash"]
Deleted int `json:"copies_deleted"`
Failed int `json:"copies_failed"`
}
- for _, vol := range rtr.volmgr.AllWritable() {
- if err := vol.Trash(hash); err == nil {
+ for _, vol := range rtr.volmgr.Mounts() {
+ if !vol.KeepMount.AllowTrash {
+ continue
+ } else if err := vol.Trash(hash); err == nil {
result.Deleted++
} else if os.IsNotExist(err) {
continue
//
// If the block found does not have the correct MD5 hash, returns
// DiskHashError.
-//
func GetBlock(ctx context.Context, volmgr *RRVolumeManager, hash string, buf []byte, resp http.ResponseWriter) (int, error) {
log := ctxlog.FromContext(ctx)
if filehash != hash {
// TODO: Try harder to tell a sysadmin about
// this.
- log.Error("checksum mismatch for block %s (actual %s) on %s", hash, filehash, vol)
+ log.Errorf("checksum mismatch for block %s (actual %s), size %d on %s", hash, filehash, size, vol)
errorToCaller = DiskHashError
continue
}
}
type putProgress struct {
+ classNeeded map[string]bool
classTodo map[string]bool
mountUsed map[*VolumeMount]bool
totalReplication int
func (pr *putProgress) Add(mnt *VolumeMount) {
if pr.mountUsed[mnt] {
- logrus.Warnf("BUG? superfluous extra write to mount %s", mnt)
+ logrus.Warnf("BUG? superfluous extra write to mount %s", mnt.UUID)
return
}
pr.mountUsed[mnt] = true
}
}
+func (pr *putProgress) Sub(mnt *VolumeMount) {
+ if !pr.mountUsed[mnt] {
+ logrus.Warnf("BUG? Sub called with no prior matching Add: %s", mnt.UUID)
+ return
+ }
+ pr.mountUsed[mnt] = false
+ pr.totalReplication -= mnt.Replication
+ for class := range mnt.StorageClasses {
+ pr.classDone[class] -= mnt.Replication
+ if pr.classNeeded[class] {
+ pr.classTodo[class] = true
+ }
+ }
+}
+
func (pr *putProgress) Done() bool {
return len(pr.classTodo) == 0 && pr.totalReplication > 0
}
return false
}
-func newPutResult(classes []string) putProgress {
+func (pr *putProgress) Copy() *putProgress {
+ cp := putProgress{
+ classNeeded: pr.classNeeded,
+ classTodo: make(map[string]bool, len(pr.classTodo)),
+ classDone: make(map[string]int, len(pr.classDone)),
+ mountUsed: make(map[*VolumeMount]bool, len(pr.mountUsed)),
+ totalReplication: pr.totalReplication,
+ }
+ for k, v := range pr.classTodo {
+ cp.classTodo[k] = v
+ }
+ for k, v := range pr.classDone {
+ cp.classDone[k] = v
+ }
+ for k, v := range pr.mountUsed {
+ cp.mountUsed[k] = v
+ }
+ return &cp
+}
+
+func newPutProgress(classes []string) putProgress {
pr := putProgress{
- classTodo: make(map[string]bool, len(classes)),
- classDone: map[string]int{},
- mountUsed: map[*VolumeMount]bool{},
+ classNeeded: make(map[string]bool, len(classes)),
+ classTodo: make(map[string]bool, len(classes)),
+ classDone: map[string]int{},
+ mountUsed: map[*VolumeMount]bool{},
}
for _, c := range classes {
if c != "" {
+ pr.classNeeded[c] = true
pr.classTodo[c] = true
}
}
return pr
}
-// PutBlock Stores the BLOCK (identified by the content id HASH) in Keep.
+// PutBlock stores the given block on one or more volumes.
+//
+// The MD5 checksum of the block must match the given hash.
+//
+// The block is written to each writable volume (ordered by priority
+// and then UUID, see volume.go) until at least one replica has been
+// stored in each of the requested storage classes.
+//
+// The returned error, if any, is a KeepError with one of the
+// following codes:
+//
+// 500 Collision
//
-// PutBlock(ctx, block, hash)
-// Stores the BLOCK (identified by the content id HASH) in Keep.
+// A different block with the same hash already exists on this
+// Keep server.
//
-// The MD5 checksum of the block must be identical to the content id HASH.
-// If not, an error is returned.
+// 422 MD5Fail
//
-// PutBlock stores the BLOCK on the first Keep volume with free space.
-// A failure code is returned to the user only if all volumes fail.
+// The MD5 hash of the BLOCK does not match the argument HASH.
//
-// On success, PutBlock returns nil.
-// On failure, it returns a KeepError with one of the following codes:
+// 503 Full
//
-// 500 Collision
-// A different block with the same hash already exists on this
-// Keep server.
-// 422 MD5Fail
-// The MD5 hash of the BLOCK does not match the argument HASH.
-// 503 Full
-// There was not enough space left in any Keep volume to store
-// the object.
-// 500 Fail
-// The object could not be stored for some other reason (e.g.
-// all writes failed). The text of the error message should
-// provide as much detail as possible.
+// There was not enough space left in any Keep volume to store
+// the object.
//
+// 500 Fail
+//
+// The object could not be stored for some other reason (e.g.
+// all writes failed). The text of the error message should
+// provide as much detail as possible.
func PutBlock(ctx context.Context, volmgr *RRVolumeManager, block []byte, hash string, wantStorageClasses []string) (putProgress, error) {
log := ctxlog.FromContext(ctx)
return putProgress{}, RequestHashError
}
- result := newPutResult(wantStorageClasses)
+ result := newPutProgress(wantStorageClasses)
// If we already have this data, it's intact on disk, and we
// can update its timestamp, return success. If we have
// different data with the same hash, return failure.
- if err := CompareAndTouch(ctx, volmgr, hash, block, &result); err != nil {
+ if err := CompareAndTouch(ctx, volmgr, hash, block, &result); err != nil || result.Done() {
return result, err
}
if ctx.Err() != nil {
return result, ErrClientDisconnect
}
- // Choose a Keep volume to write to.
- // If this volume fails, try all of the volumes in order.
- if mnt := volmgr.NextWritable(); mnt == nil || !result.Want(mnt) {
- // fall through to "try all volumes" below
- } else if err := mnt.Put(ctx, hash, block); err != nil {
- log.WithError(err).Errorf("%s: Put(%s) failed", mnt.Volume, hash)
- } else {
- result.Add(mnt)
- if result.Done() {
- return result, nil
- }
- }
- if ctx.Err() != nil {
- return putProgress{}, ErrClientDisconnect
- }
-
- writables := volmgr.AllWritable()
+ writables := volmgr.NextWritable()
if len(writables) == 0 {
log.Error("no writable volumes")
- return putProgress{}, FullError
+ return result, FullError
}
- allFull := true
+ var wg sync.WaitGroup
+ var mtx sync.Mutex
+ cond := sync.Cond{L: &mtx}
+ // pending predicts what result will be if all pending writes
+ // succeed.
+ pending := result.Copy()
+ var allFull atomic.Value
+ allFull.Store(true)
+
+ // We hold the lock for the duration of the "each volume" loop
+ // below, except when it is released during cond.Wait().
+ mtx.Lock()
+
for _, mnt := range writables {
+ // Wait until our decision to use this mount does not
+ // depend on the outcome of pending writes.
+ for result.Want(mnt) && !pending.Want(mnt) {
+ cond.Wait()
+ }
if !result.Want(mnt) {
continue
}
- err := mnt.Put(ctx, hash, block)
- if ctx.Err() != nil {
- return result, ErrClientDisconnect
- }
- switch err {
- case nil:
- result.Add(mnt)
- if result.Done() {
- return result, nil
+ mnt := mnt
+ pending.Add(mnt)
+ wg.Add(1)
+ go func() {
+ log.Debugf("PutBlock: start write to %s", mnt.UUID)
+ defer wg.Done()
+ err := mnt.Put(ctx, hash, block)
+
+ mtx.Lock()
+ if err != nil {
+ log.Debugf("PutBlock: write to %s failed", mnt.UUID)
+ pending.Sub(mnt)
+ } else {
+ log.Debugf("PutBlock: write to %s succeeded", mnt.UUID)
+ result.Add(mnt)
}
- continue
- case FullError:
- continue
- default:
- // The volume is not full but the
- // write did not succeed. Report the
- // error and continue trying.
- allFull = false
- log.WithError(err).Errorf("%s: Put(%s) failed", mnt.Volume, hash)
- }
+ cond.Broadcast()
+ mtx.Unlock()
+
+ if err != nil && err != FullError && ctx.Err() == nil {
+ // The volume is not full but the
+ // write did not succeed. Report the
+ // error and continue trying.
+ allFull.Store(false)
+ log.WithError(err).Errorf("%s: Put(%s) failed", mnt.Volume, hash)
+ }
+ }()
+ }
+ mtx.Unlock()
+ wg.Wait()
+ if ctx.Err() != nil {
+ return result, ErrClientDisconnect
+ }
+ if result.Done() {
+ return result, nil
}
if result.totalReplication > 0 {
// Some, but not all, of the storage classes were
// satisfied. This qualifies as success.
return result, nil
- } else if allFull {
+ } else if allFull.Load().(bool) {
log.Error("all volumes with qualifying storage classes are full")
return putProgress{}, FullError
} else {
// to tell which one is wanted if we have
// both, so there's no point writing it even
// on a different volume.)
- log.Error("collision in Compare(%s) on volume %s", hash, mnt.Volume)
+ log.Errorf("collision in Compare(%s) on volume %s", hash, mnt.Volume)
return CollisionError
} else if os.IsNotExist(err) {
// Block does not exist. This is the only
var validLocatorRe = regexp.MustCompile(`^[0-9a-f]{32}$`)
-// IsValidLocator returns true if the specified string is a valid Keep locator.
-// When Keep is extended to support hash types other than MD5,
-// this should be updated to cover those as well.
-//
+// IsValidLocator returns true if the specified string is a valid Keep
+// locator. When Keep is extended to support hash types other than
+// MD5, this should be updated to cover those as well.
func IsValidLocator(loc string) bool {
return validLocatorRe.MatchString(loc)
}