X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/7990a7778f70c243b50ea878787ea83689f5b07e..5b863886118890cc81b728a3a606ea823c836f2b:/services/keepstore/handlers.go diff --git a/services/keepstore/handlers.go b/services/keepstore/handlers.go index b64294fda2..51dd73a513 100644 --- a/services/keepstore/handlers.go +++ b/services/keepstore/handlers.go @@ -1,158 +1,195 @@ -package main - -// REST handlers for Keep are implemented here. +// Copyright (C) The Arvados Authors. All rights reserved. // -// GetBlockHandler (GET /locator) -// PutBlockHandler (PUT /locator) -// IndexHandler (GET /index, GET /index/prefix) -// StatusHandler (GET /status.json) +// SPDX-License-Identifier: AGPL-3.0 + +package main import ( - "bytes" "container/list" + "context" "crypto/md5" "encoding/json" "fmt" - "github.com/gorilla/mux" "io" - "log" "net/http" "os" "regexp" "runtime" "strconv" "strings" - "syscall" + "sync" "time" + + "git.curoverse.com/arvados.git/sdk/go/arvados" + "git.curoverse.com/arvados.git/sdk/go/health" + "git.curoverse.com/arvados.git/sdk/go/httpserver" + "github.com/gorilla/mux" + "github.com/prometheus/client_golang/prometheus" ) -// MakeRESTRouter returns a new mux.Router that forwards all Keep -// requests to the appropriate handlers. -// -func MakeRESTRouter() *mux.Router { - rest := mux.NewRouter() +type router struct { + *mux.Router + limiter httpserver.RequestCounter + cluster *arvados.Cluster + remoteProxy remoteProxy + metrics *nodeMetrics +} + +// MakeRESTRouter returns a new router that forwards all Keep requests +// to the appropriate handlers. +func MakeRESTRouter(cluster *arvados.Cluster, reg *prometheus.Registry) http.Handler { + rtr := &router{ + Router: mux.NewRouter(), + cluster: cluster, + metrics: &nodeMetrics{reg: reg}, + } - rest.HandleFunc( - `/{hash:[0-9a-f]{32}}`, GetBlockHandler).Methods("GET", "HEAD") - rest.HandleFunc( + rtr.HandleFunc( + `/{hash:[0-9a-f]{32}}`, rtr.handleGET).Methods("GET", "HEAD") + rtr.HandleFunc( `/{hash:[0-9a-f]{32}}+{hints}`, - GetBlockHandler).Methods("GET", "HEAD") - - rest.HandleFunc(`/{hash:[0-9a-f]{32}}`, PutBlockHandler).Methods("PUT") - rest.HandleFunc(`/{hash:[0-9a-f]{32}}`, DeleteHandler).Methods("DELETE") - - // For IndexHandler we support: - // /index - returns all locators - // /index/{prefix} - returns all locators that begin with {prefix} - // {prefix} is a string of hexadecimal digits between 0 and 32 digits. - // If {prefix} is the empty string, return an index of all locators - // (so /index and /index/ behave identically) - // A client may supply a full 32-digit locator string, in which - // case the server will return an index with either zero or one - // entries. This usage allows a client to check whether a block is - // present, and its size and upload time, without retrieving the - // entire block. - // - rest.HandleFunc(`/index`, IndexHandler).Methods("GET", "HEAD") - rest.HandleFunc( - `/index/{prefix:[0-9a-f]{0,32}}`, IndexHandler).Methods("GET", "HEAD") - rest.HandleFunc(`/status.json`, StatusHandler).Methods("GET", "HEAD") - - // The PullHandler and TrashHandler process "PUT /pull" and "PUT - // /trash" requests from Data Manager. These requests instruct - // Keep to replicate or delete blocks; see - // https://arvados.org/projects/arvados/wiki/Keep_Design_Doc - // for more details. - // - // Each handler parses the JSON list of block management requests - // in the message body, and replaces any existing pull queue or - // trash queue with their contentes. - // - rest.HandleFunc(`/pull`, PullHandler).Methods("PUT") - rest.HandleFunc(`/trash`, TrashHandler).Methods("PUT") + rtr.handleGET).Methods("GET", "HEAD") + + rtr.HandleFunc(`/{hash:[0-9a-f]{32}}`, rtr.handlePUT).Methods("PUT") + rtr.HandleFunc(`/{hash:[0-9a-f]{32}}`, DeleteHandler).Methods("DELETE") + // List all blocks stored here. Privileged client only. + rtr.HandleFunc(`/index`, rtr.IndexHandler).Methods("GET", "HEAD") + // List blocks stored here whose hash has the given prefix. + // Privileged client only. + rtr.HandleFunc(`/index/{prefix:[0-9a-f]{0,32}}`, rtr.IndexHandler).Methods("GET", "HEAD") + + // Internals/debugging info (runtime.MemStats) + rtr.HandleFunc(`/debug.json`, rtr.DebugHandler).Methods("GET", "HEAD") + + // List volumes: path, device number, bytes used/avail. + rtr.HandleFunc(`/status.json`, rtr.StatusHandler).Methods("GET", "HEAD") + + // List mounts: UUID, readonly, tier, device ID, ... + rtr.HandleFunc(`/mounts`, rtr.MountsHandler).Methods("GET") + rtr.HandleFunc(`/mounts/{uuid}/blocks`, rtr.IndexHandler).Methods("GET") + rtr.HandleFunc(`/mounts/{uuid}/blocks/`, rtr.IndexHandler).Methods("GET") + + // Replace the current pull queue. + rtr.HandleFunc(`/pull`, PullHandler).Methods("PUT") + + // Replace the current trash queue. + rtr.HandleFunc(`/trash`, TrashHandler).Methods("PUT") + + // Untrash moves blocks from trash back into store + rtr.HandleFunc(`/untrash/{hash:[0-9a-f]{32}}`, UntrashHandler).Methods("PUT") + + rtr.Handle("/_health/{check}", &health.Handler{ + Token: theConfig.ManagementToken, + Prefix: "/_health/", + }).Methods("GET") // Any request which does not match any of these routes gets // 400 Bad Request. - rest.NotFoundHandler = http.HandlerFunc(BadRequestHandler) + rtr.NotFoundHandler = http.HandlerFunc(BadRequestHandler) + + rtr.limiter = httpserver.NewRequestLimiter(theConfig.MaxRequests, rtr) + rtr.metrics.setupBufferPoolMetrics(bufs) + rtr.metrics.setupWorkQueueMetrics(pullq, "pull") + rtr.metrics.setupWorkQueueMetrics(trashq, "trash") + rtr.metrics.setupRequestMetrics(rtr.limiter) - return rest + instrumented := httpserver.Instrument(rtr.metrics.reg, nil, + httpserver.AddRequestIDs(httpserver.LogRequests(nil, rtr.limiter))) + return instrumented.ServeAPI(theConfig.ManagementToken, instrumented) } +// BadRequestHandler is a HandleFunc to address bad requests. func BadRequestHandler(w http.ResponseWriter, r *http.Request) { http.Error(w, BadRequestError.Error(), BadRequestError.HTTPCode) } -func GetBlockHandler(resp http.ResponseWriter, req *http.Request) { - hash := mux.Vars(req)["hash"] +func (rtr *router) handleGET(resp http.ResponseWriter, req *http.Request) { + ctx, cancel := contextForResponse(context.TODO(), resp) + defer cancel() - hints := mux.Vars(req)["hints"] - - // Parse the locator string and hints from the request. - // TODO(twp): implement a Locator type. - var signature, timestamp string - if hints != "" { - signature_pat, _ := regexp.Compile("^A([[:xdigit:]]+)@([[:xdigit:]]{8})$") - for _, hint := range strings.Split(hints, "+") { - if match, _ := regexp.MatchString("^[[:digit:]]+$", hint); match { - // Server ignores size hints - } else if m := signature_pat.FindStringSubmatch(hint); m != nil { - signature = m[1] - timestamp = m[2] - } else if match, _ := regexp.MatchString("^[[:upper:]]", hint); match { - // Any unknown hint that starts with an uppercase letter is - // presumed to be valid and ignored, to permit forward compatibility. - } else { - // Unknown format; not a valid locator. - http.Error(resp, BadRequestError.Error(), BadRequestError.HTTPCode) - return - } - } + locator := req.URL.Path[1:] + if strings.Contains(locator, "+R") && !strings.Contains(locator, "+A") { + rtr.remoteProxy.Get(ctx, resp, req, rtr.cluster) + return } - // If permission checking is in effect, verify this - // request's permission signature. - if enforce_permissions { - if signature == "" || timestamp == "" { - http.Error(resp, PermissionError.Error(), PermissionError.HTTPCode) - return - } else if IsExpired(timestamp) { - http.Error(resp, ExpiredError.Error(), ExpiredError.HTTPCode) + if theConfig.RequireSignatures { + locator := req.URL.Path[1:] // strip leading slash + if err := VerifySignature(locator, GetAPIToken(req)); err != nil { + http.Error(resp, err.Error(), err.(*KeepError).HTTPCode) return - } else { - req_locator := req.URL.Path[1:] // strip leading slash - if !VerifySignature(req_locator, GetApiToken(req)) { - http.Error(resp, PermissionError.Error(), PermissionError.HTTPCode) - return - } } } - block, err := GetBlock(hash, false) + // TODO: Probe volumes to check whether the block _might_ + // exist. Some volumes/types could support a quick existence + // check without causing other operations to suffer. If all + // volumes support that, and assure us the block definitely + // isn't here, we can return 404 now instead of waiting for a + // buffer. - // Garbage collect after each GET. Fixes #2865. - // TODO(twp): review Keep memory usage and see if there's - // a better way to do this than blindly garbage collecting - // after every block. - defer runtime.GC() + buf, err := getBufferWithContext(ctx, bufs, BlockSize) + if err != nil { + http.Error(resp, err.Error(), http.StatusServiceUnavailable) + return + } + defer bufs.Put(buf) + size, err := GetBlock(ctx, mux.Vars(req)["hash"], buf, resp) if err != nil { - // This type assertion is safe because the only errors - // GetBlock can return are DiskHashError or NotFoundError. - http.Error(resp, err.Error(), err.(*KeepError).HTTPCode) + code := http.StatusInternalServerError + if err, ok := err.(*KeepError); ok { + code = err.HTTPCode + } + http.Error(resp, err.Error(), code) return } - resp.Header().Set("Content-Length", fmt.Sprintf("%d", len(block))) + resp.Header().Set("Content-Length", strconv.Itoa(size)) + resp.Header().Set("Content-Type", "application/octet-stream") + resp.Write(buf[:size]) +} - _, err = resp.Write(block) +// Return a new context that gets cancelled by resp's CloseNotifier. +func contextForResponse(parent context.Context, resp http.ResponseWriter) (context.Context, context.CancelFunc) { + ctx, cancel := context.WithCancel(parent) + if cn, ok := resp.(http.CloseNotifier); ok { + go func(c <-chan bool) { + select { + case <-c: + theConfig.debugLogf("cancel context") + cancel() + case <-ctx.Done(): + } + }(cn.CloseNotify()) + } + return ctx, cancel +} - return +// Get a buffer from the pool -- but give up and return a non-nil +// error if ctx ends before we get a buffer. +func getBufferWithContext(ctx context.Context, bufs *bufferPool, bufSize int) ([]byte, error) { + bufReady := make(chan []byte) + go func() { + bufReady <- bufs.Get(bufSize) + }() + select { + case buf := <-bufReady: + return buf, nil + case <-ctx.Done(): + go func() { + // Even if closeNotifier happened first, we + // need to keep waiting for our buf so we can + // return it to the pool. + bufs.Put(<-bufReady) + }() + return nil, ErrClientDisconnect + } } -func PutBlockHandler(resp http.ResponseWriter, req *http.Request) { - // Garbage collect after each PUT. Fixes #2865. - // See also GetBlockHandler. - defer runtime.GC() +func (rtr *router) handlePUT(resp http.ResponseWriter, req *http.Request) { + ctx, cancel := contextForResponse(context.TODO(), resp) + defer cancel() hash := mux.Vars(req)["hash"] @@ -165,7 +202,7 @@ func PutBlockHandler(resp http.ResponseWriter, req *http.Request) { return } - if req.ContentLength > BLOCKSIZE { + if req.ContentLength > BlockSize { http.Error(resp, TooLongError.Error(), TooLongError.HTTPCode) return } @@ -175,126 +212,190 @@ func PutBlockHandler(resp http.ResponseWriter, req *http.Request) { return } - buf := make([]byte, req.ContentLength) - nread, err := io.ReadFull(req.Body, buf) + buf, err := getBufferWithContext(ctx, bufs, int(req.ContentLength)) if err != nil { - http.Error(resp, err.Error(), 500) + http.Error(resp, err.Error(), http.StatusServiceUnavailable) return - } else if int64(nread) < req.ContentLength { - http.Error(resp, "request truncated", 500) + } + + _, err = io.ReadFull(req.Body, buf) + if err != nil { + http.Error(resp, err.Error(), 500) + bufs.Put(buf) return } - err = PutBlock(buf, hash) + replication, err := PutBlock(ctx, buf, hash) + bufs.Put(buf) + if err != nil { - ke := err.(*KeepError) - http.Error(resp, ke.Error(), ke.HTTPCode) + code := http.StatusInternalServerError + if err, ok := err.(*KeepError); ok { + code = err.HTTPCode + } + http.Error(resp, err.Error(), code) return } // Success; add a size hint, sign the locator if possible, and // return it to the client. - return_hash := fmt.Sprintf("%s+%d", hash, len(buf)) - api_token := GetApiToken(req) - if PermissionSecret != nil && api_token != "" { - expiry := time.Now().Add(permission_ttl) - return_hash = SignLocator(return_hash, api_token, expiry) - } - resp.Write([]byte(return_hash + "\n")) + returnHash := fmt.Sprintf("%s+%d", hash, req.ContentLength) + apiToken := GetAPIToken(req) + if theConfig.blobSigningKey != nil && apiToken != "" { + expiry := time.Now().Add(theConfig.BlobSignatureTTL.Duration()) + returnHash = SignLocator(returnHash, apiToken, expiry) + } + resp.Header().Set("X-Keep-Replicas-Stored", strconv.Itoa(replication)) + resp.Write([]byte(returnHash + "\n")) } -// IndexHandler -// A HandleFunc to address /index and /index/{prefix} requests. -// -func IndexHandler(resp http.ResponseWriter, req *http.Request) { - // Reject unauthorized requests. - if !IsDataManagerToken(GetApiToken(req)) { +// IndexHandler responds to "/index", "/index/{prefix}", and +// "/mounts/{uuid}/blocks" requests. +func (rtr *router) IndexHandler(resp http.ResponseWriter, req *http.Request) { + if !IsSystemAuth(GetAPIToken(req)) { http.Error(resp, UnauthorizedError.Error(), UnauthorizedError.HTTPCode) return } prefix := mux.Vars(req)["prefix"] + if prefix == "" { + req.ParseForm() + prefix = req.Form.Get("prefix") + } - var index string - for _, vol := range KeepVM.AllReadable() { - index = index + vol.Index(prefix) + uuid := mux.Vars(req)["uuid"] + + var vols []Volume + if uuid == "" { + vols = KeepVM.AllReadable() + } else if v := KeepVM.Lookup(uuid, false); v == nil { + http.Error(resp, "mount not found", http.StatusNotFound) + return + } else { + vols = []Volume{v} + } + + for _, v := range vols { + if err := v.IndexTo(prefix, resp); err != nil { + // The only errors returned by IndexTo are + // write errors returned by resp.Write(), + // which probably means the client has + // disconnected and this error will never be + // reported to the client -- but it will + // appear in our own error log. + http.Error(resp, err.Error(), http.StatusInternalServerError) + return + } } - resp.Write([]byte(index)) + // An empty line at EOF is the only way the client can be + // assured the entire index was received. + resp.Write([]byte{'\n'}) } -// StatusHandler -// Responds to /status.json requests with the current node status, -// described in a JSON structure. -// -// The data given in a status.json response includes: -// volumes - a list of Keep volumes currently in use by this server -// each volume is an object with the following fields: -// * mount_point -// * device_num (an integer identifying the underlying filesystem) -// * bytes_free -// * bytes_used -// -type VolumeStatus struct { - MountPoint string `json:"mount_point"` - DeviceNum uint64 `json:"device_num"` - BytesFree uint64 `json:"bytes_free"` - BytesUsed uint64 `json:"bytes_used"` +// MountsHandler responds to "GET /mounts" requests. +func (rtr *router) MountsHandler(resp http.ResponseWriter, req *http.Request) { + err := json.NewEncoder(resp).Encode(KeepVM.Mounts()) + if err != nil { + http.Error(resp, err.Error(), http.StatusInternalServerError) + } +} + +// PoolStatus struct +type PoolStatus struct { + Alloc uint64 `json:"BytesAllocatedCumulative"` + Cap int `json:"BuffersMax"` + Len int `json:"BuffersInUse"` +} + +type volumeStatusEnt struct { + Label string + Status *VolumeStatus `json:",omitempty"` + VolumeStats *ioStats `json:",omitempty"` + InternalStats interface{} `json:",omitempty"` } +// NodeStatus struct type NodeStatus struct { - Volumes []*VolumeStatus `json:"volumes"` + Volumes []*volumeStatusEnt + BufferPool PoolStatus + PullQueue WorkQueueStatus + TrashQueue WorkQueueStatus + RequestsCurrent int + RequestsMax int + Version string } -func StatusHandler(resp http.ResponseWriter, req *http.Request) { - st := GetNodeStatus() - if jstat, err := json.Marshal(st); err == nil { +var st NodeStatus +var stLock sync.Mutex + +// DebugHandler addresses /debug.json requests. +func (rtr *router) DebugHandler(resp http.ResponseWriter, req *http.Request) { + type debugStats struct { + MemStats runtime.MemStats + } + var ds debugStats + runtime.ReadMemStats(&ds.MemStats) + err := json.NewEncoder(resp).Encode(&ds) + if err != nil { + http.Error(resp, err.Error(), 500) + } +} + +// StatusHandler addresses /status.json requests. +func (rtr *router) StatusHandler(resp http.ResponseWriter, req *http.Request) { + stLock.Lock() + rtr.readNodeStatus(&st) + jstat, err := json.Marshal(&st) + stLock.Unlock() + if err == nil { resp.Write(jstat) } else { - log.Printf("json.Marshal: %s\n", err) - log.Printf("NodeStatus = %v\n", st) + log.Printf("json.Marshal: %s", err) + log.Printf("NodeStatus = %v", &st) http.Error(resp, err.Error(), 500) } } -// GetNodeStatus -// Returns a NodeStatus struct describing this Keep -// node's current status. -// -func GetNodeStatus() *NodeStatus { - st := new(NodeStatus) - - st.Volumes = make([]*VolumeStatus, len(KeepVM.AllReadable())) - for i, vol := range KeepVM.AllReadable() { - st.Volumes[i] = vol.Status() +// populate the given NodeStatus struct with current values. +func (rtr *router) readNodeStatus(st *NodeStatus) { + st.Version = version + vols := KeepVM.AllReadable() + if cap(st.Volumes) < len(vols) { + st.Volumes = make([]*volumeStatusEnt, len(vols)) + } + st.Volumes = st.Volumes[:0] + for _, vol := range vols { + var internalStats interface{} + if vol, ok := vol.(InternalStatser); ok { + internalStats = vol.InternalStats() + } + st.Volumes = append(st.Volumes, &volumeStatusEnt{ + Label: vol.String(), + Status: vol.Status(), + InternalStats: internalStats, + //VolumeStats: KeepVM.VolumeStats(vol), + }) + } + st.BufferPool.Alloc = bufs.Alloc() + st.BufferPool.Cap = bufs.Cap() + st.BufferPool.Len = bufs.Len() + st.PullQueue = getWorkQueueStatus(pullq) + st.TrashQueue = getWorkQueueStatus(trashq) + if rtr.limiter != nil { + st.RequestsCurrent = rtr.limiter.Current() + st.RequestsMax = rtr.limiter.Max() } - return st } -// GetVolumeStatus -// Returns a VolumeStatus describing the requested volume. -// -func GetVolumeStatus(volume string) *VolumeStatus { - var fs syscall.Statfs_t - var devnum uint64 - - if fi, err := os.Stat(volume); err == nil { - devnum = fi.Sys().(*syscall.Stat_t).Dev - } else { - log.Printf("GetVolumeStatus: os.Stat: %s\n", err) - return nil +// return a WorkQueueStatus for the given queue. If q is nil (which +// should never happen except in test suites), return a zero status +// value instead of crashing. +func getWorkQueueStatus(q *WorkQueue) WorkQueueStatus { + if q == nil { + // This should only happen during tests. + return WorkQueueStatus{} } - - err := syscall.Statfs(volume, &fs) - if err != nil { - log.Printf("GetVolumeStatus: statfs: %s\n", err) - return nil - } - // These calculations match the way df calculates disk usage: - // "free" space is measured by fs.Bavail, but "used" space - // uses fs.Blocks - fs.Bfree. - free := fs.Bavail * uint64(fs.Bsize) - used := (fs.Blocks - fs.Bfree) * uint64(fs.Bsize) - return &VolumeStatus{volume, devnum, free, used} + return q.Status() } // DeleteHandler processes DELETE requests. @@ -328,13 +429,13 @@ func DeleteHandler(resp http.ResponseWriter, req *http.Request) { hash := mux.Vars(req)["hash"] // Confirm that this user is an admin and has a token with unlimited scope. - var tok = GetApiToken(req) + var tok = GetAPIToken(req) if tok == "" || !CanDelete(tok) { http.Error(resp, PermissionError.Error(), PermissionError.HTTPCode) return } - if never_delete { + if !theConfig.EnableDelete { http.Error(resp, MethodDisabledError.Error(), MethodDisabledError.HTTPCode) return } @@ -347,7 +448,7 @@ func DeleteHandler(resp http.ResponseWriter, req *http.Request) { Failed int `json:"copies_failed"` } for _, vol := range KeepVM.AllWritable() { - if err := vol.Delete(hash); err == nil { + if err := vol.Trash(hash); err == nil { result.Deleted++ } else if os.IsNotExist(err) { continue @@ -371,7 +472,7 @@ func DeleteHandler(resp http.ResponseWriter, req *http.Request) { if body, err := json.Marshal(result); err == nil { resp.Write(body) } else { - log.Printf("json.Marshal: %s (result = %v)\n", err, result) + log.Printf("json.Marshal: %s (result = %v)", err, result) http.Error(resp, err.Error(), 500) } } @@ -410,14 +511,19 @@ func DeleteHandler(resp http.ResponseWriter, req *http.Request) { If the JSON unmarshalling fails, return 400 Bad Request. */ +// PullRequest consists of a block locator and an ordered list of servers type PullRequest struct { Locator string `json:"locator"` Servers []string `json:"servers"` + + // Destination mount, or "" for "anywhere" + MountUUID string `json:"mount_uuid"` } +// PullHandler processes "PUT /pull" requests for the data manager. func PullHandler(resp http.ResponseWriter, req *http.Request) { // Reject unauthorized requests. - if !IsDataManagerToken(GetApiToken(req)) { + if !IsSystemAuth(GetAPIToken(req)) { http.Error(resp, UnauthorizedError.Error(), UnauthorizedError.HTTPCode) return } @@ -426,7 +532,7 @@ func PullHandler(resp http.ResponseWriter, req *http.Request) { var pr []PullRequest r := json.NewDecoder(req.Body) if err := r.Decode(&pr); err != nil { - http.Error(resp, BadRequestError.Error(), BadRequestError.HTTPCode) + http.Error(resp, err.Error(), BadRequestError.HTTPCode) return } @@ -444,14 +550,19 @@ func PullHandler(resp http.ResponseWriter, req *http.Request) { pullq.ReplaceQueue(plist) } +// TrashRequest consists of a block locator and its Mtime type TrashRequest struct { Locator string `json:"locator"` BlockMtime int64 `json:"block_mtime"` + + // Target mount, or "" for "everywhere" + MountUUID string `json:"mount_uuid"` } +// TrashHandler processes /trash requests. func TrashHandler(resp http.ResponseWriter, req *http.Request) { // Reject unauthorized requests. - if !IsDataManagerToken(GetApiToken(req)) { + if !IsSystemAuth(GetAPIToken(req)) { http.Error(resp, UnauthorizedError.Error(), UnauthorizedError.HTTPCode) return } @@ -460,7 +571,7 @@ func TrashHandler(resp http.ResponseWriter, req *http.Request) { var trash []TrashRequest r := json.NewDecoder(req.Body) if err := r.Decode(&trash); err != nil { - http.Error(resp, BadRequestError.Error(), BadRequestError.HTTPCode) + http.Error(resp, err.Error(), BadRequestError.HTTPCode) return } @@ -478,7 +589,53 @@ func TrashHandler(resp http.ResponseWriter, req *http.Request) { trashq.ReplaceQueue(tlist) } -// ============================== +// UntrashHandler processes "PUT /untrash/{hash:[0-9a-f]{32}}" requests for the data manager. +func UntrashHandler(resp http.ResponseWriter, req *http.Request) { + // Reject unauthorized requests. + if !IsSystemAuth(GetAPIToken(req)) { + http.Error(resp, UnauthorizedError.Error(), UnauthorizedError.HTTPCode) + return + } + + hash := mux.Vars(req)["hash"] + + if len(KeepVM.AllWritable()) == 0 { + http.Error(resp, "No writable volumes", http.StatusNotFound) + return + } + + var untrashedOn, failedOn []string + var numNotFound int + for _, vol := range KeepVM.AllWritable() { + err := vol.Untrash(hash) + + if os.IsNotExist(err) { + numNotFound++ + } else if err != nil { + log.Printf("Error untrashing %v on volume %v", hash, vol.String()) + failedOn = append(failedOn, vol.String()) + } else { + log.Printf("Untrashed %v on volume %v", hash, vol.String()) + untrashedOn = append(untrashedOn, vol.String()) + } + } + + if numNotFound == len(KeepVM.AllWritable()) { + http.Error(resp, "Block not found on any of the writable volumes", http.StatusNotFound) + return + } + + if len(failedOn) == len(KeepVM.AllWritable()) { + http.Error(resp, "Failed to untrash on all writable volumes", http.StatusInternalServerError) + } else { + respBody := "Successfully untrashed on: " + strings.Join(untrashedOn, ",") + if len(failedOn) > 0 { + respBody += "; Failed to untrash on: " + strings.Join(failedOn, ",") + } + resp.Write([]byte(respBody)) + } +} + // GetBlock and PutBlock implement lower-level code for handling // blocks by rooting through volumes connected to the local machine. // Once the handler has determined that system policy permits the @@ -490,38 +647,25 @@ func TrashHandler(resp http.ResponseWriter, req *http.Request) { // block is stored on, so it should be responsible for figuring out // which volume to check for fetching blocks, storing blocks, etc. -// ============================== -// GetBlock fetches and returns the block identified by "hash". If -// the update_timestamp argument is true, GetBlock also updates the -// block's file modification time (for the sake of PutBlock, which -// must update the file's timestamp when the block already exists). -// -// On success, GetBlock returns a byte slice with the block data, and -// a nil error. +// GetBlock fetches the block identified by "hash" into the provided +// buf, and returns the data size. // // If the block cannot be found on any volume, returns NotFoundError. // // If the block found does not have the correct MD5 hash, returns // DiskHashError. // - -func GetBlock(hash string, update_timestamp bool) ([]byte, error) { +func GetBlock(ctx context.Context, hash string, buf []byte, resp http.ResponseWriter) (int, error) { // Attempt to read the requested hash from a keep volume. - error_to_caller := NotFoundError - - var vols []Volume - if update_timestamp { - // Pointless to find the block on an unwritable volume - // because Touch() will fail -- this is as good as - // "not found" for purposes of callers who need to - // update_timestamp. - vols = KeepVM.AllWritable() - } else { - vols = KeepVM.AllReadable() - } + errorToCaller := NotFoundError - for _, vol := range vols { - buf, err := vol.Get(hash) + for _, vol := range KeepVM.AllReadable() { + size, err := vol.Get(ctx, hash, buf) + select { + case <-ctx.Done(): + return 0, ErrClientDisconnect + default: + } if err != nil { // IsNotExist is an expected error and may be // ignored. All other errors are logged. In @@ -529,185 +673,218 @@ func GetBlock(hash string, update_timestamp bool) ([]byte, error) { // volumes. If all volumes report IsNotExist, // we return a NotFoundError. if !os.IsNotExist(err) { - log.Printf("GetBlock: reading %s: %s\n", hash, err) + log.Printf("%s: Get(%s): %s", vol, hash, err) + } + // If some volume returns a transient error, return it to the caller + // instead of "Not found" so it can retry. + if err == VolumeBusyError { + errorToCaller = err.(*KeepError) } continue } // Check the file checksum. // - filehash := fmt.Sprintf("%x", md5.Sum(buf)) + filehash := fmt.Sprintf("%x", md5.Sum(buf[:size])) if filehash != hash { // TODO: Try harder to tell a sysadmin about // this. - log.Printf("%s: checksum mismatch for request %s (actual %s)\n", + log.Printf("%s: checksum mismatch for request %s (actual %s)", vol, hash, filehash) - error_to_caller = DiskHashError + errorToCaller = DiskHashError continue } - if error_to_caller == DiskHashError { + if errorToCaller == DiskHashError { log.Printf("%s: checksum mismatch for request %s but a good copy was found on another volume and returned", vol, hash) } - if update_timestamp { - if err := vol.Touch(hash); err != nil { - error_to_caller = GenericError - log.Printf("%s: Touch %s failed: %s", - vol, hash, error_to_caller) - continue - } - } - return buf, nil + return size, nil } - return nil, error_to_caller + return 0, errorToCaller } -/* PutBlock(block, hash) - Stores the BLOCK (identified by the content id HASH) in Keep. - - The MD5 checksum of the block must be identical to the content id HASH. - If not, an error is returned. - - PutBlock stores the BLOCK on the first Keep volume with free space. - A failure code is returned to the user only if all volumes fail. - - On success, PutBlock returns nil. - On failure, it returns a KeepError with one of the following codes: - - 500 Collision - A different block with the same hash already exists on this - Keep server. - 422 MD5Fail - The MD5 hash of the BLOCK does not match the argument HASH. - 503 Full - There was not enough space left in any Keep volume to store - the object. - 500 Fail - The object could not be stored for some other reason (e.g. - all writes failed). The text of the error message should - provide as much detail as possible. -*/ - -func PutBlock(block []byte, hash string) error { +// PutBlock Stores the BLOCK (identified by the content id HASH) in Keep. +// +// PutBlock(ctx, block, hash) +// Stores the BLOCK (identified by the content id HASH) in Keep. +// +// The MD5 checksum of the block must be identical to the content id HASH. +// If not, an error is returned. +// +// PutBlock stores the BLOCK on the first Keep volume with free space. +// A failure code is returned to the user only if all volumes fail. +// +// On success, PutBlock returns nil. +// On failure, it returns a KeepError with one of the following codes: +// +// 500 Collision +// A different block with the same hash already exists on this +// Keep server. +// 422 MD5Fail +// The MD5 hash of the BLOCK does not match the argument HASH. +// 503 Full +// There was not enough space left in any Keep volume to store +// the object. +// 500 Fail +// The object could not be stored for some other reason (e.g. +// all writes failed). The text of the error message should +// provide as much detail as possible. +// +func PutBlock(ctx context.Context, block []byte, hash string) (int, error) { // Check that BLOCK's checksum matches HASH. blockhash := fmt.Sprintf("%x", md5.Sum(block)) if blockhash != hash { log.Printf("%s: MD5 checksum %s did not match request", hash, blockhash) - return RequestHashError - } - - // If we already have a block on disk under this identifier, return - // success (but check for MD5 collisions). While fetching the block, - // update its timestamp. - // The only errors that GetBlock can return are DiskHashError and NotFoundError. - // In either case, we want to write our new (good) block to disk, - // so there is nothing special to do if err != nil. - // - if oldblock, err := GetBlock(hash, true); err == nil { - if bytes.Compare(block, oldblock) == 0 { - // The block already exists; return success. - return nil - } else { - return CollisionError - } + return 0, RequestHashError + } + + // If we already have this data, it's intact on disk, and we + // can update its timestamp, return success. If we have + // different data with the same hash, return failure. + if n, err := CompareAndTouch(ctx, hash, block); err == nil || err == CollisionError { + return n, err + } else if ctx.Err() != nil { + return 0, ErrClientDisconnect } // Choose a Keep volume to write to. // If this volume fails, try all of the volumes in order. if vol := KeepVM.NextWritable(); vol != nil { - if err := vol.Put(hash, block); err == nil { - return nil // success! + if err := vol.Put(ctx, hash, block); err == nil { + return vol.Replication(), nil // success! + } + if ctx.Err() != nil { + return 0, ErrClientDisconnect } } writables := KeepVM.AllWritable() if len(writables) == 0 { log.Print("No writable volumes.") - return FullError + return 0, FullError } allFull := true for _, vol := range writables { - err := vol.Put(hash, block) + err := vol.Put(ctx, hash, block) + if ctx.Err() != nil { + return 0, ErrClientDisconnect + } if err == nil { - return nil // success! + return vol.Replication(), nil // success! } if err != FullError { // The volume is not full but the // write did not succeed. Report the // error and continue trying. allFull = false - log.Printf("%s: Write(%s): %s\n", vol, hash, err) + log.Printf("%s: Write(%s): %s", vol, hash, err) } } if allFull { log.Print("All volumes are full.") - return FullError - } else { - // Already logged the non-full errors. - return GenericError + return 0, FullError } + // Already logged the non-full errors. + return 0, GenericError } -// IsValidLocator -// Return true if the specified string is a valid Keep locator. -// When Keep is extended to support hash types other than MD5, -// this should be updated to cover those as well. +// CompareAndTouch returns the current replication level if one of the +// volumes already has the given content and it successfully updates +// the relevant block's modification time in order to protect it from +// premature garbage collection. Otherwise, it returns a non-nil +// error. +func CompareAndTouch(ctx context.Context, hash string, buf []byte) (int, error) { + var bestErr error = NotFoundError + for _, vol := range KeepVM.AllWritable() { + err := vol.Compare(ctx, hash, buf) + if ctx.Err() != nil { + return 0, ctx.Err() + } else if err == CollisionError { + // Stop if we have a block with same hash but + // different content. (It will be impossible + // to tell which one is wanted if we have + // both, so there's no point writing it even + // on a different volume.) + log.Printf("%s: Compare(%s): %s", vol, hash, err) + return 0, err + } else if os.IsNotExist(err) { + // Block does not exist. This is the only + // "normal" error: we don't log anything. + continue + } else if err != nil { + // Couldn't open file, data is corrupt on + // disk, etc.: log this abnormal condition, + // and try the next volume. + log.Printf("%s: Compare(%s): %s", vol, hash, err) + continue + } + if err := vol.Touch(hash); err != nil { + log.Printf("%s: Touch %s failed: %s", vol, hash, err) + bestErr = err + continue + } + // Compare and Touch both worked --> done. + return vol.Replication(), nil + } + return 0, bestErr +} + +var validLocatorRe = regexp.MustCompile(`^[0-9a-f]{32}$`) + +// IsValidLocator returns true if the specified string is a valid Keep locator. +// When Keep is extended to support hash types other than MD5, +// this should be updated to cover those as well. // func IsValidLocator(loc string) bool { - match, err := regexp.MatchString(`^[0-9a-f]{32}$`, loc) - if err == nil { - return match - } - log.Printf("IsValidLocator: %s\n", err) - return false + return validLocatorRe.MatchString(loc) } -// GetApiToken returns the OAuth2 token from the Authorization +var authRe = regexp.MustCompile(`^(OAuth2|Bearer)\s+(.*)`) + +// GetAPIToken returns the OAuth2 token from the Authorization // header of a HTTP request, or an empty string if no matching // token is found. -func GetApiToken(req *http.Request) string { +func GetAPIToken(req *http.Request) string { if auth, ok := req.Header["Authorization"]; ok { - if pat, err := regexp.Compile(`^OAuth2\s+(.*)`); err != nil { - log.Println(err) - } else if match := pat.FindStringSubmatch(auth[0]); match != nil { - return match[1] + if match := authRe.FindStringSubmatch(auth[0]); match != nil { + return match[2] } } return "" } // IsExpired returns true if the given Unix timestamp (expressed as a -// hexadecimal string) is in the past, or if timestamp_hex cannot be +// hexadecimal string) is in the past, or if timestampHex cannot be // parsed as a hexadecimal string. -func IsExpired(timestamp_hex string) bool { - ts, err := strconv.ParseInt(timestamp_hex, 16, 0) +func IsExpired(timestampHex string) bool { + ts, err := strconv.ParseInt(timestampHex, 16, 0) if err != nil { - log.Printf("IsExpired: %s\n", err) + log.Printf("IsExpired: %s", err) return true } return time.Unix(ts, 0).Before(time.Now()) } -// CanDelete returns true if the user identified by api_token is +// CanDelete returns true if the user identified by apiToken is // allowed to delete blocks. -func CanDelete(api_token string) bool { - if api_token == "" { +func CanDelete(apiToken string) bool { + if apiToken == "" { return false } // Blocks may be deleted only when Keep has been configured with a // data manager. - if IsDataManagerToken(api_token) { + if IsSystemAuth(apiToken) { return true } - // TODO(twp): look up api_token with the API server + // TODO(twp): look up apiToken with the API server // return true if is_admin is true and if the token // has unlimited scope return false } -// IsDataManagerToken returns true if api_token represents the data -// manager's token. -func IsDataManagerToken(api_token string) bool { - return data_manager_token != "" && api_token == data_manager_token +// IsSystemAuth returns true if the given token is allowed to perform +// system level actions like deleting data. +func IsSystemAuth(token string) bool { + return token != "" && token == theConfig.systemAuthToken }