X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/68c8b2056d6c72f0229cf29f999a8e0a1e4231f9..143a5f355100b837daa428231df0370b525a1f9f:/services/keepstore/handlers.go diff --git a/services/keepstore/handlers.go b/services/keepstore/handlers.go index c7559a1bee..3dcc6e1a2d 100644 --- a/services/keepstore/handlers.go +++ b/services/keepstore/handlers.go @@ -8,8 +8,6 @@ package main // StatusHandler (GET /status.json) import ( - "bufio" - "bytes" "container/list" "crypto/md5" "encoding/json" @@ -22,8 +20,7 @@ import ( "regexp" "runtime" "strconv" - "strings" - "syscall" + "sync" "time" ) @@ -41,35 +38,19 @@ func MakeRESTRouter() *mux.Router { rest.HandleFunc(`/{hash:[0-9a-f]{32}}`, PutBlockHandler).Methods("PUT") rest.HandleFunc(`/{hash:[0-9a-f]{32}}`, DeleteHandler).Methods("DELETE") - - // For IndexHandler we support: - // /index - returns all locators - // /index/{prefix} - returns all locators that begin with {prefix} - // {prefix} is a string of hexadecimal digits between 0 and 32 digits. - // If {prefix} is the empty string, return an index of all locators - // (so /index and /index/ behave identically) - // A client may supply a full 32-digit locator string, in which - // case the server will return an index with either zero or one - // entries. This usage allows a client to check whether a block is - // present, and its size and upload time, without retrieving the - // entire block. - // + // List all blocks stored here. Privileged client only. rest.HandleFunc(`/index`, IndexHandler).Methods("GET", "HEAD") - rest.HandleFunc( - `/index/{prefix:[0-9a-f]{0,32}}`, IndexHandler).Methods("GET", "HEAD") + // List blocks stored here whose hash has the given prefix. + // Privileged client only. + rest.HandleFunc(`/index/{prefix:[0-9a-f]{0,32}}`, IndexHandler).Methods("GET", "HEAD") + + // List volumes: path, device number, bytes used/avail. rest.HandleFunc(`/status.json`, StatusHandler).Methods("GET", "HEAD") - // The PullHandler and TrashHandler process "PUT /pull" and "PUT - // /trash" requests from Data Manager. These requests instruct - // Keep to replicate or delete blocks; see - // https://arvados.org/projects/arvados/wiki/Keep_Design_Doc - // for more details. - // - // Each handler parses the JSON list of block management requests - // in the message body, and replaces any existing pull queue or - // trash queue with their contentes. - // + // Replace the current pull queue. rest.HandleFunc(`/pull`, PullHandler).Methods("PUT") + + // Replace the current trash queue. rest.HandleFunc(`/trash`, TrashHandler).Methods("PUT") // Any request which does not match any of these routes gets @@ -83,143 +64,77 @@ func BadRequestHandler(w http.ResponseWriter, r *http.Request) { http.Error(w, BadRequestError.Error(), BadRequestError.HTTPCode) } -// FindKeepVolumes scans all mounted volumes on the system for Keep -// volumes, and returns a list of matching paths. -// -// A device is assumed to be a Keep volume if it is a normal or tmpfs -// volume and has a "/keep" directory directly underneath the mount -// point. -// -func FindKeepVolumes() []string { - vols := make([]string, 0) - - if f, err := os.Open(PROC_MOUNTS); err != nil { - log.Fatalf("opening %s: %s\n", PROC_MOUNTS, err) - } else { - scanner := bufio.NewScanner(f) - for scanner.Scan() { - args := strings.Fields(scanner.Text()) - dev, mount := args[0], args[1] - if mount != "/" && - (dev == "tmpfs" || strings.HasPrefix(dev, "/dev/")) { - keep := mount + "/keep" - if st, err := os.Stat(keep); err == nil && st.IsDir() { - vols = append(vols, keep) - } - } - } - if err := scanner.Err(); err != nil { - log.Fatal(err) - } - } - return vols -} - func GetBlockHandler(resp http.ResponseWriter, req *http.Request) { - hash := mux.Vars(req)["hash"] - - hints := mux.Vars(req)["hints"] - - // Parse the locator string and hints from the request. - // TODO(twp): implement a Locator type. - var signature, timestamp string - if hints != "" { - signature_pat, _ := regexp.Compile("^A([[:xdigit:]]+)@([[:xdigit:]]{8})$") - for _, hint := range strings.Split(hints, "+") { - if match, _ := regexp.MatchString("^[[:digit:]]+$", hint); match { - // Server ignores size hints - } else if m := signature_pat.FindStringSubmatch(hint); m != nil { - signature = m[1] - timestamp = m[2] - } else if match, _ := regexp.MatchString("^[[:upper:]]", hint); match { - // Any unknown hint that starts with an uppercase letter is - // presumed to be valid and ignored, to permit forward compatibility. - } else { - // Unknown format; not a valid locator. - http.Error(resp, BadRequestError.Error(), BadRequestError.HTTPCode) - return - } - } - } - - // If permission checking is in effect, verify this - // request's permission signature. if enforce_permissions { - if signature == "" || timestamp == "" { - http.Error(resp, PermissionError.Error(), PermissionError.HTTPCode) - return - } else if IsExpired(timestamp) { - http.Error(resp, ExpiredError.Error(), ExpiredError.HTTPCode) + locator := req.URL.Path[1:] // strip leading slash + if err := VerifySignature(locator, GetApiToken(req)); err != nil { + http.Error(resp, err.Error(), err.(*KeepError).HTTPCode) return - } else { - req_locator := req.URL.Path[1:] // strip leading slash - if !VerifySignature(req_locator, GetApiToken(req)) { - http.Error(resp, PermissionError.Error(), PermissionError.HTTPCode) - return - } } } - block, err := GetBlock(hash, false) - - // Garbage collect after each GET. Fixes #2865. - // TODO(twp): review Keep memory usage and see if there's - // a better way to do this than blindly garbage collecting - // after every block. - defer runtime.GC() - + block, err := GetBlock(mux.Vars(req)["hash"]) if err != nil { // This type assertion is safe because the only errors // GetBlock can return are DiskHashError or NotFoundError. http.Error(resp, err.Error(), err.(*KeepError).HTTPCode) return } + defer bufs.Put(block) - resp.Header().Set("Content-Length", fmt.Sprintf("%d", len(block))) - - _, err = resp.Write(block) - - return + resp.Header().Set("Content-Length", strconv.Itoa(len(block))) + resp.Header().Set("Content-Type", "application/octet-stream") + resp.Write(block) } func PutBlockHandler(resp http.ResponseWriter, req *http.Request) { - // Garbage collect after each PUT. Fixes #2865. - // See also GetBlockHandler. - defer runtime.GC() - hash := mux.Vars(req)["hash"] - // Read the block data to be stored. - // If the request exceeds BLOCKSIZE bytes, issue a HTTP 500 error. - // + // Detect as many error conditions as possible before reading + // the body: avoid transmitting data that will not end up + // being written anyway. + + if req.ContentLength == -1 { + http.Error(resp, SizeRequiredError.Error(), SizeRequiredError.HTTPCode) + return + } + if req.ContentLength > BLOCKSIZE { http.Error(resp, TooLongError.Error(), TooLongError.HTTPCode) return } - buf := make([]byte, req.ContentLength) - nread, err := io.ReadFull(req.Body, buf) + if len(KeepVM.AllWritable()) == 0 { + http.Error(resp, FullError.Error(), FullError.HTTPCode) + return + } + + buf := bufs.Get(int(req.ContentLength)) + _, err := io.ReadFull(req.Body, buf) if err != nil { http.Error(resp, err.Error(), 500) - } else if int64(nread) < req.ContentLength { - http.Error(resp, "request truncated", 500) - } else { - if err := PutBlock(buf, hash); err == nil { - // Success; add a size hint, sign the locator if - // possible, and return it to the client. - return_hash := fmt.Sprintf("%s+%d", hash, len(buf)) - api_token := GetApiToken(req) - if PermissionSecret != nil && api_token != "" { - expiry := time.Now().Add(permission_ttl) - return_hash = SignLocator(return_hash, api_token, expiry) - } - resp.Write([]byte(return_hash + "\n")) - } else { - ke := err.(*KeepError) - http.Error(resp, ke.Error(), ke.HTTPCode) - } + bufs.Put(buf) + return } - return + + err = PutBlock(buf, hash) + bufs.Put(buf) + + if err != nil { + ke := err.(*KeepError) + http.Error(resp, ke.Error(), ke.HTTPCode) + return + } + + // Success; add a size hint, sign the locator if possible, and + // return it to the client. + return_hash := fmt.Sprintf("%s+%d", hash, req.ContentLength) + api_token := GetApiToken(req) + if PermissionSecret != nil && api_token != "" { + expiry := time.Now().Add(blob_signature_ttl) + return_hash = SignLocator(return_hash, api_token, expiry) + } + resp.Write([]byte(return_hash + "\n")) } // IndexHandler @@ -234,11 +149,21 @@ func IndexHandler(resp http.ResponseWriter, req *http.Request) { prefix := mux.Vars(req)["prefix"] - var index string - for _, vol := range KeepVM.Volumes() { - index = index + vol.Index(prefix) + for _, vol := range KeepVM.AllReadable() { + if err := vol.IndexTo(prefix, resp); err != nil { + // The only errors returned by IndexTo are + // write errors returned by resp.Write(), + // which probably means the client has + // disconnected and this error will never be + // reported to the client -- but it will + // appear in our own error log. + http.Error(resp, err.Error(), http.StatusInternalServerError) + return + } } - resp.Write([]byte(index)) + // An empty line at EOF is the only way the client can be + // assured the entire index was received. + resp.Write([]byte{'\n'}) } // StatusHandler @@ -252,68 +177,67 @@ func IndexHandler(resp http.ResponseWriter, req *http.Request) { // * device_num (an integer identifying the underlying filesystem) // * bytes_free // * bytes_used -// -type VolumeStatus struct { - MountPoint string `json:"mount_point"` - DeviceNum uint64 `json:"device_num"` - BytesFree uint64 `json:"bytes_free"` - BytesUsed uint64 `json:"bytes_used"` + +type PoolStatus struct { + Alloc uint64 `json:"BytesAllocated"` + Cap int `json:"BuffersMax"` + Len int `json:"BuffersInUse"` } type NodeStatus struct { - Volumes []*VolumeStatus `json:"volumes"` + Volumes []*VolumeStatus `json:"volumes"` + BufferPool PoolStatus + PullQueue WorkQueueStatus + TrashQueue WorkQueueStatus + Memory runtime.MemStats } +var st NodeStatus +var stLock sync.Mutex + func StatusHandler(resp http.ResponseWriter, req *http.Request) { - st := GetNodeStatus() - if jstat, err := json.Marshal(st); err == nil { + stLock.Lock() + readNodeStatus(&st) + jstat, err := json.Marshal(&st) + stLock.Unlock() + if err == nil { resp.Write(jstat) } else { - log.Printf("json.Marshal: %s\n", err) - log.Printf("NodeStatus = %v\n", st) + log.Printf("json.Marshal: %s", err) + log.Printf("NodeStatus = %v", &st) http.Error(resp, err.Error(), 500) } } -// GetNodeStatus -// Returns a NodeStatus struct describing this Keep -// node's current status. -// -func GetNodeStatus() *NodeStatus { - st := new(NodeStatus) - - st.Volumes = make([]*VolumeStatus, len(KeepVM.Volumes())) - for i, vol := range KeepVM.Volumes() { - st.Volumes[i] = vol.Status() +// populate the given NodeStatus struct with current values. +func readNodeStatus(st *NodeStatus) { + vols := KeepVM.AllReadable() + if cap(st.Volumes) < len(vols) { + st.Volumes = make([]*VolumeStatus, len(vols)) } - return st -} - -// GetVolumeStatus -// Returns a VolumeStatus describing the requested volume. -// -func GetVolumeStatus(volume string) *VolumeStatus { - var fs syscall.Statfs_t - var devnum uint64 - - if fi, err := os.Stat(volume); err == nil { - devnum = fi.Sys().(*syscall.Stat_t).Dev - } else { - log.Printf("GetVolumeStatus: os.Stat: %s\n", err) - return nil + st.Volumes = st.Volumes[:0] + for _, vol := range vols { + if s := vol.Status(); s != nil { + st.Volumes = append(st.Volumes, s) + } } + st.BufferPool.Alloc = bufs.Alloc() + st.BufferPool.Cap = bufs.Cap() + st.BufferPool.Len = bufs.Len() + st.PullQueue = getWorkQueueStatus(pullq) + st.TrashQueue = getWorkQueueStatus(trashq) + runtime.ReadMemStats(&st.Memory) +} - err := syscall.Statfs(volume, &fs) - if err != nil { - log.Printf("GetVolumeStatus: statfs: %s\n", err) - return nil +// return a WorkQueueStatus for the given queue. If q is nil (which +// should never happen except in test suites), return a zero status +// value instead of crashing. +func getWorkQueueStatus(q *WorkQueue) WorkQueueStatus { + if q == nil { + // This should only happen during tests. + return WorkQueueStatus{} } - // These calculations match the way df calculates disk usage: - // "free" space is measured by fs.Bavail, but "used" space - // uses fs.Blocks - fs.Bfree. - free := fs.Bavail * uint64(fs.Bsize) - used := (fs.Blocks - fs.Bfree) * uint64(fs.Bsize) - return &VolumeStatus{volume, devnum, free, used} + return q.Status() } // DeleteHandler processes DELETE requests. @@ -358,14 +282,14 @@ func DeleteHandler(resp http.ResponseWriter, req *http.Request) { return } - // Delete copies of this block from all available volumes. Report - // how many blocks were successfully and unsuccessfully - // deleted. + // Delete copies of this block from all available volumes. + // Report how many blocks were successfully deleted, and how + // many were found on writable volumes but not deleted. var result struct { Deleted int `json:"copies_deleted"` Failed int `json:"copies_failed"` } - for _, vol := range KeepVM.Volumes() { + for _, vol := range KeepVM.AllWritable() { if err := vol.Delete(hash); err == nil { result.Deleted++ } else if os.IsNotExist(err) { @@ -390,7 +314,7 @@ func DeleteHandler(resp http.ResponseWriter, req *http.Request) { if body, err := json.Marshal(result); err == nil { resp.Write(body) } else { - log.Printf("json.Marshal: %s (result = %v)\n", err, result) + log.Printf("json.Marshal: %s (result = %v)", err, result) http.Error(resp, err.Error(), 500) } } @@ -445,7 +369,7 @@ func PullHandler(resp http.ResponseWriter, req *http.Request) { var pr []PullRequest r := json.NewDecoder(req.Body) if err := r.Decode(&pr); err != nil { - http.Error(resp, BadRequestError.Error(), BadRequestError.HTTPCode) + http.Error(resp, err.Error(), BadRequestError.HTTPCode) return } @@ -460,10 +384,6 @@ func PullHandler(resp http.ResponseWriter, req *http.Request) { for _, p := range pr { plist.PushBack(p) } - - if pullq == nil { - pullq = NewWorkQueue() - } pullq.ReplaceQueue(plist) } @@ -483,7 +403,7 @@ func TrashHandler(resp http.ResponseWriter, req *http.Request) { var trash []TrashRequest r := json.NewDecoder(req.Body) if err := r.Decode(&trash); err != nil { - http.Error(resp, BadRequestError.Error(), BadRequestError.HTTPCode) + http.Error(resp, err.Error(), BadRequestError.HTTPCode) return } @@ -498,10 +418,6 @@ func TrashHandler(resp http.ResponseWriter, req *http.Request) { for _, t := range trash { tlist.PushBack(t) } - - if trashq == nil { - trashq = NewWorkQueue() - } trashq.ReplaceQueue(tlist) } @@ -518,10 +434,7 @@ func TrashHandler(resp http.ResponseWriter, req *http.Request) { // which volume to check for fetching blocks, storing blocks, etc. // ============================== -// GetBlock fetches and returns the block identified by "hash". If -// the update_timestamp argument is true, GetBlock also updates the -// block's file modification time (for the sake of PutBlock, which -// must update the file's timestamp when the block already exists). +// GetBlock fetches and returns the block identified by "hash". // // On success, GetBlock returns a byte slice with the block data, and // a nil error. @@ -532,56 +445,40 @@ func TrashHandler(resp http.ResponseWriter, req *http.Request) { // DiskHashError. // -func GetBlock(hash string, update_timestamp bool) ([]byte, error) { +func GetBlock(hash string) ([]byte, error) { // Attempt to read the requested hash from a keep volume. error_to_caller := NotFoundError - for _, vol := range KeepVM.Volumes() { - if buf, err := vol.Get(hash); err != nil { - // IsNotExist is an expected error and may be ignored. - // (If all volumes report IsNotExist, we return a NotFoundError) - // All other errors should be logged but we continue trying to - // read. - switch { - case os.IsNotExist(err): - continue - default: - log.Printf("GetBlock: reading %s: %s\n", hash, err) - } - } else { - // Double check the file checksum. - // - filehash := fmt.Sprintf("%x", md5.Sum(buf)) - if filehash != hash { - // TODO(twp): this condition probably represents a bad disk and - // should raise major alarm bells for an administrator: e.g. - // they should be sent directly to an event manager at high - // priority or logged as urgent problems. - // - log.Printf("%s: checksum mismatch for request %s (actual %s)\n", - vol, hash, filehash) - error_to_caller = DiskHashError - } else { - // Success! - if error_to_caller != NotFoundError { - log.Printf("%s: checksum mismatch for request %s but a good copy was found on another volume and returned\n", - vol, hash) - } - // Update the timestamp if the caller requested. - // If we could not update the timestamp, continue looking on - // other volumes. - if update_timestamp { - if vol.Touch(hash) != nil { - continue - } - } - return buf, nil + for _, vol := range KeepVM.AllReadable() { + buf, err := vol.Get(hash) + if err != nil { + // IsNotExist is an expected error and may be + // ignored. All other errors are logged. In + // any case we continue trying to read other + // volumes. If all volumes report IsNotExist, + // we return a NotFoundError. + if !os.IsNotExist(err) { + log.Printf("%s: Get(%s): %s", vol, hash, err) } + continue } - } - - if error_to_caller != NotFoundError { - log.Printf("%s: checksum mismatch, no good copy found\n", hash) + // Check the file checksum. + // + filehash := fmt.Sprintf("%x", md5.Sum(buf)) + if filehash != hash { + // TODO: Try harder to tell a sysadmin about + // this. + log.Printf("%s: checksum mismatch for request %s (actual %s)", + vol, hash, filehash) + error_to_caller = DiskHashError + bufs.Put(buf) + continue + } + if error_to_caller == DiskHashError { + log.Printf("%s: checksum mismatch for request %s but a good copy was found on another volume and returned", + vol, hash) + } + return buf, nil } return nil, error_to_caller } @@ -620,74 +517,107 @@ func PutBlock(block []byte, hash string) error { return RequestHashError } - // If we already have a block on disk under this identifier, return - // success (but check for MD5 collisions). While fetching the block, - // update its timestamp. - // The only errors that GetBlock can return are DiskHashError and NotFoundError. - // In either case, we want to write our new (good) block to disk, - // so there is nothing special to do if err != nil. - // - if oldblock, err := GetBlock(hash, true); err == nil { - if bytes.Compare(block, oldblock) == 0 { - // The block already exists; return success. - return nil - } else { - return CollisionError - } + // If we already have this data, it's intact on disk, and we + // can update its timestamp, return success. If we have + // different data with the same hash, return failure. + if err := CompareAndTouch(hash, block); err == nil || err == CollisionError { + return err } // Choose a Keep volume to write to. // If this volume fails, try all of the volumes in order. - vol := KeepVM.Choose() - if err := vol.Put(hash, block); err == nil { - return nil // success! - } else { - allFull := true - for _, vol := range KeepVM.Volumes() { - err := vol.Put(hash, block) - if err == nil { - return nil // success! - } - if err != FullError { - // The volume is not full but the write did not succeed. - // Report the error and continue trying. - allFull = false - log.Printf("%s: Write(%s): %s\n", vol, hash, err) - } + if vol := KeepVM.NextWritable(); vol != nil { + if err := vol.Put(hash, block); err == nil { + return nil // success! } + } - if allFull { - log.Printf("all Keep volumes full") - return FullError - } else { - log.Printf("all Keep volumes failed") - return GenericError + writables := KeepVM.AllWritable() + if len(writables) == 0 { + log.Print("No writable volumes.") + return FullError + } + + allFull := true + for _, vol := range writables { + err := vol.Put(hash, block) + if err == nil { + return nil // success! + } + if err != FullError { + // The volume is not full but the + // write did not succeed. Report the + // error and continue trying. + allFull = false + log.Printf("%s: Write(%s): %s", vol, hash, err) + } + } + + if allFull { + log.Print("All volumes are full.") + return FullError + } else { + // Already logged the non-full errors. + return GenericError + } +} + +// CompareAndTouch returns nil if one of the volumes already has the +// given content and it successfully updates the relevant block's +// modification time in order to protect it from premature garbage +// collection. +func CompareAndTouch(hash string, buf []byte) error { + var bestErr error = NotFoundError + for _, vol := range KeepVM.AllWritable() { + if err := vol.Compare(hash, buf); err == CollisionError { + // Stop if we have a block with same hash but + // different content. (It will be impossible + // to tell which one is wanted if we have + // both, so there's no point writing it even + // on a different volume.) + log.Printf("%s: Compare(%s): %s", vol, hash, err) + return err + } else if os.IsNotExist(err) { + // Block does not exist. This is the only + // "normal" error: we don't log anything. + continue + } else if err != nil { + // Couldn't open file, data is corrupt on + // disk, etc.: log this abnormal condition, + // and try the next volume. + log.Printf("%s: Compare(%s): %s", vol, hash, err) + continue + } + if err := vol.Touch(hash); err != nil { + log.Printf("%s: Touch %s failed: %s", vol, hash, err) + bestErr = err + continue } + // Compare and Touch both worked --> done. + return nil } + return bestErr } +var validLocatorRe = regexp.MustCompile(`^[0-9a-f]{32}$`) + // IsValidLocator // Return true if the specified string is a valid Keep locator. // When Keep is extended to support hash types other than MD5, // this should be updated to cover those as well. // func IsValidLocator(loc string) bool { - match, err := regexp.MatchString(`^[0-9a-f]{32}$`, loc) - if err == nil { - return match - } - log.Printf("IsValidLocator: %s\n", err) - return false + return validLocatorRe.MatchString(loc) } +var authRe = regexp.MustCompile(`^OAuth2\s+(.*)`) + // GetApiToken returns the OAuth2 token from the Authorization // header of a HTTP request, or an empty string if no matching // token is found. func GetApiToken(req *http.Request) string { if auth, ok := req.Header["Authorization"]; ok { - if pat, err := regexp.Compile(`^OAuth2\s+(.*)`); err != nil { - log.Println(err) - } else if match := pat.FindStringSubmatch(auth[0]); match != nil { + if match := authRe.FindStringSubmatch(auth[0]); match != nil { return match[1] } } @@ -700,7 +630,7 @@ func GetApiToken(req *http.Request) string { func IsExpired(timestamp_hex string) bool { ts, err := strconv.ParseInt(timestamp_hex, 16, 0) if err != nil { - log.Printf("IsExpired: %s\n", err) + log.Printf("IsExpired: %s", err) return true } return time.Unix(ts, 0).Before(time.Now())