From 0ab78942d824ad28b69d695538a80c8800f9a4c2 Mon Sep 17 00:00:00 2001 From: Tom Clegg Date: Sun, 21 Apr 2024 18:35:27 -0400 Subject: [PATCH] 21696: Comment optimized PortableDataHash func. Arvados-DCO-1.1-Signed-off-by: Tom Clegg --- sdk/go/arvados/collection.go | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/sdk/go/arvados/collection.go b/sdk/go/arvados/collection.go index b157c4c8a8..1e9616c428 100644 --- a/sdk/go/arvados/collection.go +++ b/sdk/go/arvados/collection.go @@ -107,27 +107,49 @@ type CollectionList struct { // PortableDataHash computes the portable data hash of the given // manifest. func PortableDataHash(mt string) string { + // To calculate the PDH, we write the manifest to an md5 hash + // func, except we skip the "extra" part of block tokens that + // look like "abcdef0123456789abcdef0123456789+12345+extra". + // + // This code is simplified by the facts that (A) all block + // tokens -- even the first and last in a stream -- are + // preceded and followed by a space character; and (B) all + // non-block tokens either start with '.' or contain ':'. + // + // A regexp-based approach (like the one this replaced) would + // be more readable, but very slow. h := md5.New() size := 0 todo := []byte(mt) for len(todo) > 0 { + // sp is the end of the current token (note that if + // the current token is the last file token in a + // stream, we'll also include the \n and the dirname + // token on the next line, which is perfectly fine for + // our purposes). sp := bytes.IndexByte(todo, ' ') if sp < 0 { + // Last token of the manifest, which is never + // a block token. n, _ := h.Write(todo) size += n break } - if sp >= 34 && todo[32] == '+' && bytes.IndexByte(todo[:32], ':') == -1 { + if sp >= 34 && todo[32] == '+' && bytes.IndexByte(todo[:32], ':') == -1 && todo[0] != '.' { + // todo[:sp] is a block token. sizeend := bytes.IndexByte(todo[33:sp], '+') if sizeend < 0 { + // "hash+size" sizeend = sp } else { + // "hash+size+extra" sizeend += 33 } n, _ := h.Write(todo[:sizeend]) h.Write([]byte{' '}) size += n + 1 } else { + // todo[:sp] is not a block token. n, _ := h.Write(todo[:sp+1]) size += n } -- 2.39.5