X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/1e3eacb0ca6f2228f50f13514c7577a149a707e6..0ab78942d824ad28b69d695538a80c8800f9a4c2:/sdk/go/arvados/collection.go

diff --git a/sdk/go/arvados/collection.go b/sdk/go/arvados/collection.go
index 030665d77f..1e9616c428 100644
--- a/sdk/go/arvados/collection.go
+++ b/sdk/go/arvados/collection.go
@@ -5,7 +5,7 @@
 package arvados
 
 import (
-	"bufio"
+	"bytes"
 	"crypto/md5"
 	"fmt"
 	"regexp"
@@ -15,6 +15,11 @@ import (
 	"git.arvados.org/arvados.git/sdk/go/blockdigest"
 )
 
+var (
+	UUIDMatch = regexp.MustCompile(`^[a-z0-9]{5}-[a-z0-9]{5}-[a-z0-9]{15}$`).MatchString
+	PDHMatch  = regexp.MustCompile(`^[0-9a-f]{32}\+\d+$`).MatchString
+)
+
 // Collection is an arvados#collection resource.
 type Collection struct {
 	UUID                      string                 `json:"uuid"`
@@ -53,37 +58,43 @@ func (c Collection) resourceName() string {
 
 // SizedDigests returns the hash+size part of each data block
 // referenced by the collection.
+//
+// Zero-length blocks are not included.
 func (c *Collection) SizedDigests() ([]SizedDigest, error) {
-	manifestText := c.ManifestText
-	if manifestText == "" {
-		manifestText = c.UnsignedManifestText
+	manifestText := []byte(c.ManifestText)
+	if len(manifestText) == 0 {
+		manifestText = []byte(c.UnsignedManifestText)
 	}
-	if manifestText == "" && c.PortableDataHash != "d41d8cd98f00b204e9800998ecf8427e+0" {
+	if len(manifestText) == 0 && c.PortableDataHash != "d41d8cd98f00b204e9800998ecf8427e+0" {
 		// TODO: Check more subtle forms of corruption, too
 		return nil, fmt.Errorf("manifest is missing")
 	}
-	var sds []SizedDigest
-	scanner := bufio.NewScanner(strings.NewReader(manifestText))
-	scanner.Buffer(make([]byte, 1048576), len(manifestText))
-	for scanner.Scan() {
-		line := scanner.Text()
-		tokens := strings.Split(line, " ")
+	sds := make([]SizedDigest, 0, len(manifestText)/40)
+	for _, line := range bytes.Split(manifestText, []byte{'\n'}) {
+		if len(line) == 0 {
+			continue
+		}
+		tokens := bytes.Split(line, []byte{' '})
 		if len(tokens) < 3 {
 			return nil, fmt.Errorf("Invalid stream (<3 tokens): %q", line)
 		}
 		for _, token := range tokens[1:] {
-			if !blockdigest.LocatorPattern.MatchString(token) {
+			if !blockdigest.LocatorPattern.Match(token) {
 				// FIXME: ensure it's a file token
 				break
 			}
+			if bytes.HasPrefix(token, []byte("d41d8cd98f00b204e9800998ecf8427e+0")) {
+				// Exclude "empty block" placeholder
+				continue
+			}
 			// FIXME: shouldn't assume 32 char hash
-			if i := strings.IndexRune(token[33:], '+'); i >= 0 {
+			if i := bytes.IndexRune(token[33:], '+'); i >= 0 {
 				token = token[:33+i]
 			}
-			sds = append(sds, SizedDigest(token))
+			sds = append(sds, SizedDigest(string(token)))
 		}
 	}
-	return sds, scanner.Err()
+	return sds, nil
 }
 
 type CollectionList struct {
@@ -93,27 +104,78 @@ type CollectionList struct {
 	Limit          int          `json:"limit"`
 }
 
-var (
-	blkRe = regexp.MustCompile(`^ [0-9a-f]{32}\+\d+`)
-	tokRe = regexp.MustCompile(` ?[^ ]*`)
-)
-
 // PortableDataHash computes the portable data hash of the given
 // manifest.
 func PortableDataHash(mt string) string {
+	// To calculate the PDH, we write the manifest to an md5 hash
+	// func, except we skip the "extra" part of block tokens that
+	// look like "abcdef0123456789abcdef0123456789+12345+extra".
+	//
+	// This code is simplified by the facts that (A) all block
+	// tokens -- even the first and last in a stream -- are
+	// preceded and followed by a space character; and (B) all
+	// non-block tokens either start with '.'  or contain ':'.
+	//
+	// A regexp-based approach (like the one this replaced) would
+	// be more readable, but very slow.
 	h := md5.New()
 	size := 0
-	_ = tokRe.ReplaceAllFunc([]byte(mt), func(tok []byte) []byte {
-		if m := blkRe.Find(tok); m != nil {
-			// write hash+size, ignore remaining block hints
-			tok = m
+	todo := []byte(mt)
+	for len(todo) > 0 {
+		// sp is the end of the current token (note that if
+		// the current token is the last file token in a
+		// stream, we'll also include the \n and the dirname
+		// token on the next line, which is perfectly fine for
+		// our purposes).
+		sp := bytes.IndexByte(todo, ' ')
+		if sp < 0 {
+			// Last token of the manifest, which is never
+			// a block token.
+			n, _ := h.Write(todo)
+			size += n
+			break
 		}
-		n, err := h.Write(tok)
-		if err != nil {
-			panic(err)
+		if sp >= 34 && todo[32] == '+' && bytes.IndexByte(todo[:32], ':') == -1 && todo[0] != '.' {
+			// todo[:sp] is a block token.
+			sizeend := bytes.IndexByte(todo[33:sp], '+')
+			if sizeend < 0 {
+				// "hash+size"
+				sizeend = sp
+			} else {
+				// "hash+size+extra"
+				sizeend += 33
+			}
+			n, _ := h.Write(todo[:sizeend])
+			h.Write([]byte{' '})
+			size += n + 1
+		} else {
+			// todo[:sp] is not a block token.
+			n, _ := h.Write(todo[:sp+1])
+			size += n
 		}
-		size += n
-		return nil
-	})
+		todo = todo[sp+1:]
+	}
 	return fmt.Sprintf("%x+%d", h.Sum(nil), size)
 }
+
+// CollectionIDFromDNSName returns a UUID or PDH if s begins with a
+// UUID or URL-encoded PDH; otherwise "".
+func CollectionIDFromDNSName(s string) string {
+	// Strip domain.
+	if i := strings.IndexRune(s, '.'); i >= 0 {
+		s = s[:i]
+	}
+	// Names like {uuid}--collections.example.com serve the same
+	// purpose as {uuid}.collections.example.com but can reduce
+	// cost/effort of using [additional] wildcard certificates.
+	if i := strings.Index(s, "--"); i >= 0 {
+		s = s[:i]
+	}
+	if UUIDMatch(s) {
+		return s
+	}
+	if pdh := strings.Replace(s, "-", "+", 1); PDHMatch(pdh) {
+		return pdh
+	}
+	return ""
+}