package arvados
import (
- "bufio"
+ "bytes"
+ "crypto/md5"
"fmt"
+ "regexp"
"strings"
"time"
- "git.curoverse.com/arvados.git/sdk/go/blockdigest"
+ "git.arvados.org/arvados.git/sdk/go/blockdigest"
+)
+
+var (
+ UUIDMatch = regexp.MustCompile(`^[a-z0-9]{5}-[a-z0-9]{5}-[a-z0-9]{15}$`).MatchString
+ PDHMatch = regexp.MustCompile(`^[0-9a-f]{32}\+\d+$`).MatchString
)
// Collection is an arvados#collection resource.
ManifestText string `json:"manifest_text"`
UnsignedManifestText string `json:"unsigned_manifest_text"`
Name string `json:"name"`
- CreatedAt *time.Time `json:"created_at"`
- ModifiedAt *time.Time `json:"modified_at"`
+ CreatedAt time.Time `json:"created_at"`
+ ModifiedAt time.Time `json:"modified_at"`
+ ModifiedByClientUUID string `json:"modified_by_client_uuid"`
+ ModifiedByUserUUID string `json:"modified_by_user_uuid"`
PortableDataHash string `json:"portable_data_hash"`
ReplicationConfirmed *int `json:"replication_confirmed"`
ReplicationConfirmedAt *time.Time `json:"replication_confirmed_at"`
DeleteAt *time.Time `json:"delete_at"`
IsTrashed bool `json:"is_trashed"`
Properties map[string]interface{} `json:"properties"`
+ WritableBy []string `json:"writable_by,omitempty"`
+ FileCount int `json:"file_count"`
+ FileSizeTotal int64 `json:"file_size_total"`
+ Version int `json:"version"`
+ PreserveVersion bool `json:"preserve_version"`
+ CurrentVersionUUID string `json:"current_version_uuid"`
+ Description string `json:"description"`
}
func (c Collection) resourceName() string {
// SizedDigests returns the hash+size part of each data block
// referenced by the collection.
+//
+// Zero-length blocks are not included.
func (c *Collection) SizedDigests() ([]SizedDigest, error) {
- manifestText := c.ManifestText
- if manifestText == "" {
- manifestText = c.UnsignedManifestText
+ manifestText := []byte(c.ManifestText)
+ if len(manifestText) == 0 {
+ manifestText = []byte(c.UnsignedManifestText)
}
- if manifestText == "" && c.PortableDataHash != "d41d8cd98f00b204e9800998ecf8427e+0" {
+ if len(manifestText) == 0 && c.PortableDataHash != "d41d8cd98f00b204e9800998ecf8427e+0" {
// TODO: Check more subtle forms of corruption, too
return nil, fmt.Errorf("manifest is missing")
}
- var sds []SizedDigest
- scanner := bufio.NewScanner(strings.NewReader(manifestText))
- scanner.Buffer(make([]byte, 1048576), len(manifestText))
- for scanner.Scan() {
- line := scanner.Text()
- tokens := strings.Split(line, " ")
+ sds := make([]SizedDigest, 0, len(manifestText)/40)
+ for _, line := range bytes.Split(manifestText, []byte{'\n'}) {
+ if len(line) == 0 {
+ continue
+ }
+ tokens := bytes.Split(line, []byte{' '})
if len(tokens) < 3 {
return nil, fmt.Errorf("Invalid stream (<3 tokens): %q", line)
}
for _, token := range tokens[1:] {
- if !blockdigest.LocatorPattern.MatchString(token) {
+ if !blockdigest.LocatorPattern.Match(token) {
// FIXME: ensure it's a file token
break
}
+ if bytes.HasPrefix(token, []byte("d41d8cd98f00b204e9800998ecf8427e+0")) {
+ // Exclude "empty block" placeholder
+ continue
+ }
// FIXME: shouldn't assume 32 char hash
- if i := strings.IndexRune(token[33:], '+'); i >= 0 {
+ if i := bytes.IndexRune(token[33:], '+'); i >= 0 {
token = token[:33+i]
}
- sds = append(sds, SizedDigest(token))
+ sds = append(sds, SizedDigest(string(token)))
}
}
- return sds, scanner.Err()
+ return sds, nil
}
type CollectionList struct {
Offset int `json:"offset"`
Limit int `json:"limit"`
}
+
+// PortableDataHash computes the portable data hash of the given
+// manifest.
+func PortableDataHash(mt string) string {
+ // To calculate the PDH, we write the manifest to an md5 hash
+ // func, except we skip the "extra" part of block tokens that
+ // look like "abcdef0123456789abcdef0123456789+12345+extra".
+ //
+ // This code is simplified by the facts that (A) all block
+ // tokens -- even the first and last in a stream -- are
+ // preceded and followed by a space character; and (B) all
+ // non-block tokens either start with '.' or contain ':'.
+ //
+ // A regexp-based approach (like the one this replaced) would
+ // be more readable, but very slow.
+ h := md5.New()
+ size := 0
+ todo := []byte(mt)
+ for len(todo) > 0 {
+ // sp is the end of the current token (note that if
+ // the current token is the last file token in a
+ // stream, we'll also include the \n and the dirname
+ // token on the next line, which is perfectly fine for
+ // our purposes).
+ sp := bytes.IndexByte(todo, ' ')
+ if sp < 0 {
+ // Last token of the manifest, which is never
+ // a block token.
+ n, _ := h.Write(todo)
+ size += n
+ break
+ }
+ if sp >= 34 && todo[32] == '+' && bytes.IndexByte(todo[:32], ':') == -1 && todo[0] != '.' {
+ // todo[:sp] is a block token.
+ sizeend := bytes.IndexByte(todo[33:sp], '+')
+ if sizeend < 0 {
+ // "hash+size"
+ sizeend = sp
+ } else {
+ // "hash+size+extra"
+ sizeend += 33
+ }
+ n, _ := h.Write(todo[:sizeend])
+ h.Write([]byte{' '})
+ size += n + 1
+ } else {
+ // todo[:sp] is not a block token.
+ n, _ := h.Write(todo[:sp+1])
+ size += n
+ }
+ todo = todo[sp+1:]
+ }
+ return fmt.Sprintf("%x+%d", h.Sum(nil), size)
+}
+
+// CollectionIDFromDNSName returns a UUID or PDH if s begins with a
+// UUID or URL-encoded PDH; otherwise "".
+func CollectionIDFromDNSName(s string) string {
+ // Strip domain.
+ if i := strings.IndexRune(s, '.'); i >= 0 {
+ s = s[:i]
+ }
+ // Names like {uuid}--collections.example.com serve the same
+ // purpose as {uuid}.collections.example.com but can reduce
+ // cost/effort of using [additional] wildcard certificates.
+ if i := strings.Index(s, "--"); i >= 0 {
+ s = s[:i]
+ }
+ if UUIDMatch(s) {
+ return s
+ }
+ if pdh := strings.Replace(s, "-", "+", 1); PDHMatch(pdh) {
+ return pdh
+ }
+ return ""
+}