package manifest
import (
+ "errors"
"fmt"
"git.curoverse.com/arvados.git/sdk/go/blockdigest"
"log"
"strings"
)
+var ErrInvalidToken = errors.New("Invalid token")
+
var LocatorPattern = regexp.MustCompile(
"^[0-9a-fA-F]{32}\\+[0-9]+(\\+[A-Z][A-Za-z0-9@_-]+)*$")
Hints []string
}
+type DataSegment struct {
+ BlockLocator
+ Locator string
+ StreamOffset uint64
+}
+
+// FileSegment is a portion of a file that is contained within a
+// single block.
+type FileSegment struct {
+ Locator string
+ // Offset (within this block) of this data segment
+ Offset int
+ Len int
+}
+
// Represents a single line from a manifest.
type ManifestStream struct {
StreamName string
Blocks []string
- Files []string
+ FileTokens []string
+}
+
+var escapeSeq = regexp.MustCompile(`\\([0-9]{3}|\\)`)
+
+func unescapeSeq(seq string) string {
+ if seq == `\\` {
+ return `\`
+ }
+ i, err := strconv.ParseUint(seq[1:], 8, 8)
+ if err != nil {
+ // Invalid escape sequence: can't unescape.
+ return seq
+ }
+ return string([]byte{byte(i)})
+}
+
+func UnescapeName(s string) string {
+ return escapeSeq.ReplaceAllStringFunc(s, unescapeSeq)
}
func ParseBlockLocator(s string) (b BlockLocator, err error) {
return
}
+func parseFileToken(tok string) (segPos, segLen uint64, name string, err error) {
+ parts := strings.SplitN(tok, ":", 3)
+ if len(parts) != 3 {
+ err = ErrInvalidToken
+ return
+ }
+ segPos, err = strconv.ParseUint(parts[0], 10, 64)
+ if err != nil {
+ return
+ }
+ segLen, err = strconv.ParseUint(parts[1], 10, 64)
+ if err != nil {
+ return
+ }
+ name = UnescapeName(parts[2])
+ return
+}
+
+func (s *ManifestStream) FileSegmentIterByName(filepath string) <-chan *FileSegment {
+ ch := make(chan *FileSegment)
+ go func() {
+ s.sendFileSegmentIterByName(filepath, ch)
+ close(ch)
+ }()
+ return ch
+}
+
+func (s *ManifestStream) sendFileSegmentIterByName(filepath string, ch chan<- *FileSegment) {
+ blockLens := make([]int, 0, len(s.Blocks))
+ // This is what streamName+"/"+fileName will look like:
+ target := "./" + filepath
+ for _, fTok := range s.FileTokens {
+ wantPos, wantLen, name, err := parseFileToken(fTok)
+ if err != nil {
+ // Skip (!) invalid file tokens.
+ continue
+ }
+ if s.StreamName+"/"+name != target {
+ continue
+ }
+ if wantLen == 0 {
+ ch <- &FileSegment{Locator: "d41d8cd98f00b204e9800998ecf8427e+0", Offset: 0, Len: 0}
+ continue
+ }
+ // Linear search for blocks containing data for this
+ // file
+ var blockPos uint64 = 0 // position of block in stream
+ for i, loc := range s.Blocks {
+ if blockPos >= wantPos+wantLen {
+ break
+ }
+ if len(blockLens) <= i {
+ blockLens = blockLens[:i+1]
+ b, err := ParseBlockLocator(loc)
+ if err != nil {
+ // Unparseable locator -> unusable
+ // stream.
+ ch <- nil
+ return
+ }
+ blockLens[i] = b.Size
+ }
+ blockLen := uint64(blockLens[i])
+ if blockPos+blockLen <= wantPos {
+ blockPos += blockLen
+ continue
+ }
+ fseg := FileSegment{
+ Locator: loc,
+ Offset: 0,
+ Len: blockLens[i],
+ }
+ if blockPos < wantPos {
+ fseg.Offset = int(wantPos - blockPos)
+ fseg.Len -= fseg.Offset
+ }
+ if blockPos+blockLen > wantPos+wantLen {
+ fseg.Len = int(wantPos+wantLen-blockPos) - fseg.Offset
+ }
+ ch <- &fseg
+ blockPos += blockLen
+ }
+ }
+}
+
func parseManifestStream(s string) (m ManifestStream) {
tokens := strings.Split(s, " ")
- m.StreamName = tokens[0]
+ m.StreamName = UnescapeName(tokens[0])
tokens = tokens[1:]
var i int
for i = range tokens {
- if !LocatorPattern.MatchString(tokens[i]) {
+ if !blockdigest.IsBlockLocator(tokens[i]) {
break
}
}
m.Blocks = tokens[:i]
- m.Files = tokens[i:]
+ m.FileTokens = tokens[i:]
return
}
return ch
}
+func (m *Manifest) FileSegmentIterByName(filepath string) <-chan *FileSegment {
+ ch := make(chan *FileSegment)
+ go func() {
+ for stream := range m.StreamIter() {
+ if !strings.HasPrefix("./"+filepath, stream.StreamName+"/") {
+ continue
+ }
+ stream.sendFileSegmentIterByName(filepath, ch)
+ }
+ close(ch)
+ }()
+ return ch
+}
+
// Blocks may appear mulitple times within the same manifest if they
// are used by multiple files. In that case this Iterator will output
// the same block multiple times.
-func (m *Manifest) BlockIterWithDuplicates() <-chan BlockLocator {
- blockChannel := make(chan BlockLocator)
+func (m *Manifest) BlockIterWithDuplicates() <-chan blockdigest.BlockLocator {
+ blockChannel := make(chan blockdigest.BlockLocator)
go func(streamChannel <-chan ManifestStream) {
for m := range streamChannel {
for _, block := range m.Blocks {
- if b, err := ParseBlockLocator(block); err == nil {
+ if b, err := blockdigest.ParseBlockLocator(block); err == nil {
blockChannel <- b
} else {
log.Printf("ERROR: Failed to parse block: %v", err)