1 /* Deals with parsing Manifest Text. */
3 // Inspired by the Manifest class in arvados/sdk/ruby/lib/arvados/keep.rb
10 "git.curoverse.com/arvados.git/sdk/go/blockdigest"
17 var ErrInvalidToken = errors.New("Invalid token")
19 var LocatorPattern = regexp.MustCompile(
20 "^[0-9a-fA-F]{32}\\+[0-9]+(\\+[A-Z][A-Za-z0-9@_-]+)*$")
22 type Manifest struct {
26 type BlockLocator struct {
27 Digest blockdigest.BlockDigest
32 type DataSegment struct {
38 // FileSegment is a portion of a file that is contained within a
40 type FileSegment struct {
42 // Offset (within this block) of this data segment
47 // Represents a single line from a manifest.
48 type ManifestStream struct {
54 var escapeSeq = regexp.MustCompile(`\\([0-9]{3}|\\)`)
56 func unescapeSeq(seq string) string {
60 i, err := strconv.ParseUint(seq[1:], 8, 8)
62 // Invalid escape sequence: can't unescape.
65 return string([]byte{byte(i)})
68 func UnescapeName(s string) string {
69 return escapeSeq.ReplaceAllStringFunc(s, unescapeSeq)
72 func ParseBlockLocator(s string) (b BlockLocator, err error) {
73 if !LocatorPattern.MatchString(s) {
74 err = fmt.Errorf("String \"%s\" does not match BlockLocator pattern "+
77 LocatorPattern.String())
79 tokens := strings.Split(s, "+")
81 var blockDigest blockdigest.BlockDigest
82 // We expect both of the following to succeed since LocatorPattern
83 // restricts the strings appropriately.
84 blockDigest, err = blockdigest.FromString(tokens[0])
88 blockSize, err = strconv.ParseInt(tokens[1], 10, 0)
92 b.Digest = blockDigest
93 b.Size = int(blockSize)
99 func parseFileToken(tok string) (segPos, segLen uint64, name string, err error) {
100 parts := strings.SplitN(tok, ":", 3)
102 err = ErrInvalidToken
105 segPos, err = strconv.ParseUint(parts[0], 10, 64)
109 segLen, err = strconv.ParseUint(parts[1], 10, 64)
113 name = UnescapeName(parts[2])
117 func (s *ManifestStream) FileSegmentIterByName(filepath string) <-chan *FileSegment {
118 ch := make(chan *FileSegment)
120 s.sendFileSegmentIterByName(filepath, ch)
126 func (s *ManifestStream) sendFileSegmentIterByName(filepath string, ch chan<- *FileSegment) {
127 blockLens := make([]int, 0, len(s.Blocks))
128 // This is what streamName+"/"+fileName will look like:
129 target := "./" + filepath
130 for _, fTok := range s.FileTokens {
131 wantPos, wantLen, name, err := parseFileToken(fTok)
133 // Skip (!) invalid file tokens.
136 if s.StreamName+"/"+name != target {
140 ch <- &FileSegment{Locator: "d41d8cd98f00b204e9800998ecf8427e+0", Offset: 0, Len: 0}
143 // Linear search for blocks containing data for this
145 var blockPos uint64 = 0 // position of block in stream
146 for i, loc := range s.Blocks {
147 if blockPos >= wantPos+wantLen {
150 if len(blockLens) <= i {
151 blockLens = blockLens[:i+1]
152 b, err := ParseBlockLocator(loc)
154 // Unparseable locator -> unusable
159 blockLens[i] = b.Size
161 blockLen := uint64(blockLens[i])
162 if blockPos+blockLen <= wantPos {
171 if blockPos < wantPos {
172 fseg.Offset = int(wantPos - blockPos)
173 fseg.Len -= fseg.Offset
175 if blockPos+blockLen > wantPos+wantLen {
176 fseg.Len = int(wantPos+wantLen-blockPos) - fseg.Offset
184 func parseManifestStream(s string) (m ManifestStream) {
185 tokens := strings.Split(s, " ")
186 m.StreamName = UnescapeName(tokens[0])
189 for i = range tokens {
190 if !blockdigest.IsBlockLocator(tokens[i]) {
194 m.Blocks = tokens[:i]
195 m.FileTokens = tokens[i:]
199 func (m *Manifest) StreamIter() <-chan ManifestStream {
200 ch := make(chan ManifestStream)
201 go func(input string) {
202 // This slice holds the current line and the remainder of the
203 // manifest. We parse one line at a time, to save effort if we
204 // only need the first few lines.
205 lines := []string{"", input}
207 lines = strings.SplitN(lines[1], "\n", 2)
208 if len(lines[0]) > 0 {
209 // Only parse non-blank lines
210 ch <- parseManifestStream(lines[0])
221 func (m *Manifest) FileSegmentIterByName(filepath string) <-chan *FileSegment {
222 ch := make(chan *FileSegment)
224 for stream := range m.StreamIter() {
225 if !strings.HasPrefix("./"+filepath, stream.StreamName+"/") {
228 stream.sendFileSegmentIterByName(filepath, ch)
235 // Blocks may appear mulitple times within the same manifest if they
236 // are used by multiple files. In that case this Iterator will output
237 // the same block multiple times.
238 func (m *Manifest) BlockIterWithDuplicates() <-chan blockdigest.BlockLocator {
239 blockChannel := make(chan blockdigest.BlockLocator)
240 go func(streamChannel <-chan ManifestStream) {
241 for m := range streamChannel {
242 for _, block := range m.Blocks {
243 if b, err := blockdigest.ParseBlockLocator(block); err == nil {
246 log.Printf("ERROR: Failed to parse block: %v", err)