sdk/go/manifest/manifest.go

   1 /* Deals with parsing Manifest Text. */
   2
   3 // Inspired by the Manifest class in arvados/sdk/ruby/lib/arvados/keep.rb
   4
   5 package manifest
   6
   7 import (
   8         "errors"
   9         "fmt"
  10         "git.curoverse.com/arvados.git/sdk/go/blockdigest"
  11         "path"
  12         "regexp"
  13         "sort"
  14         "strconv"
  15         "strings"
  16 )
  17
  18 var ErrInvalidToken = errors.New("Invalid token")
  19
  20 type Manifest struct {
  21         Text string
  22         Err  error
  23 }
  24
  25 type BlockLocator struct {
  26         Digest blockdigest.BlockDigest
  27         Size   int
  28         Hints  []string
  29 }
  30
  31 // FileSegment is a portion of a file that is contained within a
  32 // single block.
  33 type FileSegment struct {
  34         Locator string
  35         // Offset (within this block) of this data segment
  36         Offset int
  37         Len    int
  38 }
  39
  40 // FileStreamSegment is a portion of a file described as a segment of a stream.
  41 type FileStreamSegment struct {
  42         SegPos uint64
  43         SegLen uint64
  44         Name   string
  45 }
  46
  47 // Represents a single line from a manifest.
  48 type ManifestStream struct {
  49         StreamName         string
  50         Blocks             []string
  51         blockOffsets       []uint64
  52         FileStreamSegments []FileStreamSegment
  53         Err                error
  54 }
  55
  56 // Array of segments referencing file content
  57 type segmentedFile []FileSegment
  58
  59 // Map of files to list of file segments referencing file content
  60 type segmentedStream map[string]segmentedFile
  61
  62 // Map of streams
  63 type segmentedManifest map[string]segmentedStream
  64
  65 var escapeSeq = regexp.MustCompile(`\\([0-9]{3}|\\)`)
  66
  67 func unescapeSeq(seq string) string {
  68         if seq == `\\` {
  69                 return `\`
  70         }
  71         i, err := strconv.ParseUint(seq[1:], 8, 8)
  72         if err != nil {
  73                 // Invalid escape sequence: can't unescape.
  74                 return seq
  75         }
  76         return string([]byte{byte(i)})
  77 }
  78
  79 func EscapeName(s string) string {
  80         raw := []byte(s)
  81         escaped := make([]byte, 0, len(s))
  82         for _, c := range raw {
  83                 if c <= 32 {
  84                         oct := fmt.Sprintf("\\%03o", c)
  85                         escaped = append(escaped, []byte(oct)...)
  86                 } else {
  87                         escaped = append(escaped, c)
  88                 }
  89         }
  90         return string(escaped)
  91 }
  92
  93 func UnescapeName(s string) string {
  94         return escapeSeq.ReplaceAllStringFunc(s, unescapeSeq)
  95 }
  96
  97 func ParseBlockLocator(s string) (b BlockLocator, err error) {
  98         if !blockdigest.LocatorPattern.MatchString(s) {
  99                 err = fmt.Errorf("String \"%s\" does not match BlockLocator pattern "+
 100                         "\"%s\".",
 101                         s,
 102                         blockdigest.LocatorPattern.String())
 103         } else {
 104                 tokens := strings.Split(s, "+")
 105                 var blockSize int64
 106                 var blockDigest blockdigest.BlockDigest
 107                 // We expect both of the following to succeed since LocatorPattern
 108                 // restricts the strings appropriately.
 109                 blockDigest, err = blockdigest.FromString(tokens[0])
 110                 if err != nil {
 111                         return
 112                 }
 113                 blockSize, err = strconv.ParseInt(tokens[1], 10, 0)
 114                 if err != nil {
 115                         return
 116                 }
 117                 b.Digest = blockDigest
 118                 b.Size = int(blockSize)
 119                 b.Hints = tokens[2:]
 120         }
 121         return
 122 }
 123
 124 func parseFileStreamSegment(tok string) (ft FileStreamSegment, err error) {
 125         parts := strings.SplitN(tok, ":", 3)
 126         if len(parts) != 3 {
 127                 err = ErrInvalidToken
 128                 return
 129         }
 130         ft.SegPos, err = strconv.ParseUint(parts[0], 10, 64)
 131         if err != nil {
 132                 return
 133         }
 134         ft.SegLen, err = strconv.ParseUint(parts[1], 10, 64)
 135         if err != nil {
 136                 return
 137         }
 138         ft.Name = UnescapeName(parts[2])
 139         return
 140 }
 141
 142 func (s *ManifestStream) FileSegmentIterByName(filepath string) <-chan *FileSegment {
 143         ch := make(chan *FileSegment, 64)
 144         go func() {
 145                 s.sendFileSegmentIterByName(filepath, ch)
 146                 close(ch)
 147         }()
 148         return ch
 149 }
 150
 151 func firstBlock(offsets []uint64, range_start uint64) int {
 152         // range_start/block_start is the inclusive lower bound
 153         // range_end/block_end is the exclusive upper bound
 154
 155         hi := len(offsets) - 1
 156         var lo int
 157         i := ((hi + lo) / 2)
 158         block_start := offsets[i]
 159         block_end := offsets[i+1]
 160
 161         // perform a binary search for the first block
 162         // assumes that all of the blocks are contiguous, so range_start is guaranteed
 163         // to either fall into the range of a block or be outside the block range entirely
 164         for !(range_start >= block_start && range_start < block_end) {
 165                 fmt.Println(i, block_start, block_end)
 166                 if lo == i {
 167                         // must be out of range, fail
 168                         return -1
 169                 }
 170                 if range_start > block_start {
 171                         lo = i
 172                 } else {
 173                         hi = i
 174                 }
 175                 i = ((hi + lo) / 2)
 176                 block_start = offsets[i]
 177                 block_end = offsets[i+1]
 178         }
 179         return i
 180 }
 181
 182 func (s *ManifestStream) sendFileSegmentIterByName(filepath string, ch chan<- *FileSegment) {
 183         // This is what streamName+"/"+fileName will look like:
 184         target := fixStreamName(filepath)
 185         for _, fTok := range s.FileStreamSegments {
 186                 wantPos := fTok.SegPos
 187                 wantLen := fTok.SegLen
 188                 name := fTok.Name
 189
 190                 if s.StreamName+"/"+name != target {
 191                         continue
 192                 }
 193                 if wantLen == 0 {
 194                         ch <- &FileSegment{Locator: "d41d8cd98f00b204e9800998ecf8427e+0", Offset: 0, Len: 0}
 195                         continue
 196                 }
 197
 198                 // Binary search to determine first block in the stream
 199                 i := firstBlock(s.blockOffsets, wantPos)
 200                 if i == -1 {
 201                         // Shouldn't happen, file segments are checked in parseManifestStream
 202                         panic(fmt.Sprintf("File segment %v extends past end of stream", fTok))
 203                 }
 204                 for ; i < len(s.Blocks); i++ {
 205                         blockPos := s.blockOffsets[i]
 206                         blockEnd := s.blockOffsets[i+1]
 207                         if blockEnd <= wantPos {
 208                                 // Shouldn't happen, FirstBlock() should start
 209                                 // us on the right block, so if this triggers
 210                                 // that means there is a bug.
 211                                 panic(fmt.Sprintf("Block end %v comes before start of file segment %v", blockEnd, wantPos))
 212                         }
 213                         if blockPos >= wantPos+wantLen {
 214                                 // current block comes after current file span
 215                                 break
 216                         }
 217
 218                         fseg := FileSegment{
 219                                 Locator: s.Blocks[i],
 220                                 Offset:  0,
 221                                 Len:     int(blockEnd - blockPos),
 222                         }
 223                         if blockPos < wantPos {
 224                                 fseg.Offset = int(wantPos - blockPos)
 225                                 fseg.Len -= fseg.Offset
 226                         }
 227                         if blockEnd > wantPos+wantLen {
 228                                 fseg.Len = int(wantPos+wantLen-blockPos) - fseg.Offset
 229                         }
 230                         ch <- &fseg
 231                 }
 232         }
 233 }
 234
 235 func parseManifestStream(s string) (m ManifestStream) {
 236         tokens := strings.Split(s, " ")
 237
 238         m.StreamName = UnescapeName(tokens[0])
 239         if m.StreamName != "." && !strings.HasPrefix(m.StreamName, "./") {
 240                 m.Err = fmt.Errorf("Invalid stream name: %s", m.StreamName)
 241                 return
 242         }
 243
 244         tokens = tokens[1:]
 245         var i int
 246         for i = 0; i < len(tokens); i++ {
 247                 if !blockdigest.IsBlockLocator(tokens[i]) {
 248                         break
 249                 }
 250         }
 251         m.Blocks = tokens[:i]
 252         fileTokens := tokens[i:]
 253
 254         if len(m.Blocks) == 0 {
 255                 m.Err = fmt.Errorf("No block locators found")
 256                 return
 257         }
 258
 259         m.blockOffsets = make([]uint64, len(m.Blocks)+1)
 260         var streamoffset uint64
 261         for i, b := range m.Blocks {
 262                 bl, err := ParseBlockLocator(b)
 263                 if err != nil {
 264                         m.Err = err
 265                         return
 266                 }
 267                 m.blockOffsets[i] = streamoffset
 268                 streamoffset += uint64(bl.Size)
 269         }
 270         m.blockOffsets[len(m.Blocks)] = streamoffset
 271
 272         if len(fileTokens) == 0 {
 273                 m.Err = fmt.Errorf("No file tokens found")
 274                 return
 275         }
 276
 277         for _, ft := range fileTokens {
 278                 pft, err := parseFileStreamSegment(ft)
 279                 if err != nil {
 280                         m.Err = fmt.Errorf("Invalid file token: %s", ft)
 281                         break
 282                 }
 283                 if pft.SegPos+pft.SegLen > streamoffset {
 284                         m.Err = fmt.Errorf("File segment %s extends past end of stream %d", ft, streamoffset)
 285                         break
 286                 }
 287                 m.FileStreamSegments = append(m.FileStreamSegments, pft)
 288         }
 289
 290         return
 291 }
 292
 293 func fixStreamName(sn string) string {
 294         sn = path.Clean(sn)
 295         if strings.HasPrefix(sn, "/") {
 296                 sn = "." + sn
 297         } else if sn != "." {
 298                 sn = "./" + sn
 299         }
 300         return sn
 301 }
 302
 303 func splitPath(srcpath string) (streamname, filename string) {
 304         pathIdx := strings.LastIndex(srcpath, "/")
 305         if pathIdx >= 0 {
 306                 streamname = srcpath[0:pathIdx]
 307                 filename = srcpath[pathIdx+1:]
 308         } else {
 309                 streamname = srcpath
 310                 filename = ""
 311         }
 312         return
 313 }
 314
 315 func (m *Manifest) segment() (*segmentedManifest, error) {
 316         files := make(segmentedManifest)
 317
 318         for stream := range m.StreamIter() {
 319                 if stream.Err != nil {
 320                         // Stream has an error
 321                         return nil, stream.Err
 322                 }
 323                 currentStreamfiles := make(map[string]bool)
 324                 for _, f := range stream.FileStreamSegments {
 325                         sn := stream.StreamName
 326                         if strings.HasSuffix(sn, "/") {
 327                                 sn = sn[0 : len(sn)-1]
 328                         }
 329                         path := sn + "/" + f.Name
 330                         streamname, filename := splitPath(path)
 331                         if files[streamname] == nil {
 332                                 files[streamname] = make(segmentedStream)
 333                         }
 334                         if !currentStreamfiles[path] {
 335                                 segs := files[streamname][filename]
 336                                 for seg := range stream.FileSegmentIterByName(path) {
 337                                         if seg.Len > 0 {
 338                                                 segs = append(segs, *seg)
 339                                         }
 340                                 }
 341                                 files[streamname][filename] = segs
 342                                 currentStreamfiles[path] = true
 343                         }
 344                 }
 345         }
 346
 347         return &files, nil
 348 }
 349
 350 func (stream segmentedStream) normalizedText(name string) string {
 351         var sortedfiles []string
 352         for k, _ := range stream {
 353                 sortedfiles = append(sortedfiles, k)
 354         }
 355         sort.Strings(sortedfiles)
 356
 357         stream_tokens := []string{EscapeName(name)}
 358
 359         blocks := make(map[blockdigest.BlockDigest]int64)
 360         var streamoffset int64
 361
 362         // Go through each file and add each referenced block exactly once.
 363         for _, streamfile := range sortedfiles {
 364                 for _, segment := range stream[streamfile] {
 365                         b, _ := ParseBlockLocator(segment.Locator)
 366                         if _, ok := blocks[b.Digest]; !ok {
 367                                 stream_tokens = append(stream_tokens, segment.Locator)
 368                                 blocks[b.Digest] = streamoffset
 369                                 streamoffset += int64(b.Size)
 370                         }
 371                 }
 372         }
 373
 374         if len(stream_tokens) == 1 {
 375                 stream_tokens = append(stream_tokens, "d41d8cd98f00b204e9800998ecf8427e+0")
 376         }
 377
 378         for _, streamfile := range sortedfiles {
 379                 // Add in file segments
 380                 span_start := int64(-1)
 381                 span_end := int64(0)
 382                 fout := EscapeName(streamfile)
 383                 for _, segment := range stream[streamfile] {
 384                         // Collapse adjacent segments
 385                         b, _ := ParseBlockLocator(segment.Locator)
 386                         streamoffset = blocks[b.Digest] + int64(segment.Offset)
 387                         if span_start == -1 {
 388                                 span_start = streamoffset
 389                                 span_end = streamoffset + int64(segment.Len)
 390                         } else {
 391                                 if streamoffset == span_end {
 392                                         span_end += int64(segment.Len)
 393                                 } else {
 394                                         stream_tokens = append(stream_tokens, fmt.Sprintf("%d:%d:%s", span_start, span_end-span_start, fout))
 395                                         span_start = streamoffset
 396                                         span_end = streamoffset + int64(segment.Len)
 397                                 }
 398                         }
 399                 }
 400
 401                 if span_start != -1 {
 402                         stream_tokens = append(stream_tokens, fmt.Sprintf("%d:%d:%s", span_start, span_end-span_start, fout))
 403                 }
 404
 405                 if len(stream[streamfile]) == 0 {
 406                         stream_tokens = append(stream_tokens, fmt.Sprintf("0:0:%s", fout))
 407                 }
 408         }
 409
 410         return strings.Join(stream_tokens, " ") + "\n"
 411 }
 412
 413 func (m segmentedManifest) manifestTextForPath(srcpath, relocate string) string {
 414         srcpath = fixStreamName(srcpath)
 415
 416         var suffix string
 417         if strings.HasSuffix(relocate, "/") {
 418                 suffix = "/"
 419         }
 420         relocate = fixStreamName(relocate) + suffix
 421
 422         streamname, filename := splitPath(srcpath)
 423
 424         if stream, ok := m[streamname]; ok {
 425                 // check if it refers to a single file in a stream
 426                 filesegs, okfile := stream[filename]
 427                 if okfile {
 428                         newstream := make(segmentedStream)
 429                         relocate_stream, relocate_filename := splitPath(relocate)
 430                         if relocate_filename == "" {
 431                                 relocate_filename = filename
 432                         }
 433                         newstream[relocate_filename] = filesegs
 434                         return newstream.normalizedText(relocate_stream)
 435                 }
 436         }
 437
 438         // Going to extract multiple streams
 439         prefix := srcpath + "/"
 440
 441         if strings.HasSuffix(relocate, "/") {
 442                 relocate = relocate[0 : len(relocate)-1]
 443         }
 444
 445         var sortedstreams []string
 446         for k, _ := range m {
 447                 sortedstreams = append(sortedstreams, k)
 448         }
 449         sort.Strings(sortedstreams)
 450
 451         manifest := ""
 452         for _, k := range sortedstreams {
 453                 if strings.HasPrefix(k, prefix) || k == srcpath {
 454                         manifest += m[k].normalizedText(relocate + k[len(srcpath):])
 455                 }
 456         }
 457         return manifest
 458 }
 459
 460 // Extract extracts some or all of the manifest and returns the extracted
 461 // portion as a normalized manifest.  This is a swiss army knife function that
 462 // can be several ways:
 463 //
 464 // If 'srcpath' and 'relocate' are '.' it simply returns an equivalent manifest
 465 // in normalized form.
 466 //
 467 //   Extract(".", ".")  // return entire normalized manfest text
 468 //
 469 // If 'srcpath' points to a single file, it will return manifest text for just that file.
 470 // The value of "relocate" is can be used to rename the file or set the file stream.
 471 //
 472 //   Extract("./foo", ".")          // extract file "foo" and put it in stream "."
 473 //   Extract("./foo", "./bar")      // extract file "foo", rename it to "bar" in stream "."
 474 //   Extract("./foo", "./bar/")     // extract file "foo", rename it to "./bar/foo"
 475 //   Extract("./foo", "./bar/baz")  // extract file "foo", rename it to "./bar/baz")
 476 //
 477 // Otherwise it will return the manifest text for all streams with the prefix in "srcpath" and place
 478 // them under the path in "relocate".
 479 //
 480 //   Extract("./stream", ".")      // extract "./stream" to "." and "./stream/subdir" to "./subdir")
 481 //   Extract("./stream", "./bar")  // extract "./stream" to "./bar" and "./stream/subdir" to "./bar/subdir")
 482 func (m Manifest) Extract(srcpath, relocate string) (ret Manifest) {
 483         segmented, err := m.segment()
 484         if err != nil {
 485                 ret.Err = err
 486                 return
 487         }
 488         ret.Text = segmented.manifestTextForPath(srcpath, relocate)
 489         return
 490 }
 491
 492 func (m *Manifest) StreamIter() <-chan ManifestStream {
 493         ch := make(chan ManifestStream)
 494         go func(input string) {
 495                 // This slice holds the current line and the remainder of the
 496                 // manifest.  We parse one line at a time, to save effort if we
 497                 // only need the first few lines.
 498                 lines := []string{"", input}
 499                 for {
 500                         lines = strings.SplitN(lines[1], "\n", 2)
 501                         if len(lines[0]) > 0 {
 502                                 // Only parse non-blank lines
 503                                 ch <- parseManifestStream(lines[0])
 504                         }
 505                         if len(lines) == 1 {
 506                                 break
 507                         }
 508                 }
 509                 close(ch)
 510         }(m.Text)
 511         return ch
 512 }
 513
 514 func (m *Manifest) FileSegmentIterByName(filepath string) <-chan *FileSegment {
 515         ch := make(chan *FileSegment, 64)
 516         filepath = fixStreamName(filepath)
 517         go func() {
 518                 for stream := range m.StreamIter() {
 519                         if !strings.HasPrefix(filepath, stream.StreamName+"/") {
 520                                 continue
 521                         }
 522                         stream.sendFileSegmentIterByName(filepath, ch)
 523                 }
 524                 close(ch)
 525         }()
 526         return ch
 527 }
 528
 529 // Blocks may appear multiple times within the same manifest if they
 530 // are used by multiple files. In that case this Iterator will output
 531 // the same block multiple times.
 532 //
 533 // In order to detect parse errors, caller must check m.Err after the returned channel closes.
 534 func (m *Manifest) BlockIterWithDuplicates() <-chan blockdigest.BlockLocator {
 535         blockChannel := make(chan blockdigest.BlockLocator)
 536         go func(streamChannel <-chan ManifestStream) {
 537                 for ms := range streamChannel {
 538                         if ms.Err != nil {
 539                                 m.Err = ms.Err
 540                                 continue
 541                         }
 542                         for _, block := range ms.Blocks {
 543                                 if b, err := blockdigest.ParseBlockLocator(block); err == nil {
 544                                         blockChannel <- b
 545                                 } else {
 546                                         m.Err = err
 547                                 }
 548                         }
 549                 }
 550                 close(blockChannel)
 551         }(m.StreamIter())
 552         return blockChannel
 553 }