"errors"
"fmt"
"git.curoverse.com/arvados.git/sdk/go/blockdigest"
+ "path"
"regexp"
"sort"
"strconv"
}
// Array of segments referencing file content
-type SegmentedFile []FileSegment
+type segmentedFile []FileSegment
// Map of files to list of file segments referencing file content
-type SegmentedStream map[string]SegmentedFile
+type segmentedStream map[string]segmentedFile
// Map of streams
-type SegmentedManifest map[string]SegmentedStream
+type segmentedManifest map[string]segmentedStream
var escapeSeq = regexp.MustCompile(`\\([0-9]{3}|\\)`)
}
func EscapeName(s string) string {
- return strings.Replace(s, " ", `\040`, -1)
+ raw := []byte(s)
+ escaped := make([]byte, 0, len(s))
+ for _, c := range raw {
+ if c <= 32 {
+ oct := fmt.Sprintf("\\%03o", c)
+ escaped = append(escaped, []byte(oct)...)
+ } else {
+ escaped = append(escaped, c)
+ }
+ }
+ return string(escaped)
}
func UnescapeName(s string) string {
return ch
}
-func FirstBlock(offsets []uint64, range_start uint64) int {
+func firstBlock(offsets []uint64, range_start uint64) int {
// range_start/block_start is the inclusive lower bound
// range_end/block_end is the exclusive upper bound
func (s *ManifestStream) sendFileSegmentIterByName(filepath string, ch chan<- *FileSegment) {
// This is what streamName+"/"+fileName will look like:
- target := filepath
- if !strings.HasPrefix(target, "./") {
- target = "./" + target
- }
+ target := fixStreamName(filepath)
for _, fTok := range s.FileStreamSegments {
wantPos := fTok.SegPos
wantLen := fTok.SegLen
}
// Binary search to determine first block in the stream
- i := FirstBlock(s.BlockOffsets, wantPos)
+ i := firstBlock(s.BlockOffsets, wantPos)
if i == -1 {
- // error
- break
+ // Shouldn't happen, file segments are checked in parseManifestStream
+ panic(fmt.Sprintf("File segment %v extends past end of stream", fTok))
}
for i < len(s.Blocks) {
blockPos := s.BlockOffsets[i]
blockEnd := s.BlockOffsets[i+1]
if blockEnd <= wantPos {
- // current block comes before current file span
- // (shouldn't happen, FirstBlock() should start us
- // on the right block)
- break
+ // Shouldn't happen, FirstBlock() should start
+ // us on the right block, so if this triggers
+ // that means there is a bug.
+ panic(fmt.Sprintf("Block end %v comes before start of file segment %v", blockEnd, wantPos))
}
if blockPos >= wantPos+wantLen {
// current block comes after current file span
m.Err = fmt.Errorf("Invalid file token: %s", ft)
break
}
+ if pft.SegPos+pft.SegLen > streamoffset {
+ m.Err = fmt.Errorf("File segment %s extends past end of stream %d", ft, streamoffset)
+ break
+ }
m.FileStreamSegments = append(m.FileStreamSegments, pft)
}
return
}
-func SplitPath(path string) (streamname, filename string) {
- pathIdx := strings.LastIndex(path, "/")
+func fixStreamName(sn string) string {
+ sn = path.Clean(sn)
+ if strings.HasPrefix(sn, "/") {
+ sn = "." + sn
+ } else if sn != "." {
+ sn = "./" + sn
+ }
+ return sn
+}
+
+func splitPath(srcpath string) (streamname, filename string) {
+ pathIdx := strings.LastIndex(srcpath, "/")
if pathIdx >= 0 {
- streamname = path[0:pathIdx]
- filename = path[pathIdx+1:]
+ streamname = srcpath[0:pathIdx]
+ filename = srcpath[pathIdx+1:]
} else {
- streamname = path
+ streamname = srcpath
filename = ""
}
return
}
-func (m *Manifest) SegmentManifest() *SegmentedManifest {
- files := make(SegmentedManifest)
+func (m *Manifest) segment() *segmentedManifest {
+ files := make(segmentedManifest)
for stream := range m.StreamIter() {
+ if stream.Err != nil {
+ // Skip streams with errors
+ continue
+ }
currentStreamfiles := make(map[string]bool)
for _, f := range stream.FileStreamSegments {
sn := stream.StreamName
- if sn != "." && !strings.HasPrefix(sn, "./") {
- sn = "./" + sn
- }
if strings.HasSuffix(sn, "/") {
sn = sn[0 : len(sn)-1]
}
path := sn + "/" + f.Name
- streamname, filename := SplitPath(path)
+ streamname, filename := splitPath(path)
if files[streamname] == nil {
- files[streamname] = make(SegmentedStream)
+ files[streamname] = make(segmentedStream)
}
if !currentStreamfiles[path] {
segs := files[streamname][filename]
for seg := range stream.FileSegmentIterByName(path) {
- segs = append(segs, *seg)
+ if seg.Len > 0 {
+ segs = append(segs, *seg)
+ }
}
files[streamname][filename] = segs
currentStreamfiles[path] = true
return &files
}
-func (stream *SegmentedStream) NormalizeStream(name string) string {
+func (stream segmentedStream) normalizedText(name string) string {
var sortedfiles []string
- for k, _ := range *stream {
+ for k, _ := range stream {
sortedfiles = append(sortedfiles, k)
}
sort.Strings(sortedfiles)
// Go through each file and add each referenced block exactly once.
for _, streamfile := range sortedfiles {
- for _, segment := range (*stream)[streamfile] {
+ for _, segment := range stream[streamfile] {
if _, ok := blocks[segment.Locator]; !ok {
stream_tokens = append(stream_tokens, segment.Locator)
blocks[segment.Locator] = streamoffset
span_start := int64(-1)
span_end := int64(0)
fout := EscapeName(streamfile)
- for _, segment := range (*stream)[streamfile] {
+ for _, segment := range stream[streamfile] {
// Collapse adjacent segments
streamoffset = blocks[segment.Locator] + int64(segment.Offset)
if span_start == -1 {
stream_tokens = append(stream_tokens, fmt.Sprintf("%d:%d:%s", span_start, span_end-span_start, fout))
}
- if len((*stream)[streamfile]) == 0 {
+ if len(stream[streamfile]) == 0 {
stream_tokens = append(stream_tokens, fmt.Sprintf("0:0:%s", fout))
}
}
return strings.Join(stream_tokens, " ") + "\n"
}
-func (m *Manifest) NormalizeManifest() string {
- segments := m.SegmentManifest()
+func (m segmentedManifest) manifestTextForPath(srcpath, relocate string) string {
+ srcpath = fixStreamName(srcpath)
+
+ var suffix string
+ if strings.HasSuffix(relocate, "/") {
+ suffix = "/"
+ }
+ relocate = fixStreamName(relocate) + suffix
+
+ streamname, filename := splitPath(srcpath)
+
+ if stream, ok := m[streamname]; ok {
+ // check if it refers to a single file in a stream
+ filesegs, okfile := stream[filename]
+ if okfile {
+ newstream := make(segmentedStream)
+ relocate_stream, relocate_filename := splitPath(relocate)
+ if relocate_filename == "" {
+ relocate_filename = filename
+ }
+ newstream[relocate_filename] = filesegs
+ return newstream.normalizedText(relocate_stream)
+ }
+ }
+
+ // Going to extract multiple streams
+ prefix := srcpath + "/"
+
+ if strings.HasSuffix(relocate, "/") {
+ relocate = relocate[0 : len(relocate)-1]
+ }
var sortedstreams []string
- for k, _ := range *segments {
+ for k, _ := range m {
sortedstreams = append(sortedstreams, k)
}
sort.Strings(sortedstreams)
- var manifest string
+ manifest := ""
for _, k := range sortedstreams {
- stream := (*segments)[k]
- manifest += stream.NormalizeStream(k)
+ if strings.HasPrefix(k, prefix) || k == srcpath {
+ manifest += m[k].normalizedText(relocate + k[len(srcpath):])
+ }
}
return manifest
}
-func (m *SegmentedManifest) ManifestForPath(path, relocate string) string {
- if path == "" {
- path = "."
- }
- if relocate == "" {
- relocate = "."
- }
-
- streamname, filename := SplitPath(path)
- var relocate_stream, relocate_filename string
- relocate_stream, relocate_filename = SplitPath(relocate)
-
- if stream, ok := (*m)[path]; ok {
- // refers to a single stream
- return stream.NormalizeStream(relocate)
- } else if stream, ok := (*m)[streamname]; ok {
- // refers to a single file in a stream
- newstream := make(SegmentedStream)
- if relocate_filename == "" {
- relocate_filename = filename
- }
- newstream[relocate_filename] = stream[filename]
- return newstream.NormalizeStream(relocate_stream)
- } else {
- // refers to multiple streams
- manifest := ""
- prefix := path
- if !strings.HasSuffix(prefix, "/") {
- prefix += "/"
- }
- if !strings.HasSuffix(relocate, "/") {
- relocate += "/"
- }
-
- var sortedstreams []string
- for k, _ := range *m {
- sortedstreams = append(sortedstreams, k)
- }
- sort.Strings(sortedstreams)
-
- for _, k := range sortedstreams {
- if strings.HasPrefix(k, prefix) {
- v := (*m)[k]
- manifest += v.NormalizeStream(relocate + k[len(prefix):])
- }
- }
- return manifest
- }
+// ManifestTextForPath extracts some or all of the manifest and returns
+// normalized manifest text. This is a swiss army knife function that can be
+// used a couple of different ways:
+//
+// If 'srcpath' points to a single file, it will return manifest text for just that file.
+// The value of "relocate" is can be used to rename the file or set the file stream.
+//
+// ManifestTextForPath("./foo", ".") (extract file "foo" and put it in stream ".")
+// ManifestTextForPath("./foo", "./bar") (extract file "foo", rename it to "bar" in stream ".")
+// ManifestTextForPath("./foo", "./bar/") (extract file "foo", rename it to "./bar/foo")
+// ManifestTextForPath("./foo", "./bar/baz") (extract file "foo", rename it to "./bar/baz")
+//
+// Otherwise it will return the manifest text for all streams with the prefix in "srcpath" and place
+// them under the path in "relocate".
+//
+// ManifestTextForPath(".", ".") (return entire normalized manfest text)
+// ManifestTextForPath("./stream", ".") (extract "./stream" to "." and "./stream/subdir" to "./subdir")
+// ManifestTextForPath("./stream", "./bar") (extract "./stream" to "./bar" and "./stream/subdir" to "./bar/subdir")
+func (m *Manifest) ManifestTextForPath(srcpath, relocate string) string {
+ return m.segment().manifestTextForPath(srcpath, relocate)
}
-func (m *Manifest) ManifestForPath(path, relocate string) string {
- return m.SegmentManifest().ManifestForPath(path, relocate)
+// NormalizedText returns the manifest text in normalized form.
+func (m *Manifest) NormalizedText() string {
+ return m.ManifestTextForPath(".", ".")
}
func (m *Manifest) StreamIter() <-chan ManifestStream {
func (m *Manifest) FileSegmentIterByName(filepath string) <-chan *FileSegment {
ch := make(chan *FileSegment, 64)
- if !strings.HasPrefix(filepath, "./") {
- filepath = "./" + filepath
- }
+ filepath = fixStreamName(filepath)
go func() {
for stream := range m.StreamIter() {
if !strings.HasPrefix(filepath, stream.StreamName+"/") {
firstStream,
ManifestStream{StreamName: ".",
Blocks: []string{"b746e3d2104645f2f64cd3cc69dd895d+15693477+E2866e643690156651c03d876e638e674dcd79475@5441920c"},
- FileStreamSegments: []FileStreamSegment{{0, 15893477, "chr10_band0_s0_e3000000.fj"}}})
+ FileStreamSegments: []FileStreamSegment{{0, 15693477, "chr10_band0_s0_e3000000.fj"}}})
received, ok := <-streamIter
if ok {
. 085c37f02916da1cad16f93c54d899b7+41 0:41:md5sum.txt
. 8b22da26f9f433dea0a10e5ec66d73ba+43 0:43:md5sum.txt
`}
- expectEqual(t, m1.NormalizeManifest(),
+ expectEqual(t, m1.NormalizedText(),
`. 5348b82a029fd9e971a811ce1f71360b+43 085c37f02916da1cad16f93c54d899b7+41 8b22da26f9f433dea0a10e5ec66d73ba+43 0:127:md5sum.txt
`)
m2 := Manifest{Text: `. 204e43b8a1185621ca55a94839582e6f+67108864 b9677abbac956bd3e86b1deb28dfac03+67108864 fc15aff2a762b13f521baf042140acec+67108864 323d2a3ce20370c4ca1d3462a344f8fd+25885655 0:227212247:var-GS000016015-ASM.tsv.bz2
`}
- expectEqual(t, m2.NormalizeManifest(), m2.Text)
+ expectEqual(t, m2.NormalizedText(), m2.Text)
m3 := Manifest{Text: `. 5348b82a029fd9e971a811ce1f71360b+43 3:40:md5sum.txt
. 085c37f02916da1cad16f93c54d899b7+41 0:41:md5sum.txt
. 8b22da26f9f433dea0a10e5ec66d73ba+43 0:43:md5sum.txt
`}
- expectEqual(t, m3.NormalizeManifest(), `. 5348b82a029fd9e971a811ce1f71360b+43 085c37f02916da1cad16f93c54d899b7+41 8b22da26f9f433dea0a10e5ec66d73ba+43 3:124:md5sum.txt
+ expectEqual(t, m3.NormalizedText(), `. 5348b82a029fd9e971a811ce1f71360b+43 085c37f02916da1cad16f93c54d899b7+41 8b22da26f9f433dea0a10e5ec66d73ba+43 3:124:md5sum.txt
`)
m4 := Manifest{Text: `. 204e43b8a1185621ca55a94839582e6f+67108864 0:3:foo/bar
./foo 323d2a3ce20370c4ca1d3462a344f8fd+25885655 0:3:bar
`}
- expectEqual(t, m4.NormalizeManifest(),
+ expectEqual(t, m4.NormalizedText(),
`./foo 204e43b8a1185621ca55a94839582e6f+67108864 323d2a3ce20370c4ca1d3462a344f8fd+25885655 0:3:bar 67108864:3:bar
./zzz 204e43b8a1185621ca55a94839582e6f+67108864 0:999:zzz
`)
- expectEqual(t, m4.ManifestForPath("./foo", "."), ". 204e43b8a1185621ca55a94839582e6f+67108864 323d2a3ce20370c4ca1d3462a344f8fd+25885655 0:3:bar 67108864:3:bar\n")
- expectEqual(t, m4.ManifestForPath("./foo", "./baz"), "./baz 204e43b8a1185621ca55a94839582e6f+67108864 323d2a3ce20370c4ca1d3462a344f8fd+25885655 0:3:bar 67108864:3:bar\n")
- expectEqual(t, m4.ManifestForPath("./foo/bar", "."), ". 204e43b8a1185621ca55a94839582e6f+67108864 323d2a3ce20370c4ca1d3462a344f8fd+25885655 0:3:bar 67108864:3:bar\n")
- expectEqual(t, m4.ManifestForPath("./foo/bar", "./baz"), ". 204e43b8a1185621ca55a94839582e6f+67108864 323d2a3ce20370c4ca1d3462a344f8fd+25885655 0:3:baz 67108864:3:baz\n")
- expectEqual(t, m4.ManifestForPath("./foo/bar", "./quux/"), "./quux 204e43b8a1185621ca55a94839582e6f+67108864 323d2a3ce20370c4ca1d3462a344f8fd+25885655 0:3:bar 67108864:3:bar\n")
- expectEqual(t, m4.ManifestForPath(".", "."), `./foo 204e43b8a1185621ca55a94839582e6f+67108864 323d2a3ce20370c4ca1d3462a344f8fd+25885655 0:3:bar 67108864:3:bar
+ expectEqual(t, m4.ManifestTextForPath("./foo", "."), ". 204e43b8a1185621ca55a94839582e6f+67108864 323d2a3ce20370c4ca1d3462a344f8fd+25885655 0:3:bar 67108864:3:bar\n")
+ expectEqual(t, m4.ManifestTextForPath("./foo", "./baz"), "./baz 204e43b8a1185621ca55a94839582e6f+67108864 323d2a3ce20370c4ca1d3462a344f8fd+25885655 0:3:bar 67108864:3:bar\n")
+ expectEqual(t, m4.ManifestTextForPath("./foo/bar", "."), ". 204e43b8a1185621ca55a94839582e6f+67108864 323d2a3ce20370c4ca1d3462a344f8fd+25885655 0:3:bar 67108864:3:bar\n")
+ expectEqual(t, m4.ManifestTextForPath("./foo/bar", "./baz"), ". 204e43b8a1185621ca55a94839582e6f+67108864 323d2a3ce20370c4ca1d3462a344f8fd+25885655 0:3:baz 67108864:3:baz\n")
+ expectEqual(t, m4.ManifestTextForPath("./foo/bar", "./quux/"), "./quux 204e43b8a1185621ca55a94839582e6f+67108864 323d2a3ce20370c4ca1d3462a344f8fd+25885655 0:3:bar 67108864:3:bar\n")
+ expectEqual(t, m4.ManifestTextForPath("./foo/bar", "./quux/baz"), "./quux 204e43b8a1185621ca55a94839582e6f+67108864 323d2a3ce20370c4ca1d3462a344f8fd+25885655 0:3:baz 67108864:3:baz\n")
+ expectEqual(t, m4.ManifestTextForPath(".", "."), `./foo 204e43b8a1185621ca55a94839582e6f+67108864 323d2a3ce20370c4ca1d3462a344f8fd+25885655 0:3:bar 67108864:3:bar
./zzz 204e43b8a1185621ca55a94839582e6f+67108864 0:999:zzz
`)
- expectEqual(t, m4.ManifestForPath(".", "./zip"), `./zip/foo 204e43b8a1185621ca55a94839582e6f+67108864 323d2a3ce20370c4ca1d3462a344f8fd+25885655 0:3:bar 67108864:3:bar
+ expectEqual(t, m4.ManifestTextForPath(".", "./zip"), `./zip/foo 204e43b8a1185621ca55a94839582e6f+67108864 323d2a3ce20370c4ca1d3462a344f8fd+25885655 0:3:bar 67108864:3:bar
./zip/zzz 204e43b8a1185621ca55a94839582e6f+67108864 0:999:zzz
`)
+ expectEqual(t, m4.ManifestTextForPath("foo/.//bar/../../zzz/", "/waz/"), `./waz 204e43b8a1185621ca55a94839582e6f+67108864 0:999:zzz
+`)
+
m5 := Manifest{Text: `. 204e43b8a1185621ca55a94839582e6f+67108864 0:3:foo/bar
./zzz 204e43b8a1185621ca55a94839582e6f+67108864 0:999:zzz
./foo 204e43b8a1185621ca55a94839582e6f+67108864 3:3:bar
`}
- expectEqual(t, m5.NormalizeManifest(),
+ expectEqual(t, m5.NormalizedText(),
`./foo 204e43b8a1185621ca55a94839582e6f+67108864 0:6:bar
./zzz 204e43b8a1185621ca55a94839582e6f+67108864 0:999:zzz
`)
m8 := Manifest{Text: `./a\040b\040c 59ca0efa9f5633cb0371bbc0355478d8+13 0:13:hello\040world.txt
`}
- expectEqual(t, m8.NormalizeManifest(), m8.Text)
+ expectEqual(t, m8.NormalizedText(), m8.Text)
m9 := Manifest{Text: ". acbd18db4cc2f85cedef654fccc4a4d8+40 0:10:one 20:10:two 10:10:one 30:10:two\n"}
- expectEqual(t, m9.ManifestForPath("", ""), ". acbd18db4cc2f85cedef654fccc4a4d8+40 0:20:one 20:20:two\n")
+ expectEqual(t, m9.ManifestTextForPath("", ""), ". acbd18db4cc2f85cedef654fccc4a4d8+40 0:20:one 20:20:two\n")
m10 := Manifest{Text: ". acbd18db4cc2f85cedef654fccc4a4d8+40 0:10:one 20:10:two 10:10:one 30:10:two\n"}
- expectEqual(t, m10.ManifestForPath("./two", "./three"), ". acbd18db4cc2f85cedef654fccc4a4d8+40 20:20:three\n")
+ expectEqual(t, m10.ManifestTextForPath("./two", "./three"), ". acbd18db4cc2f85cedef654fccc4a4d8+40 20:20:three\n")
m11 := Manifest{Text: arvadostest.PathologicalManifest}
- expectEqual(t, m11.NormalizeManifest(), `. acbd18db4cc2f85cedef654fccc4a4d8+3 37b51d194a7513e45b56f6524f2d51f2+3 73feffa4b7f6bb68e44cf984c85f6e88+3+Z+K@xyzzy d41d8cd98f00b204e9800998ecf8427e+0 0:1:f 1:4:ooba 5:1:r 5:4:rbaz 9:0:zero@0 9:0:zero@1 9:0:zero@4 9:0:zero@9
-./foo acbd18db4cc2f85cedef654fccc4a4d8+3 d41d8cd98f00b204e9800998ecf8427e+0 0:3:foo 0:3:foo 3:0:zero
+ expectEqual(t, m11.NormalizedText(), `. acbd18db4cc2f85cedef654fccc4a4d8+3 37b51d194a7513e45b56f6524f2d51f2+3 73feffa4b7f6bb68e44cf984c85f6e88+3+Z+K@xyzzy 0:1:f 1:4:ooba 5:1:r 5:4:rbaz 0:0:zero@0 0:0:zero@1 0:0:zero@4 0:0:zero@9
+./foo acbd18db4cc2f85cedef654fccc4a4d8+3 0:3:foo 0:3:foo 0:0:zero
./foo\040bar acbd18db4cc2f85cedef654fccc4a4d8+3 0:3:baz 0:3:baz\040waz
./overlapReverse acbd18db4cc2f85cedef654fccc4a4d8+3 2:1:o 2:1:ofoo 0:3:ofoo 1:2:oo
-./segmented acbd18db4cc2f85cedef654fccc4a4d8+3 37b51d194a7513e45b56f6524f2d51f2+3 d41d8cd98f00b204e9800998ecf8427e+0 0:1:frob 5:1:frob 1:1:frob 6:0:frob 3:1:frob 1:2:oof 0:1:oof
+./segmented acbd18db4cc2f85cedef654fccc4a4d8+3 37b51d194a7513e45b56f6524f2d51f2+3 0:1:frob 5:1:frob 1:1:frob 3:1:frob 1:2:oof 0:1:oof
+`)
+
+ m12 := Manifest{Text: `./foo 204e43b8a1185621ca55a94839582e6f+67108864 0:3:bar
+./zzz 204e43b8a1185621ca55a94839582e6f+67108864 0:999:zzz
+./foo/baz 323d2a3ce20370c4ca1d3462a344f8fd+25885655 0:3:bar
+`}
+
+ expectEqual(t, m12.ManifestTextForPath("./foo", "."), `. 204e43b8a1185621ca55a94839582e6f+67108864 0:3:bar
+./baz 323d2a3ce20370c4ca1d3462a344f8fd+25885655 0:3:bar
+`)
+ expectEqual(t, m12.ManifestTextForPath("./foo", "./blub"), `./blub 204e43b8a1185621ca55a94839582e6f+67108864 0:3:bar
+./blub/baz 323d2a3ce20370c4ca1d3462a344f8fd+25885655 0:3:bar
+`)
+ expectEqual(t, m12.ManifestTextForPath("./foo", "./blub/"), `./blub 204e43b8a1185621ca55a94839582e6f+67108864 0:3:bar
+./blub/baz 323d2a3ce20370c4ca1d3462a344f8fd+25885655 0:3:bar
+`)
+ expectEqual(t, m12.ManifestTextForPath("./foo/", "./blub/"), `./blub 204e43b8a1185621ca55a94839582e6f+67108864 0:3:bar
+./blub/baz 323d2a3ce20370c4ca1d3462a344f8fd+25885655 0:3:bar
`)
}