annotations, err := ioutil.ReadFile(npydir + "/hgvs.chr2.annotations.csv")
c.Assert(err, check.IsNil)
- c.Check(string(annotations), check.Equals, `0,chr2:g.470_472del
-1,chr2:g.471G>A
-2,chr2:g.472G>A
+ c.Check(string(annotations), check.Equals, `0,chr2:g.1_3delinsAAA
+1,chr2:g.125_127delinsAAA
+2,chr2:g.241_245delinsAAAAA
+3,chr2:g.291C>A
+4,chr2:g.470_472del
+5,chr2:g.471G>A
+6,chr2:g.472G>A
`)
}
}
}
+func (s *sliceSuite) TestSpanningTile(c *check.C) {
+ tmpdir := c.MkDir()
+ err := os.Mkdir(tmpdir+"/lib1", 0777)
+ c.Assert(err, check.IsNil)
+ err = os.Mkdir(tmpdir+"/lib2", 0777)
+ c.Assert(err, check.IsNil)
+ cwd, err := os.Getwd()
+ c.Assert(err, check.IsNil)
+ err = os.Symlink(cwd+"/testdata/pipeline1", tmpdir+"/pipeline1")
+ c.Assert(err, check.IsNil)
+ err = os.Symlink(cwd+"/testdata/pipeline1", tmpdir+"/pipeline1dup")
+ c.Assert(err, check.IsNil)
+
+ err = ioutil.WriteFile(tmpdir+"/chr1-12-100.bed", []byte("chr1\t12\t100\ttest.1\n"), 0644)
+ c.Check(err, check.IsNil)
+
+ c.Log("=== import testdata/ref ===")
+ exited := (&importer{}).RunCommand("import", []string{
+ "-local=true",
+ "-tag-library", "testdata/tags",
+ "-output-tiles",
+ "-save-incomplete-tiles",
+ "-o", tmpdir + "/lib1/library1.gob",
+ "testdata/ref.fasta",
+ }, nil, os.Stderr, os.Stderr)
+ c.Assert(exited, check.Equals, 0)
+
+ c.Log("=== import testdata/spanningtile ===")
+ exited = (&importer{}).RunCommand("import", []string{
+ "-local=true",
+ "-tag-library", "testdata/tags",
+ "-output-tiles",
+ "-o", tmpdir + "/lib2/library2.gob",
+ cwd + "/testdata/spanningtile",
+ }, nil, os.Stderr, os.Stderr)
+ c.Assert(exited, check.Equals, 0)
+
+ slicedir := c.MkDir()
+
+ c.Log("=== slice ===")
+ exited = (&slicecmd{}).RunCommand("slice", []string{
+ "-local=true",
+ "-output-dir=" + slicedir,
+ "-tags-per-file=2",
+ tmpdir + "/lib1",
+ tmpdir + "/lib2",
+ }, nil, os.Stderr, os.Stderr)
+ c.Check(exited, check.Equals, 0)
+ out, _ := exec.Command("find", slicedir, "-ls").CombinedOutput()
+ c.Logf("%s", out)
+
+ c.Log("=== dump ===")
+ {
+ dumpdir := c.MkDir()
+ exited = (&dump{}).RunCommand("dump", []string{
+ "-local=true",
+ "-tags=5,6",
+ "-input-dir=" + slicedir,
+ "-output-dir=" + dumpdir,
+ }, nil, os.Stderr, os.Stderr)
+ c.Check(exited, check.Equals, 0)
+ out, _ := exec.Command("find", dumpdir, "-ls").CombinedOutput()
+ c.Logf("%s", out)
+ dumped, err := ioutil.ReadFile(dumpdir + "/variants.csv")
+ c.Assert(err, check.IsNil)
+ c.Logf("%s", dumped)
+ // spanning tile for tag 5 with A>G snp in tag 6
+ c.Check("\n"+string(dumped), check.Matches, `(?ms).*\n5,1,0,chr2,225,.*AAAACTGATCCGAAAAAAATACAA.*`)
+ c.Check("\n"+string(dumped), check.Matches, `(?ms).*\n6,1,1,chr2,349,AAAACTGATCCAAAAAAAATACAA.*`)
+ }
+
+ c.Log("=== slice-numpy ===")
+ {
+ npydir := c.MkDir()
+ exited := (&sliceNumpy{}).RunCommand("slice-numpy", []string{
+ "-local=true",
+ "-input-dir=" + slicedir,
+ "-output-dir=" + npydir,
+ }, nil, os.Stderr, os.Stderr)
+ c.Check(exited, check.Equals, 0)
+ out, _ := exec.Command("find", npydir, "-ls").CombinedOutput()
+ c.Logf("%s", out)
+
+ f, err := os.Open(npydir + "/matrix.0000.npy")
+ c.Assert(err, check.IsNil)
+ defer f.Close()
+ npy, err := gonpy.NewReader(f)
+ c.Assert(err, check.IsNil)
+ c.Check(npy.Shape, check.DeepEquals, []int{1, 4})
+ variants, err := npy.GetInt16()
+ c.Check(variants, check.DeepEquals, []int16{-1, -1, 1, 1})
+
+ annotations, err := ioutil.ReadFile(npydir + "/matrix.0000.annotations.csv")
+ c.Assert(err, check.IsNil)
+ c.Logf("%s", annotations)
+
+ f, err = os.Open(npydir + "/matrix.0002.npy")
+ c.Assert(err, check.IsNil)
+ defer f.Close()
+ npy, err = gonpy.NewReader(f)
+ c.Assert(err, check.IsNil)
+ c.Check(npy.Shape, check.DeepEquals, []int{1, 4})
+ variants, err = npy.GetInt16()
+ c.Check(variants, check.DeepEquals, []int16{1, 1, 1, 2})
+
+ annotations, err = ioutil.ReadFile(npydir + "/matrix.0002.annotations.csv")
+ c.Assert(err, check.IsNil)
+ c.Logf("%s", annotations)
+ for _, s := range []string{
+ "chr2:g\\.360A>G",
+ } {
+ c.Check(string(annotations), check.Matches, "(?ms).*"+s+".*")
+ }
+ }
+
+ c.Log("=== slice-numpy + regions ===")
+ {
+ npydir := c.MkDir()
+ exited := (&sliceNumpy{}).RunCommand("slice-numpy", []string{
+ "-local=true",
+ "-regions=" + tmpdir + "/chr1-12-100.bed",
+ "-input-dir=" + slicedir,
+ "-output-dir=" + npydir,
+ "-chunked-hgvs-matrix=true",
+ }, nil, os.Stderr, os.Stderr)
+ c.Check(exited, check.Equals, 0)
+ out, _ := exec.Command("find", npydir, "-ls").CombinedOutput()
+ c.Logf("%s", out)
+
+ f, err := os.Open(npydir + "/matrix.0000.npy")
+ c.Assert(err, check.IsNil)
+ defer f.Close()
+ npy, err := gonpy.NewReader(f)
+ c.Assert(err, check.IsNil)
+ c.Check(npy.Shape, check.DeepEquals, []int{1, 2})
+ variants, err := npy.GetInt16()
+ c.Check(variants, check.DeepEquals, []int16{-1, -1})
+
+ annotations, err := ioutil.ReadFile(npydir + "/matrix.0000.annotations.csv")
+ c.Assert(err, check.IsNil)
+ c.Check(string(annotations), check.Equals, "0,0,1,=,chr1,0,,,\n")
+
+ for _, fnm := range []string{
+ npydir + "/matrix.0001.annotations.csv",
+ npydir + "/matrix.0002.annotations.csv",
+ } {
+ annotations, err := ioutil.ReadFile(fnm)
+ c.Assert(err, check.IsNil)
+ c.Check(string(annotations), check.Equals, "", check.Commentf(fnm))
+ }
+ }
+
+ err = ioutil.WriteFile(tmpdir+"/chr1and2-100-200.bed", []byte("chr1\t100\t200\ttest.1\nchr2\t100\t200\ttest.2\n"), 0644)
+ c.Check(err, check.IsNil)
+
+ c.Log("=== slice-numpy + regions + merge ===")
+ {
+ npydir := c.MkDir()
+ exited := (&sliceNumpy{}).RunCommand("slice-numpy", []string{
+ "-local=true",
+ "-regions=" + tmpdir + "/chr1and2-100-200.bed",
+ "-input-dir=" + slicedir,
+ "-output-dir=" + npydir,
+ "-merge-output=true",
+ "-single-hgvs-matrix=true",
+ }, nil, os.Stderr, os.Stderr)
+ c.Check(exited, check.Equals, 0)
+ out, _ := exec.Command("find", npydir, "-ls").CombinedOutput()
+ c.Logf("%s", out)
+
+ f, err := os.Open(npydir + "/matrix.npy")
+ c.Assert(err, check.IsNil)
+ defer f.Close()
+ npy, err := gonpy.NewReader(f)
+ c.Assert(err, check.IsNil)
+ c.Check(npy.Shape, check.DeepEquals, []int{1, 4})
+ variants, err := npy.GetInt16()
+ if c.Check(err, check.IsNil) {
+ c.Check(variants, check.DeepEquals, []int16{
+ -1, -1, 1, 1,
+ })
+ }
+
+ annotations, err := ioutil.ReadFile(npydir + "/matrix.annotations.csv")
+ c.Assert(err, check.IsNil)
+ c.Check(string(annotations), check.Equals, "")
+ }
+
+ c.Log("=== slice-numpy + chunked hgvs matrix ===")
+ {
+ err = ioutil.WriteFile(tmpdir+"/casecontrol.tsv", []byte(`SampleID CC
+spanningtile/input1 1
+`), 0600)
+ c.Assert(err, check.IsNil)
+ npydir := c.MkDir()
+ exited := (&sliceNumpy{}).RunCommand("slice-numpy", []string{
+ "-local=true",
+ "-chunked-hgvs-matrix=true",
+ "-chi2-case-control-file=" + tmpdir + "/casecontrol.tsv",
+ "-chi2-case-control-column=CC",
+ "-chi2-p-value=1",
+ "-min-coverage=0.75",
+ "-input-dir=" + slicedir,
+ "-output-dir=" + npydir,
+ }, nil, os.Stderr, os.Stderr)
+ c.Check(exited, check.Equals, 0)
+ out, _ := exec.Command("find", npydir, "-ls").CombinedOutput()
+ c.Logf("%s", out)
+
+ annotations, err := ioutil.ReadFile(npydir + "/hgvs.chr2.annotations.csv")
+ c.Assert(err, check.IsNil)
+ c.Check(string(annotations), check.Equals, `0,chr2:g.360A>G
+`)
+ }
+
+ c.Log("=== slice-numpy + onehotChunked ===")
+ {
+ err = ioutil.WriteFile(tmpdir+"/casecontrol.tsv", []byte(`SampleID CC
+spanningtile/input1 1
+`), 0600)
+ c.Assert(err, check.IsNil)
+ npydir := c.MkDir()
+ exited := (&sliceNumpy{}).RunCommand("slice-numpy", []string{
+ "-local=true",
+ "-chunked-onehot=true",
+ "-chi2-case-control-file=" + tmpdir + "/casecontrol.tsv",
+ "-chi2-case-control-column=CC",
+ "-chi2-p-value=1",
+ "-min-coverage=0.75",
+ "-input-dir=" + slicedir,
+ "-output-dir=" + npydir,
+ }, nil, os.Stderr, os.Stderr)
+ c.Check(exited, check.Equals, 0)
+ out, _ := exec.Command("find", npydir, "-ls").CombinedOutput()
+ c.Logf("%s", out)
+
+ f, err := os.Open(npydir + "/onehot.0002.npy")
+ c.Assert(err, check.IsNil)
+ defer f.Close()
+ npy, err := gonpy.NewReader(f)
+ c.Assert(err, check.IsNil)
+ c.Check(npy.Shape, check.DeepEquals, []int{1, 2})
+ onehot, err := npy.GetInt8()
+ if c.Check(err, check.IsNil) {
+ for r := 0; r < npy.Shape[0]; r++ {
+ c.Logf("%v", onehot[r*npy.Shape[1]:(r+1)*npy.Shape[1]])
+ }
+ c.Check(onehot, check.DeepEquals, []int8{
+ 0, 1, // input1
+ })
+ }
+ }
+
+ c.Log("=== slice-numpy + onehotSingle ===")
+ {
+ err = ioutil.WriteFile(tmpdir+"/casecontrol.tsv", []byte(`SampleID CC
+spanningtile/input1 1
+`), 0600)
+ c.Assert(err, check.IsNil)
+ npydir := c.MkDir()
+ exited := (&sliceNumpy{}).RunCommand("slice-numpy", []string{
+ "-local=true",
+ "-single-onehot=true",
+ "-chi2-case-control-file=" + tmpdir + "/casecontrol.tsv",
+ "-chi2-case-control-column=CC",
+ "-chi2-p-value=1",
+ "-min-coverage=0.75",
+ "-input-dir=" + slicedir,
+ "-output-dir=" + npydir,
+ "-debug-tag=1",
+ }, nil, os.Stderr, os.Stderr)
+ c.Check(exited, check.Equals, 0)
+ out, _ := exec.Command("find", npydir, "-ls").CombinedOutput()
+ c.Logf("%s", out)
+
+ f, err := os.Open(npydir + "/onehot.npy")
+ c.Assert(err, check.IsNil)
+ defer f.Close()
+ npy, err := gonpy.NewReader(f)
+ c.Assert(err, check.IsNil)
+ c.Check(npy.Shape, check.DeepEquals, []int{2, 1})
+ onehot, err := npy.GetUint32()
+ if c.Check(err, check.IsNil) {
+ for r := 0; r < npy.Shape[0]; r++ {
+ c.Logf("%v", onehot[r*npy.Shape[1]:(r+1)*npy.Shape[1]])
+ }
+ c.Check(onehot, check.DeepEquals, []uint32{
+ 0, 1,
+ })
+ }
+
+ f, err = os.Open(npydir + "/onehot-columns.npy")
+ c.Assert(err, check.IsNil)
+ defer f.Close()
+ npy, err = gonpy.NewReader(f)
+ c.Assert(err, check.IsNil)
+ c.Check(npy.Shape, check.DeepEquals, []int{5, 2})
+ onehotcols, err := npy.GetInt32()
+ if c.Check(err, check.IsNil) {
+ for r := 0; r < npy.Shape[0]; r++ {
+ c.Logf("%v", onehotcols[r*npy.Shape[1]:(r+1)*npy.Shape[1]])
+ }
+ c.Check(onehotcols, check.DeepEquals, []int32{
+ 5, 5,
+ 2, 2,
+ 1, 0,
+ 1000000, 1000000,
+ 0, 0,
+ })
+ }
+ }
+}
+
func (s *sliceSuite) Test_tv2homhet(c *check.C) {
cmd := &sliceNumpy{
cgnames: []string{"sample1", "sample2", "sample3", "sample4"},
"golang.org/x/crypto/blake2b"
)
+const annotationMaxTileSpan = 100
+
type sliceNumpy struct {
filter filter
threads int
if err != nil {
return 1
}
+ if len(cmd.cgnames) == 0 {
+ err = fmt.Errorf("fatal: 0 cases, 0 controls, nothing to do")
+ return 1
+ }
cmd.minCoverage = int(math.Ceil(cmd.filter.MinCoverage * float64(len(cmd.cgnames))))
{
seqname string // chr1
pos int // distance from start of chromosome to starttag
tiledata []byte // acgtggcaa...
+ excluded bool // true if excluded by regions file
+ nexttag tagID // tagID of following tile (-1 for last tag of chromosome)
}
isdup := map[tagID]bool{}
reftile := map[tagID]*reftileinfo{}
for seqname, cseq := range refseq {
pos := 0
+ lastreftag := tagID(-1)
for _, libref := range cseq {
if cmd.filter.MaxTag >= 0 && libref.Tag > tagID(cmd.filter.MaxTag) {
continue
variant: libref.Variant,
tiledata: tiledata,
pos: pos,
+ nexttag: -1,
}
+ if lastreftag >= 0 {
+ reftile[lastreftag].nexttag = libref.Tag
+ }
+ lastreftag = libref.Tag
}
pos += len(tiledata) - taglen
}
}
log.Printf("before applying mask, len(reftile) == %d", len(reftile))
log.Printf("deleting reftile entries for regions outside %d intervals", mask.Len())
- for tag, rt := range reftile {
+ for _, rt := range reftile {
if !mask.Check(strings.TrimPrefix(rt.seqname, "chr"), rt.pos, rt.pos+len(rt.tiledata)) {
- delete(reftile, tag)
+ rt.excluded = true
}
}
log.Printf("after applying mask, len(reftile) == %d", len(reftile))
if tv.Ref {
continue
}
+ // Skip tile with no
+ // corresponding ref tile, if
+ // mask is in play (we can't
+ // determine coordinates for
+ // these)
if mask != nil && reftile[tv.Tag] == nil {
- // Don't waste
- // time/memory on
- // masked-out tiles.
+ continue
+ }
+ // Skip tile whose
+ // corresponding ref tile is
+ // outside target regions --
+ // unless it's a potential
+ // spanning tile.
+ if mask != nil && reftile[tv.Tag].excluded &&
+ (int(tv.Tag+1) >= len(tagset) ||
+ (bytes.HasSuffix(tv.Sequence, tagset[tv.Tag+1]) && reftile[tv.Tag+1] != nil && !reftile[tv.Tag+1].excluded)) {
continue
}
if tv.Tag == cmd.debugTag {
for tag := tagstart; tag < tagend; tag++ {
rt := reftile[tag]
if rt == nil && mask != nil {
- // Excluded by specified regions
+ // With no ref tile, we don't
+ // have coordinates to say
+ // this is in the desired
+ // regions -- so it's not.
+ // TODO: handle ref spanning
+ // tile case.
+ continue
+ }
+ if rt != nil && rt.excluded {
+ // TODO: don't skip yet --
+ // first check for spanning
+ // tile variants that
+ // intersect non-excluded ref
+ // tiles.
continue
}
if cmd.filter.MaxTag >= 0 && tag > tagID(cmd.filter.MaxTag) {
if rt == nil {
// Reference does not use any
// variant of this tile
+ //
+ // TODO: diff against the
+ // relevant portion of the
+ // ref's spanning tile
outcol++
continue
}
} else {
done[v] = true
}
- if len(tv.Sequence) < taglen || !bytes.HasSuffix(rt.tiledata, tv.Sequence[len(tv.Sequence)-taglen:]) {
+ if len(tv.Sequence) < taglen {
+ continue
+ }
+ // if reftilestr doesn't end
+ // in the same tag as tv,
+ // extend reftilestr with
+ // following ref tiles until
+ // it does (up to an arbitrary
+ // sanity-check limit)
+ reftilestr := reftilestr
+ endtagstr := strings.ToUpper(string(tv.Sequence[len(tv.Sequence)-taglen:]))
+ for i, rt := 0, rt; i < annotationMaxTileSpan && !strings.HasSuffix(reftilestr, endtagstr) && rt.nexttag >= 0; i++ {
+ rt = reftile[rt.nexttag]
+ reftilestr += strings.ToUpper(string(rt.tiledata[taglen:]))
+ }
+ if mask != nil && !mask.Check(strings.TrimPrefix(rt.seqname, "chr"), rt.pos, rt.pos+len(reftilestr)) {
+ continue
+ }
+ if !strings.HasSuffix(reftilestr, endtagstr) {
fmt.Fprintf(annow, "%d,%d,%d,,%s,%d,,,\n", tag, outcol, v, rt.seqname, rt.pos)
continue
}
- if lendiff := len(rt.tiledata) - len(tv.Sequence); lendiff < -1000 || lendiff > 1000 {
+ if lendiff := len(reftilestr) - len(tv.Sequence); lendiff < -1000 || lendiff > 1000 {
fmt.Fprintf(annow, "%d,%d,%d,,%s,%d,,,\n", tag, outcol, v, rt.seqname, rt.pos)
continue
}
if cmd.filter.MaxTag >= 0 && tag > tagID(cmd.filter.MaxTag) {
break
}
- if mask != nil && reftile[tag] == nil {
+ if rt := reftile[tag]; rt == nil || rt.excluded {
continue
}
if v == 0 {