Fix reference assembly.
authorTom Clegg <tom@tomclegg.ca>
Fri, 17 Sep 2021 00:33:38 +0000 (20:33 -0400)
committerTom Clegg <tom@tomclegg.ca>
Fri, 17 Sep 2021 00:33:38 +0000 (20:33 -0400)
refs #17966

Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom@curii.com>

slice_test.go
slicenumpy.go

index 0d1fe3a4c6a92d3115c059ccff07cf7fbee5b4b1..e59821480ecc19023ad35888d013b1aed5fa9aca 100644 (file)
@@ -9,6 +9,7 @@ import (
        "os"
        "os/exec"
 
+       "github.com/kshedden/gonpy"
        "gopkg.in/check.v1"
 )
 
@@ -22,6 +23,7 @@ func (s *sliceSuite) TestImportAndSlice(c *check.C) {
        err := ioutil.WriteFile(tmpdir+"/chr1-12-100.bed", []byte("chr1\t12\t100\ttest.1\n"), 0644)
        c.Check(err, check.IsNil)
 
+       c.Log("=== import testdata/ref ===")
        exited := (&importer{}).RunCommand("import", []string{
                "-local=true",
                "-tag-library", "testdata/tags",
@@ -32,16 +34,17 @@ func (s *sliceSuite) TestImportAndSlice(c *check.C) {
        }, nil, os.Stderr, os.Stderr)
        c.Assert(exited, check.Equals, 0)
 
+       c.Log("=== import testdata/pipeline1 ===")
        exited = (&importer{}).RunCommand("import", []string{
                "-local=true",
                "-tag-library", "testdata/tags",
                "-output-tiles",
-               // "-save-incomplete-tiles",
                "-o", tmpdir + "/library2.gob",
                "testdata/pipeline1",
        }, nil, os.Stderr, os.Stderr)
        c.Assert(exited, check.Equals, 0)
 
+       c.Log("=== merge ===")
        exited = (&merger{}).RunCommand("merge", []string{
                "-local=true",
                "-o", tmpdir + "/library.gob",
@@ -51,14 +54,50 @@ func (s *sliceSuite) TestImportAndSlice(c *check.C) {
        c.Assert(exited, check.Equals, 0)
 
        input := tmpdir + "/library.gob"
+       slicedir := c.MkDir()
 
+       c.Log("=== slice ===")
        exited = (&slicecmd{}).RunCommand("slice", []string{
                "-local=true",
                "-input-dir=" + input,
-               "-output-dir=" + tmpdir,
+               "-output-dir=" + slicedir,
                "-tags-per-file=2",
        }, nil, os.Stderr, os.Stderr)
        c.Check(exited, check.Equals, 0)
-       out, _ := exec.Command("find", tmpdir, "-ls").CombinedOutput()
+       out, _ := exec.Command("find", slicedir, "-ls").CombinedOutput()
        c.Logf("%s", out)
+
+       c.Log("=== slice-numpy ===")
+       npydir := c.MkDir()
+       exited = (&sliceNumpy{}).RunCommand("slice-numpy", []string{
+               "-local=true",
+               "-input-dir=" + slicedir,
+               "-output-dir=" + npydir,
+       }, nil, os.Stderr, os.Stderr)
+       c.Check(exited, check.Equals, 0)
+       out, _ = exec.Command("find", npydir, "-ls").CombinedOutput()
+       c.Logf("%s", out)
+
+       f, err := os.Open(npydir + "/matrix.0000.npy")
+       c.Assert(err, check.IsNil)
+       defer f.Close()
+       npy, err := gonpy.NewReader(f)
+       c.Assert(err, check.IsNil)
+       c.Check(npy.Shape, check.DeepEquals, []int{2, 4})
+       variants, err := npy.GetInt16()
+       if c.Check(variants, check.HasLen, 8) {
+               c.Check(variants[4:], check.DeepEquals, []int16{-1, -1, 1, 1}) // TODO: check first row too, when stable
+       }
+
+       annotations, err := ioutil.ReadFile(npydir + "/matrix.0000.annotations.csv")
+       c.Assert(err, check.IsNil)
+       c.Logf("%s", annotations)
+       for _, s := range []string{
+               "chr1:g.161A>T",
+               "chr1:g.178A>T",
+               "chr1:g.1_3delinsGGC",
+               "chr1:g.222_224del",
+       } {
+               c.Check(string(annotations), check.Matches, "(?ms).*"+s+".*")
+       }
 }
index ba80151f3ac22679f70b3d7a59232df3a7ec2c38..dad23420e87802cf156804fc2ffbe19c5847abbd 100644 (file)
@@ -164,6 +164,7 @@ func (cmd *sliceNumpy) RunCommand(prog string, args []string, stdin io.Reader, s
 
        log.Info("building list of reference tiles to load") // TODO: more efficient if we had saved all ref tiles in slice0
        type reftileinfo struct {
+               variant  tileVariantID
                seqname  string // chr1
                pos      int    // distance from start of chr1 to start of tile
                tiledata []byte // acgtggcaa...
@@ -171,7 +172,7 @@ func (cmd *sliceNumpy) RunCommand(prog string, args []string, stdin io.Reader, s
        reftile := map[tagID]*reftileinfo{}
        for seqname, cseq := range refseq {
                for _, libref := range cseq {
-                       reftile[libref.Tag] = &reftileinfo{seqname: seqname}
+                       reftile[libref.Tag] = &reftileinfo{seqname: seqname, variant: libref.Variant}
                }
        }
        log.Info("loading reference tiles from all slices")
@@ -187,7 +188,7 @@ func (cmd *sliceNumpy) RunCommand(prog string, args []string, stdin io.Reader, s
                        defer f.Close()
                        return DecodeLibrary(f, strings.HasSuffix(infile, ".gz"), func(ent *LibraryEntry) error {
                                for _, tv := range ent.TileVariants {
-                                       if dst, ok := reftile[tv.Tag]; ok {
+                                       if dst, ok := reftile[tv.Tag]; ok && dst.variant == tv.Variant {
                                                dst.tiledata = tv.Sequence
                                        }
                                }
@@ -342,18 +343,3 @@ func (cmd *sliceNumpy) RunCommand(prog string, args []string, stdin io.Reader, s
        }
        return 0
 }
-
-func (*sliceNumpy) writeLibRefs(fnm string, tilelib *tileLibrary, librefs []tileLibRef) error {
-       f, err := os.OpenFile(fnm, os.O_CREATE|os.O_WRONLY, 0666)
-       if err != nil {
-               return err
-       }
-       defer f.Close()
-       for i, libref := range librefs {
-               _, err = fmt.Fprintf(f, "%d,%d,%d\n", i, libref.Tag, libref.Variant)
-               if err != nil {
-                       return err
-               }
-       }
-       return f.Close()
-}