From 176a9ff1a681d7bf252e0c445c67b361dbe2d80d Mon Sep 17 00:00:00 2001 From: Tom Clegg Date: Thu, 16 Sep 2021 20:33:38 -0400 Subject: [PATCH] Fix reference assembly. refs #17966 Arvados-DCO-1.1-Signed-off-by: Tom Clegg --- slice_test.go | 45 ++++++++++++++++++++++++++++++++++++++++++--- slicenumpy.go | 20 +++----------------- 2 files changed, 45 insertions(+), 20 deletions(-) diff --git a/slice_test.go b/slice_test.go index 0d1fe3a4c6..e59821480e 100644 --- a/slice_test.go +++ b/slice_test.go @@ -9,6 +9,7 @@ import ( "os" "os/exec" + "github.com/kshedden/gonpy" "gopkg.in/check.v1" ) @@ -22,6 +23,7 @@ func (s *sliceSuite) TestImportAndSlice(c *check.C) { err := ioutil.WriteFile(tmpdir+"/chr1-12-100.bed", []byte("chr1\t12\t100\ttest.1\n"), 0644) c.Check(err, check.IsNil) + c.Log("=== import testdata/ref ===") exited := (&importer{}).RunCommand("import", []string{ "-local=true", "-tag-library", "testdata/tags", @@ -32,16 +34,17 @@ func (s *sliceSuite) TestImportAndSlice(c *check.C) { }, nil, os.Stderr, os.Stderr) c.Assert(exited, check.Equals, 0) + c.Log("=== import testdata/pipeline1 ===") exited = (&importer{}).RunCommand("import", []string{ "-local=true", "-tag-library", "testdata/tags", "-output-tiles", - // "-save-incomplete-tiles", "-o", tmpdir + "/library2.gob", "testdata/pipeline1", }, nil, os.Stderr, os.Stderr) c.Assert(exited, check.Equals, 0) + c.Log("=== merge ===") exited = (&merger{}).RunCommand("merge", []string{ "-local=true", "-o", tmpdir + "/library.gob", @@ -51,14 +54,50 @@ func (s *sliceSuite) TestImportAndSlice(c *check.C) { c.Assert(exited, check.Equals, 0) input := tmpdir + "/library.gob" + slicedir := c.MkDir() + c.Log("=== slice ===") exited = (&slicecmd{}).RunCommand("slice", []string{ "-local=true", "-input-dir=" + input, - "-output-dir=" + tmpdir, + "-output-dir=" + slicedir, "-tags-per-file=2", }, nil, os.Stderr, os.Stderr) c.Check(exited, check.Equals, 0) - out, _ := exec.Command("find", tmpdir, "-ls").CombinedOutput() + out, _ := exec.Command("find", slicedir, "-ls").CombinedOutput() c.Logf("%s", out) + + c.Log("=== slice-numpy ===") + npydir := c.MkDir() + exited = (&sliceNumpy{}).RunCommand("slice-numpy", []string{ + "-local=true", + "-input-dir=" + slicedir, + "-output-dir=" + npydir, + }, nil, os.Stderr, os.Stderr) + c.Check(exited, check.Equals, 0) + out, _ = exec.Command("find", npydir, "-ls").CombinedOutput() + c.Logf("%s", out) + + f, err := os.Open(npydir + "/matrix.0000.npy") + c.Assert(err, check.IsNil) + defer f.Close() + npy, err := gonpy.NewReader(f) + c.Assert(err, check.IsNil) + c.Check(npy.Shape, check.DeepEquals, []int{2, 4}) + variants, err := npy.GetInt16() + if c.Check(variants, check.HasLen, 8) { + c.Check(variants[4:], check.DeepEquals, []int16{-1, -1, 1, 1}) // TODO: check first row too, when stable + } + + annotations, err := ioutil.ReadFile(npydir + "/matrix.0000.annotations.csv") + c.Assert(err, check.IsNil) + c.Logf("%s", annotations) + for _, s := range []string{ + "chr1:g.161A>T", + "chr1:g.178A>T", + "chr1:g.1_3delinsGGC", + "chr1:g.222_224del", + } { + c.Check(string(annotations), check.Matches, "(?ms).*"+s+".*") + } } diff --git a/slicenumpy.go b/slicenumpy.go index ba80151f3a..dad23420e8 100644 --- a/slicenumpy.go +++ b/slicenumpy.go @@ -164,6 +164,7 @@ func (cmd *sliceNumpy) RunCommand(prog string, args []string, stdin io.Reader, s log.Info("building list of reference tiles to load") // TODO: more efficient if we had saved all ref tiles in slice0 type reftileinfo struct { + variant tileVariantID seqname string // chr1 pos int // distance from start of chr1 to start of tile tiledata []byte // acgtggcaa... @@ -171,7 +172,7 @@ func (cmd *sliceNumpy) RunCommand(prog string, args []string, stdin io.Reader, s reftile := map[tagID]*reftileinfo{} for seqname, cseq := range refseq { for _, libref := range cseq { - reftile[libref.Tag] = &reftileinfo{seqname: seqname} + reftile[libref.Tag] = &reftileinfo{seqname: seqname, variant: libref.Variant} } } log.Info("loading reference tiles from all slices") @@ -187,7 +188,7 @@ func (cmd *sliceNumpy) RunCommand(prog string, args []string, stdin io.Reader, s defer f.Close() return DecodeLibrary(f, strings.HasSuffix(infile, ".gz"), func(ent *LibraryEntry) error { for _, tv := range ent.TileVariants { - if dst, ok := reftile[tv.Tag]; ok { + if dst, ok := reftile[tv.Tag]; ok && dst.variant == tv.Variant { dst.tiledata = tv.Sequence } } @@ -342,18 +343,3 @@ func (cmd *sliceNumpy) RunCommand(prog string, args []string, stdin io.Reader, s } return 0 } - -func (*sliceNumpy) writeLibRefs(fnm string, tilelib *tileLibrary, librefs []tileLibRef) error { - f, err := os.OpenFile(fnm, os.O_CREATE|os.O_WRONLY, 0666) - if err != nil { - return err - } - defer f.Close() - for i, libref := range librefs { - _, err = fmt.Fprintf(f, "%d,%d,%d\n", i, libref.Tag, libref.Variant) - if err != nil { - return err - } - } - return f.Close() -} -- 2.30.2