Export hgvs one-hot numpy: -1 for missing / low quality tiles.
authorTom Clegg <tom@tomclegg.ca>
Wed, 11 Aug 2021 21:31:23 +0000 (17:31 -0400)
committerTom Clegg <tom@tomclegg.ca>
Wed, 11 Aug 2021 21:31:23 +0000 (17:31 -0400)
refs #17562

Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom@curii.com>

export.go
export_test.go
pipeline_test.go

index 9546926c015fe98d02357b17fb4f27e54e2a1951..7f6d84cef3ea34a9759fac6856dfd4975736c162 100644 (file)
--- a/export.go
+++ b/export.go
@@ -47,7 +47,7 @@ type outputFormat interface {
 
 var outputFormats = map[string]func() outputFormat{
        "hgvs-numpy": func() outputFormat {
-               return &formatHGVSNumpy{alleles: map[string][][]bool{}}
+               return &formatHGVSNumpy{alleles: map[string][][]int8{}}
        },
        "hgvs-onehot": func() outputFormat { return formatHGVSOneHot{} },
        "hgvs":        func() outputFormat { return formatHGVS{} },
@@ -400,15 +400,14 @@ func eachVariant(bedw io.Writer, taglen int, seqname string, reftiles []tileLibR
                tagcoverage := 0 // number of times the start tag was found in genomes -- max is len(cgs)*2
                for cgidx, cg := range cgs {
                        for phase := 0; phase < 2; phase++ {
-                               if len(cg.Variants) <= int(libref.Tag)*2+phase {
-                                       continue
+                               var variant tileVariantID
+                               if i := int(libref.Tag)*2 + phase; len(cg.Variants) > i {
+                                       variant = cg.Variants[i]
                                }
-                               variant := cg.Variants[int(libref.Tag)*2+phase]
-                               if variant == 0 {
-                                       continue
+                               if variant > 0 {
+                                       tagcoverage++
                                }
-                               tagcoverage++
-                               if variant == libref.Variant {
+                               if variant == libref.Variant || variant == 0 {
                                        continue
                                }
                                glibref := tileLibRef{Tag: libref.Tag, Variant: variant}
@@ -481,9 +480,23 @@ func eachVariant(bedw io.Writer, taglen int, seqname string, reftiles []tileLibR
                for i, pos := range flushpos {
                        varslice := variantAt[pos]
                        delete(variantAt, pos)
+                       // Check for uninitialized (zero-value)
+                       // elements in varslice
                        for i := range varslice {
-                               if varslice[i].Position == 0 {
-                                       varslice[i].Position = pos
+                               if varslice[i].Position != 0 {
+                                       // Not a zero-value element
+                                       continue
+                               }
+                               // Set the position so
+                               // varslice[*].Position are all equal
+                               varslice[i].Position = pos
+                               // This could be either =ref or a
+                               // missing/low-quality tile. Figure
+                               // out which.
+                               v := cgs[i/2].Variants[int(libref.Tag)*2+i%2]
+                               if v < 1 || len(tilelib.TileVariantSequence(tileLibRef{Tag: libref.Tag, Variant: v})) == 0 {
+                                       // Missing/low-quality tile.
+                                       varslice[i].New = "-" // fasta "gap of indeterminate length"
                                }
                        }
                        flushvariants[i] = varslice
@@ -526,6 +539,11 @@ func bucketVarsliceByRef(varslice []tvVariant) map[string]map[string]int {
        byref := map[string]map[string]int{}
        for _, v := range varslice {
                if v.Ref == "" && v.New == "" {
+                       // =ref
+                       continue
+               }
+               if v.New == "-" {
+                       // no-call
                        continue
                }
                alts := byref[v.Ref]
@@ -639,6 +657,13 @@ func (formatHGVS) Print(out io.Writer, seqname string, varslice []tvVariant) err
                        out.Write([]byte{'\t'})
                }
                var1, var2 := varslice[i*2], varslice[i*2+1]
+               if var1.New == "-" || var2.New == "-" {
+                       _, err := out.Write([]byte{'N'})
+                       if err != nil {
+                               return err
+                       }
+                       continue
+               }
                if var1.Variant == var2.Variant {
                        if var1.Ref == var1.New {
                                _, err := out.Write([]byte{'.'})
@@ -685,6 +710,9 @@ func (formatHGVSOneHot) Print(out io.Writer, seqname string, varslice []tvVarian
        sort.Slice(sorted, func(a, b int) bool { return hgvs.Less(sorted[a], sorted[b]) })
 
        for _, v := range sorted {
+               if v.New == "-" {
+                       continue
+               }
                fmt.Fprintf(out, "%s.%s", seqname, v.String())
                for i := 0; i < len(varslice); i += 2 {
                        if varslice[i].Variant == v || varslice[i+1].Variant == v {
@@ -704,7 +732,7 @@ func (formatHGVSOneHot) Print(out io.Writer, seqname string, varslice []tvVarian
 type formatHGVSNumpy struct {
        sync.Mutex
        writelock sync.Mutex
-       alleles   map[string][][]bool // alleles[seqname][variantidx][genomeidx*2+phase]
+       alleles   map[string][][]int8 // alleles[seqname][variantidx][genomeidx*2+phase]
 }
 
 func (*formatHGVSNumpy) MaxGoroutines() int                            { return 4 }
@@ -723,18 +751,20 @@ func (f *formatHGVSNumpy) Print(outw io.Writer, seqname string, varslice []tvVar
        seqalleles := f.alleles[seqname]
        f.Unlock()
 
-       // append a row to seqvariants and seqalleles for each unique
-       // non-ref variant in varslice.
+       // append a row to seqalleles for each unique non-ref variant
+       // in varslice.
        var previous hgvs.Variant
        for _, v := range sorted {
-               if previous == v || v.Ref == v.New {
+               if previous == v || v.Ref == v.New || v.New == "-" {
                        continue
                }
                previous = v
-               newrow := make([]bool, len(varslice))
+               newrow := make([]int8, len(varslice))
                for i, allele := range varslice {
                        if allele.Variant == v {
-                               newrow[i] = true
+                               newrow[i] = 1
+                       } else if allele.Variant.New == "-" {
+                               newrow[i] = -1
                        }
                }
                seqalleles = append(seqalleles, newrow)
@@ -752,8 +782,8 @@ func (f *formatHGVSNumpy) Print(outw io.Writer, seqname string, varslice []tvVar
 func (f *formatHGVSNumpy) Finish(outdir string, _ io.Writer, seqname string) error {
        // Write seqname's data to a .npy matrix with one row per
        // genome and 2 columns per variant.
-       seqalleles := f.alleles[seqname]
        f.Lock()
+       seqalleles := f.alleles[seqname]
        delete(f.alleles, seqname)
        f.Unlock()
        if len(seqalleles) == 0 {
@@ -767,10 +797,14 @@ func (f *formatHGVSNumpy) Finish(outdir string, _ io.Writer, seqname string) err
        for varidx, alleles := range seqalleles {
                for g := 0; g < len(alleles)/2; g++ {
                        aa, ab := alleles[g*2], alleles[g*2+1]
-                       if aa && ab {
+                       if aa < 0 || ab < 0 {
+                               // no-call
+                               out[g*cols+varidx*2] = -1
+                               out[g*cols+varidx*2+1] = -1
+                       } else if aa > 0 && ab > 0 {
                                // hom
                                out[g*cols+varidx*2] = 1
-                       } else if aa || ab {
+                       } else if aa > 0 || ab > 0 {
                                // het
                                out[g*cols+varidx*2+1] = 1
                        }
index 2cc0fbf36616f86a53bb653f602c7bd219b34ae5..d87b3ae51c3d9dbac364a28499921ac07b72f66a 100644 (file)
@@ -5,7 +5,6 @@
 package lightning
 
 import (
-       "bytes"
        "io/ioutil"
        "os"
        "os/exec"
@@ -24,19 +23,44 @@ func (s *exportSuite) TestFastaToHGVS(c *check.C) {
        err := ioutil.WriteFile(tmpdir+"/chr1-12-100.bed", []byte("chr1\t12\t100\ttest.1\n"), 0644)
        c.Check(err, check.IsNil)
 
-       var buffer bytes.Buffer
-       exited := (&importer{}).RunCommand("import", []string{"-local=true", "-tag-library", "testdata/tags", "-output-tiles", "-save-incomplete-tiles", "testdata/pipeline1", "testdata/ref.fasta"}, &bytes.Buffer{}, &buffer, os.Stderr)
+       exited := (&importer{}).RunCommand("import", []string{
+               "-local=true",
+               "-tag-library", "testdata/tags",
+               "-output-tiles",
+               "-save-incomplete-tiles",
+               "-o", tmpdir + "/library1.gob",
+               "testdata/ref.fasta",
+       }, nil, os.Stderr, os.Stderr)
+       c.Assert(exited, check.Equals, 0)
+
+       exited = (&importer{}).RunCommand("import", []string{
+               "-local=true",
+               "-tag-library", "testdata/tags",
+               "-output-tiles",
+               // "-save-incomplete-tiles",
+               "-o", tmpdir + "/library2.gob",
+               "testdata/pipeline1",
+       }, nil, os.Stderr, os.Stderr)
        c.Assert(exited, check.Equals, 0)
-       ioutil.WriteFile(tmpdir+"/library.gob", buffer.Bytes(), 0644)
+
+       exited = (&merger{}).RunCommand("merge", []string{
+               "-local=true",
+               "-o", tmpdir + "/library.gob",
+               tmpdir + "/library1.gob",
+               tmpdir + "/library2.gob",
+       }, nil, os.Stderr, os.Stderr)
+       c.Assert(exited, check.Equals, 0)
+
+       input := tmpdir + "/library.gob"
 
        exited = (&exporter{}).RunCommand("export", []string{
                "-local=true",
-               "-input-dir=" + tmpdir,
+               "-input-dir=" + input,
                "-output-dir=" + tmpdir,
                "-output-format=hgvs-onehot",
                "-output-labels=" + tmpdir + "/labels.csv",
                "-ref=testdata/ref.fasta",
-       }, &buffer, os.Stderr, os.Stderr)
+       }, nil, os.Stderr, os.Stderr)
        c.Check(exited, check.Equals, 0)
        output, err := ioutil.ReadFile(tmpdir + "/out.chr1.tsv")
        if !c.Check(err, check.IsNil) {
@@ -68,11 +92,11 @@ chr2.471_472delinsAA        1       0
 
        exited = (&exporter{}).RunCommand("export", []string{
                "-local=true",
-               "-input-dir=" + tmpdir,
+               "-input-dir=" + input,
                "-output-dir=" + tmpdir,
                "-output-format=pvcf",
                "-ref=testdata/ref.fasta",
-       }, &buffer, os.Stderr, os.Stderr)
+       }, os.Stderr, os.Stderr, os.Stderr)
        c.Check(exited, check.Equals, 0)
        output, err = ioutil.ReadFile(tmpdir + "/out.chr1.vcf")
        c.Check(err, check.IsNil)
@@ -102,11 +126,11 @@ chr2      471     .       GG      AA      .       .       .       GT      0/1     0/0
 
        exited = (&exporter{}).RunCommand("export", []string{
                "-local=true",
-               "-input-dir=" + tmpdir,
+               "-input-dir=" + input,
                "-output-dir=" + tmpdir,
                "-output-format=vcf",
                "-ref=testdata/ref.fasta",
-       }, &buffer, os.Stderr, os.Stderr)
+       }, nil, os.Stderr, os.Stderr)
        c.Check(exited, check.Equals, 0)
        output, err = ioutil.ReadFile(tmpdir + "/out.chr1.vcf")
        c.Check(err, check.IsNil)
@@ -136,11 +160,11 @@ chr2      471     .       GG      AA      .       .       AC=1
        outdir := c.MkDir()
        exited = (&exporter{}).RunCommand("export", []string{
                "-local=true",
-               "-input-dir=" + tmpdir,
+               "-input-dir=" + input,
                "-output-dir=" + outdir,
                "-output-format=hgvs-numpy",
                "-ref=testdata/ref.fasta",
-       }, &buffer, os.Stderr, os.Stderr)
+       }, nil, os.Stderr, os.Stderr)
        c.Check(exited, check.Equals, 0)
 
        f, err := os.Open(outdir + "/matrix.chr1.npy")
@@ -153,7 +177,7 @@ chr2        471     .       GG      AA      .       .       AC=1
        c.Check(variants, check.HasLen, 6*2*2) // 6 variants * 2 alleles * 2 genomes
        c.Check(variants, check.DeepEquals, []int8{
                1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, // input1.1.fasta
-               0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // input2.1.fasta
+               -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, // input2.1.fasta
        })
 
        f, err = os.Open(outdir + "/matrix.chr2.npy")
index e8c137212300788ab9a43cf4bbc33ca1c3f855c4..c208c645cc1b00b010fa0d3a5542ea5b6874b3cb 100644 (file)
@@ -95,11 +95,11 @@ func (s *pipelineSuite) TestImportMerge(c *check.C) {
        c.Check(code, check.Equals, 0)
        hgvsout, err := ioutil.ReadFile(tmpdir + "/out.tsv")
        c.Check(err, check.IsNil)
-       c.Check(sortLines(string(hgvsout)), check.Equals, sortLines(`chr1:g.1_3delinsGGC        .
-chr1:g.[41_42delinsAA];[41=]   .
-chr1:g.[161=];[161A>T] .
-chr1:g.[178=];[178A>T] .
-chr1:g.222_224del      .
+       c.Check(sortLines(string(hgvsout)), check.Equals, sortLines(`chr1:g.1_3delinsGGC        N
+chr1:g.[41_42delinsAA];[41=]   N
+chr1:g.[161=];[161A>T] N
+chr1:g.[178=];[178A>T] N
+chr1:g.222_224del      N
 chr1:g.[302=];[302_305delinsAAAA]      .
 .      chr2:g.[1=];[1_3delinsAAA]
 .      chr2:g.125_127delinsAAA