Add -debug-tag flag.
authorTom Clegg <tom@curii.com>
Fri, 18 Feb 2022 20:11:17 +0000 (15:11 -0500)
committerTom Clegg <tom@curii.com>
Fri, 18 Feb 2022 20:11:17 +0000 (15:11 -0500)
refs #18581

Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom@curii.com>

slice_test.go
slicenumpy.go

index 37a7a9e866ebe9dbf36b40ceda2409761758f4b7..74e0e34519948b55ec03d658b284411273eef55f 100644 (file)
@@ -215,7 +215,12 @@ func (s *sliceSuite) TestImportAndSlice(c *check.C) {
                c.Check(npy.Shape, check.DeepEquals, []int{4, 4})
                variants, err := npy.GetInt16()
                if c.Check(err, check.IsNil) {
-                       c.Check(variants, check.DeepEquals, []int16{2, 1, 3, 1, -1, -1, 4, 2, 2, 1, 3, 1, -1, -1, 4, 2})
+                       c.Check(variants, check.DeepEquals, []int16{
+                               2, 1, 3, 1,
+                               -1, -1, 4, 2,
+                               2, 1, 3, 1,
+                               -1, -1, 4, 2,
+                       })
                }
 
                annotations, err := ioutil.ReadFile(npydir + "/matrix.annotations.csv")
@@ -325,6 +330,7 @@ pipeline1dup/input2 0
                        "-min-coverage=0.75",
                        "-input-dir=" + slicedir,
                        "-output-dir=" + npydir,
+                       "-debug-tag=1",
                }, nil, os.Stderr, os.Stderr)
                c.Check(exited, check.Equals, 0)
                out, _ := exec.Command("find", npydir, "-ls").CombinedOutput()
@@ -348,3 +354,51 @@ pipeline1dup/input2        0
                }
        }
 }
+
+func (s *sliceSuite) Test_tv2homhet(c *check.C) {
+       cmd := &sliceNumpy{
+               cgnames:         []string{"sample1", "sample2", "sample3", "sample4"},
+               chi2Cases:       []bool{false, true, true, false},
+               chi2PValue:      .5,
+               includeVariant1: true,
+       }
+       cgs := map[string]CompactGenome{
+               "sample1": CompactGenome{
+                       Variants: []tileVariantID{0, 0, 1, 1}, // hom tv=1
+               },
+               "sample2": CompactGenome{
+                       Variants: []tileVariantID{0, 0, 5, 5}, // hom tv=2
+               },
+               "sample3": CompactGenome{
+                       Variants: []tileVariantID{0, 0, 5, 1}, // het tv=1, tv=2
+               },
+               "sample4": CompactGenome{
+                       Variants: []tileVariantID{0, 0, 9, 9}, // hom tv=3
+               },
+       }
+       maxv := tileVariantID(3)
+       remap := []tileVariantID{0, 1, 0, 0, 0, 2, 0, 0, 0, 3}
+       chunkstarttag := tagID(10)
+       for tag := tagID(10); tag < 12; tag++ {
+               c.Logf("=== tag %d", tag)
+               chunk, xref := cmd.tv2homhet(cgs, maxv, remap, tag, chunkstarttag)
+               c.Logf("chunk len=%d", len(chunk))
+               for _, x := range chunk {
+                       c.Logf("%+v", x)
+               }
+               c.Logf("xref len=%d", len(xref))
+               for _, x := range xref {
+                       c.Logf("%+v", x)
+               }
+               out := onehotcols2int8(chunk)
+               c.Logf("onehotcols2int8(chunk) len=%d", len(out))
+               for i := 0; i < len(out); i += len(chunk) {
+                       c.Logf("%+v", out[i:i+len(chunk)])
+               }
+               coords := onehotChunk2Indirect(chunk)
+               c.Logf("onehotChunk2Indirect(chunk) len=%d", len(coords))
+               for _, x := range coords {
+                       c.Logf("%+v", x)
+               }
+       }
+}
index 08cb2c0114d5b130443434663268ade4c3b51207..0cc9896b59a35dd6645c48617413a9d59a882e26 100644 (file)
@@ -28,6 +28,7 @@ import (
        "git.arvados.org/arvados.git/sdk/go/arvados"
        "github.com/arvados/lightning/hgvs"
        "github.com/kshedden/gonpy"
+       "github.com/sirupsen/logrus"
        log "github.com/sirupsen/logrus"
        "golang.org/x/crypto/blake2b"
 )
@@ -42,6 +43,7 @@ type sliceNumpy struct {
        minCoverage           int
        cgnames               []string
        includeVariant1       bool
+       debugTag              tagID
 }
 
 func (cmd *sliceNumpy) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int {
@@ -67,6 +69,7 @@ func (cmd *sliceNumpy) RunCommand(prog string, args []string, stdin io.Reader, s
        hgvsChunked := flags.Bool("chunked-hgvs-matrix", false, "also generate hgvs-based matrix per chromosome")
        onehotSingle := flags.Bool("single-onehot", false, "generate one-hot tile-based matrix")
        onehotChunked := flags.Bool("chunked-onehot", false, "generate one-hot tile-based matrix per input chunk")
+       debugTag := flags.Int("debug-tag", -1, "log debugging details about specified tag")
        flags.IntVar(&cmd.threads, "threads", 16, "number of memory-hungry assembly threads")
        flags.StringVar(&cmd.chi2CaseControlFile, "chi2-case-control-file", "", "tsv file or directory indicating cases and controls for Χ² test (if directory, all .tsv files will be read)")
        flags.StringVar(&cmd.chi2CaseControlColumn, "chi2-case-control-column", "", "name of case/control column in case-control files for Χ² test (value must be 0 for control, 1 for case)")
@@ -92,6 +95,8 @@ func (cmd *sliceNumpy) RunCommand(prog string, args []string, stdin io.Reader, s
                return 2
        }
 
+       cmd.debugTag = tagID(*debugTag)
+
        if !*runlocal {
                runner := arvadosContainerRunner{
                        Name:        "lightning slice-numpy",
@@ -123,6 +128,7 @@ func (cmd *sliceNumpy) RunCommand(prog string, args []string, stdin io.Reader, s
                        "-chi2-case-control-column=" + cmd.chi2CaseControlColumn,
                        "-chi2-p-value=" + fmt.Sprintf("%f", cmd.chi2PValue),
                        "-include-variant-1=" + fmt.Sprintf("%v", cmd.includeVariant1),
+                       "-debug-tag=" + fmt.Sprintf("%d", cmd.debugTag),
                }
                runner.Args = append(runner.Args, cmd.filter.Args()...)
                var output string
@@ -380,6 +386,9 @@ func (cmd *sliceNumpy) RunCommand(prog string, args []string, stdin io.Reader, s
                                                // masked-out tiles.
                                                continue
                                        }
+                                       if tv.Tag == cmd.debugTag {
+                                               log.Printf("infile %d %s tag %d variant %d hash %x", infileIdx, infile, tv.Tag, tv.Variant, tv.Blake2b[:3])
+                                       }
                                        variants := seq[tv.Tag]
                                        if len(variants) == 0 {
                                                variants = make([]TileVariant, 100)
@@ -427,12 +436,15 @@ func (cmd *sliceNumpy) RunCommand(prog string, args []string, stdin io.Reader, s
                                                count[blake2b.Sum256(rt.tiledata)] = 0
                                        }
 
-                                       for _, cg := range cgs {
+                                       for cgname, cg := range cgs {
                                                idx := int(tag-tagstart) * 2
                                                for allele := 0; allele < 2; allele++ {
                                                        v := cg.Variants[idx+allele]
                                                        if v > 0 && len(variants[v].Sequence) > 0 {
                                                                count[variants[v].Blake2b]++
+                                                               if tag == cmd.debugTag {
+                                                                       log.Printf("tag %d cg %s allele %d tv %d hash %x count is now %d", tag, cgname, allele, v, variants[v].Blake2b[:3], count[variants[v].Blake2b])
+                                                               }
                                                        }
                                                }
                                        }
@@ -458,6 +470,11 @@ func (cmd *sliceNumpy) RunCommand(prog string, args []string, stdin io.Reader, s
                                        for i, h := range hash {
                                                rank[h] = tileVariantID(i + 1)
                                        }
+                                       if tag == cmd.debugTag {
+                                               for h, r := range rank {
+                                                       log.Printf("tag %d rank(%x) = %v", tag, h[:3], r)
+                                               }
+                                       }
                                        // remap[v] will be the new
                                        // variant number for original
                                        // variant number v.
@@ -465,9 +482,16 @@ func (cmd *sliceNumpy) RunCommand(prog string, args []string, stdin io.Reader, s
                                        for i, tv := range variants {
                                                remap[i] = rank[tv.Blake2b]
                                        }
+                                       if tag == cmd.debugTag {
+                                               log.Printf("tag %d remap %+v", tag, remap)
+                                       }
                                        variantRemap[tag-tagstart] = remap
                                        if rt != nil {
-                                               rt.variant = rank[blake2b.Sum256(rt.tiledata)]
+                                               refrank := rank[blake2b.Sum256(rt.tiledata)]
+                                               if tag == cmd.debugTag {
+                                                       log.Printf("tag %d reftile variant %d => %d", tag, rt.variant, refrank)
+                                               }
+                                               rt.variant = refrank
                                        }
                                }()
                        }
@@ -501,7 +525,18 @@ func (cmd *sliceNumpy) RunCommand(prog string, args []string, stdin io.Reader, s
                                        }
                                }
                                if *onehotChunked || *onehotSingle {
+                                       if tag == cmd.debugTag {
+                                               log.WithFields(logrus.Fields{
+                                                       "cgs[2].Variants[tag*2:(tag+1)*2]": cgs[cmd.cgnames[2]].Variants[(tag-tagstart)*2 : (tag-tagstart+1)*2],
+                                               }).Info("before tv2homhet")
+                                       }
                                        onehot, xrefs := cmd.tv2homhet(cgs, maxv, remap, tag, tagstart)
+                                       if tag == cmd.debugTag {
+                                               log.WithFields(logrus.Fields{
+                                                       "onehot": onehot,
+                                                       "xrefs":  xrefs,
+                                               }).Info("tv2homhet()")
+                                       }
                                        onehotChunk = append(onehotChunk, onehot...)
                                        onehotXref = append(onehotXref, xrefs...)
                                }
@@ -611,7 +646,7 @@ func (cmd *sliceNumpy) RunCommand(prog string, args []string, stdin io.Reader, s
                                // transpose onehotChunk[col][row] to numpy[row*ncols+col]
                                rows := len(cmd.cgnames)
                                cols := len(onehotChunk)
-                               log.Infof("%04d: preparing onehot numpy (rows=%d, cols=%d, mem=%d)", infileIdx, len(cmd.cgnames), len(onehotChunk), len(cmd.cgnames)*len(onehotChunk))
+                               log.Infof("%04d: preparing onehot numpy (rows=%d, cols=%d, mem=%d)", infileIdx, rows, cols, rows*cols)
                                throttleNumpyMem.Acquire()
                                out := onehotcols2int8(onehotChunk)
                                fnm := fmt.Sprintf("%s/onehot.%04d.npy", *outputDir, infileIdx)
@@ -1200,6 +1235,19 @@ const onehotXrefSize = unsafe.Sizeof(onehotXref{})
 //
 // Return nil if no tile variant passes Χ² filter.
 func (cmd *sliceNumpy) tv2homhet(cgs map[string]CompactGenome, maxv tileVariantID, remap []tileVariantID, tag, chunkstarttag tagID) ([][]int8, []onehotXref) {
+       if tag == cmd.debugTag {
+               tv := make([]tileVariantID, len(cmd.cgnames)*2)
+               for i, name := range cmd.cgnames {
+                       copy(tv[i*2:(i+1)*2], cgs[name].Variants[(tag-chunkstarttag)*2:])
+               }
+               log.WithFields(logrus.Fields{
+                       "cgs[i].Variants[tag*2+j]": tv,
+                       "maxv":                     maxv,
+                       "remap":                    remap,
+                       "tag":                      tag,
+                       "chunkstarttag":            chunkstarttag,
+               }).Info("tv2homhet()")
+       }
        if maxv < 1 || (maxv < 2 && !cmd.includeVariant1) {
                // everyone has the most common variant (of the variants we don't drop)
                return nil, nil