From: Tom Clegg Date: Fri, 18 Feb 2022 20:11:17 +0000 (-0500) Subject: Add -debug-tag flag. X-Git-Url: https://git.arvados.org/lightning.git/commitdiff_plain/353fe94dce384fdfbc78c3b4d9516ae21144b706 Add -debug-tag flag. refs #18581 Arvados-DCO-1.1-Signed-off-by: Tom Clegg --- diff --git a/slice_test.go b/slice_test.go index 37a7a9e866..74e0e34519 100644 --- a/slice_test.go +++ b/slice_test.go @@ -215,7 +215,12 @@ func (s *sliceSuite) TestImportAndSlice(c *check.C) { c.Check(npy.Shape, check.DeepEquals, []int{4, 4}) variants, err := npy.GetInt16() if c.Check(err, check.IsNil) { - c.Check(variants, check.DeepEquals, []int16{2, 1, 3, 1, -1, -1, 4, 2, 2, 1, 3, 1, -1, -1, 4, 2}) + c.Check(variants, check.DeepEquals, []int16{ + 2, 1, 3, 1, + -1, -1, 4, 2, + 2, 1, 3, 1, + -1, -1, 4, 2, + }) } annotations, err := ioutil.ReadFile(npydir + "/matrix.annotations.csv") @@ -325,6 +330,7 @@ pipeline1dup/input2 0 "-min-coverage=0.75", "-input-dir=" + slicedir, "-output-dir=" + npydir, + "-debug-tag=1", }, nil, os.Stderr, os.Stderr) c.Check(exited, check.Equals, 0) out, _ := exec.Command("find", npydir, "-ls").CombinedOutput() @@ -348,3 +354,51 @@ pipeline1dup/input2 0 } } } + +func (s *sliceSuite) Test_tv2homhet(c *check.C) { + cmd := &sliceNumpy{ + cgnames: []string{"sample1", "sample2", "sample3", "sample4"}, + chi2Cases: []bool{false, true, true, false}, + chi2PValue: .5, + includeVariant1: true, + } + cgs := map[string]CompactGenome{ + "sample1": CompactGenome{ + Variants: []tileVariantID{0, 0, 1, 1}, // hom tv=1 + }, + "sample2": CompactGenome{ + Variants: []tileVariantID{0, 0, 5, 5}, // hom tv=2 + }, + "sample3": CompactGenome{ + Variants: []tileVariantID{0, 0, 5, 1}, // het tv=1, tv=2 + }, + "sample4": CompactGenome{ + Variants: []tileVariantID{0, 0, 9, 9}, // hom tv=3 + }, + } + maxv := tileVariantID(3) + remap := []tileVariantID{0, 1, 0, 0, 0, 2, 0, 0, 0, 3} + chunkstarttag := tagID(10) + for tag := tagID(10); tag < 12; tag++ { + c.Logf("=== tag %d", tag) + chunk, xref := cmd.tv2homhet(cgs, maxv, remap, tag, chunkstarttag) + c.Logf("chunk len=%d", len(chunk)) + for _, x := range chunk { + c.Logf("%+v", x) + } + c.Logf("xref len=%d", len(xref)) + for _, x := range xref { + c.Logf("%+v", x) + } + out := onehotcols2int8(chunk) + c.Logf("onehotcols2int8(chunk) len=%d", len(out)) + for i := 0; i < len(out); i += len(chunk) { + c.Logf("%+v", out[i:i+len(chunk)]) + } + coords := onehotChunk2Indirect(chunk) + c.Logf("onehotChunk2Indirect(chunk) len=%d", len(coords)) + for _, x := range coords { + c.Logf("%+v", x) + } + } +} diff --git a/slicenumpy.go b/slicenumpy.go index 08cb2c0114..0cc9896b59 100644 --- a/slicenumpy.go +++ b/slicenumpy.go @@ -28,6 +28,7 @@ import ( "git.arvados.org/arvados.git/sdk/go/arvados" "github.com/arvados/lightning/hgvs" "github.com/kshedden/gonpy" + "github.com/sirupsen/logrus" log "github.com/sirupsen/logrus" "golang.org/x/crypto/blake2b" ) @@ -42,6 +43,7 @@ type sliceNumpy struct { minCoverage int cgnames []string includeVariant1 bool + debugTag tagID } func (cmd *sliceNumpy) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int { @@ -67,6 +69,7 @@ func (cmd *sliceNumpy) RunCommand(prog string, args []string, stdin io.Reader, s hgvsChunked := flags.Bool("chunked-hgvs-matrix", false, "also generate hgvs-based matrix per chromosome") onehotSingle := flags.Bool("single-onehot", false, "generate one-hot tile-based matrix") onehotChunked := flags.Bool("chunked-onehot", false, "generate one-hot tile-based matrix per input chunk") + debugTag := flags.Int("debug-tag", -1, "log debugging details about specified tag") flags.IntVar(&cmd.threads, "threads", 16, "number of memory-hungry assembly threads") flags.StringVar(&cmd.chi2CaseControlFile, "chi2-case-control-file", "", "tsv file or directory indicating cases and controls for Χ² test (if directory, all .tsv files will be read)") flags.StringVar(&cmd.chi2CaseControlColumn, "chi2-case-control-column", "", "name of case/control column in case-control files for Χ² test (value must be 0 for control, 1 for case)") @@ -92,6 +95,8 @@ func (cmd *sliceNumpy) RunCommand(prog string, args []string, stdin io.Reader, s return 2 } + cmd.debugTag = tagID(*debugTag) + if !*runlocal { runner := arvadosContainerRunner{ Name: "lightning slice-numpy", @@ -123,6 +128,7 @@ func (cmd *sliceNumpy) RunCommand(prog string, args []string, stdin io.Reader, s "-chi2-case-control-column=" + cmd.chi2CaseControlColumn, "-chi2-p-value=" + fmt.Sprintf("%f", cmd.chi2PValue), "-include-variant-1=" + fmt.Sprintf("%v", cmd.includeVariant1), + "-debug-tag=" + fmt.Sprintf("%d", cmd.debugTag), } runner.Args = append(runner.Args, cmd.filter.Args()...) var output string @@ -380,6 +386,9 @@ func (cmd *sliceNumpy) RunCommand(prog string, args []string, stdin io.Reader, s // masked-out tiles. continue } + if tv.Tag == cmd.debugTag { + log.Printf("infile %d %s tag %d variant %d hash %x", infileIdx, infile, tv.Tag, tv.Variant, tv.Blake2b[:3]) + } variants := seq[tv.Tag] if len(variants) == 0 { variants = make([]TileVariant, 100) @@ -427,12 +436,15 @@ func (cmd *sliceNumpy) RunCommand(prog string, args []string, stdin io.Reader, s count[blake2b.Sum256(rt.tiledata)] = 0 } - for _, cg := range cgs { + for cgname, cg := range cgs { idx := int(tag-tagstart) * 2 for allele := 0; allele < 2; allele++ { v := cg.Variants[idx+allele] if v > 0 && len(variants[v].Sequence) > 0 { count[variants[v].Blake2b]++ + if tag == cmd.debugTag { + log.Printf("tag %d cg %s allele %d tv %d hash %x count is now %d", tag, cgname, allele, v, variants[v].Blake2b[:3], count[variants[v].Blake2b]) + } } } } @@ -458,6 +470,11 @@ func (cmd *sliceNumpy) RunCommand(prog string, args []string, stdin io.Reader, s for i, h := range hash { rank[h] = tileVariantID(i + 1) } + if tag == cmd.debugTag { + for h, r := range rank { + log.Printf("tag %d rank(%x) = %v", tag, h[:3], r) + } + } // remap[v] will be the new // variant number for original // variant number v. @@ -465,9 +482,16 @@ func (cmd *sliceNumpy) RunCommand(prog string, args []string, stdin io.Reader, s for i, tv := range variants { remap[i] = rank[tv.Blake2b] } + if tag == cmd.debugTag { + log.Printf("tag %d remap %+v", tag, remap) + } variantRemap[tag-tagstart] = remap if rt != nil { - rt.variant = rank[blake2b.Sum256(rt.tiledata)] + refrank := rank[blake2b.Sum256(rt.tiledata)] + if tag == cmd.debugTag { + log.Printf("tag %d reftile variant %d => %d", tag, rt.variant, refrank) + } + rt.variant = refrank } }() } @@ -501,7 +525,18 @@ func (cmd *sliceNumpy) RunCommand(prog string, args []string, stdin io.Reader, s } } if *onehotChunked || *onehotSingle { + if tag == cmd.debugTag { + log.WithFields(logrus.Fields{ + "cgs[2].Variants[tag*2:(tag+1)*2]": cgs[cmd.cgnames[2]].Variants[(tag-tagstart)*2 : (tag-tagstart+1)*2], + }).Info("before tv2homhet") + } onehot, xrefs := cmd.tv2homhet(cgs, maxv, remap, tag, tagstart) + if tag == cmd.debugTag { + log.WithFields(logrus.Fields{ + "onehot": onehot, + "xrefs": xrefs, + }).Info("tv2homhet()") + } onehotChunk = append(onehotChunk, onehot...) onehotXref = append(onehotXref, xrefs...) } @@ -611,7 +646,7 @@ func (cmd *sliceNumpy) RunCommand(prog string, args []string, stdin io.Reader, s // transpose onehotChunk[col][row] to numpy[row*ncols+col] rows := len(cmd.cgnames) cols := len(onehotChunk) - log.Infof("%04d: preparing onehot numpy (rows=%d, cols=%d, mem=%d)", infileIdx, len(cmd.cgnames), len(onehotChunk), len(cmd.cgnames)*len(onehotChunk)) + log.Infof("%04d: preparing onehot numpy (rows=%d, cols=%d, mem=%d)", infileIdx, rows, cols, rows*cols) throttleNumpyMem.Acquire() out := onehotcols2int8(onehotChunk) fnm := fmt.Sprintf("%s/onehot.%04d.npy", *outputDir, infileIdx) @@ -1200,6 +1235,19 @@ const onehotXrefSize = unsafe.Sizeof(onehotXref{}) // // Return nil if no tile variant passes Χ² filter. func (cmd *sliceNumpy) tv2homhet(cgs map[string]CompactGenome, maxv tileVariantID, remap []tileVariantID, tag, chunkstarttag tagID) ([][]int8, []onehotXref) { + if tag == cmd.debugTag { + tv := make([]tileVariantID, len(cmd.cgnames)*2) + for i, name := range cmd.cgnames { + copy(tv[i*2:(i+1)*2], cgs[name].Variants[(tag-chunkstarttag)*2:]) + } + log.WithFields(logrus.Fields{ + "cgs[i].Variants[tag*2+j]": tv, + "maxv": maxv, + "remap": remap, + "tag": tag, + "chunkstarttag": chunkstarttag, + }).Info("tv2homhet()") + } if maxv < 1 || (maxv < 2 && !cmd.includeVariant1) { // everyone has the most common variant (of the variants we don't drop) return nil, nil