c.Check(npy.Shape, check.DeepEquals, []int{4, 4})
variants, err := npy.GetInt16()
if c.Check(err, check.IsNil) {
- c.Check(variants, check.DeepEquals, []int16{2, 1, 3, 1, -1, -1, 4, 2, 2, 1, 3, 1, -1, -1, 4, 2})
+ c.Check(variants, check.DeepEquals, []int16{
+ 2, 1, 3, 1,
+ -1, -1, 4, 2,
+ 2, 1, 3, 1,
+ -1, -1, 4, 2,
+ })
}
annotations, err := ioutil.ReadFile(npydir + "/matrix.annotations.csv")
"-min-coverage=0.75",
"-input-dir=" + slicedir,
"-output-dir=" + npydir,
+ "-debug-tag=1",
}, nil, os.Stderr, os.Stderr)
c.Check(exited, check.Equals, 0)
out, _ := exec.Command("find", npydir, "-ls").CombinedOutput()
}
}
}
+
+func (s *sliceSuite) Test_tv2homhet(c *check.C) {
+ cmd := &sliceNumpy{
+ cgnames: []string{"sample1", "sample2", "sample3", "sample4"},
+ chi2Cases: []bool{false, true, true, false},
+ chi2PValue: .5,
+ includeVariant1: true,
+ }
+ cgs := map[string]CompactGenome{
+ "sample1": CompactGenome{
+ Variants: []tileVariantID{0, 0, 1, 1}, // hom tv=1
+ },
+ "sample2": CompactGenome{
+ Variants: []tileVariantID{0, 0, 5, 5}, // hom tv=2
+ },
+ "sample3": CompactGenome{
+ Variants: []tileVariantID{0, 0, 5, 1}, // het tv=1, tv=2
+ },
+ "sample4": CompactGenome{
+ Variants: []tileVariantID{0, 0, 9, 9}, // hom tv=3
+ },
+ }
+ maxv := tileVariantID(3)
+ remap := []tileVariantID{0, 1, 0, 0, 0, 2, 0, 0, 0, 3}
+ chunkstarttag := tagID(10)
+ for tag := tagID(10); tag < 12; tag++ {
+ c.Logf("=== tag %d", tag)
+ chunk, xref := cmd.tv2homhet(cgs, maxv, remap, tag, chunkstarttag)
+ c.Logf("chunk len=%d", len(chunk))
+ for _, x := range chunk {
+ c.Logf("%+v", x)
+ }
+ c.Logf("xref len=%d", len(xref))
+ for _, x := range xref {
+ c.Logf("%+v", x)
+ }
+ out := onehotcols2int8(chunk)
+ c.Logf("onehotcols2int8(chunk) len=%d", len(out))
+ for i := 0; i < len(out); i += len(chunk) {
+ c.Logf("%+v", out[i:i+len(chunk)])
+ }
+ coords := onehotChunk2Indirect(chunk)
+ c.Logf("onehotChunk2Indirect(chunk) len=%d", len(coords))
+ for _, x := range coords {
+ c.Logf("%+v", x)
+ }
+ }
+}
"git.arvados.org/arvados.git/sdk/go/arvados"
"github.com/arvados/lightning/hgvs"
"github.com/kshedden/gonpy"
+ "github.com/sirupsen/logrus"
log "github.com/sirupsen/logrus"
"golang.org/x/crypto/blake2b"
)
minCoverage int
cgnames []string
includeVariant1 bool
+ debugTag tagID
}
func (cmd *sliceNumpy) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int {
hgvsChunked := flags.Bool("chunked-hgvs-matrix", false, "also generate hgvs-based matrix per chromosome")
onehotSingle := flags.Bool("single-onehot", false, "generate one-hot tile-based matrix")
onehotChunked := flags.Bool("chunked-onehot", false, "generate one-hot tile-based matrix per input chunk")
+ debugTag := flags.Int("debug-tag", -1, "log debugging details about specified tag")
flags.IntVar(&cmd.threads, "threads", 16, "number of memory-hungry assembly threads")
flags.StringVar(&cmd.chi2CaseControlFile, "chi2-case-control-file", "", "tsv file or directory indicating cases and controls for Χ² test (if directory, all .tsv files will be read)")
flags.StringVar(&cmd.chi2CaseControlColumn, "chi2-case-control-column", "", "name of case/control column in case-control files for Χ² test (value must be 0 for control, 1 for case)")
return 2
}
+ cmd.debugTag = tagID(*debugTag)
+
if !*runlocal {
runner := arvadosContainerRunner{
Name: "lightning slice-numpy",
"-chi2-case-control-column=" + cmd.chi2CaseControlColumn,
"-chi2-p-value=" + fmt.Sprintf("%f", cmd.chi2PValue),
"-include-variant-1=" + fmt.Sprintf("%v", cmd.includeVariant1),
+ "-debug-tag=" + fmt.Sprintf("%d", cmd.debugTag),
}
runner.Args = append(runner.Args, cmd.filter.Args()...)
var output string
// masked-out tiles.
continue
}
+ if tv.Tag == cmd.debugTag {
+ log.Printf("infile %d %s tag %d variant %d hash %x", infileIdx, infile, tv.Tag, tv.Variant, tv.Blake2b[:3])
+ }
variants := seq[tv.Tag]
if len(variants) == 0 {
variants = make([]TileVariant, 100)
count[blake2b.Sum256(rt.tiledata)] = 0
}
- for _, cg := range cgs {
+ for cgname, cg := range cgs {
idx := int(tag-tagstart) * 2
for allele := 0; allele < 2; allele++ {
v := cg.Variants[idx+allele]
if v > 0 && len(variants[v].Sequence) > 0 {
count[variants[v].Blake2b]++
+ if tag == cmd.debugTag {
+ log.Printf("tag %d cg %s allele %d tv %d hash %x count is now %d", tag, cgname, allele, v, variants[v].Blake2b[:3], count[variants[v].Blake2b])
+ }
}
}
}
for i, h := range hash {
rank[h] = tileVariantID(i + 1)
}
+ if tag == cmd.debugTag {
+ for h, r := range rank {
+ log.Printf("tag %d rank(%x) = %v", tag, h[:3], r)
+ }
+ }
// remap[v] will be the new
// variant number for original
// variant number v.
for i, tv := range variants {
remap[i] = rank[tv.Blake2b]
}
+ if tag == cmd.debugTag {
+ log.Printf("tag %d remap %+v", tag, remap)
+ }
variantRemap[tag-tagstart] = remap
if rt != nil {
- rt.variant = rank[blake2b.Sum256(rt.tiledata)]
+ refrank := rank[blake2b.Sum256(rt.tiledata)]
+ if tag == cmd.debugTag {
+ log.Printf("tag %d reftile variant %d => %d", tag, rt.variant, refrank)
+ }
+ rt.variant = refrank
}
}()
}
}
}
if *onehotChunked || *onehotSingle {
+ if tag == cmd.debugTag {
+ log.WithFields(logrus.Fields{
+ "cgs[2].Variants[tag*2:(tag+1)*2]": cgs[cmd.cgnames[2]].Variants[(tag-tagstart)*2 : (tag-tagstart+1)*2],
+ }).Info("before tv2homhet")
+ }
onehot, xrefs := cmd.tv2homhet(cgs, maxv, remap, tag, tagstart)
+ if tag == cmd.debugTag {
+ log.WithFields(logrus.Fields{
+ "onehot": onehot,
+ "xrefs": xrefs,
+ }).Info("tv2homhet()")
+ }
onehotChunk = append(onehotChunk, onehot...)
onehotXref = append(onehotXref, xrefs...)
}
// transpose onehotChunk[col][row] to numpy[row*ncols+col]
rows := len(cmd.cgnames)
cols := len(onehotChunk)
- log.Infof("%04d: preparing onehot numpy (rows=%d, cols=%d, mem=%d)", infileIdx, len(cmd.cgnames), len(onehotChunk), len(cmd.cgnames)*len(onehotChunk))
+ log.Infof("%04d: preparing onehot numpy (rows=%d, cols=%d, mem=%d)", infileIdx, rows, cols, rows*cols)
throttleNumpyMem.Acquire()
out := onehotcols2int8(onehotChunk)
fnm := fmt.Sprintf("%s/onehot.%04d.npy", *outputDir, infileIdx)
//
// Return nil if no tile variant passes Χ² filter.
func (cmd *sliceNumpy) tv2homhet(cgs map[string]CompactGenome, maxv tileVariantID, remap []tileVariantID, tag, chunkstarttag tagID) ([][]int8, []onehotXref) {
+ if tag == cmd.debugTag {
+ tv := make([]tileVariantID, len(cmd.cgnames)*2)
+ for i, name := range cmd.cgnames {
+ copy(tv[i*2:(i+1)*2], cgs[name].Variants[(tag-chunkstarttag)*2:])
+ }
+ log.WithFields(logrus.Fields{
+ "cgs[i].Variants[tag*2+j]": tv,
+ "maxv": maxv,
+ "remap": remap,
+ "tag": tag,
+ "chunkstarttag": chunkstarttag,
+ }).Info("tv2homhet()")
+ }
if maxv < 1 || (maxv < 2 && !cmd.includeVariant1) {
// everyone has the most common variant (of the variants we don't drop)
return nil, nil