X-Git-Url: https://git.arvados.org/lightning.git/blobdiff_plain/b9a352b3c94fbefd189bf72694ef52fe561f0515..56bfceeaedc235866704decc588f56c67edd4605:/slicenumpy.go diff --git a/slicenumpy.go b/slicenumpy.go index 91413824fb..974f200639 100644 --- a/slicenumpy.go +++ b/slicenumpy.go @@ -41,6 +41,7 @@ type sliceNumpy struct { chi2PValue float64 minCoverage int cgnames []string + includeVariant1 bool } func (cmd *sliceNumpy) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int { @@ -70,6 +71,7 @@ func (cmd *sliceNumpy) RunCommand(prog string, args []string, stdin io.Reader, s flags.StringVar(&cmd.chi2CaseControlFile, "chi2-case-control-file", "", "tsv file or directory indicating cases and controls for Χ² test (if directory, all .tsv files will be read)") flags.StringVar(&cmd.chi2CaseControlColumn, "chi2-case-control-column", "", "name of case/control column in case-control files for Χ² test (value must be 0 for control, 1 for case)") flags.Float64Var(&cmd.chi2PValue, "chi2-p-value", 1, "do Χ² test and omit columns with p-value above this threshold") + flags.BoolVar(&cmd.includeVariant1, "include-variant-1", false, "include most common variant when building one-hot matrix") cmd.filter.Flags(flags) err = flags.Parse(args) if err == flag.ErrHelp { @@ -155,10 +157,10 @@ func (cmd *sliceNumpy) RunCommand(prog string, args []string, stdin io.Reader, s } cmd.cgnames = nil - taglen := -1 + var tagset [][]byte DecodeLibrary(in0, strings.HasSuffix(infiles[0], ".gz"), func(ent *LibraryEntry) error { if len(ent.TagSet) > 0 { - taglen = len(ent.TagSet[0]) + tagset = ent.TagSet } for _, cseq := range ent.CompactSequences { if cseq.Name == *ref || *ref == "" { @@ -185,10 +187,18 @@ func (cmd *sliceNumpy) RunCommand(prog string, args []string, stdin io.Reader, s err = fmt.Errorf("%s: reference sequence not found", infiles[0]) return 1 } - if taglen < 0 { + if len(tagset) == 0 { err = fmt.Errorf("tagset not found") return 1 } + + taglib := &tagLibrary{} + err = taglib.setTags(tagset) + if err != nil { + return 1 + } + taglen := taglib.TagLen() + if len(cmd.cgnames) == 0 { err = fmt.Errorf("no genomes found matching regexp %q", cmd.filter.MatchGenome) return 1 @@ -239,11 +249,28 @@ func (cmd *sliceNumpy) RunCommand(prog string, args []string, stdin io.Reader, s for seqname, cseq := range refseq { pos := 0 for _, libref := range cseq { + if libref.Tag > tagID(cmd.filter.MaxTag) { + continue + } tiledata := reftiledata[libref] if len(tiledata) == 0 { err = fmt.Errorf("missing tiledata for tag %d variant %d in %s in ref", libref.Tag, libref.Variant, seqname) return 1 } + foundthistag := false + taglib.FindAll(tiledata[:len(tiledata)-1], func(tagid tagID, offset, _ int) { + if !foundthistag && tagid == libref.Tag { + foundthistag = true + return + } + if dupref, ok := reftile[tagid]; ok { + log.Printf("dropping reference tile %+v from %s @ %d, tag not unique, also found inside %+v from %s @ %d", tileLibRef{Tag: tagid, Variant: dupref.variant}, dupref.seqname, dupref.pos, libref, seqname, pos+offset+1) + delete(reftile, tagid) + } else { + log.Printf("found tag %d at offset %d inside tile variant %+v on %s @ %d", tagid, offset, libref, seqname, pos+offset+1) + } + isdup[tagid] = true + }) if isdup[libref.Tag] { log.Printf("dropping reference tile %+v from %s @ %d, tag not unique", libref, seqname, pos) } else if reftile[libref.Tag] != nil { @@ -455,16 +482,12 @@ func (cmd *sliceNumpy) RunCommand(prog string, args []string, stdin io.Reader, s annow := bufio.NewWriterSize(annof, 1<<20) outcol := 0 for tag := tagstart; tag < tagend; tag++ { - rt, ok := reftile[tag] - if !ok { - if mask == nil { - outcol++ - } - // Excluded by specified - // regions, or reference does - // not use any variant of this - // tile. (TODO: log this? - // mention it in annotations?) + rt := reftile[tag] + if rt == nil && mask != nil { + // Excluded by specified regions + continue + } + if tag > tagID(cmd.filter.MaxTag) { continue } remap := variantRemap[tag-tagstart] @@ -479,6 +502,12 @@ func (cmd *sliceNumpy) RunCommand(prog string, args []string, stdin io.Reader, s onehotChunk = append(onehotChunk, onehot...) onehotXref = append(onehotXref, xrefs...) } + if rt == nil { + // Reference does not use any + // variant of this tile + outcol++ + continue + } fmt.Fprintf(annow, "%d,%d,%d,=,%s,%d,,,\n", tag, outcol, rt.variant, rt.seqname, rt.pos) variants := seq[tag] reftilestr := strings.ToUpper(string(rt.tiledata)) @@ -1194,21 +1223,24 @@ func (cmd *sliceNumpy) tv2homhet(cgs map[string]CompactGenome, maxv tileVariantI } var onehot [][]int8 var xref []onehotXref - for homcol := 4; homcol < len(obs); homcol += 2 { - p := [2]float64{ - pvalue(obs[homcol], cmd.chi2Cases), - pvalue(obs[homcol+1], cmd.chi2Cases), - } - if cmd.chi2PValue < 1 && !(p[0] < cmd.chi2PValue || p[1] < cmd.chi2PValue) { + for homcol := 2; homcol < len(obs); homcol += 2 { + // homcol 0,1 correspond to tile variant 0, i.e., + // no-call; homcol 2,3 correspond to the most common + // variant; so we (normally) start at homcol 4. + if homcol < 4 && !cmd.includeVariant1 { continue } for het := 0; het < 2; het++ { + p := pvalue(obs[homcol+het], cmd.chi2Cases) + if cmd.chi2PValue < 1 && !(p < cmd.chi2PValue) { + continue + } onehot = append(onehot, bool2int8(obs[homcol+het])) xref = append(xref, onehotXref{ tag: tag, variant: tileVariantID(homcol / 2), het: het == 1, - pvalue: p[het], + pvalue: p, }) } }