chi2PValue float64
minCoverage int
cgnames []string
+ includeVariant1 bool
}
func (cmd *sliceNumpy) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int {
flags.StringVar(&cmd.chi2CaseControlFile, "chi2-case-control-file", "", "tsv file or directory indicating cases and controls for Χ² test (if directory, all .tsv files will be read)")
flags.StringVar(&cmd.chi2CaseControlColumn, "chi2-case-control-column", "", "name of case/control column in case-control files for Χ² test (value must be 0 for control, 1 for case)")
flags.Float64Var(&cmd.chi2PValue, "chi2-p-value", 1, "do Χ² test and omit columns with p-value above this threshold")
+ flags.BoolVar(&cmd.includeVariant1, "include-variant-1", false, "include most common variant when building one-hot matrix")
cmd.filter.Flags(flags)
err = flags.Parse(args)
if err == flag.ErrHelp {
}
cmd.cgnames = nil
- taglen := -1
+ var tagset [][]byte
DecodeLibrary(in0, strings.HasSuffix(infiles[0], ".gz"), func(ent *LibraryEntry) error {
if len(ent.TagSet) > 0 {
- taglen = len(ent.TagSet[0])
+ tagset = ent.TagSet
}
for _, cseq := range ent.CompactSequences {
if cseq.Name == *ref || *ref == "" {
err = fmt.Errorf("%s: reference sequence not found", infiles[0])
return 1
}
- if taglen < 0 {
+ if len(tagset) == 0 {
err = fmt.Errorf("tagset not found")
return 1
}
+
+ taglib := &tagLibrary{}
+ err = taglib.setTags(tagset)
+ if err != nil {
+ return 1
+ }
+ taglen := taglib.TagLen()
+
if len(cmd.cgnames) == 0 {
err = fmt.Errorf("no genomes found matching regexp %q", cmd.filter.MatchGenome)
return 1
for seqname, cseq := range refseq {
pos := 0
for _, libref := range cseq {
+ if libref.Tag > tagID(cmd.filter.MaxTag) {
+ continue
+ }
tiledata := reftiledata[libref]
if len(tiledata) == 0 {
err = fmt.Errorf("missing tiledata for tag %d variant %d in %s in ref", libref.Tag, libref.Variant, seqname)
return 1
}
+ foundthistag := false
+ taglib.FindAll(tiledata[:len(tiledata)-1], func(tagid tagID, offset, _ int) {
+ if !foundthistag && tagid == libref.Tag {
+ foundthistag = true
+ return
+ }
+ if dupref, ok := reftile[tagid]; ok {
+ log.Printf("dropping reference tile %+v from %s @ %d, tag not unique, also found inside %+v from %s @ %d", tileLibRef{Tag: tagid, Variant: dupref.variant}, dupref.seqname, dupref.pos, libref, seqname, pos+offset+1)
+ delete(reftile, tagid)
+ } else {
+ log.Printf("found tag %d at offset %d inside tile variant %+v on %s @ %d", tagid, offset, libref, seqname, pos+offset+1)
+ }
+ isdup[tagid] = true
+ })
if isdup[libref.Tag] {
log.Printf("dropping reference tile %+v from %s @ %d, tag not unique", libref, seqname, pos)
} else if reftile[libref.Tag] != nil {
annow := bufio.NewWriterSize(annof, 1<<20)
outcol := 0
for tag := tagstart; tag < tagend; tag++ {
- rt, ok := reftile[tag]
- if !ok {
- if mask == nil {
- outcol++
- }
- // Excluded by specified
- // regions, or reference does
- // not use any variant of this
- // tile. (TODO: log this?
- // mention it in annotations?)
+ rt := reftile[tag]
+ if rt == nil && mask != nil {
+ // Excluded by specified regions
+ continue
+ }
+ if tag > tagID(cmd.filter.MaxTag) {
continue
}
remap := variantRemap[tag-tagstart]
onehotChunk = append(onehotChunk, onehot...)
onehotXref = append(onehotXref, xrefs...)
}
+ if rt == nil {
+ // Reference does not use any
+ // variant of this tile
+ outcol++
+ continue
+ }
fmt.Fprintf(annow, "%d,%d,%d,=,%s,%d,,,\n", tag, outcol, rt.variant, rt.seqname, rt.pos)
variants := seq[tag]
reftilestr := strings.ToUpper(string(rt.tiledata))
}
var onehot [][]int8
var xref []onehotXref
- for homcol := 4; homcol < len(obs); homcol += 2 {
- p := [2]float64{
- pvalue(obs[homcol], cmd.chi2Cases),
- pvalue(obs[homcol+1], cmd.chi2Cases),
- }
- if cmd.chi2PValue < 1 && !(p[0] < cmd.chi2PValue || p[1] < cmd.chi2PValue) {
+ for homcol := 2; homcol < len(obs); homcol += 2 {
+ // homcol 0,1 correspond to tile variant 0, i.e.,
+ // no-call; homcol 2,3 correspond to the most common
+ // variant; so we (normally) start at homcol 4.
+ if homcol < 4 && !cmd.includeVariant1 {
continue
}
for het := 0; het < 2; het++ {
+ p := pvalue(obs[homcol+het], cmd.chi2Cases)
+ if cmd.chi2PValue < 1 && !(p < cmd.chi2PValue) {
+ continue
+ }
onehot = append(onehot, bool2int8(obs[homcol+het]))
xref = append(xref, onehotXref{
tag: tag,
variant: tileVariantID(homcol / 2),
het: het == 1,
- pvalue: p[het],
+ pvalue: p,
})
}
}