+ return nil
+}
+
+type formatHGVSNumpy struct {
+ sync.Mutex
+ writelock sync.Mutex
+ alleles map[string][][]int8 // alleles[seqname][variantidx][genomeidx*2+phase]
+ cases []bool
+ maxPValue float64
+}
+
+func (*formatHGVSNumpy) MaxGoroutines() int { return 4 }
+func (*formatHGVSNumpy) Filename() string { return "annotations.csv" }
+func (*formatHGVSNumpy) PadLeft() bool { return false }
+func (f *formatHGVSNumpy) Head(out io.Writer, cgs []CompactGenome, cases []bool, p float64) error {
+ f.cases = cases
+ f.maxPValue = p
+ return nil
+}
+func (f *formatHGVSNumpy) Print(outw io.Writer, seqname string, varslice []tvVariant) error {
+ // sort variants to ensure output is deterministic
+ sorted := make([]hgvs.Variant, 0, len(varslice))
+ for _, v := range varslice {
+ sorted = append(sorted, v.Variant)
+ }
+ sort.Slice(sorted, func(a, b int) bool { return hgvs.Less(sorted[a], sorted[b]) })
+
+ f.Lock()
+ seqalleles := f.alleles[seqname]
+ f.Unlock()
+
+ chi2x := make([]bool, 0, len(varslice))
+ chi2y := make([]bool, 0, len(varslice))
+
+ // append a row to seqalleles for each unique non-ref variant
+ // in varslice.
+ var previous hgvs.Variant
+ for _, v := range sorted {
+ if previous == v || v.Ref == v.New || v.New == "-" {
+ continue
+ }
+ previous = v
+ chi2x, chi2y := chi2x, chi2y
+ newrow := make([]int8, len(varslice))
+ for i, allele := range varslice {
+ if allele.Variant == v {
+ newrow[i] = 1
+ chi2x = append(chi2x, true)
+ chi2y = append(chi2y, f.cases[i/2])
+ } else if allele.Variant.New == "-" {
+ newrow[i] = -1
+ } else {
+ chi2x = append(chi2x, false)
+ chi2y = append(chi2y, f.cases[i/2])
+ }
+ }
+ if f.maxPValue < 1 && pvalue(chi2x, chi2y) > f.maxPValue {
+ continue
+ }
+ seqalleles = append(seqalleles, newrow)
+ _, err := fmt.Fprintf(outw, "%d,%q\n", len(seqalleles)-1, seqname+"."+v.String())
+ if err != nil {
+ return err
+ }
+ }
+
+ f.Lock()
+ f.alleles[seqname] = seqalleles
+ f.Unlock()
+ return nil
+}
+func (f *formatHGVSNumpy) Finish(outdir string, _ io.Writer, seqname string) error {
+ // Write seqname's data to a .npy matrix with one row per
+ // genome and 2 columns per variant.
+ f.Lock()
+ seqalleles := f.alleles[seqname]
+ delete(f.alleles, seqname)
+ f.Unlock()
+ if len(seqalleles) == 0 {
+ return nil
+ }
+ out := make([]int8, len(seqalleles)*len(seqalleles[0]))
+ rows := len(seqalleles[0]) / 2
+ cols := len(seqalleles) * 2
+ // copy seqalleles[varidx][genome*2+phase] to
+ // out[genome*nvars*2 + varidx*2 + phase]
+ for varidx, alleles := range seqalleles {
+ for g := 0; g < len(alleles)/2; g++ {
+ aa, ab := alleles[g*2], alleles[g*2+1]
+ if aa < 0 || ab < 0 {
+ // no-call
+ out[g*cols+varidx*2] = -1
+ out[g*cols+varidx*2+1] = -1
+ } else if aa > 0 && ab > 0 {
+ // hom
+ out[g*cols+varidx*2] = 1
+ } else if aa > 0 || ab > 0 {
+ // het
+ out[g*cols+varidx*2+1] = 1
+ }
+ }
+ }
+ outf, err := os.OpenFile(outdir+"/matrix."+seqname+".npy", os.O_CREATE|os.O_EXCL|os.O_WRONLY, 0777)
+ if err != nil {
+ return err
+ }
+ defer outf.Close()
+ bufw := bufio.NewWriter(outf)
+ npw, err := gonpy.NewWriter(nopCloser{bufw})
+ if err != nil {
+ return err
+ }
+ log.WithFields(logrus.Fields{
+ "seqname": seqname,
+ "rows": rows,
+ "cols": cols,
+ }).Info("writing numpy")
+ npw.Shape = []int{rows, cols}
+ f.writelock.Lock() // serialize because WriteInt8 uses lots of memory
+ npw.WriteInt8(out)
+ f.writelock.Unlock()
+ err = bufw.Flush()
+ if err != nil {
+ return err
+ }
+ err = outf.Close()
+ if err != nil {
+ return err
+ }
+ return nil