type outputFormat interface {
Filename() string
PadLeft() bool
- Head(out io.Writer, cgs []CompactGenome) error
+ Head(out io.Writer, cgs []CompactGenome, cases []bool, p float64) error
Print(out io.Writer, seqname string, varslice []tvVariant) error
Finish(outdir string, out io.Writer, seqname string) error
MaxGoroutines() int
var outputFormats = map[string]func() outputFormat{
"hgvs-numpy": func() outputFormat {
- return &formatHGVSNumpy{alleles: map[string][][]bool{}}
+ return &formatHGVSNumpy{alleles: map[string][][]int8{}}
},
"hgvs-onehot": func() outputFormat { return formatHGVSOneHot{} },
"hgvs": func() outputFormat { return formatHGVS{} },
outputPerChrom bool
compress bool
maxTileSize int
+ filter filter
+ maxPValue float64
+ cases []bool
}
func (cmd *exporter) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int {
priority := flags.Int("priority", 500, "container request priority")
refname := flags.String("ref", "", "reference genome `name`")
inputDir := flags.String("input-dir", ".", "input `directory`")
+ cases := flags.String("cases", "", "file indicating which genomes are positive cases (for computing p-values)")
+ flags.Float64Var(&cmd.maxPValue, "p-value", 1, "do chi square test and omit columns with p-value above this threshold")
outputDir := flags.String("output-dir", ".", "output `directory`")
outputFormatStr := flags.String("output-format", "hgvs", "output `format`: hgvs, pvcf, or vcf")
outputBed := flags.String("output-bed", "", "also output bed `file`")
flags.BoolVar(&cmd.compress, "z", false, "write gzip-compressed output files")
labelsFilename := flags.String("output-labels", "", "also output genome labels csv `file`")
flags.IntVar(&cmd.maxTileSize, "max-tile-size", 50000, "don't try to make annotations for tiles bigger than given `size`")
+ cmd.filter.Flags(flags)
err = flags.Parse(args)
if err == flag.ErrHelp {
err = nil
return 0
} else if err != nil {
return 2
- }
- if flag.NArg() > 0 {
- err = fmt.Errorf("extra unparsed command line arguments: %q", flag.Args())
+ } else if flags.NArg() > 0 {
+ err = fmt.Errorf("errant command line arguments after parsed flags: %v", flags.Args())
return 2
}
Priority: *priority,
APIAccess: true,
}
- err = runner.TranslatePaths(inputDir)
+ err = runner.TranslatePaths(inputDir, cases)
if err != nil {
return 1
}
"-pprof", ":6000",
"-pprof-dir", "/mnt/output",
"-ref", *refname,
+ "-cases", *cases,
+ "-p-value", fmt.Sprintf("%f", cmd.maxPValue),
"-output-format", *outputFormatStr,
"-output-bed", *outputBed,
"-output-labels", "/mnt/output/labels.csv",
"-output-dir", "/mnt/output",
"-z=" + fmt.Sprintf("%v", cmd.compress),
}
+ runner.Args = append(runner.Args, cmd.filter.Args()...)
var output string
output, err = runner.Run()
if err != nil {
retainTileSequences: true,
compactGenomes: map[string][]tileVariantID{},
}
- err = tilelib.LoadDir(context.Background(), *inputDir, nil)
+ err = tilelib.LoadDir(context.Background(), *inputDir)
if err != nil {
return 1
}
return 1
}
+ log.Infof("filtering: %+v", cmd.filter)
+ cmd.filter.Apply(tilelib)
+
names := cgnames(tilelib)
for _, name := range names {
cgs = append(cgs, CompactGenome{Name: name, Variants: tilelib.compactGenomes[name]})
}
}
+ cmd.cases = make([]bool, len(names))
+ if *cases != "" {
+ log.Infof("reading cases file: %s", *cases)
+ var f io.ReadCloser
+ f, err = open(*cases)
+ if err != nil {
+ return 1
+ }
+ defer f.Close()
+ var buf []byte
+ buf, err = io.ReadAll(f)
+ if err != nil {
+ return 1
+ }
+ for _, pattern := range bytes.Split(buf, []byte("\n")) {
+ if len(pattern) == 0 {
+ continue
+ }
+ pattern := string(pattern)
+ idx := -1
+ for i, name := range names {
+ if !strings.Contains(name, pattern) {
+ continue
+ } else if idx >= 0 {
+ err = fmt.Errorf("pattern %q in cases file matches multiple genome IDs: %q, %q", pattern, names[idx], name)
+ return 1
+ } else {
+ idx = i
+ }
+ }
+ if idx < 0 {
+ log.Warnf("pattern %q in cases file does not match any genome IDs", pattern)
+ continue
+ }
+ cmd.cases[idx] = true
+ }
+ }
+
var bedout io.Writer
var bedfile *os.File
var bedbufw *bufio.Writer
defer z.Close()
outw[i] = z
}
- err = cmd.outputFormat.Head(outw[i], cgs)
+ err = cmd.outputFormat.Head(outw[i], cgs, cmd.cases, cmd.maxPValue)
if err != nil {
return err
}
defer z.Close()
out = z
}
- cmd.outputFormat.Head(out, cgs)
+ cmd.outputFormat.Head(out, cgs, cmd.cases, cmd.maxPValue)
merge(out, outw, "output")
}
if bedout != nil {
tagcoverage := 0 // number of times the start tag was found in genomes -- max is len(cgs)*2
for cgidx, cg := range cgs {
for phase := 0; phase < 2; phase++ {
- if len(cg.Variants) <= int(libref.Tag)*2+phase {
- continue
+ var variant tileVariantID
+ if i := int(libref.Tag)*2 + phase; len(cg.Variants) > i {
+ variant = cg.Variants[i]
}
- variant := cg.Variants[int(libref.Tag)*2+phase]
- if variant == 0 {
- continue
+ if variant > 0 {
+ tagcoverage++
}
- tagcoverage++
- if variant == libref.Variant {
+ if variant == libref.Variant || variant == 0 {
continue
}
glibref := tileLibRef{Tag: libref.Tag, Variant: variant}
for i, pos := range flushpos {
varslice := variantAt[pos]
delete(variantAt, pos)
+ // Check for uninitialized (zero-value)
+ // elements in varslice
for i := range varslice {
- if varslice[i].Position == 0 {
- varslice[i].Position = pos
+ if varslice[i].Position != 0 {
+ // Not a zero-value element
+ continue
+ }
+ // Set the position so
+ // varslice[*].Position are all equal
+ varslice[i].Position = pos
+ // This could be either =ref or a
+ // missing/low-quality tile. Figure
+ // out which.
+ vidx := int(libref.Tag)*2 + i%2
+ if vidx >= len(cgs[i/2].Variants) {
+ // Missing tile.
+ varslice[i].New = "-"
+ continue
+ }
+ v := cgs[i/2].Variants[vidx]
+ if v < 1 || len(tilelib.TileVariantSequence(tileLibRef{Tag: libref.Tag, Variant: v})) == 0 {
+ // Missing/low-quality tile.
+ varslice[i].New = "-" // fasta "gap of indeterminate length"
}
}
flushvariants[i] = varslice
byref := map[string]map[string]int{}
for _, v := range varslice {
if v.Ref == "" && v.New == "" {
+ // =ref
+ continue
+ }
+ if v.New == "-" {
+ // no-call
continue
}
alts := byref[v.Ref]
func (formatVCF) Filename() string { return "out.vcf" }
func (formatVCF) PadLeft() bool { return true }
func (formatVCF) Finish(string, io.Writer, string) error { return nil }
-func (formatVCF) Head(out io.Writer, cgs []CompactGenome) error {
+func (formatVCF) Head(out io.Writer, cgs []CompactGenome, cases []bool, p float64) error {
_, err := fmt.Fprint(out, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n")
return err
}
func (formatPVCF) Filename() string { return "out.vcf" }
func (formatPVCF) PadLeft() bool { return true }
func (formatPVCF) Finish(string, io.Writer, string) error { return nil }
-func (formatPVCF) Head(out io.Writer, cgs []CompactGenome) error {
+func (formatPVCF) Head(out io.Writer, cgs []CompactGenome, cases []bool, p float64) error {
fmt.Fprintln(out, `##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">`)
fmt.Fprintf(out, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT")
for _, cg := range cgs {
type formatHGVS struct{}
-func (formatHGVS) MaxGoroutines() int { return 0 }
-func (formatHGVS) Filename() string { return "out.tsv" }
-func (formatHGVS) PadLeft() bool { return false }
-func (formatHGVS) Head(out io.Writer, cgs []CompactGenome) error { return nil }
-func (formatHGVS) Finish(string, io.Writer, string) error { return nil }
+func (formatHGVS) MaxGoroutines() int { return 0 }
+func (formatHGVS) Filename() string { return "out.tsv" }
+func (formatHGVS) PadLeft() bool { return false }
+func (formatHGVS) Head(out io.Writer, cgs []CompactGenome, cases []bool, p float64) error { return nil }
+func (formatHGVS) Finish(string, io.Writer, string) error { return nil }
func (formatHGVS) Print(out io.Writer, seqname string, varslice []tvVariant) error {
for i := 0; i < len(varslice)/2; i++ {
if i > 0 {
out.Write([]byte{'\t'})
}
var1, var2 := varslice[i*2], varslice[i*2+1]
+ if var1.New == "-" || var2.New == "-" {
+ _, err := out.Write([]byte{'N'})
+ if err != nil {
+ return err
+ }
+ continue
+ }
if var1.Variant == var2.Variant {
if var1.Ref == var1.New {
_, err := out.Write([]byte{'.'})
type formatHGVSOneHot struct{}
-func (formatHGVSOneHot) MaxGoroutines() int { return 0 }
-func (formatHGVSOneHot) Filename() string { return "out.tsv" }
-func (formatHGVSOneHot) PadLeft() bool { return false }
-func (formatHGVSOneHot) Head(out io.Writer, cgs []CompactGenome) error { return nil }
-func (formatHGVSOneHot) Finish(string, io.Writer, string) error { return nil }
+func (formatHGVSOneHot) MaxGoroutines() int { return 0 }
+func (formatHGVSOneHot) Filename() string { return "out.tsv" }
+func (formatHGVSOneHot) PadLeft() bool { return false }
+func (formatHGVSOneHot) Head(out io.Writer, cgs []CompactGenome, cases []bool, p float64) error {
+ return nil
+}
+func (formatHGVSOneHot) Finish(string, io.Writer, string) error { return nil }
func (formatHGVSOneHot) Print(out io.Writer, seqname string, varslice []tvVariant) error {
vars := map[hgvs.Variant]bool{}
for _, v := range varslice {
sort.Slice(sorted, func(a, b int) bool { return hgvs.Less(sorted[a], sorted[b]) })
for _, v := range sorted {
+ if v.New == "-" {
+ continue
+ }
fmt.Fprintf(out, "%s.%s", seqname, v.String())
for i := 0; i < len(varslice); i += 2 {
if varslice[i].Variant == v || varslice[i+1].Variant == v {
type formatHGVSNumpy struct {
sync.Mutex
writelock sync.Mutex
- alleles map[string][][]bool // alleles[seqname][variantidx][genomeidx*2+phase]
+ alleles map[string][][]int8 // alleles[seqname][variantidx][genomeidx*2+phase]
+ cases []bool
+ maxPValue float64
}
-func (*formatHGVSNumpy) MaxGoroutines() int { return 4 }
-func (*formatHGVSNumpy) Filename() string { return "annotations.csv" }
-func (*formatHGVSNumpy) PadLeft() bool { return false }
-func (*formatHGVSNumpy) Head(out io.Writer, cgs []CompactGenome) error { return nil }
+func (*formatHGVSNumpy) MaxGoroutines() int { return 4 }
+func (*formatHGVSNumpy) Filename() string { return "annotations.csv" }
+func (*formatHGVSNumpy) PadLeft() bool { return false }
+func (f *formatHGVSNumpy) Head(out io.Writer, cgs []CompactGenome, cases []bool, p float64) error {
+ f.cases = cases
+ f.maxPValue = p
+ return nil
+}
func (f *formatHGVSNumpy) Print(outw io.Writer, seqname string, varslice []tvVariant) error {
// sort variants to ensure output is deterministic
sorted := make([]hgvs.Variant, 0, len(varslice))
seqalleles := f.alleles[seqname]
f.Unlock()
- // append a row to seqvariants and seqalleles for each unique
- // non-ref variant in varslice.
+ chi2x := make([]bool, 0, len(varslice))
+ chi2y := make([]bool, 0, len(varslice))
+
+ // append a row to seqalleles for each unique non-ref variant
+ // in varslice.
var previous hgvs.Variant
for _, v := range sorted {
- if previous == v || v.Ref == v.New {
+ if previous == v || v.Ref == v.New || v.New == "-" {
continue
}
previous = v
- newrow := make([]bool, len(varslice))
+ chi2x, chi2y := chi2x, chi2y
+ newrow := make([]int8, len(varslice))
for i, allele := range varslice {
if allele.Variant == v {
- newrow[i] = true
+ newrow[i] = 1
+ chi2x = append(chi2x, true)
+ chi2y = append(chi2y, f.cases[i/2])
+ } else if allele.Variant.New == "-" {
+ newrow[i] = -1
+ } else {
+ chi2x = append(chi2x, false)
+ chi2y = append(chi2y, f.cases[i/2])
}
}
+ if f.maxPValue < 1 && pvalue(chi2x, chi2y) > f.maxPValue {
+ continue
+ }
seqalleles = append(seqalleles, newrow)
_, err := fmt.Fprintf(outw, "%d,%q\n", len(seqalleles)-1, seqname+"."+v.String())
if err != nil {
func (f *formatHGVSNumpy) Finish(outdir string, _ io.Writer, seqname string) error {
// Write seqname's data to a .npy matrix with one row per
// genome and 2 columns per variant.
- seqalleles := f.alleles[seqname]
f.Lock()
+ seqalleles := f.alleles[seqname]
delete(f.alleles, seqname)
f.Unlock()
if len(seqalleles) == 0 {
for varidx, alleles := range seqalleles {
for g := 0; g < len(alleles)/2; g++ {
aa, ab := alleles[g*2], alleles[g*2+1]
- if aa && ab {
+ if aa < 0 || ab < 0 {
+ // no-call
+ out[g*cols+varidx*2] = -1
+ out[g*cols+varidx*2+1] = -1
+ } else if aa > 0 && ab > 0 {
// hom
out[g*cols+varidx*2] = 1
- } else if aa || ab {
+ } else if aa > 0 || ab > 0 {
// het
out[g*cols+varidx*2+1] = 1
}