X-Git-Url: https://git.arvados.org/lightning.git/blobdiff_plain/83983adc7d02cb2b460eede7341313d54379c8c3..76fbc75a359348e2a91546a70b8d2c738865cce2:/exportnumpy.go diff --git a/exportnumpy.go b/exportnumpy.go index d67ac6fee7..eba0234bfe 100644 --- a/exportnumpy.go +++ b/exportnumpy.go @@ -12,6 +12,7 @@ import ( _ "net/http/pprof" "os" "sort" + "strings" "git.arvados.org/arvados.git/sdk/go/arvados" "github.com/kshedden/gonpy" @@ -37,8 +38,9 @@ func (cmd *exportNumpy) RunCommand(prog string, args []string, stdin io.Reader, priority := flags.Int("priority", 500, "container request priority") inputFilename := flags.String("i", "-", "input `file`") outputFilename := flags.String("o", "-", "output `file`") - annotationsFilename := flags.String("output-annotations", "", "output `file` for tile variant annotations tsv") - librefsFilename := flags.String("output-onehot2tilevar", "", "when using -one-hot, create tsv `file` mapping column# to tag# and variant#") + annotationsFilename := flags.String("output-annotations", "", "output `file` for tile variant annotations csv") + librefsFilename := flags.String("output-onehot2tilevar", "", "when using -one-hot, create csv `file` mapping column# to tag# and variant#") + labelsFilename := flags.String("output-labels", "", "output `file` for genome labels csv") onehot := flags.Bool("one-hot", false, "recode tile variants as one-hot") cmd.filter.Flags(flags) err = flags.Parse(args) @@ -76,8 +78,9 @@ func (cmd *exportNumpy) RunCommand(prog string, args []string, stdin io.Reader, fmt.Sprintf("-one-hot=%v", *onehot), "-i", *inputFilename, "-o", "/mnt/output/matrix.npy", - "-output-annotations", "/mnt/output/annotations.tsv", - "-output-onehot2tilevar", "/mnt/output/onehot2tilevar.tsv", + "-output-annotations", "/mnt/output/annotations.csv", + "-output-onehot2tilevar", "/mnt/output/onehot2tilevar.csv", + "-output-labels", "/mnt/output/labels.csv", "-max-variants", fmt.Sprintf("%d", cmd.filter.MaxVariants), "-min-coverage", fmt.Sprintf("%f", cmd.filter.MinCoverage), "-max-tag", fmt.Sprintf("%d", cmd.filter.MaxTag), @@ -106,7 +109,7 @@ func (cmd *exportNumpy) RunCommand(prog string, args []string, stdin io.Reader, retainTileSequences: true, compactGenomes: map[string][]tileVariantID{}, } - err = tilelib.LoadGob(context.Background(), input, nil) + err = tilelib.LoadGob(context.Background(), input, strings.HasSuffix(*inputFilename, ".gz"), nil) if err != nil { return 1 } @@ -139,7 +142,31 @@ func (cmd *exportNumpy) RunCommand(prog string, args []string, stdin io.Reader, } log.Info("building numpy array") - out, rows, cols := cgs2array(tilelib.compactGenomes) + out, rows, cols, names := cgs2array(tilelib) + + if *labelsFilename != "" { + log.Infof("writing labels to %s", *labelsFilename) + var f *os.File + f, err = os.OpenFile(*labelsFilename, os.O_CREATE|os.O_WRONLY, 0777) + if err != nil { + return 1 + } + defer f.Close() + for i, name := range names { + _, err = fmt.Fprintf(f, "%d,%q\n", i, trimFilenameForLabel(name)) + if err != nil { + err = fmt.Errorf("write %s: %w", *labelsFilename, err) + return 1 + } + } + err = f.Close() + if err != nil { + err = fmt.Errorf("close %s: %w", *labelsFilename, err) + return 1 + } + } + + log.Info("writing numpy file") var output io.WriteCloser if *outputFilename == "-" { output = nopCloser{stdout} @@ -169,7 +196,7 @@ func (cmd *exportNumpy) RunCommand(prog string, args []string, stdin io.Reader, } log.Info("writing numpy") npw.Shape = []int{rows, cols} - npw.WriteUint16(out) + npw.WriteInt16(out) err = bufw.Flush() if err != nil { return 1 @@ -188,7 +215,7 @@ func (*exportNumpy) writeLibRefs(fnm string, tilelib *tileLibrary, librefs []til } defer f.Close() for i, libref := range librefs { - _, err = fmt.Fprintf(f, "%d\t%d\t%d\n", i, libref.Tag, libref.Variant) + _, err = fmt.Fprintf(f, "%d,%d,%d\n", i, libref.Tag, libref.Variant) if err != nil { return err } @@ -196,31 +223,51 @@ func (*exportNumpy) writeLibRefs(fnm string, tilelib *tileLibrary, librefs []til return f.Close() } -func cgs2array(cgs map[string][]tileVariantID) (data []uint16, rows, cols int) { - var cgnames []string - for name := range cgs { +func cgs2array(tilelib *tileLibrary) (data []int16, rows, cols int, cgnames []string) { + for name := range tilelib.compactGenomes { cgnames = append(cgnames, name) } sort.Strings(cgnames) - rows = len(cgs) - for _, cg := range cgs { + rows = len(tilelib.compactGenomes) + for _, cg := range tilelib.compactGenomes { if cols < len(cg) { cols = len(cg) } } - data = make([]uint16, rows*cols) + + // flag low-quality tile variants so we can change to -1 below + lowqual := make([]map[tileVariantID]bool, cols/2) + for tag, variants := range tilelib.variant { + lq := lowqual[tag] + for varidx, hash := range variants { + if len(tilelib.seq[hash]) == 0 { + if lq == nil { + lq = map[tileVariantID]bool{} + lowqual[tag] = lq + } + lq[tileVariantID(varidx+1)] = true + } + } + } + + data = make([]int16, rows*cols) for row, name := range cgnames { - for i, v := range cgs[name] { - data[row*cols+i] = uint16(v) + for i, v := range tilelib.compactGenomes[name] { + if v > 0 && lowqual[i/2][v] { + data[row*cols+i] = -1 + } else { + data[row*cols+i] = int16(v) + } } } + return } -func recodeOnehot(in []uint16, incols int) (out []uint16, librefs []tileLibRef, outcols int) { +func recodeOnehot(in []int16, incols int) (out []int16, librefs []tileLibRef, outcols int) { rows := len(in) / incols - maxvalue := make([]uint16, incols) + maxvalue := make([]int16, incols) for row := 0; row < rows; row++ { for col := 0; col < incols; col++ { if v := in[row*incols+col]; maxvalue[col] < v { @@ -242,7 +289,7 @@ func recodeOnehot(in []uint16, incols int) (out []uint16, librefs []tileLibRef, } log.Printf("recodeOnehot: dropped %d input cols with zero maxvalue", dropped) - out = make([]uint16, rows*outcols) + out = make([]int16, rows*outcols) for inidx, row := 0, 0; row < rows; row++ { outrow := out[row*outcols:] for col := 0; col < incols; col++ { @@ -260,3 +307,17 @@ type nopCloser struct { } func (nopCloser) Close() error { return nil } + +func trimFilenameForLabel(s string) string { + if i := strings.LastIndex(s, "/"); i >= 0 { + s = s[i+1:] + } + s = strings.TrimSuffix(s, ".gz") + s = strings.TrimSuffix(s, ".fa") + s = strings.TrimSuffix(s, ".fasta") + s = strings.TrimSuffix(s, ".1") + s = strings.TrimSuffix(s, ".2") + s = strings.TrimSuffix(s, ".gz") + s = strings.TrimSuffix(s, ".vcf") + return s +}