Move command line tool to subdir.
[lightning.git] / exportnumpy.go
index d67ac6fee73d1842015b55afa36c0b92b64c77c5..ec6c83b886831983f2a45225bf97c7271d027fc2 100644 (file)
@@ -1,4 +1,4 @@
-package main
+package lightning
 
 import (
        "bufio"
@@ -11,10 +11,13 @@ import (
        "net/http"
        _ "net/http/pprof"
        "os"
+       "path"
        "sort"
+       "strings"
 
        "git.arvados.org/arvados.git/sdk/go/arvados"
        "github.com/kshedden/gonpy"
+       "github.com/sirupsen/logrus"
        log "github.com/sirupsen/logrus"
 )
 
@@ -37,8 +40,9 @@ func (cmd *exportNumpy) RunCommand(prog string, args []string, stdin io.Reader,
        priority := flags.Int("priority", 500, "container request priority")
        inputFilename := flags.String("i", "-", "input `file`")
        outputFilename := flags.String("o", "-", "output `file`")
-       annotationsFilename := flags.String("output-annotations", "", "output `file` for tile variant annotations tsv")
-       librefsFilename := flags.String("output-onehot2tilevar", "", "when using -one-hot, create tsv `file` mapping column# to tag# and variant#")
+       annotationsFilename := flags.String("output-annotations", "", "output `file` for tile variant annotations csv")
+       librefsFilename := flags.String("output-onehot2tilevar", "", "when using -one-hot, create csv `file` mapping column# to tag# and variant#")
+       labelsFilename := flags.String("output-labels", "", "output `file` for genome labels csv")
        onehot := flags.Bool("one-hot", false, "recode tile variants as one-hot")
        cmd.filter.Flags(flags)
        err = flags.Parse(args)
@@ -64,9 +68,11 @@ func (cmd *exportNumpy) RunCommand(prog string, args []string, stdin io.Reader,
                        Name:        "lightning export-numpy",
                        Client:      arvados.NewClientFromEnv(),
                        ProjectUUID: *projectUUID,
-                       RAM:         128000000000,
+                       RAM:         450000000000,
                        VCPUs:       32,
                        Priority:    *priority,
+                       KeepCache:   1,
+                       APIAccess:   true,
                }
                err = runner.TranslatePaths(inputFilename)
                if err != nil {
@@ -76,8 +82,9 @@ func (cmd *exportNumpy) RunCommand(prog string, args []string, stdin io.Reader,
                        fmt.Sprintf("-one-hot=%v", *onehot),
                        "-i", *inputFilename,
                        "-o", "/mnt/output/matrix.npy",
-                       "-output-annotations", "/mnt/output/annotations.tsv",
-                       "-output-onehot2tilevar", "/mnt/output/onehot2tilevar.tsv",
+                       "-output-annotations", "/mnt/output/annotations.csv",
+                       "-output-onehot2tilevar", "/mnt/output/onehot2tilevar.csv",
+                       "-output-labels", "/mnt/output/labels.csv",
                        "-max-variants", fmt.Sprintf("%d", cmd.filter.MaxVariants),
                        "-min-coverage", fmt.Sprintf("%f", cmd.filter.MinCoverage),
                        "-max-tag", fmt.Sprintf("%d", cmd.filter.MaxTag),
@@ -95,18 +102,19 @@ func (cmd *exportNumpy) RunCommand(prog string, args []string, stdin io.Reader,
        if *inputFilename == "-" {
                input = ioutil.NopCloser(stdin)
        } else {
-               input, err = os.Open(*inputFilename)
+               input, err = open(*inputFilename)
                if err != nil {
                        return 1
                }
                defer input.Close()
        }
+       input = ioutil.NopCloser(bufio.NewReaderSize(input, 8*1024*1024))
        tilelib := &tileLibrary{
                retainNoCalls:       true,
                retainTileSequences: true,
                compactGenomes:      map[string][]tileVariantID{},
        }
-       err = tilelib.LoadGob(context.Background(), input, nil)
+       err = tilelib.LoadGob(context.Background(), input, strings.HasSuffix(*inputFilename, ".gz"), nil)
        if err != nil {
                return 1
        }
@@ -139,7 +147,32 @@ func (cmd *exportNumpy) RunCommand(prog string, args []string, stdin io.Reader,
        }
 
        log.Info("building numpy array")
-       out, rows, cols := cgs2array(tilelib.compactGenomes)
+       out, rows, cols, names := cgs2array(tilelib)
+
+       if *labelsFilename != "" {
+               log.Infof("writing labels to %s", *labelsFilename)
+               var f *os.File
+               f, err = os.OpenFile(*labelsFilename, os.O_CREATE|os.O_WRONLY, 0777)
+               if err != nil {
+                       return 1
+               }
+               defer f.Close()
+               _, outBasename := path.Split(*outputFilename)
+               for i, name := range names {
+                       _, err = fmt.Fprintf(f, "%d,%q,%q\n", i, trimFilenameForLabel(name), outBasename)
+                       if err != nil {
+                               err = fmt.Errorf("write %s: %w", *labelsFilename, err)
+                               return 1
+                       }
+               }
+               err = f.Close()
+               if err != nil {
+                       err = fmt.Errorf("close %s: %w", *labelsFilename, err)
+                       return 1
+               }
+       }
+
+       log.Info("writing numpy file")
        var output io.WriteCloser
        if *outputFilename == "-" {
                output = nopCloser{stdout}
@@ -167,9 +200,12 @@ func (cmd *exportNumpy) RunCommand(prog string, args []string, stdin io.Reader,
                        }
                }
        }
-       log.Info("writing numpy")
+       log.WithFields(logrus.Fields{
+               "rows": rows,
+               "cols": cols,
+       }).Info("writing numpy")
        npw.Shape = []int{rows, cols}
-       npw.WriteUint16(out)
+       npw.WriteInt16(out)
        err = bufw.Flush()
        if err != nil {
                return 1
@@ -188,7 +224,7 @@ func (*exportNumpy) writeLibRefs(fnm string, tilelib *tileLibrary, librefs []til
        }
        defer f.Close()
        for i, libref := range librefs {
-               _, err = fmt.Fprintf(f, "%d\t%d\t%d\n", i, libref.Tag, libref.Variant)
+               _, err = fmt.Fprintf(f, "%d,%d,%d\n", i, libref.Tag, libref.Variant)
                if err != nil {
                        return err
                }
@@ -196,31 +232,53 @@ func (*exportNumpy) writeLibRefs(fnm string, tilelib *tileLibrary, librefs []til
        return f.Close()
 }
 
-func cgs2array(cgs map[string][]tileVariantID) (data []uint16, rows, cols int) {
-       var cgnames []string
-       for name := range cgs {
+func cgs2array(tilelib *tileLibrary) (data []int16, rows, cols int, cgnames []string) {
+       for name := range tilelib.compactGenomes {
                cgnames = append(cgnames, name)
        }
-       sort.Strings(cgnames)
+       sort.Slice(cgnames, func(i, j int) bool {
+               return trimFilenameForLabel(cgnames[i]) < trimFilenameForLabel(cgnames[j])
+       })
 
-       rows = len(cgs)
-       for _, cg := range cgs {
+       rows = len(tilelib.compactGenomes)
+       for _, cg := range tilelib.compactGenomes {
                if cols < len(cg) {
                        cols = len(cg)
                }
        }
-       data = make([]uint16, rows*cols)
+
+       // flag low-quality tile variants so we can change to -1 below
+       lowqual := make([]map[tileVariantID]bool, cols/2)
+       for tag, variants := range tilelib.variant {
+               lq := lowqual[tag]
+               for varidx, hash := range variants {
+                       if len(tilelib.seq[hash]) == 0 {
+                               if lq == nil {
+                                       lq = map[tileVariantID]bool{}
+                                       lowqual[tag] = lq
+                               }
+                               lq[tileVariantID(varidx+1)] = true
+                       }
+               }
+       }
+
+       data = make([]int16, rows*cols)
        for row, name := range cgnames {
-               for i, v := range cgs[name] {
-                       data[row*cols+i] = uint16(v)
+               for i, v := range tilelib.compactGenomes[name] {
+                       if v > 0 && lowqual[i/2][v] {
+                               data[row*cols+i] = -1
+                       } else {
+                               data[row*cols+i] = int16(v)
+                       }
                }
        }
+
        return
 }
 
-func recodeOnehot(in []uint16, incols int) (out []uint16, librefs []tileLibRef, outcols int) {
+func recodeOnehot(in []int16, incols int) (out []int16, librefs []tileLibRef, outcols int) {
        rows := len(in) / incols
-       maxvalue := make([]uint16, incols)
+       maxvalue := make([]int16, incols)
        for row := 0; row < rows; row++ {
                for col := 0; col < incols; col++ {
                        if v := in[row*incols+col]; maxvalue[col] < v {
@@ -242,7 +300,7 @@ func recodeOnehot(in []uint16, incols int) (out []uint16, librefs []tileLibRef,
        }
        log.Printf("recodeOnehot: dropped %d input cols with zero maxvalue", dropped)
 
-       out = make([]uint16, rows*outcols)
+       out = make([]int16, rows*outcols)
        for inidx, row := 0, 0; row < rows; row++ {
                outrow := out[row*outcols:]
                for col := 0; col < incols; col++ {
@@ -260,3 +318,17 @@ type nopCloser struct {
 }
 
 func (nopCloser) Close() error { return nil }
+
+func trimFilenameForLabel(s string) string {
+       if i := strings.LastIndex(s, "/"); i >= 0 {
+               s = s[i+1:]
+       }
+       s = strings.TrimSuffix(s, ".gz")
+       s = strings.TrimSuffix(s, ".fa")
+       s = strings.TrimSuffix(s, ".fasta")
+       s = strings.TrimSuffix(s, ".1")
+       s = strings.TrimSuffix(s, ".2")
+       s = strings.TrimSuffix(s, ".gz")
+       s = strings.TrimSuffix(s, ".vcf")
+       return s
+}