Log dimensions.
[lightning.git] / exportnumpy.go
index d11b7431d5a6bc73dc4711035dfdda9bf992d6fb..fd198116b1512c01f6fb51db67bc82354edd40d5 100644 (file)
@@ -2,19 +2,22 @@ package main
 
 import (
        "bufio"
+       "errors"
        "flag"
        "fmt"
        "io"
-       "log"
+       "io/ioutil"
        "net/http"
        _ "net/http/pprof"
+       "os"
+       "sort"
 
+       "git.arvados.org/arvados.git/sdk/go/arvados"
        "github.com/kshedden/gonpy"
+       log "github.com/sirupsen/logrus"
 )
 
-type exportNumpy struct {
-       output io.Writer
-}
+type exportNumpy struct{}
 
 func (cmd *exportNumpy) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int {
        var err error
@@ -26,6 +29,12 @@ func (cmd *exportNumpy) RunCommand(prog string, args []string, stdin io.Reader,
        flags := flag.NewFlagSet("", flag.ContinueOnError)
        flags.SetOutput(stderr)
        pprof := flags.String("pprof", "", "serve Go profile data at http://`[addr]:port`")
+       runlocal := flags.Bool("local", false, "run on local host (default: run in an arvados container)")
+       projectUUID := flags.String("project", "", "project `UUID` for output data")
+       priority := flags.Int("priority", 500, "container request priority")
+       inputFilename := flags.String("i", "-", "input `file`")
+       outputFilename := flags.String("o", "-", "output `file`")
+       onehot := flags.Bool("one-hot", false, "recode tile variants as one-hot")
        err = flags.Parse(args)
        if err == flag.ErrHelp {
                err = nil
@@ -33,7 +42,6 @@ func (cmd *exportNumpy) RunCommand(prog string, args []string, stdin io.Reader,
        } else if err != nil {
                return 2
        }
-       cmd.output = stdout
 
        if *pprof != "" {
                go func() {
@@ -41,37 +49,129 @@ func (cmd *exportNumpy) RunCommand(prog string, args []string, stdin io.Reader,
                }()
        }
 
-       cgs, err := ReadCompactGenomes(stdin)
+       if !*runlocal {
+               if *outputFilename != "-" {
+                       err = errors.New("cannot specify output file in container mode: not implemented")
+                       return 1
+               }
+               runner := arvadosContainerRunner{
+                       Name:        "lightning export-numpy",
+                       Client:      arvados.NewClientFromEnv(),
+                       ProjectUUID: *projectUUID,
+                       RAM:         128000000000,
+                       VCPUs:       2,
+                       Priority:    *priority,
+               }
+               err = runner.TranslatePaths(inputFilename)
+               if err != nil {
+                       return 1
+               }
+               runner.Args = []string{"export-numpy", "-local=true", fmt.Sprintf("-one-hot=%v", *onehot), "-i", *inputFilename, "-o", "/mnt/output/library.npy"}
+               var output string
+               output, err = runner.Run()
+               if err != nil {
+                       return 1
+               }
+               fmt.Fprintln(stdout, output+"/library.npy")
+               return 0
+       }
+
+       var input io.ReadCloser
+       if *inputFilename == "-" {
+               input = ioutil.NopCloser(stdin)
+       } else {
+               input, err = os.Open(*inputFilename)
+               if err != nil {
+                       return 1
+               }
+               defer input.Close()
+       }
+       cgs, err := ReadCompactGenomes(input)
        if err != nil {
                return 1
        }
-       cols := 0
-       for _, cg := range cgs {
-               if cols < len(cg.Variants) {
-                       cols = len(cg.Variants)
-               }
+       err = input.Close()
+       if err != nil {
+               return 1
        }
-       rows := len(cgs)
-       out := make([]uint16, rows*cols)
-       for row, cg := range cgs {
-               for i, v := range cg.Variants {
-                       out[row*cols+i] = uint16(v)
+       sort.Slice(cgs, func(i, j int) bool { return cgs[i].Name < cgs[j].Name })
+
+       out, rows, cols := cgs2array(cgs)
+
+       var output io.WriteCloser
+       if *outputFilename == "-" {
+               output = nopCloser{stdout}
+       } else {
+               output, err = os.OpenFile(*outputFilename, os.O_CREATE|os.O_WRONLY, 0777)
+               if err != nil {
+                       return 1
                }
+               defer output.Close()
        }
-       w := bufio.NewWriter(cmd.output)
-       npw, err := gonpy.NewWriter(nopCloser{w})
+       bufw := bufio.NewWriter(output)
+       npw, err := gonpy.NewWriter(nopCloser{bufw})
        if err != nil {
                return 1
        }
+       if *onehot {
+               out, cols = recodeOnehot(out, cols)
+       }
        npw.Shape = []int{rows, cols}
        npw.WriteUint16(out)
-       err = w.Flush()
+       err = bufw.Flush()
+       if err != nil {
+               return 1
+       }
+       err = output.Close()
        if err != nil {
                return 1
        }
        return 0
 }
 
+func cgs2array(cgs []CompactGenome) (data []uint16, rows, cols int) {
+       rows = len(cgs)
+       for _, cg := range cgs {
+               if cols < len(cg.Variants) {
+                       cols = len(cg.Variants)
+               }
+       }
+       data = make([]uint16, rows*cols)
+       for row, cg := range cgs {
+               for i, v := range cg.Variants {
+                       data[row*cols+i] = uint16(v)
+               }
+       }
+       return
+}
+
+func recodeOnehot(in []uint16, incols int) ([]uint16, int) {
+       rows := len(in) / incols
+       maxvalue := make([]uint16, incols)
+       for row := 0; row < rows; row++ {
+               for col := 0; col < incols; col++ {
+                       if v := in[row*incols+col]; maxvalue[col] < v {
+                               maxvalue[col] = v
+                       }
+               }
+       }
+       outcol := make([]int, incols)
+       outcols := 0
+       for incol, v := range maxvalue {
+               outcol[incol] = outcols
+               outcols += int(v)
+       }
+       out := make([]uint16, rows*outcols)
+       for row := 0; row < rows; row++ {
+               for col := 0; col < incols; col++ {
+                       if v := in[row*incols+col]; v > 0 {
+                               out[row*outcols+outcol[col]+int(v)-1] = 1
+                       }
+               }
+       }
+       return out, outcols
+}
+
 type nopCloser struct {
        io.Writer
 }