16 "git.arvados.org/arvados.git/sdk/go/arvados"
17 "github.com/kshedden/gonpy"
18 log "github.com/sirupsen/logrus"
21 type exportNumpy struct {
25 func (cmd *exportNumpy) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int {
29 fmt.Fprintf(stderr, "%s\n", err)
32 flags := flag.NewFlagSet("", flag.ContinueOnError)
33 flags.SetOutput(stderr)
34 pprof := flags.String("pprof", "", "serve Go profile data at http://`[addr]:port`")
35 runlocal := flags.Bool("local", false, "run on local host (default: run in an arvados container)")
36 projectUUID := flags.String("project", "", "project `UUID` for output data")
37 priority := flags.Int("priority", 500, "container request priority")
38 inputFilename := flags.String("i", "-", "input `file`")
39 outputFilename := flags.String("o", "-", "output `file`")
40 annotationsFilename := flags.String("output-annotations", "", "output `file` for tile variant annotations tsv")
41 librefsFilename := flags.String("output-onehot2tilevar", "", "when using -one-hot, create tsv `file` mapping column# to tag# and variant#")
42 onehot := flags.Bool("one-hot", false, "recode tile variants as one-hot")
43 cmd.filter.Flags(flags)
44 err = flags.Parse(args)
45 if err == flag.ErrHelp {
48 } else if err != nil {
54 log.Println(http.ListenAndServe(*pprof, nil))
59 if *outputFilename != "-" {
60 err = errors.New("cannot specify output file in container mode: not implemented")
63 runner := arvadosContainerRunner{
64 Name: "lightning export-numpy",
65 Client: arvados.NewClientFromEnv(),
66 ProjectUUID: *projectUUID,
71 err = runner.TranslatePaths(inputFilename)
75 runner.Args = []string{"export-numpy", "-local=true",
76 fmt.Sprintf("-one-hot=%v", *onehot),
78 "-o", "/mnt/output/matrix.npy",
79 "-output-annotations", "/mnt/output/annotations.tsv",
80 "-output-onehot2tilevar", "/mnt/output/onehot2tilevar.tsv",
81 "-max-variants", fmt.Sprintf("%d", cmd.filter.MaxVariants),
82 "-min-coverage", fmt.Sprintf("%f", cmd.filter.MinCoverage),
83 "-max-tag", fmt.Sprintf("%d", cmd.filter.MaxTag),
86 output, err = runner.Run()
90 fmt.Fprintln(stdout, output+"/matrix.npy")
94 var input io.ReadCloser
95 if *inputFilename == "-" {
96 input = ioutil.NopCloser(stdin)
98 input, err = os.Open(*inputFilename)
104 tilelib := &tileLibrary{
105 includeNoCalls: true,
106 retainTileSequences: true,
107 compactGenomes: map[string][]tileVariantID{},
109 err = tilelib.LoadGob(context.Background(), input, nil)
118 log.Info("filtering")
119 cmd.filter.Apply(tilelib)
123 if *annotationsFilename != "" {
124 log.Infof("writing annotations")
125 var annow io.WriteCloser
126 annow, err = os.OpenFile(*annotationsFilename, os.O_CREATE|os.O_WRONLY, 0666)
131 err = (&annotatecmd{maxTileSize: 5000}).exportTileDiffs(annow, tilelib)
141 log.Info("building numpy array")
142 out, rows, cols := cgs2array(tilelib.compactGenomes)
143 var output io.WriteCloser
144 if *outputFilename == "-" {
145 output = nopCloser{stdout}
147 output, err = os.OpenFile(*outputFilename, os.O_CREATE|os.O_WRONLY, 0777)
153 bufw := bufio.NewWriter(output)
154 npw, err := gonpy.NewWriter(nopCloser{bufw})
159 log.Info("recoding to onehot")
160 recoded, librefs, recodedcols := recodeOnehot(out, cols)
161 out, cols = recoded, recodedcols
162 if *librefsFilename != "" {
163 log.Infof("writing onehot column mapping")
164 err = cmd.writeLibRefs(*librefsFilename, tilelib, librefs)
170 log.Info("writing numpy")
171 npw.Shape = []int{rows, cols}
184 func (*exportNumpy) writeLibRefs(fnm string, tilelib *tileLibrary, librefs []tileLibRef) error {
185 f, err := os.OpenFile(fnm, os.O_CREATE|os.O_WRONLY, 0666)
190 for i, libref := range librefs {
191 _, err = fmt.Fprintf(f, "%d\t%d\t%d\n", i, libref.Tag, libref.Variant)
199 func cgs2array(cgs map[string][]tileVariantID) (data []uint16, rows, cols int) {
201 for name := range cgs {
202 cgnames = append(cgnames, name)
204 sort.Strings(cgnames)
207 for _, cg := range cgs {
212 data = make([]uint16, rows*cols)
213 for row, name := range cgnames {
214 for i, v := range cgs[name] {
215 data[row*cols+i] = uint16(v)
221 func recodeOnehot(in []uint16, incols int) (out []uint16, librefs []tileLibRef, outcols int) {
222 rows := len(in) / incols
223 maxvalue := make([]uint16, incols)
224 for row := 0; row < rows; row++ {
225 for col := 0; col < incols; col++ {
226 if v := in[row*incols+col]; maxvalue[col] < v {
231 outcol := make([]int, incols)
233 for incol, maxv := range maxvalue {
234 outcol[incol] = outcols
238 for v := 1; v <= int(maxv); v++ {
239 librefs = append(librefs, tileLibRef{Tag: tagID(incol), Variant: tileVariantID(v)})
243 log.Printf("recodeOnehot: dropped %d input cols with zero maxvalue", dropped)
245 out = make([]uint16, rows*outcols)
246 for inidx, row := 0, 0; row < rows; row++ {
247 outrow := out[row*outcols:]
248 for col := 0; col < incols; col++ {
249 if v := in[inidx]; v > 0 {
250 outrow[outcol[col]+int(v)-1] = 1
258 type nopCloser struct {
262 func (nopCloser) Close() error { return nil }