17 "git.arvados.org/arvados.git/sdk/go/arvados"
18 "github.com/kshedden/gonpy"
19 log "github.com/sirupsen/logrus"
22 type exportNumpy struct {
26 func (cmd *exportNumpy) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int {
30 fmt.Fprintf(stderr, "%s\n", err)
33 flags := flag.NewFlagSet("", flag.ContinueOnError)
34 flags.SetOutput(stderr)
35 pprof := flags.String("pprof", "", "serve Go profile data at http://`[addr]:port`")
36 runlocal := flags.Bool("local", false, "run on local host (default: run in an arvados container)")
37 projectUUID := flags.String("project", "", "project `UUID` for output data")
38 priority := flags.Int("priority", 500, "container request priority")
39 inputFilename := flags.String("i", "-", "input `file`")
40 outputFilename := flags.String("o", "-", "output `file`")
41 annotationsFilename := flags.String("output-annotations", "", "output `file` for tile variant annotations csv")
42 librefsFilename := flags.String("output-onehot2tilevar", "", "when using -one-hot, create csv `file` mapping column# to tag# and variant#")
43 labelsFilename := flags.String("output-labels", "", "output `file` for genome labels csv")
44 onehot := flags.Bool("one-hot", false, "recode tile variants as one-hot")
45 cmd.filter.Flags(flags)
46 err = flags.Parse(args)
47 if err == flag.ErrHelp {
50 } else if err != nil {
56 log.Println(http.ListenAndServe(*pprof, nil))
61 if *outputFilename != "-" {
62 err = errors.New("cannot specify output file in container mode: not implemented")
65 runner := arvadosContainerRunner{
66 Name: "lightning export-numpy",
67 Client: arvados.NewClientFromEnv(),
68 ProjectUUID: *projectUUID,
73 err = runner.TranslatePaths(inputFilename)
77 runner.Args = []string{"export-numpy", "-local=true",
78 fmt.Sprintf("-one-hot=%v", *onehot),
80 "-o", "/mnt/output/matrix.npy",
81 "-output-annotations", "/mnt/output/annotations.csv",
82 "-output-onehot2tilevar", "/mnt/output/onehot2tilevar.csv",
83 "-output-labels", "/mnt/output/labels.csv",
84 "-max-variants", fmt.Sprintf("%d", cmd.filter.MaxVariants),
85 "-min-coverage", fmt.Sprintf("%f", cmd.filter.MinCoverage),
86 "-max-tag", fmt.Sprintf("%d", cmd.filter.MaxTag),
89 output, err = runner.Run()
93 fmt.Fprintln(stdout, output+"/matrix.npy")
97 var input io.ReadCloser
98 if *inputFilename == "-" {
99 input = ioutil.NopCloser(stdin)
101 input, err = os.Open(*inputFilename)
107 tilelib := &tileLibrary{
109 retainTileSequences: true,
110 compactGenomes: map[string][]tileVariantID{},
112 err = tilelib.LoadGob(context.Background(), input, strings.HasSuffix(*inputFilename, ".gz"), nil)
121 log.Info("filtering")
122 cmd.filter.Apply(tilelib)
126 if *annotationsFilename != "" {
127 log.Infof("writing annotations")
128 var annow io.WriteCloser
129 annow, err = os.OpenFile(*annotationsFilename, os.O_CREATE|os.O_WRONLY, 0666)
134 err = (&annotatecmd{maxTileSize: 5000}).exportTileDiffs(annow, tilelib)
144 log.Info("building numpy array")
145 out, rows, cols, names := cgs2array(tilelib)
147 if *labelsFilename != "" {
148 log.Infof("writing labels to %s", *labelsFilename)
150 f, err = os.OpenFile(*labelsFilename, os.O_CREATE|os.O_WRONLY, 0777)
155 for i, name := range names {
156 _, err = fmt.Fprintf(f, "%d,%q\n", i, trimFilenameForLabel(name))
158 err = fmt.Errorf("write %s: %w", *labelsFilename, err)
164 err = fmt.Errorf("close %s: %w", *labelsFilename, err)
169 log.Info("writing numpy file")
170 var output io.WriteCloser
171 if *outputFilename == "-" {
172 output = nopCloser{stdout}
174 output, err = os.OpenFile(*outputFilename, os.O_CREATE|os.O_WRONLY, 0777)
180 bufw := bufio.NewWriter(output)
181 npw, err := gonpy.NewWriter(nopCloser{bufw})
186 log.Info("recoding to onehot")
187 recoded, librefs, recodedcols := recodeOnehot(out, cols)
188 out, cols = recoded, recodedcols
189 if *librefsFilename != "" {
190 log.Infof("writing onehot column mapping")
191 err = cmd.writeLibRefs(*librefsFilename, tilelib, librefs)
197 log.Info("writing numpy")
198 npw.Shape = []int{rows, cols}
211 func (*exportNumpy) writeLibRefs(fnm string, tilelib *tileLibrary, librefs []tileLibRef) error {
212 f, err := os.OpenFile(fnm, os.O_CREATE|os.O_WRONLY, 0666)
217 for i, libref := range librefs {
218 _, err = fmt.Fprintf(f, "%d,%d,%d\n", i, libref.Tag, libref.Variant)
226 func cgs2array(tilelib *tileLibrary) (data []int16, rows, cols int, cgnames []string) {
227 for name := range tilelib.compactGenomes {
228 cgnames = append(cgnames, name)
230 sort.Strings(cgnames)
232 rows = len(tilelib.compactGenomes)
233 for _, cg := range tilelib.compactGenomes {
239 // flag low-quality tile variants so we can change to -1 below
240 lowqual := make([]map[tileVariantID]bool, cols/2)
241 for tag, variants := range tilelib.variant {
243 for varidx, hash := range variants {
244 if len(tilelib.seq[hash]) == 0 {
246 lq = map[tileVariantID]bool{}
249 lq[tileVariantID(varidx+1)] = true
254 data = make([]int16, rows*cols)
255 for row, name := range cgnames {
256 for i, v := range tilelib.compactGenomes[name] {
257 if v > 0 && lowqual[i/2][v] {
258 data[row*cols+i] = -1
260 data[row*cols+i] = int16(v)
268 func recodeOnehot(in []int16, incols int) (out []int16, librefs []tileLibRef, outcols int) {
269 rows := len(in) / incols
270 maxvalue := make([]int16, incols)
271 for row := 0; row < rows; row++ {
272 for col := 0; col < incols; col++ {
273 if v := in[row*incols+col]; maxvalue[col] < v {
278 outcol := make([]int, incols)
280 for incol, maxv := range maxvalue {
281 outcol[incol] = outcols
285 for v := 1; v <= int(maxv); v++ {
286 librefs = append(librefs, tileLibRef{Tag: tagID(incol), Variant: tileVariantID(v)})
290 log.Printf("recodeOnehot: dropped %d input cols with zero maxvalue", dropped)
292 out = make([]int16, rows*outcols)
293 for inidx, row := 0, 0; row < rows; row++ {
294 outrow := out[row*outcols:]
295 for col := 0; col < incols; col++ {
296 if v := in[inidx]; v > 0 {
297 outrow[outcol[col]+int(v)-1] = 1
305 type nopCloser struct {
309 func (nopCloser) Close() error { return nil }
311 func trimFilenameForLabel(s string) string {
312 if i := strings.LastIndex(s, "/"); i >= 0 {
315 s = strings.TrimSuffix(s, ".gz")
316 s = strings.TrimSuffix(s, ".fa")
317 s = strings.TrimSuffix(s, ".fasta")
318 s = strings.TrimSuffix(s, ".1")
319 s = strings.TrimSuffix(s, ".2")
320 s = strings.TrimSuffix(s, ".gz")
321 s = strings.TrimSuffix(s, ".vcf")