Update memory-size log message.
[lightning.git] / exportnumpy.go
index 41f7356660a0385f5e26d7fa144f80961bf63c03..27c86a32107a98faf1952eaffe5fae477faa7213 100644 (file)
@@ -1,3 +1,7 @@
+// Copyright (C) The Lightning Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
 package lightning
 
 import (
@@ -42,8 +46,8 @@ func (cmd *exportNumpy) RunCommand(prog string, args []string, stdin io.Reader,
        runlocal := flags.Bool("local", false, "run on local host (default: run in an arvados container)")
        projectUUID := flags.String("project", "", "project `UUID` for output data")
        priority := flags.Int("priority", 500, "container request priority")
-       inputFilename := flags.String("i", "-", "input `file`")
-       outputDir := flags.String("output-dir", "/tmp", "output `directory`")
+       inputDir := flags.String("input-dir", "./in", "input `directory`")
+       outputDir := flags.String("output-dir", "./out", "output `directory`")
        annotationsFilename := flags.String("output-annotations", "", "output `file` for tile variant annotations csv")
        librefsFilename := flags.String("output-onehot2tilevar", "", "when using -one-hot, create csv `file` mapping column# to tag# and variant#")
        labelsFilename := flags.String("output-labels", "", "output `file` for genome labels csv")
@@ -77,25 +81,23 @@ func (cmd *exportNumpy) RunCommand(prog string, args []string, stdin io.Reader,
                        KeepCache:   1,
                        APIAccess:   true,
                }
-               err = runner.TranslatePaths(inputFilename, regionsFilename)
+               err = runner.TranslatePaths(inputDir, regionsFilename)
                if err != nil {
                        return 1
                }
                runner.Args = []string{"export-numpy", "-local=true",
                        "-pprof", ":6060",
                        fmt.Sprintf("-one-hot=%v", *onehot),
-                       "-i", *inputFilename,
+                       "-input-dir", *inputDir,
                        "-output-dir", "/mnt/output",
                        "-output-annotations", "/mnt/output/annotations.csv",
                        "-output-onehot2tilevar", "/mnt/output/onehot2tilevar.csv",
                        "-output-labels", "/mnt/output/labels.csv",
                        "-regions", *regionsFilename,
                        "-expand-regions", fmt.Sprintf("%d", *expandRegions),
-                       "-max-variants", fmt.Sprintf("%d", cmd.filter.MaxVariants),
-                       "-min-coverage", fmt.Sprintf("%f", cmd.filter.MinCoverage),
-                       "-max-tag", fmt.Sprintf("%d", cmd.filter.MaxTag),
                        "-chunks", fmt.Sprintf("%d", *chunks),
                }
+               runner.Args = append(runner.Args, cmd.filter.Args()...)
                var output string
                output, err = runner.Run()
                if err != nil {
@@ -105,27 +107,12 @@ func (cmd *exportNumpy) RunCommand(prog string, args []string, stdin io.Reader,
                return 0
        }
 
-       var input io.ReadCloser
-       if *inputFilename == "-" {
-               input = ioutil.NopCloser(stdin)
-       } else {
-               input, err = open(*inputFilename)
-               if err != nil {
-                       return 1
-               }
-               defer input.Close()
-       }
-       input = ioutil.NopCloser(bufio.NewReaderSize(input, 8*1024*1024))
        tilelib := &tileLibrary{
                retainNoCalls:       true,
                retainTileSequences: true,
                compactGenomes:      map[string][]tileVariantID{},
        }
-       err = tilelib.LoadGob(context.Background(), input, strings.HasSuffix(*inputFilename, ".gz"), nil)
-       if err != nil {
-               return 1
-       }
-       err = input.Close()
+       err = tilelib.LoadDir(context.Background(), *inputDir)
        if err != nil {
                return 1
        }
@@ -264,19 +251,23 @@ func (cmd *exportNumpy) RunCommand(prog string, args []string, stdin io.Reader,
                                return
                        }
                        defer f.Close()
-                       npw, err := gonpy.NewWriter(f)
+                       // gonpy closes our writer and ignores errors. Give it a nopCloser so we can close f properly.
+                       npw, err := gonpy.NewWriter(nopCloser{f})
                        if err != nil {
                                lastErr.Store(err)
                                return
                        }
                        npw.Shape = []int{len(names), len(pdis) * 2}
-                       npw.WriteInt8(data)
-                       // gonpy closes f and ignores errors, doh.
-                       // err = f.Close()
-                       // if err != nil {
-                       //      lastErr.Store(err)
-                       //      return
-                       // }
+                       err = npw.WriteInt8(data)
+                       if err != nil {
+                               lastErr.Store(err)
+                               return
+                       }
+                       err = f.Close()
+                       if err != nil {
+                               lastErr.Store(err)
+                               return
+                       }
                }()
        }
        wg.Wait()
@@ -372,7 +363,7 @@ func lowqual(tilelib *tileLibrary) (lowqual []map[tileVariantID]bool) {
        for tag, variants := range tilelib.variant {
                lq := lowqual[tag]
                for varidx, hash := range variants {
-                       if len(tilelib.seq[hash]) == 0 {
+                       if len(tilelib.hashSequence(hash)) == 0 {
                                if lq == nil {
                                        lq = map[tileVariantID]bool{}
                                        lowqual[tag] = lq
@@ -413,22 +404,20 @@ func cgs2array(tilelib *tileLibrary, names []string, lowqual []map[tileVariantID
        return
 }
 
-func chooseTiles(tilelib *tileLibrary, regionsFilename string, expandRegions int) (drop []bool, err error) {
-       if regionsFilename == "" {
-               return
-       }
+func makeMask(regionsFilename string, expandRegions int) (*mask, error) {
+       log.Printf("makeMask: reading %s", regionsFilename)
        rfile, err := zopen(regionsFilename)
        if err != nil {
-               return
+               return nil, err
        }
        defer rfile.Close()
-       regions, err := ioutil.ReadAll(rfile)
+       regions, err := io.ReadAll(rfile)
        if err != nil {
-               return
+               return nil, err
        }
 
-       log.Print("chooseTiles: building mask")
-       mask := &mask{}
+       log.Print("makeMask: building mask")
+       var mask mask
        for _, line := range bytes.Split(regions, []byte{'\n'}) {
                if bytes.HasPrefix(line, []byte{'#'}) {
                        continue
@@ -452,14 +441,24 @@ func chooseTiles(tilelib *tileLibrary, regionsFilename string, expandRegions int
                                // GFF/GTF
                                end++
                        } else {
-                               err = fmt.Errorf("cannot parse input line as BED or GFF/GTF: %q", line)
-                               return
+                               return nil, fmt.Errorf("cannot parse input line as BED or GFF/GTF: %q", line)
                        }
                }
                mask.Add(refseqname, start-expandRegions, end+expandRegions)
        }
-       log.Print("chooseTiles: mask.Freeze")
+       log.Print("makeMask: mask.Freeze")
        mask.Freeze()
+       return &mask, nil
+}
+
+func chooseTiles(tilelib *tileLibrary, regionsFilename string, expandRegions int) (drop []bool, err error) {
+       if regionsFilename == "" {
+               return
+       }
+       mask, err := makeMask(regionsFilename, expandRegions)
+       if err != nil {
+               return
+       }
 
        tagset := tilelib.taglib.Tags()
        if len(tagset) == 0 {