Fix divide by zero.
[lightning.git] / import.go
index a43737232d06fb65816cb297f72e776b4d84de0f..f7e099b4e6718509514f28dfec911157a5c5ddbe 100644 (file)
--- a/import.go
+++ b/import.go
@@ -8,7 +8,6 @@ import (
        "errors"
        "flag"
        "fmt"
-       "image/color/palette"
        "io"
        "net/http"
        _ "net/http/pprof"
@@ -24,6 +23,7 @@ import (
        "time"
 
        "git.arvados.org/arvados.git/sdk/go/arvados"
+       "github.com/lucasb-eyer/go-colorful"
        log "github.com/sirupsen/logrus"
        "gonum.org/v1/plot"
        "gonum.org/v1/plot/plotter"
@@ -32,16 +32,17 @@ import (
 )
 
 type importer struct {
-       tagLibraryFile string
-       refFile        string
-       outputFile     string
-       projectUUID    string
-       runLocal       bool
-       skipOOO        bool
-       outputTiles    bool
-       includeNoCalls bool
-       outputStats    string
-       encoder        *gob.Encoder
+       tagLibraryFile      string
+       refFile             string
+       outputFile          string
+       projectUUID         string
+       runLocal            bool
+       skipOOO             bool
+       outputTiles         bool
+       saveIncompleteTiles bool
+       outputStats         string
+       matchChromosome     *regexp.Regexp
+       encoder             *gob.Encoder
 }
 
 func (cmd *importer) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int {
@@ -60,8 +61,9 @@ func (cmd *importer) RunCommand(prog string, args []string, stdin io.Reader, std
        flags.BoolVar(&cmd.runLocal, "local", false, "run on local host (default: run in an arvados container)")
        flags.BoolVar(&cmd.skipOOO, "skip-ooo", false, "skip out-of-order tags")
        flags.BoolVar(&cmd.outputTiles, "output-tiles", false, "include tile variant sequences in output file")
-       flags.BoolVar(&cmd.includeNoCalls, "include-no-calls", false, "treat tiles with no-calls as regular tiles")
+       flags.BoolVar(&cmd.saveIncompleteTiles, "save-incomplete-tiles", false, "treat tiles with no-calls as regular tiles")
        flags.StringVar(&cmd.outputStats, "output-stats", "", "output stats to `file` (json)")
+       matchChromosome := flags.String("match-chromosome", "^(chr)?([0-9]+|X|Y|MT?)$", "import chromosomes that match the given `regexp`")
        priority := flags.Int("priority", 500, "container request priority")
        pprof := flags.String("pprof", "", "serve Go profile data at http://`[addr]:port`")
        loglevel := flags.String("loglevel", "info", "logging threshold (trace, debug, info, warn, error, fatal, or panic)")
@@ -91,13 +93,18 @@ func (cmd *importer) RunCommand(prog string, args []string, stdin io.Reader, std
        }
        log.SetLevel(lvl)
 
+       cmd.matchChromosome, err = regexp.Compile(*matchChromosome)
+       if err != nil {
+               return 1
+       }
+
        if !cmd.runLocal {
                runner := arvadosContainerRunner{
                        Name:        "lightning import",
                        Client:      arvados.NewClientFromEnv(),
                        ProjectUUID: cmd.projectUUID,
-                       RAM:         60000000000,
-                       VCPUs:       16,
+                       RAM:         80000000000,
+                       VCPUs:       32,
                        Priority:    *priority,
                }
                err = runner.TranslatePaths(&cmd.tagLibraryFile, &cmd.refFile, &cmd.outputFile)
@@ -112,7 +119,7 @@ func (cmd *importer) RunCommand(prog string, args []string, stdin io.Reader, std
                        }
                }
                if cmd.outputFile == "-" {
-                       cmd.outputFile = "/mnt/output/library.gob"
+                       cmd.outputFile = "/mnt/output/library.gob.gz"
                } else {
                        // Not yet implemented, but this should write
                        // the collection to an existing collection,
@@ -125,7 +132,7 @@ func (cmd *importer) RunCommand(prog string, args []string, stdin io.Reader, std
                        "-loglevel=" + *loglevel,
                        fmt.Sprintf("-skip-ooo=%v", cmd.skipOOO),
                        fmt.Sprintf("-output-tiles=%v", cmd.outputTiles),
-                       fmt.Sprintf("-include-no-calls=%v", cmd.includeNoCalls),
+                       fmt.Sprintf("-save-incomplete-tiles=%v", cmd.saveIncompleteTiles),
                        "-output-stats", "/mnt/output/stats.json",
                        "-tag-library", cmd.tagLibraryFile,
                        "-ref", cmd.refFile,
@@ -136,7 +143,7 @@ func (cmd *importer) RunCommand(prog string, args []string, stdin io.Reader, std
                if err != nil {
                        return 1
                }
-               fmt.Fprintln(stdout, output+"/library.gob")
+               fmt.Fprintln(stdout, output+"/library.gob.gz")
                return 0
        }
 
@@ -150,20 +157,25 @@ func (cmd *importer) RunCommand(prog string, args []string, stdin io.Reader, std
                return 1
        }
 
-       var output io.WriteCloser
+       var outw, outf io.WriteCloser
        if cmd.outputFile == "-" {
-               output = nopCloser{stdout}
+               outw = nopCloser{stdout}
        } else {
-               output, err = os.OpenFile(cmd.outputFile, os.O_CREATE|os.O_WRONLY, 0777)
+               outf, err = os.OpenFile(cmd.outputFile, os.O_CREATE|os.O_WRONLY, 0777)
                if err != nil {
                        return 1
                }
-               defer output.Close()
+               defer outf.Close()
+               if strings.HasSuffix(cmd.outputFile, ".gz") {
+                       outw = gzip.NewWriter(outf)
+               } else {
+                       outw = outf
+               }
        }
-       bufw := bufio.NewWriter(output)
+       bufw := bufio.NewWriter(outw)
        cmd.encoder = gob.NewEncoder(bufw)
 
-       tilelib := &tileLibrary{taglib: taglib, includeNoCalls: cmd.includeNoCalls, skipOOO: cmd.skipOOO}
+       tilelib := &tileLibrary{taglib: taglib, retainNoCalls: cmd.saveIncompleteTiles, skipOOO: cmd.skipOOO}
        if cmd.outputTiles {
                cmd.encoder.Encode(LibraryEntry{TagSet: taglib.Tags()})
                tilelib.encoder = cmd.encoder
@@ -182,10 +194,16 @@ func (cmd *importer) RunCommand(prog string, args []string, stdin io.Reader, std
        if err != nil {
                return 1
        }
-       err = output.Close()
+       err = outw.Close()
        if err != nil {
                return 1
        }
+       if outf != nil && outf != outw {
+               err = outf.Close()
+               if err != nil {
+                       return 1
+               }
+       }
        return 0
 }
 
@@ -203,7 +221,7 @@ func (cmd *importer) tileFasta(tilelib *tileLibrary, infile string) (tileSeq, []
                }
                defer input.Close()
        }
-       return tilelib.TileFasta(infile, input)
+       return tilelib.TileFasta(infile, input, cmd.matchChromosome)
 }
 
 func (cmd *importer) loadTagLibrary() (*tagLibrary, error) {
@@ -469,7 +487,7 @@ func (cmd *importer) tileGVCF(tilelib *tileLibrary, infile string, phase int) (t
                return
        }
        defer consensus.Wait()
-       tileseq, stats, err = tilelib.TileFasta(fmt.Sprintf("%s phase %d", infile, phase+1), stdout)
+       tileseq, stats, err = tilelib.TileFasta(fmt.Sprintf("%s phase %d", infile, phase+1), stdout, cmd.matchChromosome)
        if err != nil {
                return
        }
@@ -538,13 +556,22 @@ func (cmd *importstatsplot) Plot(stdin io.Reader, stdout io.Writer) error {
                })
        }
 
+       labels := []string{}
+       for label := range data {
+               labels = append(labels, label)
+       }
+       sort.Strings(labels)
+       palette, err := colorful.SoftPalette(len(labels))
+       if err != nil {
+               return err
+       }
        nextInPalette := 0
-       for label, xys := range data {
-               s, err := plotter.NewScatter(xys)
+       for idx, label := range labels {
+               s, err := plotter.NewScatter(data[label])
                if err != nil {
                        return err
                }
-               s.GlyphStyle.Color = palette.Plan9[nextInPalette%len(palette.Plan9)]
+               s.GlyphStyle.Color = palette[idx]
                s.GlyphStyle.Radius = vg.Millimeter / 2
                s.GlyphStyle.Shape = draw.CrossGlyph{}
                nextInPalette += 7