Gzip gob files.
[lightning.git] / import.go
index aa83a70798c8a0c062a652a81afbbee24a761ecc..629fccac2fc049646a58c23fbbbd11a14292ef52 100644 (file)
--- a/import.go
+++ b/import.go
@@ -23,20 +23,25 @@ import (
        "time"
 
        "git.arvados.org/arvados.git/sdk/go/arvados"
+       "github.com/lucasb-eyer/go-colorful"
        log "github.com/sirupsen/logrus"
+       "gonum.org/v1/plot"
+       "gonum.org/v1/plot/plotter"
+       "gonum.org/v1/plot/vg"
+       "gonum.org/v1/plot/vg/draw"
 )
 
 type importer struct {
-       tagLibraryFile string
-       refFile        string
-       outputFile     string
-       projectUUID    string
-       runLocal       bool
-       skipOOO        bool
-       outputTiles    bool
-       includeNoCalls bool
-       outputStats    string
-       encoder        *gob.Encoder
+       tagLibraryFile      string
+       refFile             string
+       outputFile          string
+       projectUUID         string
+       runLocal            bool
+       skipOOO             bool
+       outputTiles         bool
+       saveIncompleteTiles bool
+       outputStats         string
+       encoder             *gob.Encoder
 }
 
 func (cmd *importer) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int {
@@ -55,7 +60,7 @@ func (cmd *importer) RunCommand(prog string, args []string, stdin io.Reader, std
        flags.BoolVar(&cmd.runLocal, "local", false, "run on local host (default: run in an arvados container)")
        flags.BoolVar(&cmd.skipOOO, "skip-ooo", false, "skip out-of-order tags")
        flags.BoolVar(&cmd.outputTiles, "output-tiles", false, "include tile variant sequences in output file")
-       flags.BoolVar(&cmd.includeNoCalls, "include-no-calls", false, "treat tiles with no-calls as regular tiles")
+       flags.BoolVar(&cmd.saveIncompleteTiles, "save-incomplete-tiles", false, "treat tiles with no-calls as regular tiles")
        flags.StringVar(&cmd.outputStats, "output-stats", "", "output stats to `file` (json)")
        priority := flags.Int("priority", 500, "container request priority")
        pprof := flags.String("pprof", "", "serve Go profile data at http://`[addr]:port`")
@@ -91,8 +96,8 @@ func (cmd *importer) RunCommand(prog string, args []string, stdin io.Reader, std
                        Name:        "lightning import",
                        Client:      arvados.NewClientFromEnv(),
                        ProjectUUID: cmd.projectUUID,
-                       RAM:         60000000000,
-                       VCPUs:       16,
+                       RAM:         80000000000,
+                       VCPUs:       32,
                        Priority:    *priority,
                }
                err = runner.TranslatePaths(&cmd.tagLibraryFile, &cmd.refFile, &cmd.outputFile)
@@ -107,7 +112,7 @@ func (cmd *importer) RunCommand(prog string, args []string, stdin io.Reader, std
                        }
                }
                if cmd.outputFile == "-" {
-                       cmd.outputFile = "/mnt/output/library.gob"
+                       cmd.outputFile = "/mnt/output/library.gob.gz"
                } else {
                        // Not yet implemented, but this should write
                        // the collection to an existing collection,
@@ -120,7 +125,7 @@ func (cmd *importer) RunCommand(prog string, args []string, stdin io.Reader, std
                        "-loglevel=" + *loglevel,
                        fmt.Sprintf("-skip-ooo=%v", cmd.skipOOO),
                        fmt.Sprintf("-output-tiles=%v", cmd.outputTiles),
-                       fmt.Sprintf("-include-no-calls=%v", cmd.includeNoCalls),
+                       fmt.Sprintf("-save-incomplete-tiles=%v", cmd.saveIncompleteTiles),
                        "-output-stats", "/mnt/output/stats.json",
                        "-tag-library", cmd.tagLibraryFile,
                        "-ref", cmd.refFile,
@@ -131,7 +136,7 @@ func (cmd *importer) RunCommand(prog string, args []string, stdin io.Reader, std
                if err != nil {
                        return 1
                }
-               fmt.Fprintln(stdout, output+"/library.gob")
+               fmt.Fprintln(stdout, output+"/library.gob.gz")
                return 0
        }
 
@@ -145,20 +150,25 @@ func (cmd *importer) RunCommand(prog string, args []string, stdin io.Reader, std
                return 1
        }
 
-       var output io.WriteCloser
+       var outw, outf io.WriteCloser
        if cmd.outputFile == "-" {
-               output = nopCloser{stdout}
+               outw = nopCloser{stdout}
        } else {
-               output, err = os.OpenFile(cmd.outputFile, os.O_CREATE|os.O_WRONLY, 0777)
+               outf, err = os.OpenFile(cmd.outputFile, os.O_CREATE|os.O_WRONLY, 0777)
                if err != nil {
                        return 1
                }
-               defer output.Close()
+               defer outf.Close()
+               if strings.HasSuffix(cmd.outputFile, ".gz") {
+                       outw = gzip.NewWriter(outf)
+               } else {
+                       outw = outf
+               }
        }
-       bufw := bufio.NewWriter(output)
+       bufw := bufio.NewWriter(outw)
        cmd.encoder = gob.NewEncoder(bufw)
 
-       tilelib := &tileLibrary{taglib: taglib, includeNoCalls: cmd.includeNoCalls, skipOOO: cmd.skipOOO}
+       tilelib := &tileLibrary{taglib: taglib, retainNoCalls: cmd.saveIncompleteTiles, skipOOO: cmd.skipOOO}
        if cmd.outputTiles {
                cmd.encoder.Encode(LibraryEntry{TagSet: taglib.Tags()})
                tilelib.encoder = cmd.encoder
@@ -177,10 +187,16 @@ func (cmd *importer) RunCommand(prog string, args []string, stdin io.Reader, std
        if err != nil {
                return 1
        }
-       err = output.Close()
+       err = outw.Close()
        if err != nil {
                return 1
        }
+       if outf != nil && outf != outw {
+               err = outf.Close()
+               if err != nil {
+                       return 1
+               }
+       }
        return 0
 }
 
@@ -497,3 +513,73 @@ func flatten(variants [][]tileVariantID) []tileVariantID {
        }
        return flat
 }
+
+type importstatsplot struct{}
+
+func (cmd *importstatsplot) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int {
+       err := cmd.Plot(stdin, stdout)
+       if err != nil {
+               log.Errorf("%s", err)
+               return 1
+       }
+       return 0
+}
+
+func (cmd *importstatsplot) Plot(stdin io.Reader, stdout io.Writer) error {
+       var stats []importStats
+       err := json.NewDecoder(stdin).Decode(&stats)
+       if err != nil {
+               return err
+       }
+
+       p, err := plot.New()
+       if err != nil {
+               return err
+       }
+       p.Title.Text = "coverage preserved by import (excl X<0.65)"
+       p.X.Label.Text = "input base calls ÷ sequence length"
+       p.Y.Label.Text = "output base calls ÷ input base calls"
+       p.Add(plotter.NewGrid())
+
+       data := map[string]plotter.XYs{}
+       for _, stat := range stats {
+               data[stat.InputLabel] = append(data[stat.InputLabel], plotter.XY{
+                       X: float64(stat.InputCoverage) / float64(stat.InputLength),
+                       Y: float64(stat.TileCoverage) / float64(stat.InputCoverage),
+               })
+       }
+
+       labels := []string{}
+       for label := range data {
+               labels = append(labels, label)
+       }
+       sort.Strings(labels)
+       palette, err := colorful.SoftPalette(len(labels))
+       if err != nil {
+               return err
+       }
+       nextInPalette := 0
+       for idx, label := range labels {
+               s, err := plotter.NewScatter(data[label])
+               if err != nil {
+                       return err
+               }
+               s.GlyphStyle.Color = palette[idx]
+               s.GlyphStyle.Radius = vg.Millimeter / 2
+               s.GlyphStyle.Shape = draw.CrossGlyph{}
+               nextInPalette += 7
+               p.Add(s)
+               if false {
+                       p.Legend.Add(label, s)
+               }
+       }
+       p.X.Min = 0.65
+       p.X.Max = 1
+
+       w, err := p.WriterTo(8*vg.Inch, 6*vg.Inch, "svg")
+       if err != nil {
+               return err
+       }
+       _, err = w.WriteTo(stdout)
+       return err
+}