Gzip gob files.
[lightning.git] / export.go
index b157e00411a84f8aeb61462812e4bfee38c36332..b6b160ddb79311fe3a5964b25e2ca0efe482abdc 100644 (file)
--- a/export.go
+++ b/export.go
@@ -88,7 +88,7 @@ func (cmd *exporter) RunCommand(prog string, args []string, stdin io.Reader, std
                        Name:        "lightning export",
                        Client:      arvados.NewClientFromEnv(),
                        ProjectUUID: *projectUUID,
-                       RAM:         128000000000,
+                       RAM:         240000000000,
                        VCPUs:       32,
                        Priority:    *priority,
                }
@@ -132,9 +132,9 @@ func (cmd *exporter) RunCommand(prog string, args []string, stdin io.Reader, std
        var mtx sync.Mutex
        var cgs []CompactGenome
        tilelib := tileLibrary{
-               includeNoCalls: true,
+               retainNoCalls: true,
        }
-       err = tilelib.LoadGob(context.Background(), input, func(cg CompactGenome) {
+       err = tilelib.LoadGob(context.Background(), input, strings.HasSuffix(*inputFilename, ".gz"), func(cg CompactGenome) {
                if *pick != "" && *pick != cg.Name {
                        return
                }
@@ -188,7 +188,7 @@ func (cmd *exporter) RunCommand(prog string, args []string, stdin io.Reader, std
                bedbufw = bufio.NewWriter(bedout)
        }
 
-       err = cmd.export(bufw, bedout, input, tilelib.taglib.keylen, refseq, cgs)
+       err = cmd.export(bufw, bedout, input, strings.HasSuffix(*inputFilename, ".gz"), tilelib.taglib.keylen, refseq, cgs)
        if err != nil {
                return 1
        }
@@ -217,7 +217,7 @@ func (cmd *exporter) RunCommand(prog string, args []string, stdin io.Reader, std
        return 0
 }
 
-func (cmd *exporter) export(out, bedout io.Writer, librdr io.Reader, taglen int, refseq map[string][]tileLibRef, cgs []CompactGenome) error {
+func (cmd *exporter) export(out, bedout io.Writer, librdr io.Reader, gz bool, taglen int, refseq map[string][]tileLibRef, cgs []CompactGenome) error {
        need := map[tileLibRef]bool{}
        var seqnames []string
        for seqname, librefs := range refseq {
@@ -242,7 +242,7 @@ func (cmd *exporter) export(out, bedout io.Writer, librdr io.Reader, taglen int,
 
        log.Infof("export: loading %d tile variants", len(need))
        tileVariant := map[tileLibRef]TileVariant{}
-       err := DecodeLibrary(librdr, func(ent *LibraryEntry) error {
+       err := DecodeLibrary(librdr, gz, func(ent *LibraryEntry) error {
                for _, tv := range ent.TileVariants {
                        libref := tileLibRef{Tag: tv.Tag, Variant: tv.Variant}
                        if need[libref] {
@@ -321,7 +321,7 @@ func (cmd *exporter) exportSeq(outw, bedw io.Writer, taglen int, seqname string,
        variantAt := map[int][]hgvs.Variant{} // variantAt[chromOffset][genomeIndex*2+phase]
        for refstep, libref := range reftiles {
                reftile := tileVariant[libref]
-               coverage := int64(0) // number of ref bases that are called in genomes -- max is len(reftile.Sequence)*len(cgs)*2
+               tagcoverage := 0 // number of times the start tag was found in genomes -- max is len(cgs)*2
                for cgidx, cg := range cgs {
                        for phase := 0; phase < 2; phase++ {
                                if len(cg.Variants) <= int(libref.Tag)*2+phase {
@@ -331,10 +331,17 @@ func (cmd *exporter) exportSeq(outw, bedw io.Writer, taglen int, seqname string,
                                if variant == 0 {
                                        continue
                                }
-                               genometile := tileVariant[tileLibRef{Tag: libref.Tag, Variant: variant}]
+                               tagcoverage++
                                if variant == libref.Variant {
                                        continue
                                }
+                               genometile := tileVariant[tileLibRef{Tag: libref.Tag, Variant: variant}]
+                               if len(genometile.Sequence) == 0 {
+                                       // Hash is known but sequence
+                                       // is not, e.g., retainNoCalls
+                                       // was false during import
+                                       continue
+                               }
                                refSequence := reftile.Sequence
                                // If needed, extend the reference
                                // sequence up to the tag at the end
@@ -362,7 +369,6 @@ func (cmd *exporter) exportSeq(outw, bedw io.Writer, taglen int, seqname string,
                                        }
                                        varslice[cgidx*2+phase] = v
                                }
-                               coverage += int64(len(reftile.Sequence))
                        }
                }
                refpos += len(reftile.Sequence) - taglen
@@ -400,7 +406,7 @@ func (cmd *exporter) exportSeq(outw, bedw io.Writer, taglen int, seqname string,
                        }
                        thickend := refpos
                        // coverage score, 0 to 1000
-                       score := 1000 * coverage / int64(len(reftile.Sequence)) / int64(len(cgs)) / 2
+                       score := 1000 * tagcoverage / len(cgs) / 2
                        fmt.Fprintf(bedw, "%s %d %d %d %d . %d %d\n",
                                seqname, tilestart, tileend,
                                libref.Tag,