Export hgvs-onehot.
[lightning.git] / import.go
index f4e0cd0008c218cf23553e12be9d174031e06d2c..d229160a8bff9ad4d7f7f37f7c10f3481f3a2aa9 100644 (file)
--- a/import.go
+++ b/import.go
@@ -1,4 +1,4 @@
-package main
+package lightning
 
 import (
        "bufio"
@@ -42,6 +42,7 @@ type importer struct {
        outputStats         string
        matchChromosome     *regexp.Regexp
        encoder             *gob.Encoder
+       retainAfterEncoding bool // keep imported genomes/refseqs in memory after writing to disk
        batchArgs
 }
 
@@ -180,8 +181,8 @@ func (cmd *importer) runBatches(stdout io.Writer, inputs []string) error {
                Client:      arvadosClientFromEnv,
                ProjectUUID: cmd.projectUUID,
                APIAccess:   true,
-               RAM:         300000000000,
-               VCPUs:       64,
+               RAM:         700000000000,
+               VCPUs:       96,
                Priority:    cmd.priority,
                KeepCache:   1,
        }
@@ -361,7 +362,6 @@ func (cmd *importer) tileInputs(tilelib *tileLibrary, infiles []string) error {
                                var kept, dropped int
                                variants[1], kept, dropped = tseqs.Variants()
                                log.Printf("%s found %d unique tags plus %d repeats", infile2, kept, dropped)
-
                                return err
                        }
                } else if fastaFilenameRe.MatchString(infile) {
@@ -380,6 +380,16 @@ func (cmd *importer) tileInputs(tilelib *tileLibrary, infiles []string) error {
                                        totlen += len(tseq)
                                }
                                log.Printf("%s tiled %d seqs, total len %d", infile, len(tseqs), totlen)
+
+                               if cmd.retainAfterEncoding {
+                                       tilelib.mtx.Lock()
+                                       if tilelib.refseqs == nil {
+                                               tilelib.refseqs = map[string]map[string][]tileLibRef{}
+                                       }
+                                       tilelib.refseqs[infile] = tseqs
+                                       tilelib.mtx.Unlock()
+                               }
+
                                return cmd.encoder.Encode(LibraryEntry{
                                        CompactSequences: []CompactSequence{{Name: infile, TileSequences: tseqs}},
                                })
@@ -411,8 +421,9 @@ func (cmd *importer) tileInputs(tilelib *tileLibrary, infiles []string) error {
                        if len(errs) > 0 {
                                return
                        }
+                       variants := flatten(variants)
                        err := cmd.encoder.Encode(LibraryEntry{
-                               CompactGenomes: []CompactGenome{{Name: infile, Variants: flatten(variants)}},
+                               CompactGenomes: []CompactGenome{{Name: infile, Variants: variants}},
                        })
                        if err != nil {
                                select {
@@ -420,6 +431,14 @@ func (cmd *importer) tileInputs(tilelib *tileLibrary, infiles []string) error {
                                default:
                                }
                        }
+                       if cmd.retainAfterEncoding {
+                               tilelib.mtx.Lock()
+                               if tilelib.compactGenomes == nil {
+                                       tilelib.compactGenomes = make(map[string][]tileVariantID)
+                               }
+                               tilelib.compactGenomes[infile] = variants
+                               tilelib.mtx.Unlock()
+                       }
                }()
        }
        go close(todo)
@@ -452,6 +471,13 @@ func (cmd *importer) tileInputs(tilelib *tileLibrary, infiles []string) error {
                }()
        }
        tileJobs.Wait()
+       if len(errs) > 0 {
+               // Must not wait on encodeJobs in this case. If the
+               // tileJobs goroutines exited early, some funcs in
+               // todo haven't been called, so the corresponding
+               // encodeJobs will wait forever.
+               return <-errs
+       }
        encodeJobs.Wait()
 
        go close(errs)