X-Git-Url: https://git.arvados.org/lightning.git/blobdiff_plain/827cbfd60abe637ace2e24bee80a0b4c11507602..74ae10fa596b295fdd0904cf0568a3b3d7c5e0a9:/import.go diff --git a/import.go b/import.go index f9d3a1a0ba..2f1a268511 100644 --- a/import.go +++ b/import.go @@ -1,4 +1,8 @@ -package main +// Copyright (C) The Lightning Authors. All rights reserved. +// +// SPDX-License-Identifier: AGPL-3.0 + +package lightning import ( "bufio" @@ -42,6 +46,7 @@ type importer struct { outputStats string matchChromosome *regexp.Regexp encoder *gob.Encoder + retainAfterEncoding bool // keep imported genomes/refseqs in memory after writing to disk batchArgs } @@ -180,8 +185,8 @@ func (cmd *importer) runBatches(stdout io.Writer, inputs []string) error { Client: arvadosClientFromEnv, ProjectUUID: cmd.projectUUID, APIAccess: true, - RAM: 300000000000, - VCPUs: 64, + RAM: 700000000000, + VCPUs: 96, Priority: cmd.priority, KeepCache: 1, } @@ -229,7 +234,7 @@ func (cmd *importer) runBatches(stdout io.Writer, inputs []string) error { return nil } -func (cmd *importer) tileFasta(tilelib *tileLibrary, infile string) (tileSeq, []importStats, error) { +func (cmd *importer) tileFasta(tilelib *tileLibrary, infile string, isRef bool) (tileSeq, []importStats, error) { var input io.ReadCloser input, err := open(infile) if err != nil { @@ -244,7 +249,7 @@ func (cmd *importer) tileFasta(tilelib *tileLibrary, infile string) (tileSeq, [] } defer input.Close() } - return tilelib.TileFasta(infile, input, cmd.matchChromosome) + return tilelib.TileFasta(infile, input, cmd.matchChromosome, isRef) } func (cmd *importer) loadTagLibrary() (*tagLibrary, error) { @@ -344,7 +349,7 @@ func (cmd *importer) tileInputs(tilelib *tileLibrary, infiles []string) error { defer phases.Done() log.Printf("%s starting", infile) defer log.Printf("%s done", infile) - tseqs, stats, err := cmd.tileFasta(tilelib, infile) + tseqs, stats, err := cmd.tileFasta(tilelib, infile, false) allstats[idx*2] = stats var kept, dropped int variants[0], kept, dropped = tseqs.Variants() @@ -356,12 +361,11 @@ func (cmd *importer) tileInputs(tilelib *tileLibrary, infiles []string) error { defer phases.Done() log.Printf("%s starting", infile2) defer log.Printf("%s done", infile2) - tseqs, stats, err := cmd.tileFasta(tilelib, infile2) + tseqs, stats, err := cmd.tileFasta(tilelib, infile2, false) allstats[idx*2+1] = stats var kept, dropped int variants[1], kept, dropped = tseqs.Variants() log.Printf("%s found %d unique tags plus %d repeats", infile2, kept, dropped) - return err } } else if fastaFilenameRe.MatchString(infile) { @@ -370,7 +374,7 @@ func (cmd *importer) tileInputs(tilelib *tileLibrary, infiles []string) error { defer phases.Done() log.Printf("%s starting", infile) defer log.Printf("%s done", infile) - tseqs, stats, err := cmd.tileFasta(tilelib, infile) + tseqs, stats, err := cmd.tileFasta(tilelib, infile, true) allstats[idx*2] = stats if err != nil { return err @@ -380,6 +384,16 @@ func (cmd *importer) tileInputs(tilelib *tileLibrary, infiles []string) error { totlen += len(tseq) } log.Printf("%s tiled %d seqs, total len %d", infile, len(tseqs), totlen) + + if cmd.retainAfterEncoding { + tilelib.mtx.Lock() + if tilelib.refseqs == nil { + tilelib.refseqs = map[string]map[string][]tileLibRef{} + } + tilelib.refseqs[infile] = tseqs + tilelib.mtx.Unlock() + } + return cmd.encoder.Encode(LibraryEntry{ CompactSequences: []CompactSequence{{Name: infile, TileSequences: tseqs}}, }) @@ -411,8 +425,9 @@ func (cmd *importer) tileInputs(tilelib *tileLibrary, infiles []string) error { if len(errs) > 0 { return } + variants := flatten(variants) err := cmd.encoder.Encode(LibraryEntry{ - CompactGenomes: []CompactGenome{{Name: infile, Variants: flatten(variants)}}, + CompactGenomes: []CompactGenome{{Name: infile, Variants: variants}}, }) if err != nil { select { @@ -420,6 +435,14 @@ func (cmd *importer) tileInputs(tilelib *tileLibrary, infiles []string) error { default: } } + if cmd.retainAfterEncoding { + tilelib.mtx.Lock() + if tilelib.compactGenomes == nil { + tilelib.compactGenomes = make(map[string][]tileVariantID) + } + tilelib.compactGenomes[infile] = variants + tilelib.mtx.Unlock() + } }() } go close(todo) @@ -517,7 +540,7 @@ func (cmd *importer) tileGVCF(tilelib *tileLibrary, infile string, phase int) (t return } defer consensus.Wait() - tileseq, stats, err = tilelib.TileFasta(fmt.Sprintf("%s phase %d", infile, phase+1), stdout, cmd.matchChromosome) + tileseq, stats, err = tilelib.TileFasta(fmt.Sprintf("%s phase %d", infile, phase+1), stdout, cmd.matchChromosome, false) if err != nil { return }