From 83983adc7d02cb2b460eede7341313d54379c8c3 Mon Sep 17 00:00:00 2001 From: Tom Clegg Date: Thu, 12 Nov 2020 18:56:48 -0500 Subject: [PATCH] When not saving incomplete tilevars, still save hashes/indexes. Arvados-DCO-1.1-Signed-off-by: Tom Clegg --- annotate.go | 6 ++++-- example-su92l-1kg.sh | 9 +++++++-- export.go | 8 +++++++- exportnumpy.go | 2 +- import.go | 30 +++++++++++++++--------------- merge.go | 4 ++-- pca.go | 2 +- pipeline_test.go | 4 ++-- tilelib.go | 21 +++++++++++++-------- 9 files changed, 52 insertions(+), 34 deletions(-) diff --git a/annotate.go b/annotate.go index 7787ae2318..34d1c97af5 100644 --- a/annotate.go +++ b/annotate.go @@ -109,7 +109,7 @@ func (cmd *annotatecmd) RunCommand(prog string, args []string, stdin io.Reader, bufw := bufio.NewWriter(output) tilelib := &tileLibrary{ - includeNoCalls: true, + retainNoCalls: true, retainTileSequences: true, } err = tilelib.LoadGob(context.Background(), input, nil) @@ -232,7 +232,9 @@ func (cmd *annotatecmd) annotateSequence(throttle *throttle, outch chan<- string for variant := 1; variant <= len(tvs); variant++ { variant, hash := tileVariantID(variant), tvs[variant-1] tileseq := tilelib.TileVariantSequence(tileLibRef{Tag: tag, Variant: variant}) - if len(tileseq) < taglen { + if len(tileseq) == 0 { + continue + } else if len(tileseq) < taglen { return fmt.Errorf("tilevar %d,%d has sequence len %d < taglen %d", tag, variant, len(tileseq), taglen) } var refpart []byte diff --git a/example-su92l-1kg.sh b/example-su92l-1kg.sh index 5de4a4603b..c4e2b54386 100755 --- a/example-su92l-1kg.sh +++ b/example-su92l-1kg.sh @@ -19,11 +19,11 @@ genome=$(lightning ref2genome -project ${project} -priority ${priority} -r fasta=$(lightning vcf2fasta -project ${project} -priority ${priority} -ref ${ref_fa} -genome ${genome} -mask=true ${gvcf}) ; echo fasta=${fasta} # fasta=su92l-4zz18-9nq05jifgz7iult -ref37_lib=$(lightning import -project ${project} -priority ${priority} -tag-library ${tagset} -skip-ooo=true -output-tiles=true -include-no-calls ${ref37_fa}) ; echo ref37_lib=${ref37_lib} +ref37_lib=$(lightning import -project ${project} -priority ${priority} -tag-library ${tagset} -skip-ooo=true -output-tiles=true -save-incomplete-tiles ${ref37_fa}) ; echo ref37_lib=${ref37_lib} # ref37_lib=su92l-4zz18-vnhlv3g6yp1azls/library.gob # 539s -ref38_lib=$(lightning import -project ${project} -priority ${priority} -tag-library ${tagset} -skip-ooo=true -output-tiles=true -include-no-calls ${ref_fa}) ; echo ref38_lib=${ref38_lib} +ref38_lib=$(lightning import -project ${project} -priority ${priority} -tag-library ${tagset} -skip-ooo=true -output-tiles=true -save-incomplete-tiles ${ref_fa}) ; echo ref38_lib=${ref38_lib} # ref38_lib=su92l-4zz18-swebknshfwsvys6/library.gob unfiltered=$(lightning import -project ${project} -priority ${priority} -tag-library ${tagset} -skip-ooo=true -output-tiles=true ${fasta}) ; echo unfiltered=${unfiltered} @@ -37,6 +37,10 @@ exportvcf=$(lightning export -project ${project} -priority ${priority} -i # exportvcf=su92l-4zz18-gz4svr6zyvipueu/export.csv # 5506s +exporthgvs=$(lightning export -project ${project} -priority ${priority} -i ${merged38} -output-format hgvs -ref /mnt/su92l-4zz18-u77iyyy7cb05xqv/hg38.fa.gz -output-bed hg38.bed) ; echo exporthgvs=${exporthgvs} +# +# + stats=$(lightning stats -project ${project} -priority ${priority} -i ${merged}) ; echo stats=${stats} filtered=$(lightning filter -project ${project} -priority ${priority} -i ${merged} -min-coverage "0.9" -max-variants "30") ; echo filtered=${filtered} @@ -58,3 +62,4 @@ numpy=$(lightning export-numpy -project ${project} -priority ${priority} -i # 6155s # pcapy=$(lightning pca -project ${project} -priority ${priority} -i ${numpy}) ; echo pcapy=${pcapy} comvar=$(lightning numpy-comvar -project ${project} -priority ${priority} -i ${numpy} -annotations ${numpy%/matrix.npy}/annotations.tsv) ; echo comvar=${comvar} +# comvar=su92l-4zz18-s1yhngobdvcoc2e/commonvariants.csv diff --git a/export.go b/export.go index a37ec725bf..79e75a794d 100644 --- a/export.go +++ b/export.go @@ -132,7 +132,7 @@ func (cmd *exporter) RunCommand(prog string, args []string, stdin io.Reader, std var mtx sync.Mutex var cgs []CompactGenome tilelib := tileLibrary{ - includeNoCalls: true, + retainNoCalls: true, } err = tilelib.LoadGob(context.Background(), input, func(cg CompactGenome) { if *pick != "" && *pick != cg.Name { @@ -336,6 +336,12 @@ func (cmd *exporter) exportSeq(outw, bedw io.Writer, taglen int, seqname string, continue } genometile := tileVariant[tileLibRef{Tag: libref.Tag, Variant: variant}] + if len(genometile.Sequence) == 0 { + // Hash is known but sequence + // is not, e.g., retainNoCalls + // was false during import + continue + } refSequence := reftile.Sequence // If needed, extend the reference // sequence up to the tag at the end diff --git a/exportnumpy.go b/exportnumpy.go index 208d7a6323..d67ac6fee7 100644 --- a/exportnumpy.go +++ b/exportnumpy.go @@ -102,7 +102,7 @@ func (cmd *exportNumpy) RunCommand(prog string, args []string, stdin io.Reader, defer input.Close() } tilelib := &tileLibrary{ - includeNoCalls: true, + retainNoCalls: true, retainTileSequences: true, compactGenomes: map[string][]tileVariantID{}, } diff --git a/import.go b/import.go index 4a82055973..f6ca80fed6 100644 --- a/import.go +++ b/import.go @@ -32,16 +32,16 @@ import ( ) type importer struct { - tagLibraryFile string - refFile string - outputFile string - projectUUID string - runLocal bool - skipOOO bool - outputTiles bool - includeNoCalls bool - outputStats string - encoder *gob.Encoder + tagLibraryFile string + refFile string + outputFile string + projectUUID string + runLocal bool + skipOOO bool + outputTiles bool + saveIncompleteTiles bool + outputStats string + encoder *gob.Encoder } func (cmd *importer) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int { @@ -60,7 +60,7 @@ func (cmd *importer) RunCommand(prog string, args []string, stdin io.Reader, std flags.BoolVar(&cmd.runLocal, "local", false, "run on local host (default: run in an arvados container)") flags.BoolVar(&cmd.skipOOO, "skip-ooo", false, "skip out-of-order tags") flags.BoolVar(&cmd.outputTiles, "output-tiles", false, "include tile variant sequences in output file") - flags.BoolVar(&cmd.includeNoCalls, "include-no-calls", false, "treat tiles with no-calls as regular tiles") + flags.BoolVar(&cmd.saveIncompleteTiles, "save-incomplete-tiles", false, "treat tiles with no-calls as regular tiles") flags.StringVar(&cmd.outputStats, "output-stats", "", "output stats to `file` (json)") priority := flags.Int("priority", 500, "container request priority") pprof := flags.String("pprof", "", "serve Go profile data at http://`[addr]:port`") @@ -96,8 +96,8 @@ func (cmd *importer) RunCommand(prog string, args []string, stdin io.Reader, std Name: "lightning import", Client: arvados.NewClientFromEnv(), ProjectUUID: cmd.projectUUID, - RAM: 60000000000, - VCPUs: 16, + RAM: 80000000000, + VCPUs: 32, Priority: *priority, } err = runner.TranslatePaths(&cmd.tagLibraryFile, &cmd.refFile, &cmd.outputFile) @@ -125,7 +125,7 @@ func (cmd *importer) RunCommand(prog string, args []string, stdin io.Reader, std "-loglevel=" + *loglevel, fmt.Sprintf("-skip-ooo=%v", cmd.skipOOO), fmt.Sprintf("-output-tiles=%v", cmd.outputTiles), - fmt.Sprintf("-include-no-calls=%v", cmd.includeNoCalls), + fmt.Sprintf("-save-incomplete-tiles=%v", cmd.saveIncompleteTiles), "-output-stats", "/mnt/output/stats.json", "-tag-library", cmd.tagLibraryFile, "-ref", cmd.refFile, @@ -163,7 +163,7 @@ func (cmd *importer) RunCommand(prog string, args []string, stdin io.Reader, std bufw := bufio.NewWriter(output) cmd.encoder = gob.NewEncoder(bufw) - tilelib := &tileLibrary{taglib: taglib, includeNoCalls: cmd.includeNoCalls, skipOOO: cmd.skipOOO} + tilelib := &tileLibrary{taglib: taglib, retainNoCalls: cmd.saveIncompleteTiles, skipOOO: cmd.skipOOO} if cmd.outputTiles { cmd.encoder.Encode(LibraryEntry{TagSet: taglib.Tags()}) tilelib.encoder = cmd.encoder diff --git a/merge.go b/merge.go index c51dcc8dee..1807393b59 100644 --- a/merge.go +++ b/merge.go @@ -127,8 +127,8 @@ func (cmd *merger) doMerge() error { cmd.errs = make(chan error, 1) cmd.tilelib = &tileLibrary{ - encoder: encoder, - includeNoCalls: true, + encoder: encoder, + retainNoCalls: true, } cmd.mapped = map[string]map[tileLibRef]tileVariantID{} diff --git a/pca.go b/pca.go index 8c19e5623c..65c69eb6e3 100644 --- a/pca.go +++ b/pca.go @@ -139,7 +139,7 @@ func (cmd *goPCA) RunCommand(prog string, args []string, stdin io.Reader, stdout } log.Print("reading") tilelib := tileLibrary{ - includeNoCalls: true, + retainNoCalls: true, compactGenomes: map[string][]tileVariantID{}, } err = tilelib.LoadGob(context.Background(), input, nil) diff --git a/pipeline_test.go b/pipeline_test.go index ef7af393eb..dddadcd805 100644 --- a/pipeline_test.go +++ b/pipeline_test.go @@ -63,7 +63,7 @@ func (s *pipelineSuite) TestImportMerge(c *check.C) { args := []string{"-local=true", "-o=" + libfile[i], "-skip-ooo=true", "-output-tiles", "-tag-library", "testdata/tags"} if i == 0 { // ref only - args = append(args, "-include-no-calls") + args = append(args, "-save-incomplete-tiles") } args = append(args, infile) code := (&importer{}).RunCommand("lightning import", args, bytes.NewReader(nil), &bytes.Buffer{}, os.Stderr) @@ -127,7 +127,7 @@ chr2 471 GG AA 0/1 0/0 bedout, err := ioutil.ReadFile(tmpdir + "/export.bed") c.Check(err, check.IsNil) c.Logf("%s", string(bedout)) - c.Check(string(bedout), check.Equals, `chr1 0 248 0 500 . 0 224 + c.Check(string(bedout), check.Equals, `chr1 0 248 0 1000 . 0 224 chr1 224 372 1 1000 . 248 348 chr1 348 496 2 1000 . 372 472 chr1 472 572 3 1000 . 496 572 diff --git a/tilelib.go b/tilelib.go index 4100cbf70a..b1550f67eb 100644 --- a/tilelib.go +++ b/tilelib.go @@ -50,7 +50,7 @@ func (tseq tileSeq) Variants() ([]tileVariantID, int, int) { } type tileLibrary struct { - includeNoCalls bool + retainNoCalls bool skipOOO bool retainTileSequences bool @@ -336,7 +336,7 @@ func (tilelib *tileLibrary) TileFasta(filelabel string, rdr io.Reader) (tileSeq, basesOut += countBases(job.fasta[last.pos+last.taglen : f.pos+f.taglen]) } else { // If we dropped this tile - // (because !includeNoCalls), + // (because !retainNoCalls), // set taglen=0 so the // overlapping tag is counted // toward coverage on the @@ -386,12 +386,12 @@ func (tilelib *tileLibrary) Len() int { // Return a tileLibRef for a tile with the given tag and sequence, // adding the sequence to the library if needed. func (tilelib *tileLibrary) getRef(tag tagID, seq []byte) tileLibRef { - if !tilelib.includeNoCalls { + dropSeq := false + if !tilelib.retainNoCalls { for _, b := range seq { if b != 'a' && b != 'c' && b != 'g' && b != 't' { - // return "tile not found" if seq has any - // no-calls - return tileLibRef{Tag: tag} + dropSeq = true + break } } } @@ -424,7 +424,7 @@ func (tilelib *tileLibrary) getRef(tag tagID, seq []byte) tileLibRef { } tilelib.variants++ tilelib.variant[tag] = append(tilelib.variant[tag], seqhash) - if tilelib.retainTileSequences { + if tilelib.retainTileSequences && !dropSeq { if tilelib.seq == nil { tilelib.seq = map[[blake2b.Size256]byte][]byte{} } @@ -434,12 +434,17 @@ func (tilelib *tileLibrary) getRef(tag tagID, seq []byte) tileLibRef { tilelib.mtx.Unlock() if tilelib.encoder != nil { + saveSeq := seq + if dropSeq { + // Save the hash, but not the sequence + saveSeq = nil + } tilelib.encoder.Encode(LibraryEntry{ TileVariants: []TileVariant{{ Tag: tag, Variant: variant, Blake2b: seqhash, - Sequence: seq, + Sequence: saveSeq, }}, }) } -- 2.30.2