When not saving incomplete tilevars, still save hashes/indexes.
authorTom Clegg <tom@tomclegg.ca>
Thu, 12 Nov 2020 23:56:48 +0000 (18:56 -0500)
committerTom Clegg <tom@tomclegg.ca>
Thu, 12 Nov 2020 23:56:48 +0000 (18:56 -0500)
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom@tomclegg.ca>

annotate.go
example-su92l-1kg.sh
export.go
exportnumpy.go
import.go
merge.go
pca.go
pipeline_test.go
tilelib.go

index 7787ae231838430840d6ae4ef5631ac08750f9b6..34d1c97af586acc4f2def992b261e03f7d0d0b5b 100644 (file)
@@ -109,7 +109,7 @@ func (cmd *annotatecmd) RunCommand(prog string, args []string, stdin io.Reader,
        bufw := bufio.NewWriter(output)
 
        tilelib := &tileLibrary{
-               includeNoCalls:      true,
+               retainNoCalls:       true,
                retainTileSequences: true,
        }
        err = tilelib.LoadGob(context.Background(), input, nil)
@@ -232,7 +232,9 @@ func (cmd *annotatecmd) annotateSequence(throttle *throttle, outch chan<- string
                for variant := 1; variant <= len(tvs); variant++ {
                        variant, hash := tileVariantID(variant), tvs[variant-1]
                        tileseq := tilelib.TileVariantSequence(tileLibRef{Tag: tag, Variant: variant})
-                       if len(tileseq) < taglen {
+                       if len(tileseq) == 0 {
+                               continue
+                       } else if len(tileseq) < taglen {
                                return fmt.Errorf("tilevar %d,%d has sequence len %d < taglen %d", tag, variant, len(tileseq), taglen)
                        }
                        var refpart []byte
index 5de4a4603b50a14398fda298627143739fe83c71..c4e2b543869fa9fa259090ab2995b98f69e72c83 100755 (executable)
@@ -19,11 +19,11 @@ genome=$(lightning     ref2genome   -project ${project} -priority ${priority} -r
 fasta=$(lightning      vcf2fasta    -project ${project} -priority ${priority} -ref ${ref_fa} -genome ${genome} -mask=true ${gvcf})                     ; echo fasta=${fasta}
 # fasta=su92l-4zz18-9nq05jifgz7iult
 
-ref37_lib=$(lightning  import       -project ${project} -priority ${priority} -tag-library ${tagset} -skip-ooo=true -output-tiles=true -include-no-calls ${ref37_fa}) ; echo ref37_lib=${ref37_lib}
+ref37_lib=$(lightning  import       -project ${project} -priority ${priority} -tag-library ${tagset} -skip-ooo=true -output-tiles=true -save-incomplete-tiles ${ref37_fa}) ; echo ref37_lib=${ref37_lib}
 # ref37_lib=su92l-4zz18-vnhlv3g6yp1azls/library.gob
 # 539s
 
-ref38_lib=$(lightning  import       -project ${project} -priority ${priority} -tag-library ${tagset} -skip-ooo=true -output-tiles=true -include-no-calls ${ref_fa}) ; echo ref38_lib=${ref38_lib}
+ref38_lib=$(lightning  import       -project ${project} -priority ${priority} -tag-library ${tagset} -skip-ooo=true -output-tiles=true -save-incomplete-tiles ${ref_fa}) ; echo ref38_lib=${ref38_lib}
 # ref38_lib=su92l-4zz18-swebknshfwsvys6/library.gob
 
 unfiltered=$(lightning import       -project ${project} -priority ${priority} -tag-library ${tagset} -skip-ooo=true -output-tiles=true ${fasta})       ; echo unfiltered=${unfiltered}
@@ -37,6 +37,10 @@ exportvcf=$(lightning  export       -project ${project} -priority ${priority} -i
 # exportvcf=su92l-4zz18-gz4svr6zyvipueu/export.csv
 # 5506s
 
+exporthgvs=$(lightning export       -project ${project} -priority ${priority} -i ${merged38} -output-format hgvs -ref /mnt/su92l-4zz18-u77iyyy7cb05xqv/hg38.fa.gz -output-bed hg38.bed) ; echo exporthgvs=${exporthgvs}
+# 
+# 
+
 stats=$(lightning      stats        -project ${project} -priority ${priority} -i ${merged})                                                            ; echo stats=${stats}
 
 filtered=$(lightning   filter       -project ${project} -priority ${priority} -i ${merged} -min-coverage "0.9" -max-variants "30")                     ; echo filtered=${filtered}
@@ -58,3 +62,4 @@ numpy=$(lightning      export-numpy -project ${project} -priority ${priority} -i
 # 6155s
 # pcapy=$(lightning      pca          -project ${project} -priority ${priority} -i ${numpy})                                                             ; echo pcapy=${pcapy}
 comvar=$(lightning     numpy-comvar -project ${project} -priority ${priority} -i ${numpy} -annotations ${numpy%/matrix.npy}/annotations.tsv)           ; echo comvar=${comvar}
+# comvar=su92l-4zz18-s1yhngobdvcoc2e/commonvariants.csv
index a37ec725bf80f4318eae2d676803391b93797414..79e75a794d98359ee4f0893d3c77499f9c0ddc29 100644 (file)
--- a/export.go
+++ b/export.go
@@ -132,7 +132,7 @@ func (cmd *exporter) RunCommand(prog string, args []string, stdin io.Reader, std
        var mtx sync.Mutex
        var cgs []CompactGenome
        tilelib := tileLibrary{
-               includeNoCalls: true,
+               retainNoCalls: true,
        }
        err = tilelib.LoadGob(context.Background(), input, func(cg CompactGenome) {
                if *pick != "" && *pick != cg.Name {
@@ -336,6 +336,12 @@ func (cmd *exporter) exportSeq(outw, bedw io.Writer, taglen int, seqname string,
                                        continue
                                }
                                genometile := tileVariant[tileLibRef{Tag: libref.Tag, Variant: variant}]
+                               if len(genometile.Sequence) == 0 {
+                                       // Hash is known but sequence
+                                       // is not, e.g., retainNoCalls
+                                       // was false during import
+                                       continue
+                               }
                                refSequence := reftile.Sequence
                                // If needed, extend the reference
                                // sequence up to the tag at the end
index 208d7a63231f97e7adfad49c32528a1574b221ff..d67ac6fee73d1842015b55afa36c0b92b64c77c5 100644 (file)
@@ -102,7 +102,7 @@ func (cmd *exportNumpy) RunCommand(prog string, args []string, stdin io.Reader,
                defer input.Close()
        }
        tilelib := &tileLibrary{
-               includeNoCalls:      true,
+               retainNoCalls:       true,
                retainTileSequences: true,
                compactGenomes:      map[string][]tileVariantID{},
        }
index 4a820559733a9746e0224919f14ff3fff8d2255d..f6ca80fed6e9aa5b461f1c7f31d42563de8b1862 100644 (file)
--- a/import.go
+++ b/import.go
@@ -32,16 +32,16 @@ import (
 )
 
 type importer struct {
-       tagLibraryFile string
-       refFile        string
-       outputFile     string
-       projectUUID    string
-       runLocal       bool
-       skipOOO        bool
-       outputTiles    bool
-       includeNoCalls bool
-       outputStats    string
-       encoder        *gob.Encoder
+       tagLibraryFile      string
+       refFile             string
+       outputFile          string
+       projectUUID         string
+       runLocal            bool
+       skipOOO             bool
+       outputTiles         bool
+       saveIncompleteTiles bool
+       outputStats         string
+       encoder             *gob.Encoder
 }
 
 func (cmd *importer) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int {
@@ -60,7 +60,7 @@ func (cmd *importer) RunCommand(prog string, args []string, stdin io.Reader, std
        flags.BoolVar(&cmd.runLocal, "local", false, "run on local host (default: run in an arvados container)")
        flags.BoolVar(&cmd.skipOOO, "skip-ooo", false, "skip out-of-order tags")
        flags.BoolVar(&cmd.outputTiles, "output-tiles", false, "include tile variant sequences in output file")
-       flags.BoolVar(&cmd.includeNoCalls, "include-no-calls", false, "treat tiles with no-calls as regular tiles")
+       flags.BoolVar(&cmd.saveIncompleteTiles, "save-incomplete-tiles", false, "treat tiles with no-calls as regular tiles")
        flags.StringVar(&cmd.outputStats, "output-stats", "", "output stats to `file` (json)")
        priority := flags.Int("priority", 500, "container request priority")
        pprof := flags.String("pprof", "", "serve Go profile data at http://`[addr]:port`")
@@ -96,8 +96,8 @@ func (cmd *importer) RunCommand(prog string, args []string, stdin io.Reader, std
                        Name:        "lightning import",
                        Client:      arvados.NewClientFromEnv(),
                        ProjectUUID: cmd.projectUUID,
-                       RAM:         60000000000,
-                       VCPUs:       16,
+                       RAM:         80000000000,
+                       VCPUs:       32,
                        Priority:    *priority,
                }
                err = runner.TranslatePaths(&cmd.tagLibraryFile, &cmd.refFile, &cmd.outputFile)
@@ -125,7 +125,7 @@ func (cmd *importer) RunCommand(prog string, args []string, stdin io.Reader, std
                        "-loglevel=" + *loglevel,
                        fmt.Sprintf("-skip-ooo=%v", cmd.skipOOO),
                        fmt.Sprintf("-output-tiles=%v", cmd.outputTiles),
-                       fmt.Sprintf("-include-no-calls=%v", cmd.includeNoCalls),
+                       fmt.Sprintf("-save-incomplete-tiles=%v", cmd.saveIncompleteTiles),
                        "-output-stats", "/mnt/output/stats.json",
                        "-tag-library", cmd.tagLibraryFile,
                        "-ref", cmd.refFile,
@@ -163,7 +163,7 @@ func (cmd *importer) RunCommand(prog string, args []string, stdin io.Reader, std
        bufw := bufio.NewWriter(output)
        cmd.encoder = gob.NewEncoder(bufw)
 
-       tilelib := &tileLibrary{taglib: taglib, includeNoCalls: cmd.includeNoCalls, skipOOO: cmd.skipOOO}
+       tilelib := &tileLibrary{taglib: taglib, retainNoCalls: cmd.saveIncompleteTiles, skipOOO: cmd.skipOOO}
        if cmd.outputTiles {
                cmd.encoder.Encode(LibraryEntry{TagSet: taglib.Tags()})
                tilelib.encoder = cmd.encoder
index c51dcc8dee9f47ef4c2ac72aad090b193f69abeb..1807393b593b7d2c4e59ac3b138e3532b977391b 100644 (file)
--- a/merge.go
+++ b/merge.go
@@ -127,8 +127,8 @@ func (cmd *merger) doMerge() error {
 
        cmd.errs = make(chan error, 1)
        cmd.tilelib = &tileLibrary{
-               encoder:        encoder,
-               includeNoCalls: true,
+               encoder:       encoder,
+               retainNoCalls: true,
        }
 
        cmd.mapped = map[string]map[tileLibRef]tileVariantID{}
diff --git a/pca.go b/pca.go
index 8c19e5623c9f6cf239f57a9f71e601a3bc424f70..65c69eb6e3a495657fad6284230fc015058e47e4 100644 (file)
--- a/pca.go
+++ b/pca.go
@@ -139,7 +139,7 @@ func (cmd *goPCA) RunCommand(prog string, args []string, stdin io.Reader, stdout
        }
        log.Print("reading")
        tilelib := tileLibrary{
-               includeNoCalls: true,
+               retainNoCalls:  true,
                compactGenomes: map[string][]tileVariantID{},
        }
        err = tilelib.LoadGob(context.Background(), input, nil)
index ef7af393ebaa60d2c71618d9ebf419444e24d724..dddadcd805d87e8c746b3b43f3709e93f3af723f 100644 (file)
@@ -63,7 +63,7 @@ func (s *pipelineSuite) TestImportMerge(c *check.C) {
                        args := []string{"-local=true", "-o=" + libfile[i], "-skip-ooo=true", "-output-tiles", "-tag-library", "testdata/tags"}
                        if i == 0 {
                                // ref only
-                               args = append(args, "-include-no-calls")
+                               args = append(args, "-save-incomplete-tiles")
                        }
                        args = append(args, infile)
                        code := (&importer{}).RunCommand("lightning import", args, bytes.NewReader(nil), &bytes.Buffer{}, os.Stderr)
@@ -127,7 +127,7 @@ chr2        471     GG      AA      0/1     0/0
        bedout, err := ioutil.ReadFile(tmpdir + "/export.bed")
        c.Check(err, check.IsNil)
        c.Logf("%s", string(bedout))
-       c.Check(string(bedout), check.Equals, `chr1 0 248 0 500 . 0 224
+       c.Check(string(bedout), check.Equals, `chr1 0 248 0 1000 . 0 224
 chr1 224 372 1 1000 . 248 348
 chr1 348 496 2 1000 . 372 472
 chr1 472 572 3 1000 . 496 572
index 4100cbf70ab3fb4ac7febee89d3350e0d8203f70..b1550f67eb64141b0cfe328a4da23ad93d184215 100644 (file)
@@ -50,7 +50,7 @@ func (tseq tileSeq) Variants() ([]tileVariantID, int, int) {
 }
 
 type tileLibrary struct {
-       includeNoCalls      bool
+       retainNoCalls       bool
        skipOOO             bool
        retainTileSequences bool
 
@@ -336,7 +336,7 @@ func (tilelib *tileLibrary) TileFasta(filelabel string, rdr io.Reader) (tileSeq,
                                basesOut += countBases(job.fasta[last.pos+last.taglen : f.pos+f.taglen])
                        } else {
                                // If we dropped this tile
-                               // (because !includeNoCalls),
+                               // (because !retainNoCalls),
                                // set taglen=0 so the
                                // overlapping tag is counted
                                // toward coverage on the
@@ -386,12 +386,12 @@ func (tilelib *tileLibrary) Len() int {
 // Return a tileLibRef for a tile with the given tag and sequence,
 // adding the sequence to the library if needed.
 func (tilelib *tileLibrary) getRef(tag tagID, seq []byte) tileLibRef {
-       if !tilelib.includeNoCalls {
+       dropSeq := false
+       if !tilelib.retainNoCalls {
                for _, b := range seq {
                        if b != 'a' && b != 'c' && b != 'g' && b != 't' {
-                               // return "tile not found" if seq has any
-                               // no-calls
-                               return tileLibRef{Tag: tag}
+                               dropSeq = true
+                               break
                        }
                }
        }
@@ -424,7 +424,7 @@ func (tilelib *tileLibrary) getRef(tag tagID, seq []byte) tileLibRef {
        }
        tilelib.variants++
        tilelib.variant[tag] = append(tilelib.variant[tag], seqhash)
-       if tilelib.retainTileSequences {
+       if tilelib.retainTileSequences && !dropSeq {
                if tilelib.seq == nil {
                        tilelib.seq = map[[blake2b.Size256]byte][]byte{}
                }
@@ -434,12 +434,17 @@ func (tilelib *tileLibrary) getRef(tag tagID, seq []byte) tileLibRef {
        tilelib.mtx.Unlock()
 
        if tilelib.encoder != nil {
+               saveSeq := seq
+               if dropSeq {
+                       // Save the hash, but not the sequence
+                       saveSeq = nil
+               }
                tilelib.encoder.Encode(LibraryEntry{
                        TileVariants: []TileVariant{{
                                Tag:      tag,
                                Variant:  variant,
                                Blake2b:  seqhash,
-                               Sequence: seq,
+                               Sequence: saveSeq,
                        }},
                })
        }