From f50e2e9f8c572357ecab7117991efb7c9bea13e7 Mon Sep 17 00:00:00 2001 From: Tom Clegg Date: Wed, 14 Oct 2020 14:33:01 -0400 Subject: [PATCH] Export HGVS. Arvados-DCO-1.1-Signed-off-by: Tom Clegg --- cmd.go | 1 + exporthgvs.go | 306 ++++++++++++++++++++++++++++++ gob.go | 12 +- hgvs/diff.go | 2 + import.go | 50 +++-- merge.go | 7 - pipeline_test.go | 22 ++- testdata/pipeline1/input1.1.fasta | 6 +- testdata/pipeline1/input1.2.fasta | 4 +- tilelib.go | 121 ++++++++---- 10 files changed, 456 insertions(+), 75 deletions(-) create mode 100644 exporthgvs.go diff --git a/cmd.go b/cmd.go index 8fc3636d6c..a7c3e196b1 100644 --- a/cmd.go +++ b/cmd.go @@ -19,6 +19,7 @@ var ( "ref2genome": &ref2genome{}, "vcf2fasta": &vcf2fasta{}, "import": &importer{}, + "export-hgvs": &exportHGVS{}, "export-numpy": &exportNumpy{}, "filter": &filterer{}, "build-docker-image": &buildDockerImage{}, diff --git a/exporthgvs.go b/exporthgvs.go new file mode 100644 index 0000000000..b3865591d1 --- /dev/null +++ b/exporthgvs.go @@ -0,0 +1,306 @@ +package main + +import ( + "bufio" + "bytes" + "context" + "errors" + "flag" + "fmt" + "io" + "net/http" + _ "net/http/pprof" + "os" + "sort" + "strings" + "sync" + "time" + + "git.arvados.org/arvados.git/sdk/go/arvados" + "github.com/arvados/lightning/hgvs" + log "github.com/sirupsen/logrus" +) + +type exportHGVS struct { +} + +func (cmd *exportHGVS) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int { + var err error + defer func() { + if err != nil { + fmt.Fprintf(stderr, "%s\n", err) + } + }() + flags := flag.NewFlagSet("", flag.ContinueOnError) + flags.SetOutput(stderr) + pprof := flags.String("pprof", "", "serve Go profile data at http://`[addr]:port`") + runlocal := flags.Bool("local", false, "run on local host (default: run in an arvados container)") + projectUUID := flags.String("project", "", "project `UUID` for output data") + priority := flags.Int("priority", 500, "container request priority") + refname := flags.String("ref", "", "reference genome `name`") + inputFilename := flags.String("i", "-", "input `file` (library)") + outputFilename := flags.String("o", "-", "fasta output `file`") + pick := flags.String("pick", "", "`name` of single genome to export") + err = flags.Parse(args) + if err == flag.ErrHelp { + err = nil + return 0 + } else if err != nil { + return 2 + } + + if *pprof != "" { + go func() { + log.Println(http.ListenAndServe(*pprof, nil)) + }() + } + + if !*runlocal { + if *outputFilename != "-" { + err = errors.New("cannot specify output file in container mode: not implemented") + return 1 + } + runner := arvadosContainerRunner{ + Name: "lightning export-hgvs", + Client: arvados.NewClientFromEnv(), + ProjectUUID: *projectUUID, + RAM: 128000000000, + VCPUs: 2, + Priority: *priority, + } + err = runner.TranslatePaths(inputFilename) + if err != nil { + return 1 + } + runner.Args = []string{"export-hgvs", "-local=true", "-pick", *pick, "-ref", *refname, "-i", *inputFilename, "-o", "/mnt/output/export.csv"} + var output string + output, err = runner.Run() + if err != nil { + return 1 + } + fmt.Fprintln(stdout, output+"/export.csv") + return 0 + } + + input, err := os.Open(*inputFilename) + if err != nil { + return 1 + } + defer input.Close() + + // Error out early if seeking doesn't work on the input file. + _, err = input.Seek(0, io.SeekEnd) + if err != nil { + return 1 + } + _, err = input.Seek(0, io.SeekStart) + if err != nil { + return 1 + } + + var mtx sync.Mutex + var cgs []CompactGenome + var tilelib tileLibrary + err = tilelib.LoadGob(context.Background(), input, func(cg CompactGenome) { + if *pick != "" && *pick != cg.Name { + return + } + log.Debugf("export: pick %q", cg.Name) + mtx.Lock() + defer mtx.Unlock() + cgs = append(cgs, cg) + }) + if err != nil { + return 1 + } + sort.Slice(cgs, func(i, j int) bool { return cgs[i].Name < cgs[j].Name }) + log.Printf("export: pick %q => %d genomes", *pick, len(cgs)) + + refseq, ok := tilelib.refseqs[*refname] + if !ok { + err = fmt.Errorf("reference name %q not found in input; have %v", *refname, func() (names []string) { + for name := range tilelib.refseqs { + names = append(names, name) + } + return + }()) + return 1 + } + + _, err = input.Seek(0, io.SeekStart) + if err != nil { + return 1 + } + + var output io.WriteCloser + if *outputFilename == "-" { + output = nopCloser{stdout} + } else { + output, err = os.OpenFile(*outputFilename, os.O_CREATE|os.O_WRONLY, 0777) + if err != nil { + return 1 + } + defer output.Close() + } + bufw := bufio.NewWriter(output) + err = cmd.export(bufw, input, tilelib.taglib.keylen, refseq, cgs) + if err != nil { + return 1 + } + err = bufw.Flush() + if err != nil { + return 1 + } + err = output.Close() + if err != nil { + return 1 + } + err = input.Close() + if err != nil { + return 1 + } + return 0 +} + +func (cmd *exportHGVS) export(out io.Writer, librdr io.Reader, taglen int, refseq map[string][]tileLibRef, cgs []CompactGenome) error { + need := map[tileLibRef]bool{} + var seqnames []string + for seqname, librefs := range refseq { + seqnames = append(seqnames, seqname) + for _, libref := range librefs { + need[libref] = true + } + } + sort.Strings(seqnames) + + for _, cg := range cgs { + for i, variant := range cg.Variants { + if variant == 0 { + continue + } + libref := tileLibRef{Tag: tagID(i / 2), Variant: variant} + need[libref] = true + } + } + + log.Infof("export: loading %d tile variants", len(need)) + tileVariant := map[tileLibRef]TileVariant{} + err := DecodeLibrary(librdr, func(ent *LibraryEntry) error { + for _, tv := range ent.TileVariants { + libref := tileLibRef{Tag: tv.Tag, Variant: tv.Variant} + if need[libref] { + tileVariant[libref] = tv + } + } + return nil + }) + if err != nil { + return err + } + + log.Infof("export: loaded %d tile variants", len(tileVariant)) + var missing []tileLibRef + for libref := range need { + if _, ok := tileVariant[libref]; !ok { + missing = append(missing, libref) + } + } + if len(missing) > 0 { + if limit := 100; len(missing) > limit { + log.Warnf("first %d missing tiles: %v", limit, missing[:limit]) + } else { + log.Warnf("missing tiles: %v", missing) + } + return fmt.Errorf("%d needed tiles are missing from library", len(missing)) + } + + refpos := 0 + for _, seqname := range seqnames { + variantAt := map[int][]hgvs.Variant{} // variantAt[chromOffset][genomeIndex*2+phase] + for refstep, libref := range refseq[seqname] { + reftile := tileVariant[libref] + for cgidx, cg := range cgs { + for phase := 0; phase < 2; phase++ { + if len(cg.Variants) <= int(libref.Tag)*2+phase { + continue + } + variant := cg.Variants[int(libref.Tag)*2+phase] + if variant == 0 { + continue + } + genometile := tileVariant[tileLibRef{Tag: libref.Tag, Variant: variant}] + if variant == libref.Variant { + continue + } + refSequence := reftile.Sequence + // If needed, extend the + // reference sequence up to + // the tag at the end of the + // genometile sequence. + refstepend := refstep + 1 + for refstepend < len(refseq[seqname]) && len(refSequence) >= taglen && !bytes.EqualFold(refSequence[len(refSequence)-taglen:], genometile.Sequence[len(genometile.Sequence)-taglen:]) { + if &refSequence[0] == &reftile.Sequence[0] { + refSequence = append([]byte(nil), refSequence...) + } + refSequence = append(refSequence, tileVariant[refseq[seqname][refstepend]].Sequence...) + refstepend++ + } + vars, _ := hgvs.Diff(strings.ToUpper(string(refSequence)), strings.ToUpper(string(genometile.Sequence)), time.Second) + for _, v := range vars { + v.Position += refpos + log.Debugf("%s seq %s phase %d tag %d tile diff %s\n", cg.Name, seqname, phase, libref.Tag, v.String()) + varslice := variantAt[v.Position] + if varslice == nil { + varslice = make([]hgvs.Variant, len(cgs)*2) + } + varslice[cgidx*2+phase] = v + variantAt[v.Position] = varslice + } + } + } + refpos += len(reftile.Sequence) - taglen + + // Flush entries from variantAt that are + // behind refpos. Flush all entries if this is + // the last reftile of the path/chromosome. + var flushpos []int + lastrefstep := refstep == len(refseq[seqname])-1 + for pos := range variantAt { + if lastrefstep || pos <= refpos { + flushpos = append(flushpos, pos) + } + } + sort.Slice(flushpos, func(i, j int) bool { return flushpos[i] < flushpos[j] }) + for _, pos := range flushpos { + varslice := variantAt[pos] + delete(variantAt, pos) + for i := range varslice { + if varslice[i].Position == 0 { + varslice[i].Position = pos + } + } + for i := 0; i < len(cgs); i++ { + if i > 0 { + out.Write([]byte{'\t'}) + } + var1, var2 := varslice[i*2], varslice[i*2+1] + if var1.Position == 0 && var2.Position == 0 { + out.Write([]byte{'.'}) + } else if var1 == var2 { + fmt.Fprintf(out, "%s:g.%s", seqname, var1.String()) + } else { + if var1.Position == 0 { + var1.Position = pos + } + if var2.Position == 0 { + var2.Position = pos + } + fmt.Fprintf(out, "%s:g.[%s];[%s]", seqname, var1.String(), var2.String()) + } + } + out.Write([]byte{'\n'}) + } + } + } + return nil +} diff --git a/gob.go b/gob.go index 2fb061c5b1..a8401d8af8 100644 --- a/gob.go +++ b/gob.go @@ -14,6 +14,11 @@ type CompactGenome struct { Variants []tileVariantID } +type CompactSequence struct { + Name string + TileSequences map[string][]tileLibRef +} + type TileVariant struct { Tag tagID Variant tileVariantID @@ -22,9 +27,10 @@ type TileVariant struct { } type LibraryEntry struct { - TagSet [][]byte - CompactGenomes []CompactGenome - TileVariants []TileVariant + TagSet [][]byte + CompactGenomes []CompactGenome + CompactSequences []CompactSequence + TileVariants []TileVariant } func ReadCompactGenomes(rdr io.Reader) ([]CompactGenome, error) { diff --git a/hgvs/diff.go b/hgvs/diff.go index a6366eb36f..e5377d6bac 100644 --- a/hgvs/diff.go +++ b/hgvs/diff.go @@ -15,6 +15,8 @@ type Variant struct { func (v *Variant) String() string { switch { + case len(v.New) == 0 && len(v.Ref) == 0: + return fmt.Sprintf("%d=", v.Position) case len(v.New) == 0 && len(v.Ref) == 1: return fmt.Sprintf("%ddel", v.Position) case len(v.New) == 0: diff --git a/import.go b/import.go index 860ac3eb7a..71ab688e98 100644 --- a/import.go +++ b/import.go @@ -317,12 +317,20 @@ func (cmd *importer) tileInputs(tilelib *tileLibrary, infiles []string) error { log.Printf("%s starting", infile) defer log.Printf("%s done", infile) tseqs, err := cmd.tileFasta(tilelib, infile) - var kept, dropped int - variants[0], kept, dropped = tseqs.Variants() - variants[1] = variants[0] - log.Printf("%s found %d unique tags plus %d repeats", infile, kept, dropped) - return err + if err != nil { + return err + } + totlen := 0 + for _, tseq := range tseqs { + totlen += len(tseq) + } + log.Printf("%s tiled %d seqs, total len %d", infile, len(tseqs), totlen) + return cmd.encoder.Encode(LibraryEntry{ + CompactSequences: []CompactSequence{{Name: infile, TileSequences: tseqs}}, + }) } + // Don't write out a CompactGenomes entry + continue } else if vcfFilenameRe.MatchString(infile) { for phase := 0; phase < 2; phase++ { phase := phase @@ -347,20 +355,8 @@ func (cmd *importer) tileInputs(tilelib *tileLibrary, infiles []string) error { if len(errs) > 0 { return } - ntags := len(variants[0]) - if ntags < len(variants[1]) { - ntags = len(variants[1]) - } - flat := make([]tileVariantID, ntags*2) - for i := 0; i < ntags; i++ { - for hap := 0; hap < 2; hap++ { - if i < len(variants[hap]) { - flat[i*2+hap] = variants[hap][i] - } - } - } err := cmd.encoder.Encode(LibraryEntry{ - CompactGenomes: []CompactGenome{{Name: infile, Variants: flat}}, + CompactGenomes: []CompactGenome{{Name: infile, Variants: flatten(variants)}}, }) if err != nil { select { @@ -450,3 +446,21 @@ func (cmd *importer) tileGVCF(tilelib *tileLibrary, infile string, phase int) (t } return } + +func flatten(variants [][]tileVariantID) []tileVariantID { + ntags := 0 + for _, v := range variants { + if ntags < len(v) { + ntags = len(v) + } + } + flat := make([]tileVariantID, ntags*2) + for i := 0; i < ntags; i++ { + for hap := 0; hap < 2; hap++ { + if i < len(variants[hap]) { + flat[i*2+hap] = variants[hap][i] + } + } + } + return flat +} diff --git a/merge.go b/merge.go index a55f47228d..c51dcc8dee 100644 --- a/merge.go +++ b/merge.go @@ -129,13 +129,6 @@ func (cmd *merger) doMerge() error { cmd.tilelib = &tileLibrary{ encoder: encoder, includeNoCalls: true, - onLoadGenome: func(cg CompactGenome) { - err := encoder.Encode(LibraryEntry{CompactGenomes: []CompactGenome{cg}}) - if err != nil { - cmd.setError(err) - cancel() - } - }, } cmd.mapped = map[string]map[tileLibRef]tileVariantID{} diff --git a/pipeline_test.go b/pipeline_test.go index eb61d96723..16ad4843e1 100644 --- a/pipeline_test.go +++ b/pipeline_test.go @@ -4,6 +4,7 @@ import ( "bytes" "fmt" "io" + "io/ioutil" "os" "sync" @@ -69,8 +70,27 @@ func (s *pipelineSuite) TestImportMerge(c *check.C) { c.Logf("len(merged) %d", merged.Len()) statsout := &bytes.Buffer{} - code = (&stats{}).RunCommand("lightning stats", []string{"-local"}, merged, statsout, os.Stderr) + code = (&stats{}).RunCommand("lightning stats", []string{"-local"}, bytes.NewReader(merged.Bytes()), statsout, os.Stderr) c.Check(code, check.Equals, 0) c.Check(statsout.Len() > 0, check.Equals, true) c.Logf("%s", statsout.String()) + + c.Check(ioutil.WriteFile(tmpdir+"/merged.gob", merged.Bytes(), 0666), check.IsNil) + + hgvsout := &bytes.Buffer{} + code = (&exportHGVS{}).RunCommand("lightning export-hgvs", []string{"-local", "-ref", "testdata/ref.fasta", "-i", tmpdir + "/merged.gob"}, bytes.NewReader(nil), hgvsout, os.Stderr) + c.Check(code, check.Equals, 0) + c.Check(hgvsout.Len() > 0, check.Equals, true) + c.Logf("%s", hgvsout.String()) + c.Check(hgvsout.String(), check.Equals, `chr1:g.[41_42delinsAA];[41=] +chr1:g.[161=];[161A>T] +chr1:g.[178=];[178A>T] +chr1:g.222_224del +chr1:g.[302=];[302_305delinsAAAA] +chr2:g.[813_826del];[813=] +chr2:g.[830_841delinsAA];[830=] +chr2:g.[887C>A];[887=] +chr2:g.[1042_1044del];[1042=] +chr2:g.[1043=];[1043_1044delinsAA] +`) } diff --git a/testdata/pipeline1/input1.1.fasta b/testdata/pipeline1/input1.1.fasta index e81faaf1d8..bc1b2bd2bf 100644 --- a/testdata/pipeline1/input1.1.fasta +++ b/testdata/pipeline1/input1.1.fasta @@ -1,5 +1,5 @@ >chr1 -ggcgtctacctcgagaagccccgacctctgaataagatctaagaacatctcaagggattgtgtatcttgttgggtgtacgcgcgccagcccgcagcatta +ggcgtctacctcgagaagccccgacctctgaataagatctAAgaacatctcaagggattgtgtatcttgttgggtgtacgcgcgccagcccgcagcatta ggagaactgtgctccgccttcaga ccccttgggtaaaatgccgcgcaatatgttgattacacttgctgcccatctgaaaggtcgccttatcaatcctatgctgaatgccctctaaggagtt acacatgctagcgcgtcggggtgg @@ -12,8 +12,8 @@ tgatagccatcccgcacctagcgcacggaacttcgaccgatcccatattaacgtgtctcttatcctcccgctactgcgtg tttttatagcagtagcggttgcgataatgcgcactaaggtggccataacttagccacacagactgcgacctcggtgtcaatctttaggcgatgactagtg gttattaataataacttatcatca cttccggggaaactagcgtaaaaaccgccgtcgcagtataccaccttatacctgtgccactcaatacaggagttgctcagcccaagacaccaacgactaa -gctctcaaaccttgtatttttctt -atgtcctcgcttggattatggggactcgcagtaaatgacaaccgtgcccgctggccctggccggaccgcgcgtccgtaagtagcgatcgagtcctgcagt +gctctcaaaccttgtaAAAAActt +atgtcctcgcttggattatggggactcgcagtaaatgacaacAgtgcccgctggccctggccggaccgcgcgtccgtaagtagcgatcgagtcctgcagt aaaactgatccaaaaaaaatacaa acaccacgtttataccctgttgagtcgccacgtaacgtattaacgtatgaacggcccggttttcttatctcgcaccgctagtttgccctggcggtcg cctatgagtcaatcctattttcaa diff --git a/testdata/pipeline1/input1.2.fasta b/testdata/pipeline1/input1.2.fasta index 953af8971b..49af080940 100644 --- a/testdata/pipeline1/input1.2.fasta +++ b/testdata/pipeline1/input1.2.fasta @@ -1,9 +1,9 @@ >chr1 ggcgtctacctcgagaagccccgacctctgaataagatctttgaacatctcaagggattgtgtatcttgttgggtgtacgcgcgccagcccgcagcatta ggagaactgtgctccgccttcaga -ccccttgggtaaaatgccgcgcaatatgttgattacacttgctgcccatctgaaaggtcgccttatcaatcctatgctgaatgccctctaaggagtt +ccccttgggtaaaatgccgcgcaatatgttgattacTcttgctgcccatctgaTaggtcgccttatcaatcctatgctgaatgccctctaaggagtt acacatgctagcgcgtcggggtgg -tcgccgcatgggacatgttggggtcccgtagctccggtcgatgtaggcacgcgttttgccgaggagatagatcatcagctgacctatagattcgtctgtc +tcgccgcatgggacatgttggggtcccgtagctccggtcgatgtaggcacgcgAAAAgccgaggagatagatcatcagctgacctatagattcgtctgtc gactctagcagagtggccagccac aaggtttatattcagtcgaaatggacgggtcccgaacttgcacggacctaacgggactcgcctttcgtaataaccgaacctctaggccgccgcgagatca cctcccgagccgagccacccgtca diff --git a/tilelib.go b/tilelib.go index b4ecd4a3af..8be8eb46cf 100644 --- a/tilelib.go +++ b/tilelib.go @@ -17,8 +17,8 @@ import ( type tileVariantID uint16 // 1-based type tileLibRef struct { - tag tagID - variant tileVariantID + Tag tagID + Variant tileVariantID } type tileSeq map[string][]tileLibRef @@ -27,8 +27,8 @@ func (tseq tileSeq) Variants() ([]tileVariantID, int, int) { maxtag := 0 for _, refs := range tseq { for _, ref := range refs { - if maxtag < int(ref.tag) { - maxtag = int(ref.tag) + if maxtag < int(ref.Tag) { + maxtag = int(ref.Tag) } } } @@ -36,12 +36,12 @@ func (tseq tileSeq) Variants() ([]tileVariantID, int, int) { var kept, dropped int for _, refs := range tseq { for _, ref := range refs { - if vars[int(ref.tag)] != 0 { + if vars[int(ref.Tag)] != 0 { dropped++ } else { kept++ } - vars[int(ref.tag)] = ref.variant + vars[int(ref.Tag)] = ref.Variant } } return vars, kept, dropped @@ -52,13 +52,12 @@ type tileLibrary struct { skipOOO bool taglib *tagLibrary variant [][][blake2b.Size256]byte + refseqs map[string]map[string][]tileLibRef // count [][]int // seq map[[blake2b.Size]byte][]byte variants int // if non-nil, write out any tile variants added while tiling encoder *gob.Encoder - // if non-nil, call this func upon loading a genome - onLoadGenome func(CompactGenome) mtx sync.Mutex } @@ -103,16 +102,17 @@ func (tilelib *tileLibrary) loadTileVariants(tvs []TileVariant, variantmap map[t for _, tv := range tvs { // Assign a new variant ID (unique across all inputs) // for each input variant. - variantmap[tileLibRef{tag: tv.Tag, variant: tv.Variant}] = tilelib.getRef(tv.Tag, tv.Sequence).variant + variantmap[tileLibRef{Tag: tv.Tag, Variant: tv.Variant}] = tilelib.getRef(tv.Tag, tv.Sequence).Variant } return nil } -func (tilelib *tileLibrary) loadGenomes(genomes map[string][]tileVariantID, variantmap map[tileLibRef]tileVariantID, onLoadGenome func(CompactGenome)) error { +func (tilelib *tileLibrary) loadCompactGenomes(cgs []CompactGenome, variantmap map[tileLibRef]tileVariantID, onLoadGenome func(CompactGenome)) error { + log.Debugf("loadCompactGenomes: %d", len(cgs)) var wg sync.WaitGroup errs := make(chan error, 1) - for name, variants := range genomes { - name, variants := name, variants + for _, cg := range cgs { + name, variants := cg.Name, cg.Variants wg.Add(1) go func() { defer wg.Done() @@ -124,7 +124,7 @@ func (tilelib *tileLibrary) loadGenomes(genomes map[string][]tileVariantID, vari continue } tag := tagID(i / 2) - newvariant, ok := variantmap[tileLibRef{tag: tag, variant: variant}] + newvariant, ok := variantmap[tileLibRef{Tag: tag, Variant: variant}] if !ok { err := fmt.Errorf("oops: genome %q has variant %d for tag %d, but that variant was not in its library", name, variant, tag) select { @@ -133,27 +133,22 @@ func (tilelib *tileLibrary) loadGenomes(genomes map[string][]tileVariantID, vari } return } + log.Tracef("loadCompactGenomes: cg %s tag %d variant %d => %d", name, tag, variant, newvariant) variants[i] = newvariant } + if onLoadGenome != nil { + onLoadGenome(cg) + } if tilelib.encoder != nil { - for name, variants := range genomes { - cg := CompactGenome{ - Name: name, - Variants: variants, - } - if onLoadGenome != nil { - onLoadGenome(cg) - } - err := tilelib.encoder.Encode(LibraryEntry{ - CompactGenomes: []CompactGenome{cg}, - }) - if err != nil { - select { - case errs <- err: - default: - } - return + err := tilelib.encoder.Encode(LibraryEntry{ + CompactGenomes: []CompactGenome{cg}, + }) + if err != nil { + select { + case errs <- err: + default: } + return } } }() @@ -163,8 +158,45 @@ func (tilelib *tileLibrary) loadGenomes(genomes map[string][]tileVariantID, vari return <-errs } +func (tilelib *tileLibrary) loadCompactSequences(cseqs []CompactSequence, variantmap map[tileLibRef]tileVariantID) error { + log.Debugf("loadCompactSequences: %d", len(cseqs)) + for _, cseq := range cseqs { + for _, tseq := range cseq.TileSequences { + for i, libref := range tseq { + v, ok := variantmap[libref] + if !ok { + return fmt.Errorf("oops: CompactSequence %q has variant %d for tag %d, but that variant was not in its library", cseq.Name, libref.Variant, libref.Tag) + } + tseq[i].Variant = v + } + } + if tilelib.encoder != nil { + if err := tilelib.encoder.Encode(LibraryEntry{ + CompactSequences: []CompactSequence{cseq}, + }); err != nil { + return err + } + } + } + tilelib.mtx.Lock() + defer tilelib.mtx.Unlock() + if tilelib.refseqs == nil { + tilelib.refseqs = map[string]map[string][]tileLibRef{} + } + for _, cseq := range cseqs { + tilelib.refseqs[cseq.Name] = cseq.TileSequences + } + return nil +} + +// Load library data from rdr. Tile variants might be renumbered in +// the process; in that case, genomes variants will be renumbered to +// match. +// +// If onLoadGenome is non-nil, call it on each CompactGenome entry. func (tilelib *tileLibrary) LoadGob(ctx context.Context, rdr io.Reader, onLoadGenome func(CompactGenome)) error { - genomes := map[string][]tileVariantID{} + cgs := []CompactGenome{} + cseqs := []CompactSequence{} variantmap := map[tileLibRef]tileVariantID{} err := DecodeLibrary(rdr, func(ent *LibraryEntry) error { if ctx.Err() != nil { @@ -172,14 +204,13 @@ func (tilelib *tileLibrary) LoadGob(ctx context.Context, rdr io.Reader, onLoadGe } if err := tilelib.loadTagSet(ent.TagSet); err != nil { return err - } else if err = tilelib.loadTileVariants(ent.TileVariants, variantmap); err != nil { + } + if err := tilelib.loadTileVariants(ent.TileVariants, variantmap); err != nil { return err - } else { - for _, cg := range ent.CompactGenomes { - genomes[cg.Name] = cg.Variants - } - return nil } + cgs = append(cgs, ent.CompactGenomes...) + cseqs = append(cseqs, ent.CompactSequences...) + return nil }) if err != nil { return err @@ -187,7 +218,11 @@ func (tilelib *tileLibrary) LoadGob(ctx context.Context, rdr io.Reader, onLoadGe if ctx.Err() != nil { return ctx.Err() } - err = tilelib.loadGenomes(genomes, variantmap, onLoadGenome) + err = tilelib.loadCompactGenomes(cgs, variantmap, onLoadGenome) + if err != nil { + return err + } + err = tilelib.loadCompactSequences(cseqs, variantmap) if err != nil { return err } @@ -258,11 +293,15 @@ func (tilelib *tileLibrary) TileFasta(filelabel string, rdr io.Reader) (tileSeq, log.Tracef("%s %s found[%d] == %#v", filelabel, job.label, i, f) if last.taglen > 0 { path = append(path, tilelib.getRef(last.tagid, job.fasta[last.pos:f.pos+f.taglen])) + } else { + f.pos = 0 } last = f } if last.taglen > 0 { path = append(path, tilelib.getRef(last.tagid, job.fasta[last.pos:])) + } else { + log.Warnf("%s %s no tags found", filelabel, job.label) } pathcopy := make([]tileLibRef, len(path)) @@ -289,7 +328,7 @@ func (tilelib *tileLibrary) getRef(tag tagID, seq []byte) tileLibRef { if b != 'a' && b != 'c' && b != 'g' && b != 't' { // return "tile not found" if seq has any // no-calls - return tileLibRef{tag: tag} + return tileLibRef{Tag: tag} } } } @@ -320,7 +359,7 @@ func (tilelib *tileLibrary) getRef(tag tagID, seq []byte) tileLibRef { for i, varhash := range tilelib.variant[tag] { if varhash == seqhash { tilelib.mtx.Unlock() - return tileLibRef{tag: tag, variant: tileVariantID(i + 1)} + return tileLibRef{Tag: tag, Variant: tileVariantID(i + 1)} } } tilelib.variants++ @@ -339,5 +378,5 @@ func (tilelib *tileLibrary) getRef(tag tagID, seq []byte) tileLibRef { }}, }) } - return tileLibRef{tag: tag, variant: variant} + return tileLibRef{Tag: tag, Variant: variant} } -- 2.30.2