From 6c78b1f73e00a341cddb4f4b4b56bbaba25b4289 Mon Sep 17 00:00:00 2001 From: Tom Clegg Date: Fri, 24 Apr 2020 16:03:10 -0400 Subject: [PATCH] Log # duplicate tags. Arvados-DCO-1.1-Signed-off-by: Tom Clegg --- import.go | 12 +++++++++--- tilelib.go | 10 ++++++++-- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/import.go b/import.go index 36558ca8d6..a396636fcb 100644 --- a/import.go +++ b/import.go @@ -264,7 +264,9 @@ func (cmd *importer) tileInputs(tilelib *tileLibrary, infiles []string) error { log.Printf("%s starting", infile) defer log.Printf("%s done", infile) tseqs, err := cmd.tileFasta(tilelib, infile) - variants[0] = tseqs.Variants() + var kept, dropped int + variants[0], kept, dropped = tseqs.Variants() + log.Printf("%s found %d unique tags plus %d repeats", infile, kept, dropped) return err } infile2 := regexp.MustCompile(`\.1\.fasta(\.gz)?$`).ReplaceAllString(infile, `.2.fasta$1`) @@ -273,7 +275,9 @@ func (cmd *importer) tileInputs(tilelib *tileLibrary, infiles []string) error { log.Printf("%s starting", infile2) defer log.Printf("%s done", infile2) tseqs, err := cmd.tileFasta(tilelib, infile2) - variants[1] = tseqs.Variants() + var kept, dropped int + variants[1], kept, dropped = tseqs.Variants() + log.Printf("%s found %d unique tags plus %d repeats", infile, kept, dropped) return err } } else { @@ -284,7 +288,9 @@ func (cmd *importer) tileInputs(tilelib *tileLibrary, infiles []string) error { log.Printf("%s phase %d starting", infile, phase+1) defer log.Printf("%s phase %d done", infile, phase+1) tseqs, err := cmd.tileGVCF(tilelib, infile, phase) - variants[phase] = tseqs.Variants() + var kept, dropped int + variants[phase], kept, dropped = tseqs.Variants() + log.Printf("%s phase %d found %d unique tags plus %d repeats", infile, phase+1, kept, dropped) return err } } diff --git a/tilelib.go b/tilelib.go index be60ab2e1d..5b4b30754c 100644 --- a/tilelib.go +++ b/tilelib.go @@ -20,7 +20,7 @@ type tileLibRef struct { type tileSeq map[string][]tileLibRef -func (tseq tileSeq) Variants() []tileVariantID { +func (tseq tileSeq) Variants() ([]tileVariantID, int, int) { maxtag := 0 for _, refs := range tseq { for _, ref := range refs { @@ -30,12 +30,18 @@ func (tseq tileSeq) Variants() []tileVariantID { } } vars := make([]tileVariantID, maxtag+1) + var kept, dropped int for _, refs := range tseq { for _, ref := range refs { + if vars[int(ref.tag)] != 0 { + dropped++ + } else { + kept++ + } vars[int(ref.tag)] = ref.variant } } - return vars + return vars, kept, dropped } type tileLibrary struct { -- 2.30.2