Log # duplicate tags.
authorTom Clegg <tom@tomclegg.ca>
Fri, 24 Apr 2020 20:03:10 +0000 (16:03 -0400)
committerTom Clegg <tom@tomclegg.ca>
Fri, 24 Apr 2020 20:03:10 +0000 (16:03 -0400)
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom@tomclegg.ca>

import.go
tilelib.go

index 36558ca8d6a31f1f078db090d4ce87501cb333f4..a396636fcb324628a42b60b07d294d9cbba650ec 100644 (file)
--- a/import.go
+++ b/import.go
@@ -264,7 +264,9 @@ func (cmd *importer) tileInputs(tilelib *tileLibrary, infiles []string) error {
                                log.Printf("%s starting", infile)
                                defer log.Printf("%s done", infile)
                                tseqs, err := cmd.tileFasta(tilelib, infile)
-                               variants[0] = tseqs.Variants()
+                               var kept, dropped int
+                               variants[0], kept, dropped = tseqs.Variants()
+                               log.Printf("%s found %d unique tags plus %d repeats", infile, kept, dropped)
                                return err
                        }
                        infile2 := regexp.MustCompile(`\.1\.fasta(\.gz)?$`).ReplaceAllString(infile, `.2.fasta$1`)
@@ -273,7 +275,9 @@ func (cmd *importer) tileInputs(tilelib *tileLibrary, infiles []string) error {
                                log.Printf("%s starting", infile2)
                                defer log.Printf("%s done", infile2)
                                tseqs, err := cmd.tileFasta(tilelib, infile2)
-                               variants[1] = tseqs.Variants()
+                               var kept, dropped int
+                               variants[1], kept, dropped = tseqs.Variants()
+                               log.Printf("%s found %d unique tags plus %d repeats", infile, kept, dropped)
                                return err
                        }
                } else {
@@ -284,7 +288,9 @@ func (cmd *importer) tileInputs(tilelib *tileLibrary, infiles []string) error {
                                        log.Printf("%s phase %d starting", infile, phase+1)
                                        defer log.Printf("%s phase %d done", infile, phase+1)
                                        tseqs, err := cmd.tileGVCF(tilelib, infile, phase)
-                                       variants[phase] = tseqs.Variants()
+                                       var kept, dropped int
+                                       variants[phase], kept, dropped = tseqs.Variants()
+                                       log.Printf("%s phase %d found %d unique tags plus %d repeats", infile, phase+1, kept, dropped)
                                        return err
                                }
                        }
index be60ab2e1d6024341db4b4fb2397315c733d7a5a..5b4b30754c689f2b09a2c7012e31cfb0462a9e68 100644 (file)
@@ -20,7 +20,7 @@ type tileLibRef struct {
 
 type tileSeq map[string][]tileLibRef
 
-func (tseq tileSeq) Variants() []tileVariantID {
+func (tseq tileSeq) Variants() ([]tileVariantID, int, int) {
        maxtag := 0
        for _, refs := range tseq {
                for _, ref := range refs {
@@ -30,12 +30,18 @@ func (tseq tileSeq) Variants() []tileVariantID {
                }
        }
        vars := make([]tileVariantID, maxtag+1)
+       var kept, dropped int
        for _, refs := range tseq {
                for _, ref := range refs {
+                       if vars[int(ref.tag)] != 0 {
+                               dropped++
+                       } else {
+                               kept++
+                       }
                        vars[int(ref.tag)] = ref.variant
                }
        }
-       return vars
+       return vars, kept, dropped
 }
 
 type tileLibrary struct {