Update logged stats.
authorTom Clegg <tom@curii.com>
Fri, 4 Feb 2022 06:11:58 +0000 (01:11 -0500)
committerTom Clegg <tom@curii.com>
Fri, 4 Feb 2022 06:11:58 +0000 (01:11 -0500)
refs #18664

Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom@curii.com>

slice_test.go
tilelib.go

index 47c43e1cee8b8a8f9d20663333fb8a18a65df933..b0d02fdcdac28dfcb61ca2f4bdb04c072b01a322 100644 (file)
@@ -96,7 +96,7 @@ func (s *sliceSuite) TestImportAndSlice(c *check.C) {
                dumped, err := ioutil.ReadFile(dumpdir + "/variants.csv")
                c.Assert(err, check.IsNil)
                c.Logf("%s", dumped)
-               c.Check(string(dumped), check.Matches, `(?ms).*\n6,1,1,chr2,349,AAAACTG.*`)
+               c.Check("\n"+string(dumped), check.Matches, `(?ms).*\n6,1,1,chr2,349,AAAACTG.*`)
        }
 
        c.Log("=== slice-numpy ===")
index 0b7fc599182031bb6faab854a8f712aa09c4165d..df6cb73b3be2b9e16418f6d5eb7c5135b7686efe 100644 (file)
@@ -544,12 +544,13 @@ func (tilelib *tileLibrary) dump(out io.Writer) {
 }
 
 type importStats struct {
-       InputFile              string
-       InputLabel             string
-       InputLength            int
-       InputCoverage          int
-       PathLength             int
-       DroppedOutOfOrderTiles int
+       InputFile             string
+       InputLabel            string
+       InputLength           int
+       InputCoverage         int
+       PathLength            int
+       DroppedRepeatedTags   int
+       DroppedOutOfOrderTags int
 }
 
 func (tilelib *tileLibrary) TileFasta(filelabel string, rdr io.Reader, matchChromosome *regexp.Regexp, isRef bool) (tileSeq, []importStats, error) {
@@ -606,8 +607,7 @@ func (tilelib *tileLibrary) TileFasta(filelabel string, rdr io.Reader, matchChro
                        log.Warnf("%s %s no tags found", filelabel, job.label)
                }
 
-               skipped := 0
-
+               droppedDup := 0
                if !tilelib.useDups {
                        // Remove any tags that appeared more than once
                        dup := map[tagID]bool{}
@@ -621,17 +621,19 @@ func (tilelib *tileLibrary) TileFasta(filelabel string, rdr io.Reader, matchChro
                                        dst++
                                }
                        }
-                       skipped += len(found) - dst
+                       droppedDup = len(found) - dst
+                       log.Infof("%s %s dropping %d non-unique tags", filelabel, job.label, droppedDup)
                        found = found[:dst]
                }
 
+               droppedOOO := 0
                if tilelib.skipOOO {
-                       log.Infof("%s %s keeping longest increasing subsequence", filelabel, job.label)
                        keep := longestIncreasingSubsequence(len(found), func(i int) int { return int(found[i].tagid) })
                        for i, x := range keep {
                                found[i] = found[x]
                        }
-                       skipped += len(found) - len(keep)
+                       droppedOOO = len(found) - len(keep)
+                       log.Infof("%s %s dropping %d out-of-order tags", filelabel, job.label, droppedOOO)
                        found = found[:len(keep)]
                }
 
@@ -670,14 +672,15 @@ func (tilelib *tileLibrary) TileFasta(filelabel string, rdr io.Reader, matchChro
                ret[job.label] = pathcopy
 
                basesIn := countBases(job.fasta)
-               log.Infof("%s %s fasta in %d coverage in %d path len %d low-quality %d skipped-out-of-order %d", filelabel, job.label, len(job.fasta), basesIn, len(path), lowquality, skipped)
+               log.Infof("%s %s fasta in %d coverage in %d path len %d low-quality %d", filelabel, job.label, len(job.fasta), basesIn, len(path), lowquality)
                stats = append(stats, importStats{
-                       InputFile:              filelabel,
-                       InputLabel:             job.label,
-                       InputLength:            len(job.fasta),
-                       InputCoverage:          basesIn,
-                       PathLength:             len(path),
-                       DroppedOutOfOrderTiles: skipped,
+                       InputFile:             filelabel,
+                       InputLabel:            job.label,
+                       InputLength:           len(job.fasta),
+                       InputCoverage:         basesIn,
+                       PathLength:            len(path),
+                       DroppedOutOfOrderTags: droppedOOO,
+                       DroppedRepeatedTags:   droppedDup,
                })
 
                totalPathLen += len(path)