Export labels.csv with numpy array.
authorTom Clegg <tom@tomclegg.ca>
Wed, 25 Nov 2020 06:07:46 +0000 (01:07 -0500)
committerTom Clegg <tom@tomclegg.ca>
Wed, 25 Nov 2020 06:07:46 +0000 (01:07 -0500)
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom@tomclegg.ca>

annotate.go
exportnumpy.go
pca.go
pipeline_test.go

index ff4ab887175cfb64c43af6ae6afd93da35399ec8..d731f6934a9f147d714a898440268ed48af00c70 100644 (file)
@@ -75,13 +75,13 @@ func (cmd *annotatecmd) RunCommand(prog string, args []string, stdin io.Reader,
                if err != nil {
                        return 1
                }
-               runner.Args = []string{"annotate", "-local=true", fmt.Sprintf("-variant-hash=%v", cmd.variantHash), "-max-tile-size", strconv.Itoa(cmd.maxTileSize), "-i", *inputFilename, "-o", "/mnt/output/tilevariants.tsv"}
+               runner.Args = []string{"annotate", "-local=true", fmt.Sprintf("-variant-hash=%v", cmd.variantHash), "-max-tile-size", strconv.Itoa(cmd.maxTileSize), "-i", *inputFilename, "-o", "/mnt/output/tilevariants.csv"}
                var output string
                output, err = runner.Run()
                if err != nil {
                        return 1
                }
-               fmt.Fprintln(stdout, output+"/tilevariants.tsv")
+               fmt.Fprintln(stdout, output+"/tilevariants.csv")
                return 0
        }
 
@@ -277,9 +277,9 @@ func (cmd *annotatecmd) annotateSequence(throttle *throttle, outch chan<- string
                                        }
                                        refnamefield := ""
                                        if refnamecol {
-                                               refnamefield = "\t" + refname
+                                               refnamefield = "," + trimFilenameForLabel(refname)
                                        }
-                                       outch <- fmt.Sprintf("%d\t%s%s\t%s:g.%s\n", tag, varid, refnamefield, seqname, diff.String())
+                                       outch <- fmt.Sprintf("%d,%s%s,%s:g.%s\n", tag, varid, refnamefield, seqname, diff.String())
                                }
                        }()
                }
index 3736ef0184cc61cfc1797951c08d989c14a83d4e..eba0234bfe216e12384306f73b97fec04e34a673 100644 (file)
@@ -38,8 +38,9 @@ func (cmd *exportNumpy) RunCommand(prog string, args []string, stdin io.Reader,
        priority := flags.Int("priority", 500, "container request priority")
        inputFilename := flags.String("i", "-", "input `file`")
        outputFilename := flags.String("o", "-", "output `file`")
-       annotationsFilename := flags.String("output-annotations", "", "output `file` for tile variant annotations tsv")
-       librefsFilename := flags.String("output-onehot2tilevar", "", "when using -one-hot, create tsv `file` mapping column# to tag# and variant#")
+       annotationsFilename := flags.String("output-annotations", "", "output `file` for tile variant annotations csv")
+       librefsFilename := flags.String("output-onehot2tilevar", "", "when using -one-hot, create csv `file` mapping column# to tag# and variant#")
+       labelsFilename := flags.String("output-labels", "", "output `file` for genome labels csv")
        onehot := flags.Bool("one-hot", false, "recode tile variants as one-hot")
        cmd.filter.Flags(flags)
        err = flags.Parse(args)
@@ -77,8 +78,9 @@ func (cmd *exportNumpy) RunCommand(prog string, args []string, stdin io.Reader,
                        fmt.Sprintf("-one-hot=%v", *onehot),
                        "-i", *inputFilename,
                        "-o", "/mnt/output/matrix.npy",
-                       "-output-annotations", "/mnt/output/annotations.tsv",
-                       "-output-onehot2tilevar", "/mnt/output/onehot2tilevar.tsv",
+                       "-output-annotations", "/mnt/output/annotations.csv",
+                       "-output-onehot2tilevar", "/mnt/output/onehot2tilevar.csv",
+                       "-output-labels", "/mnt/output/labels.csv",
                        "-max-variants", fmt.Sprintf("%d", cmd.filter.MaxVariants),
                        "-min-coverage", fmt.Sprintf("%f", cmd.filter.MinCoverage),
                        "-max-tag", fmt.Sprintf("%d", cmd.filter.MaxTag),
@@ -140,7 +142,29 @@ func (cmd *exportNumpy) RunCommand(prog string, args []string, stdin io.Reader,
        }
 
        log.Info("building numpy array")
-       out, rows, cols := cgs2array(tilelib)
+       out, rows, cols, names := cgs2array(tilelib)
+
+       if *labelsFilename != "" {
+               log.Infof("writing labels to %s", *labelsFilename)
+               var f *os.File
+               f, err = os.OpenFile(*labelsFilename, os.O_CREATE|os.O_WRONLY, 0777)
+               if err != nil {
+                       return 1
+               }
+               defer f.Close()
+               for i, name := range names {
+                       _, err = fmt.Fprintf(f, "%d,%q\n", i, trimFilenameForLabel(name))
+                       if err != nil {
+                               err = fmt.Errorf("write %s: %w", *labelsFilename, err)
+                               return 1
+                       }
+               }
+               err = f.Close()
+               if err != nil {
+                       err = fmt.Errorf("close %s: %w", *labelsFilename, err)
+                       return 1
+               }
+       }
 
        log.Info("writing numpy file")
        var output io.WriteCloser
@@ -191,7 +215,7 @@ func (*exportNumpy) writeLibRefs(fnm string, tilelib *tileLibrary, librefs []til
        }
        defer f.Close()
        for i, libref := range librefs {
-               _, err = fmt.Fprintf(f, "%d\t%d\t%d\n", i, libref.Tag, libref.Variant)
+               _, err = fmt.Fprintf(f, "%d,%d,%d\n", i, libref.Tag, libref.Variant)
                if err != nil {
                        return err
                }
@@ -199,8 +223,7 @@ func (*exportNumpy) writeLibRefs(fnm string, tilelib *tileLibrary, librefs []til
        return f.Close()
 }
 
-func cgs2array(tilelib *tileLibrary) (data []int16, rows, cols int) {
-       var cgnames []string
+func cgs2array(tilelib *tileLibrary) (data []int16, rows, cols int, cgnames []string) {
        for name := range tilelib.compactGenomes {
                cgnames = append(cgnames, name)
        }
@@ -284,3 +307,17 @@ type nopCloser struct {
 }
 
 func (nopCloser) Close() error { return nil }
+
+func trimFilenameForLabel(s string) string {
+       if i := strings.LastIndex(s, "/"); i >= 0 {
+               s = s[i+1:]
+       }
+       s = strings.TrimSuffix(s, ".gz")
+       s = strings.TrimSuffix(s, ".fa")
+       s = strings.TrimSuffix(s, ".fasta")
+       s = strings.TrimSuffix(s, ".1")
+       s = strings.TrimSuffix(s, ".2")
+       s = strings.TrimSuffix(s, ".gz")
+       s = strings.TrimSuffix(s, ".vcf")
+       return s
+}
diff --git a/pca.go b/pca.go
index 044a7eefdfe9d36a2b4bfabfe3d148b5244f2fb3..41e3a7877f2847bfaed98bf3f583a9cd0842841a 100644 (file)
--- a/pca.go
+++ b/pca.go
@@ -162,7 +162,7 @@ func (cmd *goPCA) RunCommand(prog string, args []string, stdin io.Reader, stdout
        tilelib.Tidy()
 
        log.Print("converting cgs to array")
-       data, rows, cols := cgs2array(tilelib)
+       data, rows, cols, _ := cgs2array(tilelib)
        if *onehot {
                log.Printf("recode one-hot: %d rows, %d cols", rows, cols)
                data, _, cols = recodeOnehot(data, cols)
index dddadcd805d87e8c746b3b43f3709e93f3af723f..c917b3610de4f8dcf7181d4ebfe04c012c2cb95f 100644 (file)
@@ -143,21 +143,21 @@ chr2 472 572 7 1000 . 496 572
        c.Check(annotateout.Len() > 0, check.Equals, true)
        sorted := sortLines(annotateout.String())
        c.Logf("%s", sorted)
-       c.Check(sorted, check.Equals, sortLines(`0      8d4fe9a63921b   chr1:g.161A>T
-0      8d4fe9a63921b   chr1:g.178A>T
-0      8d4fe9a63921b   chr1:g.1_3delinsGGC
-0      8d4fe9a63921b   chr1:g.222_224del
-0      ba4263ca4199c   chr1:g.1_3delinsGGC
-0      ba4263ca4199c   chr1:g.222_224del
-0      ba4263ca4199c   chr1:g.41_42delinsAA
-1      139890345dbb8   chr1:g.302_305delinsAAAA
-4      cbfca15d241d3   chr2:g.125_127delinsAAA
-4      cbfca15d241d3   chr2:g.1_3delinsAAA
-4      f5fafe9450b02   chr2:g.241_245delinsAAAAA
-4      f5fafe9450b02   chr2:g.291C>A
-4      fe9a71a42adb4   chr2:g.125_127delinsAAA
-6      e36dce85efbef   chr2:g.471_472delinsAA
-6      f81388b184f4a   chr2:g.470_472del
+       c.Check(sorted, check.Equals, sortLines(`0,8d4fe9a63921b,chr1:g.161A>T
+0,8d4fe9a63921b,chr1:g.178A>T
+0,8d4fe9a63921b,chr1:g.1_3delinsGGC
+0,8d4fe9a63921b,chr1:g.222_224del
+0,ba4263ca4199c,chr1:g.1_3delinsGGC
+0,ba4263ca4199c,chr1:g.222_224del
+0,ba4263ca4199c,chr1:g.41_42delinsAA
+1,139890345dbb8,chr1:g.302_305delinsAAAA
+4,cbfca15d241d3,chr2:g.125_127delinsAAA
+4,cbfca15d241d3,chr2:g.1_3delinsAAA
+4,f5fafe9450b02,chr2:g.241_245delinsAAAAA
+4,f5fafe9450b02,chr2:g.291C>A
+4,fe9a71a42adb4,chr2:g.125_127delinsAAA
+6,e36dce85efbef,chr2:g.471_472delinsAA
+6,f81388b184f4a,chr2:g.470_472del
 `))
 }