Option to recode as one-hot for numpy output.
authorTom Clegg <tom@tomclegg.ca>
Fri, 11 Sep 2020 02:33:41 +0000 (22:33 -0400)
committerTom Clegg <tom@tomclegg.ca>
Fri, 11 Sep 2020 02:52:35 +0000 (22:52 -0400)
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom@tomclegg.ca>

example-su92l-1kg.sh
exportnumpy.go
exportnumpy_test.go

index 6ca93ac2784bfb93e16d8393d6cec808fca85cb2..7c66d76575190c807741793cf81209ddaa7cf2e1 100755 (executable)
@@ -18,7 +18,7 @@ genome=$(lightning     ref2genome   -project ${project} -priority ${priority} -r
 fasta=$(lightning      vcf2fasta    -project ${project} -priority ${priority} -ref ${ref_fa} -genome ${genome} -mask=true ${gvcf})
 unfiltered=$(lightning import       -project ${project} -priority ${priority} -tag-library ${tagset} -skip-ooo=true ${fasta})
 filtered=$(lightning   filter       -project ${project} -priority ${priority} -i ${unfiltered} -min-coverage "0.9" -max-variants "30")
-numpy=$(lightning      export-numpy -project ${project} -priority ${priority} -i ${filtered})
+numpy=$(lightning      export-numpy -project ${project} -priority ${priority} -i ${filtered} -one-hot)
 pca=$(lightning        pca          -project ${project} -priority ${priority} -i ${numpy})
 plot=$(lightning       plot         -project ${project} -priority ${priority} -i ${pca} -labels-csv ${info}/sample_info.csv -sample-fasta-dir ${fasta})
 echo >&2 "https://workbench2.${plot%%-*}.arvadosapi.com/collections/${plot}"
index 300093cbd968dd28c039afbc265e4b3a610b1037..e7a8cf86099f95cce81782175c35c5324f62525c 100644 (file)
@@ -34,6 +34,7 @@ func (cmd *exportNumpy) RunCommand(prog string, args []string, stdin io.Reader,
        priority := flags.Int("priority", 500, "container request priority")
        inputFilename := flags.String("i", "-", "input `file`")
        outputFilename := flags.String("o", "-", "output `file`")
+       onehot := flags.Bool("one-hot", false, "recode tile variants as one-hot")
        err = flags.Parse(args)
        if err == flag.ErrHelp {
                err = nil
@@ -65,7 +66,7 @@ func (cmd *exportNumpy) RunCommand(prog string, args []string, stdin io.Reader,
                if err != nil {
                        return 1
                }
-               runner.Args = []string{"export-numpy", "-local=true", "-i", *inputFilename, "-o", "/mnt/output/library.npy"}
+               runner.Args = []string{"export-numpy", "-local=true", fmt.Sprintf("-one-hot=%v", *onehot), "-i", *inputFilename, "-o", "/mnt/output/library.npy"}
                var output string
                output, err = runner.Run()
                if err != nil {
@@ -123,8 +124,14 @@ func (cmd *exportNumpy) RunCommand(prog string, args []string, stdin io.Reader,
        if err != nil {
                return 1
        }
-       npw.Shape = []int{rows, cols}
-       npw.WriteUint16(out)
+       if *onehot {
+               out, cols := recodeOnehot(out, cols)
+               npw.Shape = []int{rows, cols}
+               npw.WriteUint8(out)
+       } else {
+               npw.Shape = []int{rows, cols}
+               npw.WriteUint16(out)
+       }
        err = bufw.Flush()
        if err != nil {
                return 1
@@ -136,6 +143,33 @@ func (cmd *exportNumpy) RunCommand(prog string, args []string, stdin io.Reader,
        return 0
 }
 
+func recodeOnehot(in []uint16, incols int) ([]uint8, int) {
+       rows := len(in) / incols
+       maxvalue := make([]uint16, incols)
+       for row := 0; row < rows; row++ {
+               for col := 0; col < incols; col++ {
+                       if v := in[row*incols+col]; maxvalue[col] < v {
+                               maxvalue[col] = v
+                       }
+               }
+       }
+       outcol := make([]int, incols)
+       outcols := 0
+       for incol, v := range maxvalue {
+               outcol[incol] = outcols
+               outcols += int(v)
+       }
+       out := make([]uint8, rows*outcols)
+       for row := 0; row < rows; row++ {
+               for col := 0; col < incols; col++ {
+                       if v := in[row*incols+col]; v > 0 {
+                               out[row*outcols+outcol[col]+int(v)-1] = 1
+                       }
+               }
+       }
+       return out, outcols
+}
+
 type nopCloser struct {
        io.Writer
 }
index 8051e279a91f52d2ece97a9cb4838f80b357d440..7f22fb01b66535d6377a76be24fbc460cf0638ee 100644 (file)
@@ -45,3 +45,32 @@ func sortUints(variants []uint16) {
                }
        }
 }
+
+func (s *exportSuite) TestOnehot(c *check.C) {
+       for _, trial := range []struct {
+               incols  int
+               in      []uint16
+               outcols int
+               out     []uint8
+       }{
+               {2, []uint16{1, 1, 1, 1}, 2, []uint8{1, 1, 1, 1}},
+               {2, []uint16{1, 1, 1, 2}, 3, []uint8{1, 1, 0, 1, 0, 1}},
+               {
+                       // 2nd column => 3 one-hot columns
+                       // 4th column => 0 one-hot columns
+                       4, []uint16{
+                               1, 1, 0, 0,
+                               1, 2, 1, 0,
+                               1, 3, 0, 0,
+                       }, 5, []uint8{
+                               1, 1, 0, 0, 0,
+                               1, 0, 1, 0, 1,
+                               1, 0, 0, 1, 0,
+                       },
+               },
+       } {
+               out, outcols := recodeOnehot(trial.in, trial.incols)
+               c.Check(out, check.DeepEquals, trial.out)
+               c.Check(outcols, check.Equals, trial.outcols)
+       }
+}