From ce6bc473186a2b7366bb22a31773310fee9dc4d5 Mon Sep 17 00:00:00 2001 From: Tom Clegg Date: Thu, 10 Sep 2020 22:33:41 -0400 Subject: [PATCH 1/1] Option to recode as one-hot for numpy output. Arvados-DCO-1.1-Signed-off-by: Tom Clegg --- example-su92l-1kg.sh | 2 +- exportnumpy.go | 40 +++++++++++++++++++++++++++++++++++++--- exportnumpy_test.go | 29 +++++++++++++++++++++++++++++ 3 files changed, 67 insertions(+), 4 deletions(-) diff --git a/example-su92l-1kg.sh b/example-su92l-1kg.sh index 6ca93ac278..7c66d76575 100755 --- a/example-su92l-1kg.sh +++ b/example-su92l-1kg.sh @@ -18,7 +18,7 @@ genome=$(lightning ref2genome -project ${project} -priority ${priority} -r fasta=$(lightning vcf2fasta -project ${project} -priority ${priority} -ref ${ref_fa} -genome ${genome} -mask=true ${gvcf}) unfiltered=$(lightning import -project ${project} -priority ${priority} -tag-library ${tagset} -skip-ooo=true ${fasta}) filtered=$(lightning filter -project ${project} -priority ${priority} -i ${unfiltered} -min-coverage "0.9" -max-variants "30") -numpy=$(lightning export-numpy -project ${project} -priority ${priority} -i ${filtered}) +numpy=$(lightning export-numpy -project ${project} -priority ${priority} -i ${filtered} -one-hot) pca=$(lightning pca -project ${project} -priority ${priority} -i ${numpy}) plot=$(lightning plot -project ${project} -priority ${priority} -i ${pca} -labels-csv ${info}/sample_info.csv -sample-fasta-dir ${fasta}) echo >&2 "https://workbench2.${plot%%-*}.arvadosapi.com/collections/${plot}" diff --git a/exportnumpy.go b/exportnumpy.go index 300093cbd9..e7a8cf8609 100644 --- a/exportnumpy.go +++ b/exportnumpy.go @@ -34,6 +34,7 @@ func (cmd *exportNumpy) RunCommand(prog string, args []string, stdin io.Reader, priority := flags.Int("priority", 500, "container request priority") inputFilename := flags.String("i", "-", "input `file`") outputFilename := flags.String("o", "-", "output `file`") + onehot := flags.Bool("one-hot", false, "recode tile variants as one-hot") err = flags.Parse(args) if err == flag.ErrHelp { err = nil @@ -65,7 +66,7 @@ func (cmd *exportNumpy) RunCommand(prog string, args []string, stdin io.Reader, if err != nil { return 1 } - runner.Args = []string{"export-numpy", "-local=true", "-i", *inputFilename, "-o", "/mnt/output/library.npy"} + runner.Args = []string{"export-numpy", "-local=true", fmt.Sprintf("-one-hot=%v", *onehot), "-i", *inputFilename, "-o", "/mnt/output/library.npy"} var output string output, err = runner.Run() if err != nil { @@ -123,8 +124,14 @@ func (cmd *exportNumpy) RunCommand(prog string, args []string, stdin io.Reader, if err != nil { return 1 } - npw.Shape = []int{rows, cols} - npw.WriteUint16(out) + if *onehot { + out, cols := recodeOnehot(out, cols) + npw.Shape = []int{rows, cols} + npw.WriteUint8(out) + } else { + npw.Shape = []int{rows, cols} + npw.WriteUint16(out) + } err = bufw.Flush() if err != nil { return 1 @@ -136,6 +143,33 @@ func (cmd *exportNumpy) RunCommand(prog string, args []string, stdin io.Reader, return 0 } +func recodeOnehot(in []uint16, incols int) ([]uint8, int) { + rows := len(in) / incols + maxvalue := make([]uint16, incols) + for row := 0; row < rows; row++ { + for col := 0; col < incols; col++ { + if v := in[row*incols+col]; maxvalue[col] < v { + maxvalue[col] = v + } + } + } + outcol := make([]int, incols) + outcols := 0 + for incol, v := range maxvalue { + outcol[incol] = outcols + outcols += int(v) + } + out := make([]uint8, rows*outcols) + for row := 0; row < rows; row++ { + for col := 0; col < incols; col++ { + if v := in[row*incols+col]; v > 0 { + out[row*outcols+outcol[col]+int(v)-1] = 1 + } + } + } + return out, outcols +} + type nopCloser struct { io.Writer } diff --git a/exportnumpy_test.go b/exportnumpy_test.go index 8051e279a9..7f22fb01b6 100644 --- a/exportnumpy_test.go +++ b/exportnumpy_test.go @@ -45,3 +45,32 @@ func sortUints(variants []uint16) { } } } + +func (s *exportSuite) TestOnehot(c *check.C) { + for _, trial := range []struct { + incols int + in []uint16 + outcols int + out []uint8 + }{ + {2, []uint16{1, 1, 1, 1}, 2, []uint8{1, 1, 1, 1}}, + {2, []uint16{1, 1, 1, 2}, 3, []uint8{1, 1, 0, 1, 0, 1}}, + { + // 2nd column => 3 one-hot columns + // 4th column => 0 one-hot columns + 4, []uint16{ + 1, 1, 0, 0, + 1, 2, 1, 0, + 1, 3, 0, 0, + }, 5, []uint8{ + 1, 1, 0, 0, 0, + 1, 0, 1, 0, 1, + 1, 0, 0, 1, 0, + }, + }, + } { + out, outcols := recodeOnehot(trial.in, trial.incols) + c.Check(out, check.DeepEquals, trial.out) + c.Check(outcols, check.Equals, trial.outcols) + } +} -- 2.30.2