fasta=$(lightning vcf2fasta -project ${project} -priority ${priority} -ref ${ref_fa} -genome ${genome} -mask=true ${gvcf})
unfiltered=$(lightning import -project ${project} -priority ${priority} -tag-library ${tagset} -skip-ooo=true ${fasta})
filtered=$(lightning filter -project ${project} -priority ${priority} -i ${unfiltered} -min-coverage "0.9" -max-variants "30")
-numpy=$(lightning export-numpy -project ${project} -priority ${priority} -i ${filtered})
+numpy=$(lightning export-numpy -project ${project} -priority ${priority} -i ${filtered} -one-hot)
pca=$(lightning pca -project ${project} -priority ${priority} -i ${numpy})
plot=$(lightning plot -project ${project} -priority ${priority} -i ${pca} -labels-csv ${info}/sample_info.csv -sample-fasta-dir ${fasta})
echo >&2 "https://workbench2.${plot%%-*}.arvadosapi.com/collections/${plot}"
priority := flags.Int("priority", 500, "container request priority")
inputFilename := flags.String("i", "-", "input `file`")
outputFilename := flags.String("o", "-", "output `file`")
+ onehot := flags.Bool("one-hot", false, "recode tile variants as one-hot")
err = flags.Parse(args)
if err == flag.ErrHelp {
err = nil
if err != nil {
return 1
}
- runner.Args = []string{"export-numpy", "-local=true", "-i", *inputFilename, "-o", "/mnt/output/library.npy"}
+ runner.Args = []string{"export-numpy", "-local=true", fmt.Sprintf("-one-hot=%v", *onehot), "-i", *inputFilename, "-o", "/mnt/output/library.npy"}
var output string
output, err = runner.Run()
if err != nil {
if err != nil {
return 1
}
- npw.Shape = []int{rows, cols}
- npw.WriteUint16(out)
+ if *onehot {
+ out, cols := recodeOnehot(out, cols)
+ npw.Shape = []int{rows, cols}
+ npw.WriteUint8(out)
+ } else {
+ npw.Shape = []int{rows, cols}
+ npw.WriteUint16(out)
+ }
err = bufw.Flush()
if err != nil {
return 1
return 0
}
+func recodeOnehot(in []uint16, incols int) ([]uint8, int) {
+ rows := len(in) / incols
+ maxvalue := make([]uint16, incols)
+ for row := 0; row < rows; row++ {
+ for col := 0; col < incols; col++ {
+ if v := in[row*incols+col]; maxvalue[col] < v {
+ maxvalue[col] = v
+ }
+ }
+ }
+ outcol := make([]int, incols)
+ outcols := 0
+ for incol, v := range maxvalue {
+ outcol[incol] = outcols
+ outcols += int(v)
+ }
+ out := make([]uint8, rows*outcols)
+ for row := 0; row < rows; row++ {
+ for col := 0; col < incols; col++ {
+ if v := in[row*incols+col]; v > 0 {
+ out[row*outcols+outcol[col]+int(v)-1] = 1
+ }
+ }
+ }
+ return out, outcols
+}
+
type nopCloser struct {
io.Writer
}
}
}
}
+
+func (s *exportSuite) TestOnehot(c *check.C) {
+ for _, trial := range []struct {
+ incols int
+ in []uint16
+ outcols int
+ out []uint8
+ }{
+ {2, []uint16{1, 1, 1, 1}, 2, []uint8{1, 1, 1, 1}},
+ {2, []uint16{1, 1, 1, 2}, 3, []uint8{1, 1, 0, 1, 0, 1}},
+ {
+ // 2nd column => 3 one-hot columns
+ // 4th column => 0 one-hot columns
+ 4, []uint16{
+ 1, 1, 0, 0,
+ 1, 2, 1, 0,
+ 1, 3, 0, 0,
+ }, 5, []uint8{
+ 1, 1, 0, 0, 0,
+ 1, 0, 1, 0, 1,
+ 1, 0, 0, 1, 0,
+ },
+ },
+ } {
+ out, outcols := recodeOnehot(trial.in, trial.incols)
+ c.Check(out, check.DeepEquals, trial.out)
+ c.Check(outcols, check.Equals, trial.outcols)
+ }
+}