}
log.Info("building numpy array")
- out, rows, cols := cgs2array(tilelib.compactGenomes)
+ out, rows, cols := cgs2array(tilelib)
+
+ log.Info("writing numpy file")
var output io.WriteCloser
if *outputFilename == "-" {
output = nopCloser{stdout}
}
log.Info("writing numpy")
npw.Shape = []int{rows, cols}
- npw.WriteUint16(out)
+ npw.WriteInt16(out)
err = bufw.Flush()
if err != nil {
return 1
return f.Close()
}
-func cgs2array(cgs map[string][]tileVariantID) (data []uint16, rows, cols int) {
+func cgs2array(tilelib *tileLibrary) (data []int16, rows, cols int) {
var cgnames []string
- for name := range cgs {
+ for name := range tilelib.compactGenomes {
cgnames = append(cgnames, name)
}
sort.Strings(cgnames)
- rows = len(cgs)
- for _, cg := range cgs {
+ rows = len(tilelib.compactGenomes)
+ for _, cg := range tilelib.compactGenomes {
if cols < len(cg) {
cols = len(cg)
}
}
- data = make([]uint16, rows*cols)
+
+ // flag low-quality tile variants so we can change to -1 below
+ lowqual := make([]map[tileVariantID]bool, cols/2)
+ for tag, variants := range tilelib.variant {
+ lq := lowqual[tag]
+ for varidx, hash := range variants {
+ if len(tilelib.seq[hash]) == 0 {
+ if lq == nil {
+ lq = map[tileVariantID]bool{}
+ lowqual[tag] = lq
+ }
+ lq[tileVariantID(varidx+1)] = true
+ }
+ }
+ }
+
+ data = make([]int16, rows*cols)
for row, name := range cgnames {
- for i, v := range cgs[name] {
- data[row*cols+i] = uint16(v)
+ for i, v := range tilelib.compactGenomes[name] {
+ if v > 0 && lowqual[i/2][v] {
+ data[row*cols+i] = -1
+ } else {
+ data[row*cols+i] = int16(v)
+ }
}
}
+
return
}
-func recodeOnehot(in []uint16, incols int) (out []uint16, librefs []tileLibRef, outcols int) {
+func recodeOnehot(in []int16, incols int) (out []int16, librefs []tileLibRef, outcols int) {
rows := len(in) / incols
- maxvalue := make([]uint16, incols)
+ maxvalue := make([]int16, incols)
for row := 0; row < rows; row++ {
for col := 0; col < incols; col++ {
if v := in[row*incols+col]; maxvalue[col] < v {
}
log.Printf("recodeOnehot: dropped %d input cols with zero maxvalue", dropped)
- out = make([]uint16, rows*outcols)
+ out = make([]int16, rows*outcols)
for inidx, row := 0, 0; row < rows; row++ {
outrow := out[row*outcols:]
for col := 0; col < incols; col++ {
c.Check(exited, check.Equals, 0)
npy, err := gonpy.NewReader(&output)
c.Assert(err, check.IsNil)
- variants, err := npy.GetUint16()
+ variants, err := npy.GetInt16()
c.Assert(err, check.IsNil)
for i := 0; i < 4; i += 2 {
if variants[i] == 1 {
- c.Check(variants[i+1], check.Equals, uint16(2), check.Commentf("i=%d, v=%v", i, variants))
+ c.Check(variants[i+1], check.Equals, int16(2), check.Commentf("i=%d, v=%v", i, variants))
} else {
- c.Check(variants[i], check.Equals, uint16(2), check.Commentf("i=%d, v=%v", i, variants))
+ c.Check(variants[i], check.Equals, int16(2), check.Commentf("i=%d, v=%v", i, variants))
}
}
for i := 4; i < 9; i += 2 {
- c.Check(variants[i], check.Equals, uint16(1), check.Commentf("i=%d, v=%v", i, variants))
+ c.Check(variants[i], check.Equals, int16(1), check.Commentf("i=%d, v=%v", i, variants))
}
}
-func sortUints(variants []uint16) {
+func sortUints(variants []int16) {
for i := 0; i < len(variants); i += 2 {
if variants[i] > variants[i+1] {
for j := 0; j < len(variants); j++ {
func (s *exportSuite) TestOnehot(c *check.C) {
for _, trial := range []struct {
incols int
- in []uint16
+ in []int16
outcols int
- out []uint16
+ out []int16
}{
- {2, []uint16{1, 1, 1, 1}, 2, []uint16{1, 1, 1, 1}},
- {2, []uint16{1, 1, 1, 2}, 3, []uint16{1, 1, 0, 1, 0, 1}},
+ {2, []int16{1, 1, 1, 1}, 2, []int16{1, 1, 1, 1}},
+ {2, []int16{1, 1, 1, 2}, 3, []int16{1, 1, 0, 1, 0, 1}},
{
// 2nd column => 3 one-hot columns
// 4th column => 0 one-hot columns
- 4, []uint16{
+ 4, []int16{
1, 1, 0, 0,
1, 2, 1, 0,
1, 3, 0, 0,
- }, 5, []uint16{
+ }, 5, []int16{
1, 1, 0, 0, 0,
1, 0, 1, 0, 1,
1, 0, 0, 1, 0,
}
log.Print("converting cgs to array")
- data, rows, cols := cgs2array(tilelib.compactGenomes)
+ data, rows, cols := cgs2array(&tilelib)
if *onehot {
log.Printf("recode one-hot: %d rows, %d cols", rows, cols)
data, _, cols = recodeOnehot(data, cols)
return 0
}
-func array2matrix(rows, cols int, data []uint16) mat.Matrix {
+func array2matrix(rows, cols int, data []int16) mat.Matrix {
floatdata := make([]float64, rows*cols)
for i, v := range data {
floatdata[i] = float64(v)