-package main
+package lightning
import (
"bufio"
"net/http"
_ "net/http/pprof"
"os"
+ "path"
"sort"
+ "strings"
"git.arvados.org/arvados.git/sdk/go/arvados"
"github.com/kshedden/gonpy"
+ "github.com/sirupsen/logrus"
log "github.com/sirupsen/logrus"
)
priority := flags.Int("priority", 500, "container request priority")
inputFilename := flags.String("i", "-", "input `file`")
outputFilename := flags.String("o", "-", "output `file`")
- annotationsFilename := flags.String("output-annotations", "", "output `file` for tile variant annotations tsv")
- librefsFilename := flags.String("output-onehot2tilevar", "", "when using -one-hot, create tsv `file` mapping column# to tag# and variant#")
+ annotationsFilename := flags.String("output-annotations", "", "output `file` for tile variant annotations csv")
+ librefsFilename := flags.String("output-onehot2tilevar", "", "when using -one-hot, create csv `file` mapping column# to tag# and variant#")
+ labelsFilename := flags.String("output-labels", "", "output `file` for genome labels csv")
onehot := flags.Bool("one-hot", false, "recode tile variants as one-hot")
cmd.filter.Flags(flags)
err = flags.Parse(args)
Name: "lightning export-numpy",
Client: arvados.NewClientFromEnv(),
ProjectUUID: *projectUUID,
- RAM: 128000000000,
+ RAM: 450000000000,
VCPUs: 32,
Priority: *priority,
+ KeepCache: 1,
+ APIAccess: true,
}
err = runner.TranslatePaths(inputFilename)
if err != nil {
fmt.Sprintf("-one-hot=%v", *onehot),
"-i", *inputFilename,
"-o", "/mnt/output/matrix.npy",
- "-output-annotations", "/mnt/output/annotations.tsv",
- "-output-onehot2tilevar", "/mnt/output/onehot2tilevar.tsv",
+ "-output-annotations", "/mnt/output/annotations.csv",
+ "-output-onehot2tilevar", "/mnt/output/onehot2tilevar.csv",
+ "-output-labels", "/mnt/output/labels.csv",
"-max-variants", fmt.Sprintf("%d", cmd.filter.MaxVariants),
"-min-coverage", fmt.Sprintf("%f", cmd.filter.MinCoverage),
"-max-tag", fmt.Sprintf("%d", cmd.filter.MaxTag),
if *inputFilename == "-" {
input = ioutil.NopCloser(stdin)
} else {
- input, err = os.Open(*inputFilename)
+ input, err = open(*inputFilename)
if err != nil {
return 1
}
defer input.Close()
}
+ input = ioutil.NopCloser(bufio.NewReaderSize(input, 8*1024*1024))
tilelib := &tileLibrary{
retainNoCalls: true,
retainTileSequences: true,
compactGenomes: map[string][]tileVariantID{},
}
- err = tilelib.LoadGob(context.Background(), input, nil)
+ err = tilelib.LoadGob(context.Background(), input, strings.HasSuffix(*inputFilename, ".gz"), nil)
if err != nil {
return 1
}
}
log.Info("building numpy array")
- out, rows, cols := cgs2array(tilelib.compactGenomes)
+ out, rows, cols, names := cgs2array(tilelib)
+
+ if *labelsFilename != "" {
+ log.Infof("writing labels to %s", *labelsFilename)
+ var f *os.File
+ f, err = os.OpenFile(*labelsFilename, os.O_CREATE|os.O_WRONLY, 0777)
+ if err != nil {
+ return 1
+ }
+ defer f.Close()
+ _, outBasename := path.Split(*outputFilename)
+ for i, name := range names {
+ _, err = fmt.Fprintf(f, "%d,%q,%q\n", i, trimFilenameForLabel(name), outBasename)
+ if err != nil {
+ err = fmt.Errorf("write %s: %w", *labelsFilename, err)
+ return 1
+ }
+ }
+ err = f.Close()
+ if err != nil {
+ err = fmt.Errorf("close %s: %w", *labelsFilename, err)
+ return 1
+ }
+ }
+
+ log.Info("writing numpy file")
var output io.WriteCloser
if *outputFilename == "-" {
output = nopCloser{stdout}
}
}
}
- log.Info("writing numpy")
+ log.WithFields(logrus.Fields{
+ "rows": rows,
+ "cols": cols,
+ }).Info("writing numpy")
npw.Shape = []int{rows, cols}
- npw.WriteUint16(out)
+ npw.WriteInt16(out)
err = bufw.Flush()
if err != nil {
return 1
}
defer f.Close()
for i, libref := range librefs {
- _, err = fmt.Fprintf(f, "%d\t%d\t%d\n", i, libref.Tag, libref.Variant)
+ _, err = fmt.Fprintf(f, "%d,%d,%d\n", i, libref.Tag, libref.Variant)
if err != nil {
return err
}
return f.Close()
}
-func cgs2array(cgs map[string][]tileVariantID) (data []uint16, rows, cols int) {
- var cgnames []string
- for name := range cgs {
+func cgs2array(tilelib *tileLibrary) (data []int16, rows, cols int, cgnames []string) {
+ for name := range tilelib.compactGenomes {
cgnames = append(cgnames, name)
}
- sort.Strings(cgnames)
+ sort.Slice(cgnames, func(i, j int) bool {
+ return trimFilenameForLabel(cgnames[i]) < trimFilenameForLabel(cgnames[j])
+ })
- rows = len(cgs)
- for _, cg := range cgs {
+ rows = len(tilelib.compactGenomes)
+ for _, cg := range tilelib.compactGenomes {
if cols < len(cg) {
cols = len(cg)
}
}
- data = make([]uint16, rows*cols)
+
+ // flag low-quality tile variants so we can change to -1 below
+ lowqual := make([]map[tileVariantID]bool, cols/2)
+ for tag, variants := range tilelib.variant {
+ lq := lowqual[tag]
+ for varidx, hash := range variants {
+ if len(tilelib.seq[hash]) == 0 {
+ if lq == nil {
+ lq = map[tileVariantID]bool{}
+ lowqual[tag] = lq
+ }
+ lq[tileVariantID(varidx+1)] = true
+ }
+ }
+ }
+
+ data = make([]int16, rows*cols)
for row, name := range cgnames {
- for i, v := range cgs[name] {
- data[row*cols+i] = uint16(v)
+ for i, v := range tilelib.compactGenomes[name] {
+ if v > 0 && lowqual[i/2][v] {
+ data[row*cols+i] = -1
+ } else {
+ data[row*cols+i] = int16(v)
+ }
}
}
+
return
}
-func recodeOnehot(in []uint16, incols int) (out []uint16, librefs []tileLibRef, outcols int) {
+func recodeOnehot(in []int16, incols int) (out []int16, librefs []tileLibRef, outcols int) {
rows := len(in) / incols
- maxvalue := make([]uint16, incols)
+ maxvalue := make([]int16, incols)
for row := 0; row < rows; row++ {
for col := 0; col < incols; col++ {
if v := in[row*incols+col]; maxvalue[col] < v {
}
log.Printf("recodeOnehot: dropped %d input cols with zero maxvalue", dropped)
- out = make([]uint16, rows*outcols)
+ out = make([]int16, rows*outcols)
for inidx, row := 0, 0; row < rows; row++ {
outrow := out[row*outcols:]
for col := 0; col < incols; col++ {
}
func (nopCloser) Close() error { return nil }
+
+func trimFilenameForLabel(s string) string {
+ if i := strings.LastIndex(s, "/"); i >= 0 {
+ s = s[i+1:]
+ }
+ s = strings.TrimSuffix(s, ".gz")
+ s = strings.TrimSuffix(s, ".fa")
+ s = strings.TrimSuffix(s, ".fasta")
+ s = strings.TrimSuffix(s, ".1")
+ s = strings.TrimSuffix(s, ".2")
+ s = strings.TrimSuffix(s, ".gz")
+ s = strings.TrimSuffix(s, ".vcf")
+ return s
+}