Separate options to output single/chunked hgvs matrices.
[lightning.git] / slicenumpy.go
1 // Copyright (C) The Lightning Authors. All rights reserved.
2 //
3 // SPDX-License-Identifier: AGPL-3.0
4
5 package lightning
6
7 import (
8         "bufio"
9         "bytes"
10         "flag"
11         "fmt"
12         "io"
13         "io/ioutil"
14         "net/http"
15         _ "net/http/pprof"
16         "os"
17         "regexp"
18         "runtime"
19         "sort"
20         "strconv"
21         "strings"
22         "sync/atomic"
23
24         "git.arvados.org/arvados.git/sdk/go/arvados"
25         "github.com/arvados/lightning/hgvs"
26         "github.com/kshedden/gonpy"
27         log "github.com/sirupsen/logrus"
28         "golang.org/x/crypto/blake2b"
29 )
30
31 type sliceNumpy struct {
32         filter  filter
33         threads int
34 }
35
36 func (cmd *sliceNumpy) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int {
37         var err error
38         defer func() {
39                 if err != nil {
40                         fmt.Fprintf(stderr, "%s\n", err)
41                 }
42         }()
43         flags := flag.NewFlagSet("", flag.ContinueOnError)
44         flags.SetOutput(stderr)
45         pprof := flags.String("pprof", "", "serve Go profile data at http://`[addr]:port`")
46         runlocal := flags.Bool("local", false, "run on local host (default: run in an arvados container)")
47         projectUUID := flags.String("project", "", "project `UUID` for output data")
48         priority := flags.Int("priority", 500, "container request priority")
49         inputDir := flags.String("input-dir", "./in", "input `directory`")
50         outputDir := flags.String("output-dir", "./out", "output `directory`")
51         ref := flags.String("ref", "", "reference name (if blank, choose last one that appears in input)")
52         regionsFilename := flags.String("regions", "", "only output columns/annotations that intersect regions in specified bed `file`")
53         expandRegions := flags.Int("expand-regions", 0, "expand specified regions by `N` base pairs on each side`")
54         mergeOutput := flags.Bool("merge-output", false, "merge output into one matrix.npy and one matrix.annotations.csv")
55         hgvsSingle := flags.Bool("single-hgvs-matrix", false, "also generate hgvs-based matrix")
56         hgvsChunked := flags.Bool("chunked-hgvs-matrix", false, "also generate hgvs-based matrix per chromosome")
57         flags.IntVar(&cmd.threads, "threads", 16, "number of memory-hungry assembly threads")
58         cmd.filter.Flags(flags)
59         err = flags.Parse(args)
60         if err == flag.ErrHelp {
61                 err = nil
62                 return 0
63         } else if err != nil {
64                 return 2
65         }
66
67         if *pprof != "" {
68                 go func() {
69                         log.Println(http.ListenAndServe(*pprof, nil))
70                 }()
71         }
72
73         if !*runlocal {
74                 runner := arvadosContainerRunner{
75                         Name:        "lightning slice-numpy",
76                         Client:      arvados.NewClientFromEnv(),
77                         ProjectUUID: *projectUUID,
78                         RAM:         750000000000,
79                         VCPUs:       96,
80                         Priority:    *priority,
81                         KeepCache:   2,
82                         APIAccess:   true,
83                 }
84                 err = runner.TranslatePaths(inputDir, regionsFilename)
85                 if err != nil {
86                         return 1
87                 }
88                 runner.Args = []string{"slice-numpy", "-local=true",
89                         "-pprof=:6060",
90                         "-input-dir=" + *inputDir,
91                         "-output-dir=/mnt/output",
92                         "-threads=" + fmt.Sprintf("%d", cmd.threads),
93                         "-regions=" + *regionsFilename,
94                         "-expand-regions=" + fmt.Sprintf("%d", *expandRegions),
95                         "-merge-output=" + fmt.Sprintf("%v", *mergeOutput),
96                         "-single-hgvs-matrix=" + fmt.Sprintf("%v", *hgvsSingle),
97                         "-chunked-hgvs-matrix=" + fmt.Sprintf("%v", *hgvsChunked),
98                 }
99                 runner.Args = append(runner.Args, cmd.filter.Args()...)
100                 var output string
101                 output, err = runner.Run()
102                 if err != nil {
103                         return 1
104                 }
105                 fmt.Fprintln(stdout, output)
106                 return 0
107         }
108
109         infiles, err := allGobFiles(*inputDir)
110         if err != nil {
111                 return 1
112         }
113         if len(infiles) == 0 {
114                 err = fmt.Errorf("no input files found in %s", *inputDir)
115                 return 1
116         }
117         sort.Strings(infiles)
118
119         var cgnames []string
120         var refseq map[string][]tileLibRef
121         var reftiledata = make(map[tileLibRef][]byte, 11000000)
122         in0, err := open(infiles[0])
123         if err != nil {
124                 return 1
125         }
126
127         matchGenome, err := regexp.Compile(cmd.filter.MatchGenome)
128         if err != nil {
129                 err = fmt.Errorf("-match-genome: invalid regexp: %q", cmd.filter.MatchGenome)
130                 return 1
131         }
132
133         taglen := -1
134         DecodeLibrary(in0, strings.HasSuffix(infiles[0], ".gz"), func(ent *LibraryEntry) error {
135                 if len(ent.TagSet) > 0 {
136                         taglen = len(ent.TagSet[0])
137                 }
138                 for _, cseq := range ent.CompactSequences {
139                         if cseq.Name == *ref || *ref == "" {
140                                 refseq = cseq.TileSequences
141                         }
142                 }
143                 for _, cg := range ent.CompactGenomes {
144                         if matchGenome.MatchString(cg.Name) {
145                                 cgnames = append(cgnames, cg.Name)
146                         }
147                 }
148                 for _, tv := range ent.TileVariants {
149                         if tv.Ref {
150                                 reftiledata[tileLibRef{tv.Tag, tv.Variant}] = tv.Sequence
151                         }
152                 }
153                 return nil
154         })
155         if err != nil {
156                 return 1
157         }
158         in0.Close()
159         if refseq == nil {
160                 err = fmt.Errorf("%s: reference sequence not found", infiles[0])
161                 return 1
162         }
163         if taglen < 0 {
164                 err = fmt.Errorf("tagset not found")
165                 return 1
166         }
167         if len(cgnames) == 0 {
168                 err = fmt.Errorf("no genomes found matching regexp %q", cmd.filter.MatchGenome)
169                 return 1
170         }
171         sort.Strings(cgnames)
172
173         {
174                 labelsFilename := *outputDir + "/labels.csv"
175                 log.Infof("writing labels to %s", labelsFilename)
176                 var f *os.File
177                 f, err = os.Create(labelsFilename)
178                 if err != nil {
179                         return 1
180                 }
181                 defer f.Close()
182                 for i, name := range cgnames {
183                         _, err = fmt.Fprintf(f, "%d,%q\n", i, trimFilenameForLabel(name))
184                         if err != nil {
185                                 err = fmt.Errorf("write %s: %w", labelsFilename, err)
186                                 return 1
187                         }
188                 }
189                 err = f.Close()
190                 if err != nil {
191                         err = fmt.Errorf("close %s: %w", labelsFilename, err)
192                         return 1
193                 }
194         }
195
196         log.Info("indexing reference tiles")
197         type reftileinfo struct {
198                 variant  tileVariantID
199                 seqname  string // chr1
200                 pos      int    // distance from start of chromosome to starttag
201                 tiledata []byte // acgtggcaa...
202         }
203         isdup := map[tagID]bool{}
204         reftile := map[tagID]*reftileinfo{}
205         for seqname, cseq := range refseq {
206                 pos := 0
207                 for _, libref := range cseq {
208                         tiledata := reftiledata[libref]
209                         if len(tiledata) == 0 {
210                                 err = fmt.Errorf("missing tiledata for tag %d variant %d in %s in ref", libref.Tag, libref.Variant, seqname)
211                                 return 1
212                         }
213                         if isdup[libref.Tag] {
214                                 log.Printf("dropping reference tile %+v from %s @ %d, tag not unique", libref, seqname, pos)
215                         } else if reftile[libref.Tag] != nil {
216                                 log.Printf("dropping reference tile %+v from %s @ %d, tag not unique", tileLibRef{Tag: libref.Tag, Variant: reftile[libref.Tag].variant}, reftile[libref.Tag].seqname, reftile[libref.Tag].pos)
217                                 delete(reftile, libref.Tag)
218                                 log.Printf("dropping reference tile %+v from %s @ %d, tag not unique", libref, seqname, pos)
219                                 isdup[libref.Tag] = true
220                         } else {
221                                 reftile[libref.Tag] = &reftileinfo{
222                                         seqname:  seqname,
223                                         variant:  libref.Variant,
224                                         tiledata: tiledata,
225                                         pos:      pos,
226                                 }
227                         }
228                         pos += len(tiledata) - taglen
229                 }
230                 log.Printf("... %s done, len %d", seqname, pos+taglen)
231         }
232
233         var mask *mask
234         if *regionsFilename != "" {
235                 log.Printf("loading regions from %s", *regionsFilename)
236                 mask, err = makeMask(*regionsFilename, *expandRegions)
237                 if err != nil {
238                         return 1
239                 }
240                 log.Printf("before applying mask, len(reftile) == %d", len(reftile))
241                 log.Printf("deleting reftile entries for regions outside %d intervals", mask.Len())
242                 for tag, rt := range reftile {
243                         if !mask.Check(strings.TrimPrefix(rt.seqname, "chr"), rt.pos, rt.pos+len(rt.tiledata)) {
244                                 delete(reftile, tag)
245                         }
246                 }
247                 log.Printf("after applying mask, len(reftile) == %d", len(reftile))
248         }
249
250         var toMerge [][]int16
251         if *mergeOutput || *hgvsSingle || *hgvsChunked {
252                 toMerge = make([][]int16, len(infiles))
253         }
254
255         throttleMem := throttle{Max: cmd.threads} // TODO: estimate using mem and data size
256         throttleNumpyMem := throttle{Max: cmd.threads/2 + 1}
257         log.Info("generating annotations and numpy matrix for each slice")
258         var done int64
259         for infileIdx, infile := range infiles {
260                 infileIdx, infile := infileIdx, infile
261                 throttleMem.Go(func() error {
262                         seq := make(map[tagID][]TileVariant, 50000)
263                         cgs := make(map[string]CompactGenome, len(cgnames))
264                         f, err := open(infile)
265                         if err != nil {
266                                 return err
267                         }
268                         defer f.Close()
269                         log.Infof("%04d: reading %s", infileIdx, infile)
270                         err = DecodeLibrary(f, strings.HasSuffix(infile, ".gz"), func(ent *LibraryEntry) error {
271                                 for _, tv := range ent.TileVariants {
272                                         if tv.Ref {
273                                                 continue
274                                         }
275                                         if mask != nil && reftile[tv.Tag] == nil {
276                                                 // Don't waste
277                                                 // time/memory on
278                                                 // masked-out tiles.
279                                                 continue
280                                         }
281                                         variants := seq[tv.Tag]
282                                         if len(variants) == 0 {
283                                                 variants = make([]TileVariant, 100)
284                                         }
285                                         for len(variants) <= int(tv.Variant) {
286                                                 variants = append(variants, TileVariant{})
287                                         }
288                                         variants[int(tv.Variant)] = tv
289                                         seq[tv.Tag] = variants
290                                 }
291                                 for _, cg := range ent.CompactGenomes {
292                                         if matchGenome.MatchString(cg.Name) {
293                                                 cgs[cg.Name] = cg
294                                         }
295                                 }
296                                 return nil
297                         })
298                         if err != nil {
299                                 return err
300                         }
301                         tagstart := cgs[cgnames[0]].StartTag
302                         tagend := cgs[cgnames[0]].EndTag
303
304                         // TODO: filters
305
306                         log.Infof("%04d: renumber/dedup variants for tags %d-%d", infileIdx, tagstart, tagend)
307                         variantRemap := make([][]tileVariantID, tagend-tagstart)
308                         throttleCPU := throttle{Max: runtime.GOMAXPROCS(0)}
309                         for tag, variants := range seq {
310                                 tag, variants := tag, variants
311                                 throttleCPU.Acquire()
312                                 go func() {
313                                         defer throttleCPU.Release()
314                                         count := make(map[[blake2b.Size256]byte]int, len(variants))
315
316                                         rt := reftile[tag]
317                                         if rt != nil {
318                                                 count[blake2b.Sum256(rt.tiledata)] = 0
319                                         }
320
321                                         for _, cg := range cgs {
322                                                 idx := int(tag-tagstart) * 2
323                                                 if idx < len(cg.Variants) {
324                                                         for allele := 0; allele < 2; allele++ {
325                                                                 v := cg.Variants[idx+allele]
326                                                                 if v > 0 && len(variants[v].Sequence) > 0 {
327                                                                         count[variants[v].Blake2b]++
328                                                                 }
329                                                         }
330                                                 }
331                                         }
332                                         // hash[i] will be the hash of
333                                         // the variant(s) that should
334                                         // be at rank i (0-based).
335                                         hash := make([][blake2b.Size256]byte, 0, len(count))
336                                         for b := range count {
337                                                 hash = append(hash, b)
338                                         }
339                                         sort.Slice(hash, func(i, j int) bool {
340                                                 bi, bj := &hash[i], &hash[j]
341                                                 if ci, cj := count[*bi], count[*bj]; ci != cj {
342                                                         return ci > cj
343                                                 } else {
344                                                         return bytes.Compare((*bi)[:], (*bj)[:]) < 0
345                                                 }
346                                         })
347                                         // rank[b] will be the 1-based
348                                         // new variant number for
349                                         // variants whose hash is b.
350                                         rank := make(map[[blake2b.Size256]byte]tileVariantID, len(hash))
351                                         for i, h := range hash {
352                                                 rank[h] = tileVariantID(i + 1)
353                                         }
354                                         // remap[v] will be the new
355                                         // variant number for original
356                                         // variant number v.
357                                         remap := make([]tileVariantID, len(variants))
358                                         for i, tv := range variants {
359                                                 remap[i] = rank[tv.Blake2b]
360                                         }
361                                         variantRemap[tag-tagstart] = remap
362                                         if rt != nil {
363                                                 rt.variant = rank[blake2b.Sum256(rt.tiledata)]
364                                         }
365                                 }()
366                         }
367                         throttleCPU.Wait()
368
369                         annotationsFilename := fmt.Sprintf("%s/matrix.%04d.annotations.csv", *outputDir, infileIdx)
370                         log.Infof("%04d: writing %s", infileIdx, annotationsFilename)
371                         annof, err := os.Create(annotationsFilename)
372                         if err != nil {
373                                 return err
374                         }
375                         annow := bufio.NewWriterSize(annof, 1<<20)
376                         outcol := 0
377                         for tag := tagstart; tag < tagend; tag++ {
378                                 rt, ok := reftile[tag]
379                                 if !ok {
380                                         if mask == nil {
381                                                 outcol++
382                                         }
383                                         // Excluded by specified
384                                         // regions, or reference does
385                                         // not use any variant of this
386                                         // tile. (TODO: log this?
387                                         // mention it in annotations?)
388                                         continue
389                                 }
390                                 fmt.Fprintf(annow, "%d,%d,%d,=,%s,%d,,,\n", tag, outcol, rt.variant, rt.seqname, rt.pos)
391                                 variants := seq[tag]
392                                 reftilestr := strings.ToUpper(string(rt.tiledata))
393                                 remap := variantRemap[tag-tagstart]
394                                 done := make([]bool, len(variants))
395                                 for v, tv := range variants {
396                                         v := remap[v]
397                                         if v == rt.variant || done[v] {
398                                                 continue
399                                         } else {
400                                                 done[v] = true
401                                         }
402                                         if len(tv.Sequence) < taglen || !bytes.HasSuffix(rt.tiledata, tv.Sequence[len(tv.Sequence)-taglen:]) {
403                                                 fmt.Fprintf(annow, "%d,%d,%d,,%s,%d,,,\n", tag, outcol, v, rt.seqname, rt.pos)
404                                                 continue
405                                         }
406                                         if lendiff := len(rt.tiledata) - len(tv.Sequence); lendiff < -1000 || lendiff > 1000 {
407                                                 fmt.Fprintf(annow, "%d,%d,%d,,%s,%d,,,\n", tag, outcol, v, rt.seqname, rt.pos)
408                                                 continue
409                                         }
410                                         diffs, _ := hgvs.Diff(reftilestr, strings.ToUpper(string(tv.Sequence)), 0)
411                                         for _, diff := range diffs {
412                                                 diff.Position += rt.pos
413                                                 fmt.Fprintf(annow, "%d,%d,%d,%s:g.%s,%s,%d,%s,%s,%s\n", tag, outcol, v, rt.seqname, diff.String(), rt.seqname, diff.Position, diff.Ref, diff.New, diff.Left)
414                                         }
415                                 }
416                                 outcol++
417                         }
418                         err = annow.Flush()
419                         if err != nil {
420                                 return err
421                         }
422                         err = annof.Close()
423                         if err != nil {
424                                 return err
425                         }
426
427                         log.Infof("%04d: preparing numpy", infileIdx)
428                         throttleNumpyMem.Acquire()
429                         rows := len(cgnames)
430                         cols := 2 * outcol
431                         out := make([]int16, rows*cols)
432                         for row, name := range cgnames {
433                                 out := out[row*cols:]
434                                 outcol := 0
435                                 for col, v := range cgs[name].Variants {
436                                         tag := tagstart + tagID(col/2)
437                                         if mask != nil && reftile[tag] == nil {
438                                                 continue
439                                         }
440                                         if variants, ok := seq[tag]; ok && len(variants) > int(v) && len(variants[v].Sequence) > 0 {
441                                                 out[outcol] = int16(variantRemap[tag-tagstart][v])
442                                         } else {
443                                                 out[outcol] = -1
444                                         }
445                                         outcol++
446                                 }
447                         }
448                         seq = nil
449                         throttleNumpyMem.Release()
450
451                         if *mergeOutput || *hgvsSingle || *hgvsChunked {
452                                 log.Infof("%04d: matrix fragment %d rows x %d cols", infileIdx, rows, cols)
453                                 toMerge[infileIdx] = out
454                         }
455                         if !*mergeOutput {
456                                 fnm := fmt.Sprintf("%s/matrix.%04d.npy", *outputDir, infileIdx)
457                                 err = writeNumpyInt16(fnm, out, rows, cols)
458                                 if err != nil {
459                                         return err
460                                 }
461                         }
462                         log.Infof("%s: done (%d/%d)", infile, int(atomic.AddInt64(&done, 1)), len(infiles))
463                         return nil
464                 })
465         }
466         if err = throttleMem.Wait(); err != nil {
467                 return 1
468         }
469         if *mergeOutput || *hgvsSingle || *hgvsChunked {
470                 var annow *bufio.Writer
471                 var annof *os.File
472                 if *mergeOutput {
473                         annoFilename := fmt.Sprintf("%s/matrix.annotations.csv", *outputDir)
474                         annof, err = os.Create(annoFilename)
475                         if err != nil {
476                                 return 1
477                         }
478                         annow = bufio.NewWriterSize(annof, 1<<20)
479                 }
480
481                 rows := len(cgnames)
482                 cols := 0
483                 for _, chunk := range toMerge {
484                         cols += len(chunk) / rows
485                 }
486                 log.Infof("merging output matrix (rows=%d, cols=%d, mem=%d) and annotations", rows, cols, rows*cols*2)
487                 var out []int16
488                 if *mergeOutput {
489                         out = make([]int16, rows*cols)
490                 }
491                 hgvsCols := map[string][2][]int16{} // hgvs -> [[g0,g1,g2,...], [g0,g1,g2,...]] (slice of genomes for each phase)
492                 startcol := 0
493                 for outIdx, chunk := range toMerge {
494                         chunkcols := len(chunk) / rows
495                         if *mergeOutput {
496                                 for row := 0; row < rows; row++ {
497                                         copy(out[row*cols+startcol:], chunk[row*chunkcols:(row+1)*chunkcols])
498                                 }
499                         }
500                         toMerge[outIdx] = nil
501
502                         annotationsFilename := fmt.Sprintf("%s/matrix.%04d.annotations.csv", *outputDir, outIdx)
503                         log.Infof("reading %s", annotationsFilename)
504                         buf, err := os.ReadFile(annotationsFilename)
505                         if err != nil {
506                                 return 1
507                         }
508                         if *mergeOutput {
509                                 err = os.Remove(annotationsFilename)
510                                 if err != nil {
511                                         return 1
512                                 }
513                         }
514                         for _, line := range bytes.Split(buf, []byte{'\n'}) {
515                                 if len(line) == 0 {
516                                         continue
517                                 }
518                                 fields := bytes.SplitN(line, []byte{','}, 9)
519                                 tag, _ := strconv.Atoi(string(fields[0]))
520                                 incol, _ := strconv.Atoi(string(fields[1]))
521                                 tileVariant, _ := strconv.Atoi(string(fields[2]))
522                                 hgvsID := string(fields[3])
523                                 seqname := string(fields[4])
524                                 pos, _ := strconv.Atoi(string(fields[5]))
525                                 refseq := fields[6]
526                                 if hgvsID == "" {
527                                         // Null entry for un-diffable
528                                         // tile variant
529                                         continue
530                                 }
531                                 if hgvsID == "=" {
532                                         // Null entry for ref tile
533                                         continue
534                                 }
535                                 if mask != nil && !mask.Check(strings.TrimPrefix(seqname, "chr"), pos, pos+len(refseq)) {
536                                         // The tile intersects one of
537                                         // the selected regions, but
538                                         // this particular HGVS
539                                         // variant does not.
540                                         continue
541                                 }
542                                 hgvsColPair := hgvsCols[hgvsID]
543                                 if hgvsColPair[0] == nil {
544                                         // values in new columns start
545                                         // out as -1 ("no data yet")
546                                         // or 0 ("=ref") here, may
547                                         // change to 1 ("hgvs variant
548                                         // present") below, either on
549                                         // this line or a future line.
550                                         hgvsColPair = [2][]int16{make([]int16, len(cgnames)), make([]int16, len(cgnames))}
551                                         rt, ok := reftile[tagID(tag)]
552                                         if !ok {
553                                                 err = fmt.Errorf("bug: seeing annotations for tag %d, but it has no reftile entry", tag)
554                                                 return 1
555                                         }
556                                         for ph := 0; ph < 2; ph++ {
557                                                 for row := 0; row < rows; row++ {
558                                                         v := chunk[row*chunkcols+incol*2+ph]
559                                                         if tileVariantID(v) == rt.variant {
560                                                                 hgvsColPair[ph][row] = 0
561                                                         } else {
562                                                                 hgvsColPair[ph][row] = -1
563                                                         }
564                                                 }
565                                         }
566                                         hgvsCols[hgvsID] = hgvsColPair
567                                         hgvsref := hgvs.Variant{
568                                                 Position: pos,
569                                                 Ref:      string(refseq),
570                                                 New:      string(refseq),
571                                         }
572                                         if annow != nil {
573                                                 fmt.Fprintf(annow, "%d,%d,%d,%s:g.%s,%s,%d,%s,%s,%s\n", tag, incol+startcol/2, rt.variant, seqname, hgvsref.String(), seqname, pos, refseq, refseq, fields[8])
574                                         }
575                                 }
576                                 if annow != nil {
577                                         fmt.Fprintf(annow, "%d,%d,%d,%s,%s,%d,%s,%s,%s\n", tag, incol+startcol/2, tileVariant, hgvsID, seqname, pos, refseq, fields[7], fields[8])
578                                 }
579                                 for ph := 0; ph < 2; ph++ {
580                                         for row := 0; row < rows; row++ {
581                                                 v := chunk[row*chunkcols+incol*2+ph]
582                                                 if int(v) == tileVariant {
583                                                         hgvsColPair[ph][row] = 1
584                                                 }
585                                         }
586                                 }
587                         }
588
589                         startcol += chunkcols
590                 }
591                 if *mergeOutput {
592                         err = annow.Flush()
593                         if err != nil {
594                                 return 1
595                         }
596                         err = annof.Close()
597                         if err != nil {
598                                 return 1
599                         }
600                         err = writeNumpyInt16(fmt.Sprintf("%s/matrix.npy", *outputDir), out, rows, cols)
601                         if err != nil {
602                                 return 1
603                         }
604                 }
605                 out = nil
606
607                 var seqnames []string
608                 if *hgvsSingle {
609                         seqnames = []string{""}
610                 }
611                 if *hgvsChunked {
612                         for seqname := range refseq {
613                                 seqnames = append(seqnames, seqname)
614                         }
615                 }
616                 for _, seqname := range seqnames {
617                         basename := "hgvs"
618                         wantPrefix := ""
619                         if seqname == "" {
620                                 cols = len(hgvsCols) * 2
621                         } else {
622                                 basename = "hgvs." + seqname
623                                 wantPrefix = seqname + ":"
624                                 cols = 0
625                                 for hgvsID := range hgvsCols {
626                                         if strings.HasPrefix(hgvsID, wantPrefix) {
627                                                 cols += 2
628                                         }
629                                 }
630                         }
631                         log.Printf("building hgvs-based matrix [%s]: %d rows x %d cols", seqname, rows, cols)
632                         out = make([]int16, rows*cols)
633                         hgvsIDs := make([]string, 0, cols/2)
634                         for hgvsID := range hgvsCols {
635                                 if strings.HasPrefix(hgvsID, wantPrefix) {
636                                         hgvsIDs = append(hgvsIDs, hgvsID)
637                                 }
638                         }
639                         sort.Strings(hgvsIDs)
640                         var hgvsLabels bytes.Buffer
641                         for idx, hgvsID := range hgvsIDs {
642                                 fmt.Fprintf(&hgvsLabels, "%d,%s\n", idx, hgvsID)
643                                 for ph := 0; ph < 2; ph++ {
644                                         hgvscol := hgvsCols[hgvsID][ph]
645                                         for row, val := range hgvscol {
646                                                 out[row*cols+idx*2+ph] = val
647                                         }
648                                 }
649                         }
650                         err = writeNumpyInt16(fmt.Sprintf("%s/%s.npy", *outputDir, basename), out, rows, cols)
651                         if err != nil {
652                                 return 1
653                         }
654
655                         fnm := fmt.Sprintf("%s/%s.annotations.csv", *outputDir, basename)
656                         log.Printf("writing hgvs labels: %s", fnm)
657                         err = ioutil.WriteFile(fnm, hgvsLabels.Bytes(), 0777)
658                         if err != nil {
659                                 return 1
660                         }
661                 }
662         }
663         return 0
664 }
665
666 func writeNumpyInt16(fnm string, out []int16, rows, cols int) error {
667         output, err := os.Create(fnm)
668         if err != nil {
669                 return err
670         }
671         defer output.Close()
672         bufw := bufio.NewWriterSize(output, 1<<26)
673         npw, err := gonpy.NewWriter(nopCloser{bufw})
674         if err != nil {
675                 return err
676         }
677         log.WithFields(log.Fields{
678                 "filename": fnm,
679                 "rows":     rows,
680                 "cols":     cols,
681         }).Infof("writing numpy: %s", fnm)
682         npw.Shape = []int{rows, cols}
683         npw.WriteInt16(out)
684         err = bufw.Flush()
685         if err != nil {
686                 return err
687         }
688         return output.Close()
689 }