19566: Merge branch 'main'
[lightning.git] / tilestats.go
1 // Copyright (C) The Lightning Authors. All rights reserved.
2 //
3 // SPDX-License-Identifier: AGPL-3.0
4
5 package lightning
6
7 import (
8         "bufio"
9         "flag"
10         "fmt"
11         "io"
12         "net/http"
13         _ "net/http/pprof"
14         "os"
15         "path/filepath"
16         "sort"
17         "strings"
18
19         "git.arvados.org/arvados.git/sdk/go/arvados"
20         log "github.com/sirupsen/logrus"
21 )
22
23 type tilingStats struct {
24 }
25
26 func (cmd *tilingStats) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int {
27         var err error
28         defer func() {
29                 if err != nil {
30                         fmt.Fprintf(stderr, "%s\n", err)
31                 }
32         }()
33         flags := flag.NewFlagSet("", flag.ContinueOnError)
34         flags.SetOutput(stderr)
35         pprof := flags.String("pprof", "", "serve Go profile data at http://`[addr]:port`")
36         runlocal := flags.Bool("local", false, "run on local host (default: run in an arvados container)")
37         projectUUID := flags.String("project", "", "project `UUID` for output data")
38         priority := flags.Int("priority", 500, "container request priority")
39         inputDir := flags.String("input-dir", "./in", "input `directory`")
40         outputDir := flags.String("output-dir", "./out", "output `directory`")
41         err = flags.Parse(args)
42         if err == flag.ErrHelp {
43                 err = nil
44                 return 0
45         } else if err != nil {
46                 return 2
47         } else if flags.NArg() > 0 {
48                 err = fmt.Errorf("errant command line arguments after parsed flags: %v", flags.Args())
49                 return 2
50         }
51
52         if *pprof != "" {
53                 go func() {
54                         log.Println(http.ListenAndServe(*pprof, nil))
55                 }()
56         }
57
58         if !*runlocal {
59                 runner := arvadosContainerRunner{
60                         Name:        "lightning tiling-stats",
61                         Client:      arvados.NewClientFromEnv(),
62                         ProjectUUID: *projectUUID,
63                         RAM:         8000000000,
64                         VCPUs:       2,
65                         Priority:    *priority,
66                         KeepCache:   2,
67                         APIAccess:   true,
68                 }
69                 err = runner.TranslatePaths(inputDir)
70                 if err != nil {
71                         return 1
72                 }
73                 runner.Args = []string{"tiling-stats", "-local=true",
74                         "-pprof=:6060",
75                         "-input-dir=" + *inputDir,
76                         "-output-dir=/mnt/output",
77                 }
78                 var output string
79                 output, err = runner.Run()
80                 if err != nil {
81                         return 1
82                 }
83                 fmt.Fprintln(stdout, output)
84                 return 0
85         }
86
87         infiles, err := allFiles(*inputDir, matchGobFile)
88         if err != nil {
89                 return 1
90         }
91         if len(infiles) == 0 {
92                 err = fmt.Errorf("no input files found in %s", *inputDir)
93                 return 1
94         }
95         sort.Strings(infiles)
96
97         var refseqs []CompactSequence
98         var reftiledata = make(map[tileLibRef][]byte, 11000000)
99         in0, err := open(infiles[0])
100         if err != nil {
101                 return 1
102         }
103         defer in0.Close()
104         var taglen int
105         err = DecodeLibrary(in0, strings.HasSuffix(infiles[0], ".gz"), func(ent *LibraryEntry) error {
106                 if len(ent.TagSet) > 0 {
107                         taglen = len(ent.TagSet[0])
108                 }
109                 refseqs = append(refseqs, ent.CompactSequences...)
110                 for _, tv := range ent.TileVariants {
111                         if tv.Ref {
112                                 reftiledata[tileLibRef{tv.Tag, tv.Variant}] = tv.Sequence
113                         }
114                 }
115                 return nil
116         })
117         if err != nil {
118                 return 1
119         }
120         in0.Close()
121         if len(refseqs) == 0 {
122                 err = fmt.Errorf("%s: reference sequence not found", infiles[0])
123                 return 1
124         }
125         if taglen == 0 {
126                 err = fmt.Errorf("%s: tagset not found", infiles[0])
127                 return 1
128         }
129
130         for _, cseq := range refseqs {
131                 _, basename := filepath.Split(cseq.Name)
132                 bedname := fmt.Sprintf("%s/%s.bed", *outputDir, basename)
133                 log.Infof("writing %s", bedname)
134                 var f *os.File
135                 f, err = os.OpenFile(bedname, os.O_CREATE|os.O_EXCL|os.O_WRONLY, 0777)
136                 if err != nil {
137                         return 1
138                 }
139                 defer f.Close()
140                 bufw := bufio.NewWriterSize(f, 1<<24)
141                 seqnames := make([]string, 0, len(cseq.TileSequences))
142                 for seqname := range cseq.TileSequences {
143                         seqnames = append(seqnames, seqname)
144                 }
145                 sort.Strings(seqnames)
146                 // Mark duplicate tags (tags that place more than once
147                 // on the reference)
148                 duptag := map[tagID]bool{}
149                 for _, seqname := range seqnames {
150                         for _, libref := range cseq.TileSequences[seqname] {
151                                 if _, seen := duptag[libref.Tag]; seen {
152                                         duptag[libref.Tag] = true
153                                 } else {
154                                         duptag[libref.Tag] = false
155                                 }
156                         }
157                 }
158                 for _, seqname := range seqnames {
159                         pos := 0
160                         for _, libref := range cseq.TileSequences[seqname] {
161                                 if duptag[libref.Tag] {
162                                         continue
163                                 }
164                                 tiledata := reftiledata[libref]
165                                 if len(tiledata) <= taglen {
166                                         err = fmt.Errorf("bogus input data: ref tile libref %v has len %d < taglen %d", libref, len(tiledata), taglen)
167                                         return 1
168                                 }
169                                 score := 1000 * countBases(tiledata) / len(tiledata)
170                                 _, err = fmt.Fprintf(bufw, "%s %d %d %d %d . %d %d\n",
171                                         seqname,
172                                         pos, pos+len(tiledata),
173                                         libref.Tag,
174                                         score,
175                                         pos+taglen, pos+len(tiledata)-taglen)
176                                 if err != nil {
177                                         return 1
178                                 }
179                                 pos += len(tiledata) - taglen
180                         }
181                 }
182                 err = bufw.Flush()
183                 if err != nil {
184                         return 1
185                 }
186                 err = f.Close()
187                 if err != nil {
188                         return 1
189                 }
190         }
191         return 0
192 }