1 // Copyright (C) The Lightning Authors. All rights reserved.
3 // SPDX-License-Identifier: AGPL-3.0
19 "git.arvados.org/arvados.git/sdk/go/arvados"
20 log "github.com/sirupsen/logrus"
23 type tilingStats struct {
26 func (cmd *tilingStats) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int {
30 fmt.Fprintf(stderr, "%s\n", err)
33 flags := flag.NewFlagSet("", flag.ContinueOnError)
34 flags.SetOutput(stderr)
35 pprof := flags.String("pprof", "", "serve Go profile data at http://`[addr]:port`")
36 runlocal := flags.Bool("local", false, "run on local host (default: run in an arvados container)")
37 projectUUID := flags.String("project", "", "project `UUID` for output data")
38 priority := flags.Int("priority", 500, "container request priority")
39 inputDir := flags.String("input-dir", "./in", "input `directory`")
40 outputDir := flags.String("output-dir", "./out", "output `directory`")
41 err = flags.Parse(args)
42 if err == flag.ErrHelp {
45 } else if err != nil {
51 log.Println(http.ListenAndServe(*pprof, nil))
56 runner := arvadosContainerRunner{
57 Name: "lightning tiling-stats",
58 Client: arvados.NewClientFromEnv(),
59 ProjectUUID: *projectUUID,
66 err = runner.TranslatePaths(inputDir)
70 runner.Args = []string{"tiling-stats", "-local=true",
72 "-input-dir=" + *inputDir,
73 "-output-dir=/mnt/output",
76 output, err = runner.Run()
80 fmt.Fprintln(stdout, output)
84 infiles, err := allFiles(*inputDir, matchGobFile)
88 if len(infiles) == 0 {
89 err = fmt.Errorf("no input files found in %s", *inputDir)
94 var refseqs []CompactSequence
95 var reftiledata = make(map[tileLibRef][]byte, 11000000)
96 in0, err := open(infiles[0])
102 err = DecodeLibrary(in0, strings.HasSuffix(infiles[0], ".gz"), func(ent *LibraryEntry) error {
103 if len(ent.TagSet) > 0 {
104 taglen = len(ent.TagSet[0])
106 refseqs = append(refseqs, ent.CompactSequences...)
107 for _, tv := range ent.TileVariants {
109 reftiledata[tileLibRef{tv.Tag, tv.Variant}] = tv.Sequence
118 if len(refseqs) == 0 {
119 err = fmt.Errorf("%s: reference sequence not found", infiles[0])
123 err = fmt.Errorf("%s: tagset not found", infiles[0])
127 for _, cseq := range refseqs {
128 _, basename := filepath.Split(cseq.Name)
129 bedname := fmt.Sprintf("%s/%s.bed", *outputDir, basename)
130 log.Infof("writing %s", bedname)
132 f, err = os.OpenFile(bedname, os.O_CREATE|os.O_EXCL|os.O_WRONLY, 0777)
137 bufw := bufio.NewWriterSize(f, 1<<24)
138 seqnames := make([]string, 0, len(cseq.TileSequences))
139 for seqname := range cseq.TileSequences {
140 seqnames = append(seqnames, seqname)
142 sort.Strings(seqnames)
143 // Mark duplicate tags (tags that place more than once
145 duptag := map[tagID]bool{}
146 for _, seqname := range seqnames {
147 for _, libref := range cseq.TileSequences[seqname] {
148 if _, seen := duptag[libref.Tag]; seen {
149 duptag[libref.Tag] = true
151 duptag[libref.Tag] = false
155 for _, seqname := range seqnames {
157 for _, libref := range cseq.TileSequences[seqname] {
158 if duptag[libref.Tag] {
161 tiledata := reftiledata[libref]
162 if len(tiledata) <= taglen {
163 err = fmt.Errorf("bogus input data: ref tile libref %v has len %d < taglen %d", libref, len(tiledata), taglen)
166 score := 1000 * countBases(tiledata) / len(tiledata)
167 _, err = fmt.Fprintf(bufw, "%s %d %d %d %d . %d %d\n",
169 pos, pos+len(tiledata),
172 pos+taglen, pos+len(tiledata)-taglen)
176 pos += len(tiledata) - taglen