Add tilestats cmd.
[lightning.git] / slice_test.go
1 // Copyright (C) The Lightning Authors. All rights reserved.
2 //
3 // SPDX-License-Identifier: AGPL-3.0
4
5 package lightning
6
7 import (
8         "io/ioutil"
9         "os"
10         "os/exec"
11
12         "github.com/kshedden/gonpy"
13         "gopkg.in/check.v1"
14 )
15
16 type sliceSuite struct{}
17
18 var _ = check.Suite(&sliceSuite{})
19
20 func (s *sliceSuite) TestImportAndSlice(c *check.C) {
21         tmpdir := c.MkDir()
22         err := os.Mkdir(tmpdir+"/lib1", 0777)
23         c.Assert(err, check.IsNil)
24         err = os.Mkdir(tmpdir+"/lib2", 0777)
25         c.Assert(err, check.IsNil)
26         err = os.Mkdir(tmpdir+"/lib3", 0777)
27         c.Assert(err, check.IsNil)
28         cwd, err := os.Getwd()
29         c.Assert(err, check.IsNil)
30         err = os.Symlink(cwd+"/testdata/pipeline1", tmpdir+"/pipeline1")
31         c.Assert(err, check.IsNil)
32         err = os.Symlink(cwd+"/testdata/pipeline1", tmpdir+"/pipeline1dup")
33         c.Assert(err, check.IsNil)
34
35         err = ioutil.WriteFile(tmpdir+"/chr1-12-100.bed", []byte("chr1\t12\t100\ttest.1\n"), 0644)
36         c.Check(err, check.IsNil)
37
38         c.Log("=== import testdata/ref ===")
39         exited := (&importer{}).RunCommand("import", []string{
40                 "-local=true",
41                 "-tag-library", "testdata/tags",
42                 "-output-tiles",
43                 "-save-incomplete-tiles",
44                 "-o", tmpdir + "/lib1/library1.gob",
45                 "testdata/ref.fasta",
46         }, nil, os.Stderr, os.Stderr)
47         c.Assert(exited, check.Equals, 0)
48
49         c.Log("=== import testdata/pipeline1 ===")
50         exited = (&importer{}).RunCommand("import", []string{
51                 "-local=true",
52                 "-tag-library", "testdata/tags",
53                 "-output-tiles",
54                 "-o", tmpdir + "/lib2/library2.gob",
55                 tmpdir + "/pipeline1",
56         }, nil, os.Stderr, os.Stderr)
57         c.Assert(exited, check.Equals, 0)
58
59         c.Log("=== import pipeline1dup ===")
60         exited = (&importer{}).RunCommand("import", []string{
61                 "-local=true",
62                 "-tag-library", "testdata/tags",
63                 "-output-tiles",
64                 "-o", tmpdir + "/lib3/library3.gob",
65                 tmpdir + "/pipeline1dup",
66         }, nil, os.Stderr, os.Stderr)
67         c.Assert(exited, check.Equals, 0)
68
69         slicedir := c.MkDir()
70
71         c.Log("=== slice ===")
72         exited = (&slicecmd{}).RunCommand("slice", []string{
73                 "-local=true",
74                 "-output-dir=" + slicedir,
75                 "-tags-per-file=2",
76                 tmpdir + "/lib1",
77                 tmpdir + "/lib2",
78                 tmpdir + "/lib3",
79         }, nil, os.Stderr, os.Stderr)
80         c.Check(exited, check.Equals, 0)
81         out, _ := exec.Command("find", slicedir, "-ls").CombinedOutput()
82         c.Logf("%s", out)
83
84         c.Log("=== dump ===")
85         {
86                 dumpdir := c.MkDir()
87                 exited = (&dump{}).RunCommand("dump", []string{
88                         "-local=true",
89                         "-tags=4,6,7",
90                         "-input-dir=" + slicedir,
91                         "-output-dir=" + dumpdir,
92                 }, nil, os.Stderr, os.Stderr)
93                 c.Check(exited, check.Equals, 0)
94                 out, _ := exec.Command("find", dumpdir, "-ls").CombinedOutput()
95                 c.Logf("%s", out)
96                 dumped, err := ioutil.ReadFile(dumpdir + "/variants.csv")
97                 c.Assert(err, check.IsNil)
98                 c.Logf("%s", dumped)
99                 c.Check("\n"+string(dumped), check.Matches, `(?ms).*\n6,1,1,chr2,349,AAAACTG.*`)
100         }
101
102         c.Log("=== slice-numpy ===")
103         {
104                 npydir := c.MkDir()
105                 exited := (&sliceNumpy{}).RunCommand("slice-numpy", []string{
106                         "-local=true",
107                         "-input-dir=" + slicedir,
108                         "-output-dir=" + npydir,
109                 }, nil, os.Stderr, os.Stderr)
110                 c.Check(exited, check.Equals, 0)
111                 out, _ := exec.Command("find", npydir, "-ls").CombinedOutput()
112                 c.Logf("%s", out)
113
114                 f, err := os.Open(npydir + "/matrix.0000.npy")
115                 c.Assert(err, check.IsNil)
116                 defer f.Close()
117                 npy, err := gonpy.NewReader(f)
118                 c.Assert(err, check.IsNil)
119                 c.Check(npy.Shape, check.DeepEquals, []int{4, 4})
120                 variants, err := npy.GetInt16()
121                 c.Check(variants, check.DeepEquals, []int16{2, 1, 1, 2, -1, -1, 1, 1, 2, 1, 1, 2, -1, -1, 1, 1})
122
123                 annotations, err := ioutil.ReadFile(npydir + "/matrix.0000.annotations.csv")
124                 c.Assert(err, check.IsNil)
125                 c.Logf("%s", annotations)
126                 for _, s := range []string{
127                         "chr1:g.161A>T",
128                         "chr1:g.178A>T",
129                         "chr1:g.1_3delinsGGC",
130                         "chr1:g.222_224del",
131                 } {
132                         c.Check(string(annotations), check.Matches, "(?ms).*"+s+".*")
133                 }
134
135                 annotations, err = ioutil.ReadFile(npydir + "/matrix.0002.annotations.csv")
136                 c.Assert(err, check.IsNil)
137                 c.Logf("%s", annotations)
138                 for _, s := range []string{
139                         ",2,chr2:g.1_3delinsAAA",
140                         ",2,chr2:g.125_127delinsAAA",
141                         ",4,chr2:g.125_127delinsAAA",
142                 } {
143                         c.Check(string(annotations), check.Matches, "(?ms).*"+s+".*")
144                 }
145         }
146
147         c.Log("=== slice-numpy + regions ===")
148         {
149                 npydir := c.MkDir()
150                 exited := (&sliceNumpy{}).RunCommand("slice-numpy", []string{
151                         "-local=true",
152                         "-regions=" + tmpdir + "/chr1-12-100.bed",
153                         "-input-dir=" + slicedir,
154                         "-output-dir=" + npydir,
155                         "-chunked-hgvs-matrix=true",
156                 }, nil, os.Stderr, os.Stderr)
157                 c.Check(exited, check.Equals, 0)
158                 out, _ := exec.Command("find", npydir, "-ls").CombinedOutput()
159                 c.Logf("%s", out)
160
161                 f, err := os.Open(npydir + "/matrix.0000.npy")
162                 c.Assert(err, check.IsNil)
163                 defer f.Close()
164                 npy, err := gonpy.NewReader(f)
165                 c.Assert(err, check.IsNil)
166                 c.Check(npy.Shape, check.DeepEquals, []int{4, 2})
167                 variants, err := npy.GetInt16()
168                 c.Check(variants, check.DeepEquals, []int16{2, 1, -1, -1, 2, 1, -1, -1})
169
170                 annotations, err := ioutil.ReadFile(npydir + "/matrix.0000.annotations.csv")
171                 c.Assert(err, check.IsNil)
172                 c.Logf("%s", annotations)
173                 for _, s := range []string{
174                         "chr1:g.161A>T",
175                         "chr1:g.178A>T",
176                         "chr1:g.1_3delinsGGC",
177                         "chr1:g.222_224del",
178                 } {
179                         c.Check(string(annotations), check.Matches, "(?ms).*"+s+".*")
180                 }
181
182                 for _, fnm := range []string{
183                         npydir + "/matrix.0001.annotations.csv",
184                         npydir + "/matrix.0002.annotations.csv",
185                 } {
186                         annotations, err := ioutil.ReadFile(fnm)
187                         c.Assert(err, check.IsNil)
188                         c.Check(string(annotations), check.Equals, "", check.Commentf(fnm))
189                 }
190         }
191
192         err = ioutil.WriteFile(tmpdir+"/chr1and2-100-200.bed", []byte("chr1\t100\t200\ttest.1\nchr2\t100\t200\ttest.2\n"), 0644)
193         c.Check(err, check.IsNil)
194
195         c.Log("=== slice-numpy + regions + merge ===")
196         {
197                 npydir := c.MkDir()
198                 exited := (&sliceNumpy{}).RunCommand("slice-numpy", []string{
199                         "-local=true",
200                         "-regions=" + tmpdir + "/chr1and2-100-200.bed",
201                         "-input-dir=" + slicedir,
202                         "-output-dir=" + npydir,
203                         "-merge-output=true",
204                         "-single-hgvs-matrix=true",
205                 }, nil, os.Stderr, os.Stderr)
206                 c.Check(exited, check.Equals, 0)
207                 out, _ := exec.Command("find", npydir, "-ls").CombinedOutput()
208                 c.Logf("%s", out)
209
210                 f, err := os.Open(npydir + "/matrix.npy")
211                 c.Assert(err, check.IsNil)
212                 defer f.Close()
213                 npy, err := gonpy.NewReader(f)
214                 c.Assert(err, check.IsNil)
215                 c.Check(npy.Shape, check.DeepEquals, []int{4, 4})
216                 variants, err := npy.GetInt16()
217                 if c.Check(err, check.IsNil) {
218                         c.Check(variants, check.DeepEquals, []int16{
219                                 2, 1, 3, 1,
220                                 -1, -1, 4, 2,
221                                 2, 1, 3, 1,
222                                 -1, -1, 4, 2,
223                         })
224                 }
225
226                 annotations, err := ioutil.ReadFile(npydir + "/matrix.annotations.csv")
227                 c.Assert(err, check.IsNil)
228                 c.Logf("%s", annotations)
229                 for _, s := range []string{
230                         "0,0,1,chr1:g.161A>T",
231                         "0,0,1,chr1:g.178A>T",
232                         "4,1,2,chr2:g.125_127delinsAAA",
233                 } {
234                         c.Check(string(annotations), check.Matches, "(?ms).*"+s+".*")
235                 }
236         }
237
238         c.Log("=== slice-numpy + chunked hgvs matrix ===")
239         {
240                 err = ioutil.WriteFile(tmpdir+"/casecontrol.tsv", []byte(`SampleID      CC
241 pipeline1/input1        1
242 pipeline1/input2        0
243 pipeline1dup/input1     1
244 pipeline1dup/input2     0
245 `), 0600)
246                 c.Assert(err, check.IsNil)
247                 npydir := c.MkDir()
248                 exited := (&sliceNumpy{}).RunCommand("slice-numpy", []string{
249                         "-local=true",
250                         "-chunked-hgvs-matrix=true",
251                         "-chi2-case-control-file=" + tmpdir + "/casecontrol.tsv",
252                         "-chi2-case-control-column=CC",
253                         "-chi2-p-value=0.5",
254                         "-min-coverage=0.75",
255                         "-input-dir=" + slicedir,
256                         "-output-dir=" + npydir,
257                 }, nil, os.Stderr, os.Stderr)
258                 c.Check(exited, check.Equals, 0)
259                 out, _ := exec.Command("find", npydir, "-ls").CombinedOutput()
260                 c.Logf("%s", out)
261
262                 annotations, err := ioutil.ReadFile(npydir + "/hgvs.chr2.annotations.csv")
263                 c.Assert(err, check.IsNil)
264                 c.Check(string(annotations), check.Equals, `0,chr2:g.470_472del
265 1,chr2:g.471G>A
266 2,chr2:g.472G>A
267 `)
268         }
269
270         c.Log("=== slice-numpy + onehotChunked ===")
271         {
272                 err = ioutil.WriteFile(tmpdir+"/casecontrol.tsv", []byte(`SampleID      CC
273 pipeline1/input1        1
274 pipeline1/input2        0
275 pipeline1dup/input1     1
276 pipeline1dup/input2     0
277 `), 0600)
278                 c.Assert(err, check.IsNil)
279                 npydir := c.MkDir()
280                 exited := (&sliceNumpy{}).RunCommand("slice-numpy", []string{
281                         "-local=true",
282                         "-chunked-onehot=true",
283                         "-chi2-case-control-file=" + tmpdir + "/casecontrol.tsv",
284                         "-chi2-case-control-column=CC",
285                         "-chi2-p-value=0.5",
286                         "-min-coverage=0.75",
287                         "-input-dir=" + slicedir,
288                         "-output-dir=" + npydir,
289                 }, nil, os.Stderr, os.Stderr)
290                 c.Check(exited, check.Equals, 0)
291                 out, _ := exec.Command("find", npydir, "-ls").CombinedOutput()
292                 c.Logf("%s", out)
293
294                 f, err := os.Open(npydir + "/onehot.0002.npy")
295                 c.Assert(err, check.IsNil)
296                 defer f.Close()
297                 npy, err := gonpy.NewReader(f)
298                 c.Assert(err, check.IsNil)
299                 c.Check(npy.Shape, check.DeepEquals, []int{4, 3})
300                 onehot, err := npy.GetInt8()
301                 if c.Check(err, check.IsNil) {
302                         for r := 0; r < npy.Shape[0]; r++ {
303                                 c.Logf("%v", onehot[r*npy.Shape[1]:(r+1)*npy.Shape[1]])
304                         }
305                         c.Check(onehot, check.DeepEquals, []int8{
306                                 0, 1, 0, // input1
307                                 1, 0, 1, // input2
308                                 0, 1, 0, // dup/input1
309                                 1, 0, 1, // dup/input2
310                         })
311                 }
312         }
313
314         c.Log("=== slice-numpy + onehotSingle ===")
315         {
316                 err = ioutil.WriteFile(tmpdir+"/casecontrol.tsv", []byte(`SampleID      CC
317 pipeline1/input1        1
318 pipeline1/input2        0
319 pipeline1dup/input1     1
320 pipeline1dup/input2     0
321 `), 0600)
322                 c.Assert(err, check.IsNil)
323                 npydir := c.MkDir()
324                 exited := (&sliceNumpy{}).RunCommand("slice-numpy", []string{
325                         "-local=true",
326                         "-single-onehot=true",
327                         "-chi2-case-control-file=" + tmpdir + "/casecontrol.tsv",
328                         "-chi2-case-control-column=CC",
329                         "-chi2-p-value=0.5",
330                         "-min-coverage=0.75",
331                         "-input-dir=" + slicedir,
332                         "-output-dir=" + npydir,
333                         "-debug-tag=1",
334                 }, nil, os.Stderr, os.Stderr)
335                 c.Check(exited, check.Equals, 0)
336                 out, _ := exec.Command("find", npydir, "-ls").CombinedOutput()
337                 c.Logf("%s", out)
338
339                 f, err := os.Open(npydir + "/onehot.npy")
340                 c.Assert(err, check.IsNil)
341                 defer f.Close()
342                 npy, err := gonpy.NewReader(f)
343                 c.Assert(err, check.IsNil)
344                 c.Check(npy.Shape, check.DeepEquals, []int{2, 16})
345                 onehot, err := npy.GetUint32()
346                 if c.Check(err, check.IsNil) {
347                         for r := 0; r < npy.Shape[0]; r++ {
348                                 c.Logf("%v", onehot[r*npy.Shape[1]:(r+1)*npy.Shape[1]])
349                         }
350                         c.Check(onehot, check.DeepEquals, []uint32{
351                                 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 0, 2,
352                                 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7,
353                         })
354                 }
355         }
356 }
357
358 func (s *sliceSuite) Test_tv2homhet(c *check.C) {
359         cmd := &sliceNumpy{
360                 cgnames:         []string{"sample1", "sample2", "sample3", "sample4"},
361                 chi2Cases:       []bool{false, true, true, false},
362                 chi2PValue:      .5,
363                 includeVariant1: true,
364         }
365         cgs := map[string]CompactGenome{
366                 "sample1": CompactGenome{
367                         Variants: []tileVariantID{0, 0, 1, 1}, // hom tv=1
368                 },
369                 "sample2": CompactGenome{
370                         Variants: []tileVariantID{0, 0, 5, 5}, // hom tv=2
371                 },
372                 "sample3": CompactGenome{
373                         Variants: []tileVariantID{0, 0, 5, 1}, // het tv=1, tv=2
374                 },
375                 "sample4": CompactGenome{
376                         Variants: []tileVariantID{0, 0, 9, 9}, // hom tv=3
377                 },
378         }
379         maxv := tileVariantID(3)
380         remap := []tileVariantID{0, 1, 0, 0, 0, 2, 0, 0, 0, 3}
381         chunkstarttag := tagID(10)
382         for tag := tagID(10); tag < 12; tag++ {
383                 c.Logf("=== tag %d", tag)
384                 chunk, xref := cmd.tv2homhet(cgs, maxv, remap, tag, chunkstarttag)
385                 c.Logf("chunk len=%d", len(chunk))
386                 for _, x := range chunk {
387                         c.Logf("%+v", x)
388                 }
389                 c.Logf("xref len=%d", len(xref))
390                 for _, x := range xref {
391                         c.Logf("%+v", x)
392                 }
393                 out := onehotcols2int8(chunk)
394                 c.Logf("onehotcols2int8(chunk) len=%d", len(out))
395                 for i := 0; i < len(out); i += len(chunk) {
396                         c.Logf("%+v", out[i:i+len(chunk)])
397                 }
398                 coords := onehotChunk2Indirect(chunk)
399                 c.Logf("onehotChunk2Indirect(chunk) len=%d", len(coords))
400                 for _, x := range coords {
401                         c.Logf("%+v", x)
402                 }
403         }
404 }