19073: Fix dup tag detection.
[lightning.git] / slice_test.go
1 // Copyright (C) The Lightning Authors. All rights reserved.
2 //
3 // SPDX-License-Identifier: AGPL-3.0
4
5 package lightning
6
7 import (
8         "io/ioutil"
9         "os"
10         "os/exec"
11
12         "github.com/kshedden/gonpy"
13         "gopkg.in/check.v1"
14 )
15
16 type sliceSuite struct{}
17
18 var _ = check.Suite(&sliceSuite{})
19
20 func (s *sliceSuite) TestImportAndSlice(c *check.C) {
21         tmpdir := c.MkDir()
22         err := os.Mkdir(tmpdir+"/lib1", 0777)
23         c.Assert(err, check.IsNil)
24         err = os.Mkdir(tmpdir+"/lib2", 0777)
25         c.Assert(err, check.IsNil)
26         err = os.Mkdir(tmpdir+"/lib3", 0777)
27         c.Assert(err, check.IsNil)
28         cwd, err := os.Getwd()
29         c.Assert(err, check.IsNil)
30         err = os.Symlink(cwd+"/testdata/pipeline1", tmpdir+"/pipeline1")
31         c.Assert(err, check.IsNil)
32         err = os.Symlink(cwd+"/testdata/pipeline1", tmpdir+"/pipeline1dup")
33         c.Assert(err, check.IsNil)
34
35         err = ioutil.WriteFile(tmpdir+"/chr1-12-100.bed", []byte("chr1\t12\t100\ttest.1\n"), 0644)
36         c.Check(err, check.IsNil)
37
38         c.Log("=== import testdata/ref ===")
39         exited := (&importer{}).RunCommand("import", []string{
40                 "-local=true",
41                 "-tag-library", "testdata/tags",
42                 "-output-tiles",
43                 "-save-incomplete-tiles",
44                 "-o", tmpdir + "/lib1/library1.gob",
45                 "testdata/ref.fasta",
46         }, nil, os.Stderr, os.Stderr)
47         c.Assert(exited, check.Equals, 0)
48
49         c.Log("=== import testdata/pipeline1 ===")
50         exited = (&importer{}).RunCommand("import", []string{
51                 "-local=true",
52                 "-tag-library", "testdata/tags",
53                 "-output-tiles",
54                 "-o", tmpdir + "/lib2/library2.gob",
55                 tmpdir + "/pipeline1",
56         }, nil, os.Stderr, os.Stderr)
57         c.Assert(exited, check.Equals, 0)
58
59         c.Log("=== import pipeline1dup ===")
60         exited = (&importer{}).RunCommand("import", []string{
61                 "-local=true",
62                 "-tag-library", "testdata/tags",
63                 "-output-tiles",
64                 "-o", tmpdir + "/lib3/library3.gob",
65                 tmpdir + "/pipeline1dup",
66         }, nil, os.Stderr, os.Stderr)
67         c.Assert(exited, check.Equals, 0)
68
69         slicedir := c.MkDir()
70
71         c.Log("=== slice ===")
72         exited = (&slicecmd{}).RunCommand("slice", []string{
73                 "-local=true",
74                 "-output-dir=" + slicedir,
75                 "-tags-per-file=2",
76                 tmpdir + "/lib1",
77                 tmpdir + "/lib2",
78                 tmpdir + "/lib3",
79         }, nil, os.Stderr, os.Stderr)
80         c.Check(exited, check.Equals, 0)
81         out, _ := exec.Command("find", slicedir, "-ls").CombinedOutput()
82         c.Logf("%s", out)
83
84         c.Log("=== dump ===")
85         {
86                 dumpdir := c.MkDir()
87                 exited = (&dump{}).RunCommand("dump", []string{
88                         "-local=true",
89                         "-tags=4,6,7",
90                         "-input-dir=" + slicedir,
91                         "-output-dir=" + dumpdir,
92                 }, nil, os.Stderr, os.Stderr)
93                 c.Check(exited, check.Equals, 0)
94                 out, _ := exec.Command("find", dumpdir, "-ls").CombinedOutput()
95                 c.Logf("%s", out)
96                 dumped, err := ioutil.ReadFile(dumpdir + "/variants.csv")
97                 c.Assert(err, check.IsNil)
98                 c.Logf("%s", dumped)
99                 c.Check("\n"+string(dumped), check.Matches, `(?ms).*\n6,1,1,chr2,349,AAAACTG.*`)
100         }
101
102         c.Log("=== slice-numpy ===")
103         {
104                 npydir := c.MkDir()
105                 exited := (&sliceNumpy{}).RunCommand("slice-numpy", []string{
106                         "-local=true",
107                         "-input-dir=" + slicedir,
108                         "-output-dir=" + npydir,
109                 }, nil, os.Stderr, os.Stderr)
110                 c.Check(exited, check.Equals, 0)
111                 out, _ := exec.Command("find", npydir, "-ls").CombinedOutput()
112                 c.Logf("%s", out)
113
114                 f, err := os.Open(npydir + "/matrix.0000.npy")
115                 c.Assert(err, check.IsNil)
116                 defer f.Close()
117                 npy, err := gonpy.NewReader(f)
118                 c.Assert(err, check.IsNil)
119                 c.Check(npy.Shape, check.DeepEquals, []int{4, 4})
120                 variants, err := npy.GetInt16()
121                 c.Check(variants, check.DeepEquals, []int16{2, 1, 1, 2, -1, -1, 1, 1, 2, 1, 1, 2, -1, -1, 1, 1})
122
123                 annotations, err := ioutil.ReadFile(npydir + "/matrix.0000.annotations.csv")
124                 c.Assert(err, check.IsNil)
125                 c.Logf("%s", annotations)
126                 for _, s := range []string{
127                         "chr1:g.161A>T",
128                         "chr1:g.178A>T",
129                         "chr1:g.1_3delinsGGC",
130                         "chr1:g.222_224del",
131                 } {
132                         c.Check(string(annotations), check.Matches, "(?ms).*"+s+".*")
133                 }
134
135                 annotations, err = ioutil.ReadFile(npydir + "/matrix.0002.annotations.csv")
136                 c.Assert(err, check.IsNil)
137                 c.Logf("%s", annotations)
138                 for _, s := range []string{
139                         ",2,chr2:g.1_3delinsAAA",
140                         ",2,chr2:g.125_127delinsAAA",
141                         ",4,chr2:g.125_127delinsAAA",
142                 } {
143                         c.Check(string(annotations), check.Matches, "(?ms).*"+s+".*")
144                 }
145         }
146
147         c.Log("=== slice-numpy + regions ===")
148         {
149                 npydir := c.MkDir()
150                 exited := (&sliceNumpy{}).RunCommand("slice-numpy", []string{
151                         "-local=true",
152                         "-regions=" + tmpdir + "/chr1-12-100.bed",
153                         "-input-dir=" + slicedir,
154                         "-output-dir=" + npydir,
155                         "-chunked-hgvs-matrix=true",
156                 }, nil, os.Stderr, os.Stderr)
157                 c.Check(exited, check.Equals, 0)
158                 out, _ := exec.Command("find", npydir, "-ls").CombinedOutput()
159                 c.Logf("%s", out)
160
161                 f, err := os.Open(npydir + "/matrix.0000.npy")
162                 c.Assert(err, check.IsNil)
163                 defer f.Close()
164                 npy, err := gonpy.NewReader(f)
165                 c.Assert(err, check.IsNil)
166                 c.Check(npy.Shape, check.DeepEquals, []int{4, 2})
167                 variants, err := npy.GetInt16()
168                 c.Check(variants, check.DeepEquals, []int16{2, 1, -1, -1, 2, 1, -1, -1})
169
170                 annotations, err := ioutil.ReadFile(npydir + "/matrix.0000.annotations.csv")
171                 c.Assert(err, check.IsNil)
172                 c.Logf("%s", annotations)
173                 for _, s := range []string{
174                         "chr1:g.161A>T",
175                         "chr1:g.178A>T",
176                         "chr1:g.1_3delinsGGC",
177                         "chr1:g.222_224del",
178                 } {
179                         c.Check(string(annotations), check.Matches, "(?ms).*"+s+".*")
180                 }
181
182                 for _, fnm := range []string{
183                         npydir + "/matrix.0001.annotations.csv",
184                         npydir + "/matrix.0002.annotations.csv",
185                 } {
186                         annotations, err := ioutil.ReadFile(fnm)
187                         c.Assert(err, check.IsNil)
188                         c.Check(string(annotations), check.Equals, "", check.Commentf(fnm))
189                 }
190         }
191
192         err = ioutil.WriteFile(tmpdir+"/chr1and2-100-200.bed", []byte("chr1\t100\t200\ttest.1\nchr2\t100\t200\ttest.2\n"), 0644)
193         c.Check(err, check.IsNil)
194
195         c.Log("=== slice-numpy + regions + merge ===")
196         {
197                 npydir := c.MkDir()
198                 exited := (&sliceNumpy{}).RunCommand("slice-numpy", []string{
199                         "-local=true",
200                         "-regions=" + tmpdir + "/chr1and2-100-200.bed",
201                         "-input-dir=" + slicedir,
202                         "-output-dir=" + npydir,
203                         "-merge-output=true",
204                         "-single-hgvs-matrix=true",
205                 }, nil, os.Stderr, os.Stderr)
206                 c.Check(exited, check.Equals, 0)
207                 out, _ := exec.Command("find", npydir, "-ls").CombinedOutput()
208                 c.Logf("%s", out)
209
210                 f, err := os.Open(npydir + "/matrix.npy")
211                 c.Assert(err, check.IsNil)
212                 defer f.Close()
213                 npy, err := gonpy.NewReader(f)
214                 c.Assert(err, check.IsNil)
215                 c.Check(npy.Shape, check.DeepEquals, []int{4, 4})
216                 variants, err := npy.GetInt16()
217                 if c.Check(err, check.IsNil) {
218                         c.Check(variants, check.DeepEquals, []int16{
219                                 2, 1, 3, 1,
220                                 -1, -1, 4, 2,
221                                 2, 1, 3, 1,
222                                 -1, -1, 4, 2,
223                         })
224                 }
225
226                 annotations, err := ioutil.ReadFile(npydir + "/matrix.annotations.csv")
227                 c.Assert(err, check.IsNil)
228                 c.Logf("%s", annotations)
229                 for _, s := range []string{
230                         "0,0,1,chr1:g.161A>T",
231                         "0,0,1,chr1:g.178A>T",
232                         "4,1,2,chr2:g.125_127delinsAAA",
233                 } {
234                         c.Check(string(annotations), check.Matches, "(?ms).*"+s+".*")
235                 }
236         }
237
238         c.Log("=== slice-numpy + chunked hgvs matrix ===")
239         {
240                 err = ioutil.WriteFile(tmpdir+"/casecontrol.tsv", []byte(`SampleID      CC
241 pipeline1/input1        1
242 pipeline1/input2        0
243 pipeline1dup/input1     1
244 pipeline1dup/input2     0
245 `), 0600)
246                 c.Assert(err, check.IsNil)
247                 npydir := c.MkDir()
248                 exited := (&sliceNumpy{}).RunCommand("slice-numpy", []string{
249                         "-local=true",
250                         "-chunked-hgvs-matrix=true",
251                         "-chi2-case-control-file=" + tmpdir + "/casecontrol.tsv",
252                         "-chi2-case-control-column=CC",
253                         "-chi2-p-value=0.5",
254                         "-min-coverage=0.75",
255                         "-input-dir=" + slicedir,
256                         "-output-dir=" + npydir,
257                 }, nil, os.Stderr, os.Stderr)
258                 c.Check(exited, check.Equals, 0)
259                 out, _ := exec.Command("find", npydir, "-ls").CombinedOutput()
260                 c.Logf("%s", out)
261
262                 annotations, err := ioutil.ReadFile(npydir + "/hgvs.chr2.annotations.csv")
263                 c.Assert(err, check.IsNil)
264                 c.Check(string(annotations), check.Equals, `0,chr2:g.470_472del
265 1,chr2:g.471G>A
266 2,chr2:g.472G>A
267 `)
268         }
269
270         c.Log("=== slice-numpy + onehotChunked ===")
271         {
272                 err = ioutil.WriteFile(tmpdir+"/casecontrol.tsv", []byte(`SampleID      CC
273 pipeline1/input1        1
274 pipeline1/input2        0
275 pipeline1dup/input1     1
276 pipeline1dup/input2     0
277 `), 0600)
278                 c.Assert(err, check.IsNil)
279                 npydir := c.MkDir()
280                 exited := (&sliceNumpy{}).RunCommand("slice-numpy", []string{
281                         "-local=true",
282                         "-chunked-onehot=true",
283                         "-chi2-case-control-file=" + tmpdir + "/casecontrol.tsv",
284                         "-chi2-case-control-column=CC",
285                         "-chi2-p-value=0.5",
286                         "-min-coverage=0.75",
287                         "-input-dir=" + slicedir,
288                         "-output-dir=" + npydir,
289                 }, nil, os.Stderr, os.Stderr)
290                 c.Check(exited, check.Equals, 0)
291                 out, _ := exec.Command("find", npydir, "-ls").CombinedOutput()
292                 c.Logf("%s", out)
293
294                 f, err := os.Open(npydir + "/onehot.0002.npy")
295                 c.Assert(err, check.IsNil)
296                 defer f.Close()
297                 npy, err := gonpy.NewReader(f)
298                 c.Assert(err, check.IsNil)
299                 c.Check(npy.Shape, check.DeepEquals, []int{4, 3})
300                 onehot, err := npy.GetInt8()
301                 if c.Check(err, check.IsNil) {
302                         for r := 0; r < npy.Shape[0]; r++ {
303                                 c.Logf("%v", onehot[r*npy.Shape[1]:(r+1)*npy.Shape[1]])
304                         }
305                         c.Check(onehot, check.DeepEquals, []int8{
306                                 0, 1, 0, // input1
307                                 1, 0, 1, // input2
308                                 0, 1, 0, // dup/input1
309                                 1, 0, 1, // dup/input2
310                         })
311                 }
312         }
313
314         c.Log("=== slice-numpy + onehotSingle ===")
315         {
316                 err = ioutil.WriteFile(tmpdir+"/casecontrol.tsv", []byte(`SampleID      CC
317 pipeline1/input1        1
318 pipeline1/input2        0
319 pipeline1dup/input1     1
320 pipeline1dup/input2     0
321 `), 0600)
322                 c.Assert(err, check.IsNil)
323                 npydir := c.MkDir()
324                 exited := (&sliceNumpy{}).RunCommand("slice-numpy", []string{
325                         "-local=true",
326                         "-single-onehot=true",
327                         "-chi2-case-control-file=" + tmpdir + "/casecontrol.tsv",
328                         "-chi2-case-control-column=CC",
329                         "-chi2-p-value=0.5",
330                         "-min-coverage=0.75",
331                         "-input-dir=" + slicedir,
332                         "-output-dir=" + npydir,
333                         "-debug-tag=1",
334                 }, nil, os.Stderr, os.Stderr)
335                 c.Check(exited, check.Equals, 0)
336                 out, _ := exec.Command("find", npydir, "-ls").CombinedOutput()
337                 c.Logf("%s", out)
338
339                 f, err := os.Open(npydir + "/onehot.npy")
340                 c.Assert(err, check.IsNil)
341                 defer f.Close()
342                 npy, err := gonpy.NewReader(f)
343                 c.Assert(err, check.IsNil)
344                 c.Check(npy.Shape, check.DeepEquals, []int{2, 16})
345                 onehot, err := npy.GetUint32()
346                 if c.Check(err, check.IsNil) {
347                         for r := 0; r < npy.Shape[0]; r++ {
348                                 c.Logf("%v", onehot[r*npy.Shape[1]:(r+1)*npy.Shape[1]])
349                         }
350                         c.Check(onehot, check.DeepEquals, []uint32{
351                                 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 0, 2,
352                                 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7,
353                         })
354                 }
355
356                 f, err = os.Open(npydir + "/onehot-columns.npy")
357                 c.Assert(err, check.IsNil)
358                 defer f.Close()
359                 npy, err = gonpy.NewReader(f)
360                 c.Assert(err, check.IsNil)
361                 c.Check(npy.Shape, check.DeepEquals, []int{8, 5})
362                 onehotcols, err := npy.GetInt32()
363                 if c.Check(err, check.IsNil) {
364                         for r := 0; r < npy.Shape[0]; r++ {
365                                 c.Logf("%v", onehotcols[r*npy.Shape[1]:(r+1)*npy.Shape[1]])
366                         }
367                         c.Check(onehotcols, check.DeepEquals, []int32{
368                                 0, 0, 1, 4, 4, 4, 6, 6,
369                                 2, 3, 2, 2, 3, 4, 2, 3,
370                                 0, 1, 0, 0, 0, 0, 0, 0,
371                                 157299, 157299, 157299, 157299, 157299, 157299, 157299, 157299,
372                                 803273, 803273, 803273, 803273, 803273, 803273, 803273, 803273,
373                         })
374                 }
375         }
376 }
377
378 func (s *sliceSuite) Test_tv2homhet(c *check.C) {
379         cmd := &sliceNumpy{
380                 cgnames:         []string{"sample1", "sample2", "sample3", "sample4"},
381                 chi2Cases:       []bool{false, true, true, false},
382                 chi2PValue:      .5,
383                 includeVariant1: true,
384         }
385         cgs := map[string]CompactGenome{
386                 "sample1": CompactGenome{
387                         Variants: []tileVariantID{0, 0, 1, 1}, // hom tv=1
388                 },
389                 "sample2": CompactGenome{
390                         Variants: []tileVariantID{0, 0, 5, 5}, // hom tv=2
391                 },
392                 "sample3": CompactGenome{
393                         Variants: []tileVariantID{0, 0, 5, 1}, // het tv=1, tv=2
394                 },
395                 "sample4": CompactGenome{
396                         Variants: []tileVariantID{0, 0, 9, 9}, // hom tv=3
397                 },
398         }
399         maxv := tileVariantID(3)
400         remap := []tileVariantID{0, 1, 0, 0, 0, 2, 0, 0, 0, 3}
401         chunkstarttag := tagID(10)
402         for tag := tagID(10); tag < 12; tag++ {
403                 c.Logf("=== tag %d", tag)
404                 chunk, xref := cmd.tv2homhet(cgs, maxv, remap, tag, chunkstarttag)
405                 c.Logf("chunk len=%d", len(chunk))
406                 for _, x := range chunk {
407                         c.Logf("%+v", x)
408                 }
409                 c.Logf("xref len=%d", len(xref))
410                 for _, x := range xref {
411                         c.Logf("%+v", x)
412                 }
413                 out := onehotcols2int8(chunk)
414                 c.Logf("onehotcols2int8(chunk) len=%d", len(out))
415                 for i := 0; i < len(out); i += len(chunk) {
416                         c.Logf("%+v", out[i:i+len(chunk)])
417                 }
418                 coords := onehotChunk2Indirect(chunk)
419                 c.Logf("onehotChunk2Indirect(chunk) len=%d", len(coords))
420                 for _, x := range coords {
421                         c.Logf("%+v", x)
422                 }
423         }
424 }