Merge branch '1792-workbench-performance'
[arvados.git] / sdk / python / test_collections.py
1 # usage example:
2 #
3 # ARVADOS_API_TOKEN=abc ARVADOS_API_HOST=arvados.local python -m unittest discover
4
5 import unittest
6 import arvados
7 import os
8 import bz2
9 import sys
10 import subprocess
11
12 class KeepLocalStoreTest(unittest.TestCase):
13     def setUp(self):
14         os.environ['KEEP_LOCAL_STORE'] = '/tmp'
15     def runTest(self):
16         self.assertEqual(arvados.Keep.put('foo'), 'acbd18db4cc2f85cedef654fccc4a4d8+3', 'wrong md5 hash from Keep.put')
17         self.assertEqual(arvados.Keep.get('acbd18db4cc2f85cedef654fccc4a4d8+3'), 'foo', 'wrong data from Keep.get')
18
19 class LocalCollectionWriterTest(unittest.TestCase):
20     def setUp(self):
21         os.environ['KEEP_LOCAL_STORE'] = '/tmp'
22     def runTest(self):
23         cw = arvados.CollectionWriter()
24         self.assertEqual(cw.current_stream_name(), '.',
25                          'current_stream_name() should be "." now')
26         cw.set_current_file_name('foo.txt')
27         cw.write('foo')
28         self.assertEqual(cw.current_file_name(), 'foo.txt',
29                          'current_file_name() should be foo.txt now')
30         cw.start_new_file('bar.txt')
31         cw.write('bar')
32         cw.start_new_stream('baz')
33         cw.write('baz')
34         cw.set_current_file_name('baz.txt')
35         hash = cw.finish()
36         self.assertEqual(hash,
37                          '23ca013983d6239e98931cc779e68426+114',
38                          'resulting manifest hash is not what I expected')
39
40 class LocalCollectionReaderTest(unittest.TestCase):
41     def setUp(self):
42         os.environ['KEEP_LOCAL_STORE'] = '/tmp'
43         LocalCollectionWriterTest().runTest()
44     def runTest(self):
45         cr = arvados.CollectionReader('23ca013983d6239e98931cc779e68426+114')
46         got = []
47         for s in cr.all_streams():
48             for f in s.all_files():
49                 got += [[f.size(), f.stream_name(), f.name(), f.read(2**26)]]
50         expected = [[3, '.', 'foo.txt', 'foo'],
51                     [3, '.', 'bar.txt', 'bar'],
52                     [3, './baz', 'baz.txt', 'baz']]
53         self.assertEqual(got,
54                          expected,
55                          'resulting file list is not what I expected')
56         stream0 = cr.all_streams()[0]
57         self.assertEqual(stream0.read(0),
58                          '',
59                          'reading zero bytes should have returned empty string')
60         self.assertEqual(stream0.read(2**26),
61                          'foobar',
62                          'reading entire stream failed')
63         self.assertEqual(stream0.read(2**26),
64                          None,
65                          'reading past end of stream should have returned None')
66         self.assertEqual(stream0.read(0),
67                          '',
68                          'reading zero bytes should have returned empty string')
69
70 class LocalCollectionManifestSubsetTest(unittest.TestCase):
71     def setUp(self):
72         os.environ['KEEP_LOCAL_STORE'] = '/tmp'
73         LocalCollectionWriterTest().runTest()
74     def runTest(self):
75         self._runTest('23ca013983d6239e98931cc779e68426+114',
76                       [[3, '.', 'foo.txt', 'foo'],
77                        [3, '.', 'bar.txt', 'bar'],
78                        [3, './baz', 'baz.txt', 'baz']])
79         self._runTest((". %s %s 0:3:foo.txt 3:3:bar.txt\n" %
80                        (arvados.Keep.put("foo"),
81                         arvados.Keep.put("bar"))),
82                       [[3, '.', 'foo.txt', 'foo'],
83                        [3, '.', 'bar.txt', 'bar']])
84         self._runTest((". %s %s 0:2:fo.txt 2:4:obar.txt\n" %
85                        (arvados.Keep.put("foo"),
86                         arvados.Keep.put("bar"))),
87                       [[2, '.', 'fo.txt', 'fo'],
88                        [4, '.', 'obar.txt', 'obar']])
89         self._runTest((". %s %s 0:2:fo.txt 2:0:zero.txt 2:2:ob.txt 4:2:ar.txt\n" %
90                        (arvados.Keep.put("foo"),
91                         arvados.Keep.put("bar"))),
92                       [[2, '.', 'fo.txt', 'fo'],
93                        [0, '.', 'zero.txt', ''],
94                        [2, '.', 'ob.txt', 'ob'],
95                        [2, '.', 'ar.txt', 'ar']])
96     def _runTest(self, collection, expected):
97         cr = arvados.CollectionReader(collection)
98         manifest_subsets = []
99         for s in cr.all_streams():
100             for f in s.all_files():
101                 manifest_subsets += [f.as_manifest()]
102         expect_i = 0
103         for m in manifest_subsets:
104             cr = arvados.CollectionReader(m)
105             for f in cr.all_files():
106                 got = [f.size(), f.stream_name(), f.name(), "".join(f.readall(2**26))]
107                 self.assertEqual(got,
108                                  expected[expect_i],
109                                  'all_files|as_manifest did not preserve manifest contents: got %s expected %s' % (got, expected[expect_i]))
110                 expect_i += 1
111
112 class LocalCollectionReadlineTest(unittest.TestCase):
113     def setUp(self):
114         os.environ['KEEP_LOCAL_STORE'] = '/tmp'
115     def _runTest(self, what_in, what_out):
116         cw = arvados.CollectionWriter()
117         cw.start_new_file('test.txt')
118         cw.write(what_in)
119         test1 = cw.finish()
120         cr = arvados.CollectionReader(test1)
121         got = []
122         for x in list(cr.all_files())[0].readlines():
123             got += [x]
124         self.assertEqual(got,
125                          what_out,
126                          "readlines did not split lines correctly: %s" % got)
127     def runTest(self):
128         self._runTest("\na\nbcd\n\nefg\nz",
129                       ["\n", "a\n", "bcd\n", "\n", "efg\n", "z"])
130         self._runTest("ab\ncd\n",
131                       ["ab\n", "cd\n"])
132
133 class LocalCollectionEmptyFileTest(unittest.TestCase):
134     def setUp(self):
135         os.environ['KEEP_LOCAL_STORE'] = '/tmp'
136     def runTest(self):
137         cw = arvados.CollectionWriter()
138         cw.start_new_file('zero.txt')
139         cw.write('')
140         self.check_manifest_file_sizes(cw.manifest_text(), [0])
141         cw = arvados.CollectionWriter()
142         cw.start_new_file('zero.txt')
143         cw.write('')
144         cw.start_new_file('one.txt')
145         cw.write('1')
146         cw.start_new_stream('foo')
147         cw.start_new_file('zero.txt')
148         cw.write('')
149         self.check_manifest_file_sizes(cw.manifest_text(), [0,1,0])
150     def check_manifest_file_sizes(self, manifest_text, expect_sizes):
151         cr = arvados.CollectionReader(manifest_text)
152         got_sizes = []
153         for f in cr.all_files():
154             got_sizes += [f.size()]
155         self.assertEqual(got_sizes, expect_sizes, "got wrong file sizes %s, expected %s" % (got_sizes, expect_sizes))
156
157 class LocalCollectionBZ2DecompressionTest(unittest.TestCase):
158     def setUp(self):
159         os.environ['KEEP_LOCAL_STORE'] = '/tmp'
160     def runTest(self):
161         n_lines_in = 2**18
162         data_in = "abc\n"
163         for x in xrange(0, 18):
164             data_in += data_in
165         compressed_data_in = bz2.compress(data_in)
166         cw = arvados.CollectionWriter()
167         cw.start_new_file('test.bz2')
168         cw.write(compressed_data_in)
169         bz2_manifest = cw.manifest_text()
170
171         cr = arvados.CollectionReader(bz2_manifest)
172         got = 0
173         for x in list(cr.all_files())[0].readlines():
174             self.assertEqual(x, "abc\n", "decompression returned wrong data: %s" % x)
175             got += 1
176         self.assertEqual(got,
177                          n_lines_in,
178                          "decompression returned %d lines instead of %d" % (got, n_lines_in))
179
180 class LocalCollectionGzipDecompressionTest(unittest.TestCase):
181     def setUp(self):
182         os.environ['KEEP_LOCAL_STORE'] = '/tmp'
183     def runTest(self):
184         n_lines_in = 2**18
185         data_in = "abc\n"
186         for x in xrange(0, 18):
187             data_in += data_in
188         p = subprocess.Popen(["gzip", "-1cn"],
189                              stdout=subprocess.PIPE,
190                              stdin=subprocess.PIPE,
191                              stderr=subprocess.PIPE,
192                              shell=False, close_fds=True)
193         compressed_data_in, stderrdata = p.communicate(data_in)
194
195         cw = arvados.CollectionWriter()
196         cw.start_new_file('test.gz')
197         cw.write(compressed_data_in)
198         gzip_manifest = cw.manifest_text()
199
200         cr = arvados.CollectionReader(gzip_manifest)
201         got = 0
202         for x in list(cr.all_files())[0].readlines():
203             self.assertEqual(x, "abc\n", "decompression returned wrong data: %s" % x)
204             got += 1
205         self.assertEqual(got,
206                          n_lines_in,
207                          "decompression returned %d lines instead of %d" % (got, n_lines_in))