Merge branch '12430-output-glob'
[arvados.git] / sdk / python / tests / test_stream.py
1 # Copyright (C) The Arvados Authors. All rights reserved.
2 #
3 # SPDX-License-Identifier: Apache-2.0
4
5 import bz2
6 import gzip
7 import io
8 import os
9 import unittest
10 import hashlib
11
12 from unittest import mock
13
14 import arvados
15 from arvados import StreamReader, StreamFileReader
16 from arvados._ranges import Range
17
18 from . import arvados_testutil as tutil
19 from . import run_test_server
20
21 class StreamFileReaderTestCase(unittest.TestCase):
22     def make_count_reader(self):
23         stream = tutil.MockStreamReader('.', '01234', '34567', '67890')
24         return StreamFileReader(stream, [Range(1, 0, 3), Range(6, 3, 3), Range(11, 6, 3)],
25                                 'count.txt')
26
27     def test_read_block_crossing_behavior(self):
28         # read() calls will be aligned on block boundaries - see #3663.
29         sfile = self.make_count_reader()
30         self.assertEqual(b'123', sfile.read(10))
31
32     def test_small_read(self):
33         sfile = self.make_count_reader()
34         self.assertEqual(b'12', sfile.read(2))
35
36     def test_successive_reads(self):
37         sfile = self.make_count_reader()
38         for expect in [b'123', b'456', b'789', b'']:
39             self.assertEqual(expect, sfile.read(10))
40
41     def test_readfrom_spans_blocks(self):
42         sfile = self.make_count_reader()
43         self.assertEqual(b'6789', sfile.readfrom(5, 12))
44
45     def test_small_readfrom_spanning_blocks(self):
46         sfile = self.make_count_reader()
47         self.assertEqual(b'2345', sfile.readfrom(1, 4))
48
49     def test_readall(self):
50         sfile = self.make_count_reader()
51         self.assertEqual(b'123456789', b''.join(sfile.readall()))
52
53     def test_one_arg_seek(self):
54         self.test_absolute_seek([])
55
56     def test_absolute_seek(self, args=[os.SEEK_SET]):
57         sfile = self.make_count_reader()
58         sfile.seek(6, *args)
59         self.assertEqual(b'78', sfile.read(2))
60         sfile.seek(4, *args)
61         self.assertEqual(b'56', sfile.read(2))
62
63     def test_relative_seek(self, args=[os.SEEK_CUR]):
64         sfile = self.make_count_reader()
65         self.assertEqual(b'12', sfile.read(2))
66         sfile.seek(2, *args)
67         self.assertEqual(b'56', sfile.read(2))
68
69     def test_end_seek(self):
70         sfile = self.make_count_reader()
71         sfile.seek(-6, os.SEEK_END)
72         self.assertEqual(b'45', sfile.read(2))
73
74     def test_seek_min_zero(self):
75         sfile = self.make_count_reader()
76         self.assertEqual(0, sfile.tell())
77         with self.assertRaises(IOError):
78             sfile.seek(-2, os.SEEK_SET)
79         self.assertEqual(0, sfile.tell())
80
81     def test_seek_max_size(self):
82         sfile = self.make_count_reader()
83         sfile.seek(2, os.SEEK_END)
84         # POSIX permits seeking past end of file.
85         self.assertEqual(11, sfile.tell())
86
87     def test_size(self):
88         self.assertEqual(9, self.make_count_reader().size())
89
90     def test_tell_after_block_read(self):
91         sfile = self.make_count_reader()
92         sfile.read(5)
93         self.assertEqual(3, sfile.tell())
94
95     def test_tell_after_small_read(self):
96         sfile = self.make_count_reader()
97         sfile.read(1)
98         self.assertEqual(1, sfile.tell())
99
100     def test_no_read_after_close(self):
101         sfile = self.make_count_reader()
102         sfile.close()
103         self.assertRaises(ValueError, sfile.read, 2)
104
105     def test_context(self):
106         with self.make_count_reader() as sfile:
107             self.assertFalse(sfile.closed, "reader is closed inside context")
108             self.assertEqual(b'12', sfile.read(2))
109         self.assertTrue(sfile.closed, "reader is open after context")
110
111     def make_newlines_reader(self):
112         stream = tutil.MockStreamReader('.', 'one\ntwo\n\nth', 'ree\nfour\n\n')
113         return StreamFileReader(stream, [Range(0, 0, 11), Range(11, 11, 10)], 'count.txt')
114
115     def check_lines(self, actual):
116         self.assertEqual(['one\n', 'two\n', '\n', 'three\n', 'four\n', '\n'],
117                          actual)
118
119     def test_readline(self):
120         reader = self.make_newlines_reader()
121         actual = []
122         while True:
123             data = reader.readline()
124             if not data:
125                 break
126             actual.append(data)
127         self.check_lines(actual)
128
129     def test_readlines(self):
130         self.check_lines(self.make_newlines_reader().readlines())
131
132     def test_iteration(self):
133         self.check_lines(list(iter(self.make_newlines_reader())))
134
135     def test_readline_size(self):
136         reader = self.make_newlines_reader()
137         self.assertEqual('on', reader.readline(2))
138         self.assertEqual('e\n', reader.readline(4))
139         self.assertEqual('two\n', reader.readline(6))
140         self.assertEqual('\n', reader.readline(8))
141         self.assertEqual('thre', reader.readline(4))
142
143     def test_readlines_sizehint(self):
144         result = self.make_newlines_reader().readlines(8)
145         self.assertEqual(['one\n', 'two\n'], result[:2])
146         self.assertNotIn('three\n', result)
147
148     def test_name_attribute(self):
149         # Test both .name and .name() (for backward compatibility)
150         stream = tutil.MockStreamReader()
151         sfile = StreamFileReader(stream, [Range(0, 0, 0)], 'nametest')
152         self.assertEqual('nametest', sfile.name)
153         self.assertEqual('nametest', sfile.name())
154
155     def check_decompressed_name(self, filename, expect):
156         stream = tutil.MockStreamReader('.', '')
157         reader = StreamFileReader(stream, [Range(0, 0, 0)], filename)
158         self.assertEqual(expect, reader.decompressed_name())
159
160     def test_decompressed_name_uncompressed_file(self):
161         self.check_decompressed_name('test.log', 'test.log')
162
163     def test_decompressed_name_gzip_file(self):
164         self.check_decompressed_name('test.log.gz', 'test.log')
165
166     def test_decompressed_name_bz2_file(self):
167         self.check_decompressed_name('test.log.bz2', 'test.log')
168
169     def check_decompression(self, compress_ext, compress_func):
170         test_text = b'decompression\ntest\n'
171         test_data = compress_func(test_text)
172         stream = tutil.MockStreamReader('.', test_data)
173         reader = StreamFileReader(stream, [Range(0, 0, len(test_data))],
174                                   'test.' + compress_ext)
175         self.assertEqual(test_text, b''.join(reader.readall_decompressed()))
176
177     @staticmethod
178     def gzip_compress(data):
179         compressed_data = io.BytesIO()
180         with gzip.GzipFile(fileobj=compressed_data, mode='wb') as gzip_file:
181             gzip_file.write(data)
182         return compressed_data.getvalue()
183
184     def test_no_decompression(self):
185         self.check_decompression('log', lambda s: s)
186
187     def test_gzip_decompression(self):
188         self.check_decompression('gz', self.gzip_compress)
189
190     def test_bz2_decompression(self):
191         self.check_decompression('bz2', bz2.compress)
192
193     def test_readline_then_readlines(self):
194         reader = self.make_newlines_reader()
195         data = reader.readline()
196         self.assertEqual('one\n', data)
197         data = reader.readlines()
198         self.assertEqual(['two\n', '\n', 'three\n', 'four\n', '\n'], data)
199
200     def test_readline_then_readall(self):
201         reader = self.make_newlines_reader()
202         data = reader.readline()
203         self.assertEqual('one\n', data)
204         self.assertEqual(b''.join([b'two\n', b'\n', b'three\n', b'four\n', b'\n']), b''.join(reader.readall()))
205
206
207 class StreamRetryTestMixin(object):
208     # Define reader_for(coll_name, **kwargs)
209     # and read_for_test(reader, size, **kwargs).
210     API_COLLECTIONS = run_test_server.fixture('collections')
211
212     def keep_client(self):
213         return arvados.KeepClient(proxy='http://[%s]:1' % (tutil.TEST_HOST,),
214                                   local_store='')
215
216     def manifest_for(self, coll_name):
217         return self.API_COLLECTIONS[coll_name]['manifest_text']
218
219     @tutil.skip_sleep
220     def test_success_without_retries(self):
221         with tutil.mock_keep_responses('bar', 200):
222             reader = self.reader_for('bar_file')
223             self.assertEqual(b'bar', self.read_for_test(reader, 3))
224
225     @tutil.skip_sleep
226     def test_read_with_instance_retries(self):
227         with tutil.mock_keep_responses('foo', 500, 200):
228             reader = self.reader_for('foo_file', num_retries=3)
229             self.assertEqual(b'foo', self.read_for_test(reader, 3))
230
231     @tutil.skip_sleep
232     def test_read_with_method_retries(self):
233         with tutil.mock_keep_responses('foo', 500, 200):
234             reader = self.reader_for('foo_file')
235             self.assertEqual(b'foo',
236                              self.read_for_test(reader, 3, num_retries=3))
237
238     @tutil.skip_sleep
239     def test_read_instance_retries_exhausted(self):
240         with tutil.mock_keep_responses('bar', 500, 500, 500, 500, 200):
241             reader = self.reader_for('bar_file', num_retries=3)
242             with self.assertRaises(arvados.errors.KeepReadError):
243                 self.read_for_test(reader, 3)
244
245     @tutil.skip_sleep
246     def test_read_method_retries_exhausted(self):
247         with tutil.mock_keep_responses('bar', 500, 500, 500, 500, 200):
248             reader = self.reader_for('bar_file')
249             with self.assertRaises(arvados.errors.KeepReadError):
250                 self.read_for_test(reader, 3, num_retries=3)
251
252     @tutil.skip_sleep
253     def test_method_retries_take_precedence(self):
254         with tutil.mock_keep_responses('', 500, 500, 500, 200):
255             reader = self.reader_for('user_agreement', num_retries=10)
256             with self.assertRaises(arvados.errors.KeepReadError):
257                 self.read_for_test(reader, 10, num_retries=1)
258
259
260 class StreamReaderTestCase(unittest.TestCase, StreamRetryTestMixin):
261     def reader_for(self, coll_name, **kwargs):
262         return StreamReader(self.manifest_for(coll_name).split(),
263                             self.keep_client(), **kwargs)
264
265     def read_for_test(self, reader, byte_count, **kwargs):
266         return reader.readfrom(0, byte_count, **kwargs)
267
268     def test_manifest_text_without_keep_client(self):
269         mtext = self.manifest_for('multilevel_collection_1')
270         for line in mtext.rstrip('\n').split('\n'):
271             reader = StreamReader(line.split())
272             self.assertEqual(line + '\n', reader.manifest_text())
273
274
275 class StreamFileReadTestCase(unittest.TestCase, StreamRetryTestMixin):
276     def reader_for(self, coll_name, **kwargs):
277         return StreamReader(self.manifest_for(coll_name).split(),
278                             self.keep_client(), **kwargs).all_files()[0]
279
280     def read_for_test(self, reader, byte_count, **kwargs):
281         return reader.read(byte_count, **kwargs)
282
283
284 class StreamFileReadFromTestCase(StreamFileReadTestCase):
285     def read_for_test(self, reader, byte_count, **kwargs):
286         return reader.readfrom(0, byte_count, **kwargs)
287
288
289 class StreamFileReadAllTestCase(StreamFileReadTestCase):
290     def read_for_test(self, reader, byte_count, **kwargs):
291         return b''.join(reader.readall(**kwargs))
292
293
294 class StreamFileReadAllDecompressedTestCase(StreamFileReadTestCase):
295     def read_for_test(self, reader, byte_count, **kwargs):
296         return b''.join(reader.readall_decompressed(**kwargs))
297
298
299 class StreamFileReadlinesTestCase(StreamFileReadTestCase):
300     def read_for_test(self, reader, byte_count, **kwargs):
301         return ''.join(reader.readlines(**kwargs)).encode()
302
303 if __name__ == '__main__':
304     unittest.main()