Merge branch '17522-arvput-stdin-transcode-fix'
[arvados.git] / sdk / python / tests / test_stream.py
1 # Copyright (C) The Arvados Authors. All rights reserved.
2 #
3 # SPDX-License-Identifier: Apache-2.0
4
5 from __future__ import absolute_import
6 from builtins import object
7 import bz2
8 import gzip
9 import io
10 import mock
11 import os
12 import unittest
13 import hashlib
14
15 import arvados
16 from arvados import StreamReader, StreamFileReader
17 from arvados._ranges import Range
18
19 from . import arvados_testutil as tutil
20 from . import run_test_server
21
22 class StreamFileReaderTestCase(unittest.TestCase):
23     def make_count_reader(self):
24         stream = tutil.MockStreamReader('.', '01234', '34567', '67890')
25         return StreamFileReader(stream, [Range(1, 0, 3), Range(6, 3, 3), Range(11, 6, 3)],
26                                 'count.txt')
27
28     def test_read_block_crossing_behavior(self):
29         # read() calls will be aligned on block boundaries - see #3663.
30         sfile = self.make_count_reader()
31         self.assertEqual(b'123', sfile.read(10))
32
33     def test_small_read(self):
34         sfile = self.make_count_reader()
35         self.assertEqual(b'12', sfile.read(2))
36
37     def test_successive_reads(self):
38         sfile = self.make_count_reader()
39         for expect in [b'123', b'456', b'789', b'']:
40             self.assertEqual(expect, sfile.read(10))
41
42     def test_readfrom_spans_blocks(self):
43         sfile = self.make_count_reader()
44         self.assertEqual(b'6789', sfile.readfrom(5, 12))
45
46     def test_small_readfrom_spanning_blocks(self):
47         sfile = self.make_count_reader()
48         self.assertEqual(b'2345', sfile.readfrom(1, 4))
49
50     def test_readall(self):
51         sfile = self.make_count_reader()
52         self.assertEqual(b'123456789', b''.join(sfile.readall()))
53
54     def test_one_arg_seek(self):
55         self.test_absolute_seek([])
56
57     def test_absolute_seek(self, args=[os.SEEK_SET]):
58         sfile = self.make_count_reader()
59         sfile.seek(6, *args)
60         self.assertEqual(b'78', sfile.read(2))
61         sfile.seek(4, *args)
62         self.assertEqual(b'56', sfile.read(2))
63
64     def test_relative_seek(self, args=[os.SEEK_CUR]):
65         sfile = self.make_count_reader()
66         self.assertEqual(b'12', sfile.read(2))
67         sfile.seek(2, *args)
68         self.assertEqual(b'56', sfile.read(2))
69
70     def test_end_seek(self):
71         sfile = self.make_count_reader()
72         sfile.seek(-6, os.SEEK_END)
73         self.assertEqual(b'45', sfile.read(2))
74
75     def test_seek_min_zero(self):
76         sfile = self.make_count_reader()
77         self.assertEqual(0, sfile.tell())
78         with self.assertRaises(IOError):
79             sfile.seek(-2, os.SEEK_SET)
80         self.assertEqual(0, sfile.tell())
81
82     def test_seek_max_size(self):
83         sfile = self.make_count_reader()
84         sfile.seek(2, os.SEEK_END)
85         # POSIX permits seeking past end of file.
86         self.assertEqual(11, sfile.tell())
87
88     def test_size(self):
89         self.assertEqual(9, self.make_count_reader().size())
90
91     def test_tell_after_block_read(self):
92         sfile = self.make_count_reader()
93         sfile.read(5)
94         self.assertEqual(3, sfile.tell())
95
96     def test_tell_after_small_read(self):
97         sfile = self.make_count_reader()
98         sfile.read(1)
99         self.assertEqual(1, sfile.tell())
100
101     def test_no_read_after_close(self):
102         sfile = self.make_count_reader()
103         sfile.close()
104         self.assertRaises(ValueError, sfile.read, 2)
105
106     def test_context(self):
107         with self.make_count_reader() as sfile:
108             self.assertFalse(sfile.closed, "reader is closed inside context")
109             self.assertEqual(b'12', sfile.read(2))
110         self.assertTrue(sfile.closed, "reader is open after context")
111
112     def make_newlines_reader(self):
113         stream = tutil.MockStreamReader('.', 'one\ntwo\n\nth', 'ree\nfour\n\n')
114         return StreamFileReader(stream, [Range(0, 0, 11), Range(11, 11, 10)], 'count.txt')
115
116     def check_lines(self, actual):
117         self.assertEqual(['one\n', 'two\n', '\n', 'three\n', 'four\n', '\n'],
118                          actual)
119
120     def test_readline(self):
121         reader = self.make_newlines_reader()
122         actual = []
123         while True:
124             data = reader.readline()
125             if not data:
126                 break
127             actual.append(data)
128         self.check_lines(actual)
129
130     def test_readlines(self):
131         self.check_lines(self.make_newlines_reader().readlines())
132
133     def test_iteration(self):
134         self.check_lines(list(iter(self.make_newlines_reader())))
135
136     def test_readline_size(self):
137         reader = self.make_newlines_reader()
138         self.assertEqual('on', reader.readline(2))
139         self.assertEqual('e\n', reader.readline(4))
140         self.assertEqual('two\n', reader.readline(6))
141         self.assertEqual('\n', reader.readline(8))
142         self.assertEqual('thre', reader.readline(4))
143
144     def test_readlines_sizehint(self):
145         result = self.make_newlines_reader().readlines(8)
146         self.assertEqual(['one\n', 'two\n'], result[:2])
147         self.assertNotIn('three\n', result)
148
149     def test_name_attribute(self):
150         # Test both .name and .name() (for backward compatibility)
151         stream = tutil.MockStreamReader()
152         sfile = StreamFileReader(stream, [Range(0, 0, 0)], 'nametest')
153         self.assertEqual('nametest', sfile.name)
154         self.assertEqual('nametest', sfile.name())
155
156     def check_decompressed_name(self, filename, expect):
157         stream = tutil.MockStreamReader('.', '')
158         reader = StreamFileReader(stream, [Range(0, 0, 0)], filename)
159         self.assertEqual(expect, reader.decompressed_name())
160
161     def test_decompressed_name_uncompressed_file(self):
162         self.check_decompressed_name('test.log', 'test.log')
163
164     def test_decompressed_name_gzip_file(self):
165         self.check_decompressed_name('test.log.gz', 'test.log')
166
167     def test_decompressed_name_bz2_file(self):
168         self.check_decompressed_name('test.log.bz2', 'test.log')
169
170     def check_decompression(self, compress_ext, compress_func):
171         test_text = b'decompression\ntest\n'
172         test_data = compress_func(test_text)
173         stream = tutil.MockStreamReader('.', test_data)
174         reader = StreamFileReader(stream, [Range(0, 0, len(test_data))],
175                                   'test.' + compress_ext)
176         self.assertEqual(test_text, b''.join(reader.readall_decompressed()))
177
178     @staticmethod
179     def gzip_compress(data):
180         compressed_data = io.BytesIO()
181         with gzip.GzipFile(fileobj=compressed_data, mode='wb') as gzip_file:
182             gzip_file.write(data)
183         return compressed_data.getvalue()
184
185     def test_no_decompression(self):
186         self.check_decompression('log', lambda s: s)
187
188     def test_gzip_decompression(self):
189         self.check_decompression('gz', self.gzip_compress)
190
191     def test_bz2_decompression(self):
192         self.check_decompression('bz2', bz2.compress)
193
194     def test_readline_then_readlines(self):
195         reader = self.make_newlines_reader()
196         data = reader.readline()
197         self.assertEqual('one\n', data)
198         data = reader.readlines()
199         self.assertEqual(['two\n', '\n', 'three\n', 'four\n', '\n'], data)
200
201     def test_readline_then_readall(self):
202         reader = self.make_newlines_reader()
203         data = reader.readline()
204         self.assertEqual('one\n', data)
205         self.assertEqual(b''.join([b'two\n', b'\n', b'three\n', b'four\n', b'\n']), b''.join(reader.readall()))
206
207
208 class StreamRetryTestMixin(object):
209     # Define reader_for(coll_name, **kwargs)
210     # and read_for_test(reader, size, **kwargs).
211     API_COLLECTIONS = run_test_server.fixture('collections')
212
213     def keep_client(self):
214         return arvados.KeepClient(proxy='http://[%s]:1' % (tutil.TEST_HOST,),
215                                   local_store='')
216
217     def manifest_for(self, coll_name):
218         return self.API_COLLECTIONS[coll_name]['manifest_text']
219
220     @tutil.skip_sleep
221     def test_success_without_retries(self):
222         with tutil.mock_keep_responses('bar', 200):
223             reader = self.reader_for('bar_file')
224             self.assertEqual(b'bar', self.read_for_test(reader, 3))
225
226     @tutil.skip_sleep
227     def test_read_no_default_retry(self):
228         with tutil.mock_keep_responses('', 500):
229             reader = self.reader_for('user_agreement')
230             with self.assertRaises(arvados.errors.KeepReadError):
231                 self.read_for_test(reader, 10)
232
233     @tutil.skip_sleep
234     def test_read_with_instance_retries(self):
235         with tutil.mock_keep_responses('foo', 500, 200):
236             reader = self.reader_for('foo_file', num_retries=3)
237             self.assertEqual(b'foo', self.read_for_test(reader, 3))
238
239     @tutil.skip_sleep
240     def test_read_with_method_retries(self):
241         with tutil.mock_keep_responses('foo', 500, 200):
242             reader = self.reader_for('foo_file')
243             self.assertEqual(b'foo',
244                              self.read_for_test(reader, 3, num_retries=3))
245
246     @tutil.skip_sleep
247     def test_read_instance_retries_exhausted(self):
248         with tutil.mock_keep_responses('bar', 500, 500, 500, 500, 200):
249             reader = self.reader_for('bar_file', num_retries=3)
250             with self.assertRaises(arvados.errors.KeepReadError):
251                 self.read_for_test(reader, 3)
252
253     @tutil.skip_sleep
254     def test_read_method_retries_exhausted(self):
255         with tutil.mock_keep_responses('bar', 500, 500, 500, 500, 200):
256             reader = self.reader_for('bar_file')
257             with self.assertRaises(arvados.errors.KeepReadError):
258                 self.read_for_test(reader, 3, num_retries=3)
259
260     @tutil.skip_sleep
261     def test_method_retries_take_precedence(self):
262         with tutil.mock_keep_responses('', 500, 500, 500, 200):
263             reader = self.reader_for('user_agreement', num_retries=10)
264             with self.assertRaises(arvados.errors.KeepReadError):
265                 self.read_for_test(reader, 10, num_retries=1)
266
267
268 class StreamReaderTestCase(unittest.TestCase, StreamRetryTestMixin):
269     def reader_for(self, coll_name, **kwargs):
270         return StreamReader(self.manifest_for(coll_name).split(),
271                             self.keep_client(), **kwargs)
272
273     def read_for_test(self, reader, byte_count, **kwargs):
274         return reader.readfrom(0, byte_count, **kwargs)
275
276     def test_manifest_text_without_keep_client(self):
277         mtext = self.manifest_for('multilevel_collection_1')
278         for line in mtext.rstrip('\n').split('\n'):
279             reader = StreamReader(line.split())
280             self.assertEqual(line + '\n', reader.manifest_text())
281
282
283 class StreamFileReadTestCase(unittest.TestCase, StreamRetryTestMixin):
284     def reader_for(self, coll_name, **kwargs):
285         return StreamReader(self.manifest_for(coll_name).split(),
286                             self.keep_client(), **kwargs).all_files()[0]
287
288     def read_for_test(self, reader, byte_count, **kwargs):
289         return reader.read(byte_count, **kwargs)
290
291
292 class StreamFileReadFromTestCase(StreamFileReadTestCase):
293     def read_for_test(self, reader, byte_count, **kwargs):
294         return reader.readfrom(0, byte_count, **kwargs)
295
296
297 class StreamFileReadAllTestCase(StreamFileReadTestCase):
298     def read_for_test(self, reader, byte_count, **kwargs):
299         return b''.join(reader.readall(**kwargs))
300
301
302 class StreamFileReadAllDecompressedTestCase(StreamFileReadTestCase):
303     def read_for_test(self, reader, byte_count, **kwargs):
304         return b''.join(reader.readall_decompressed(**kwargs))
305
306
307 class StreamFileReadlinesTestCase(StreamFileReadTestCase):
308     def read_for_test(self, reader, byte_count, **kwargs):
309         return ''.join(reader.readlines(**kwargs)).encode()
310
311 if __name__ == '__main__':
312     unittest.main()