Merge branch '21720-material-ui-upgrade'
[arvados.git] / sdk / python / tests / test_collections.py
1 # Copyright (C) The Arvados Authors. All rights reserved.
2 #
3 # SPDX-License-Identifier: Apache-2.0
4
5 import ciso8601
6 import copy
7 import datetime
8 import os
9 import random
10 import re
11 import shutil
12 import sys
13 import tempfile
14 import time
15 import unittest
16
17 import arvados
18 import arvados.keep
19 import parameterized
20
21 from arvados._internal.streams import Range, LocatorAndRange, locators_and_ranges
22 from arvados.collection import Collection, CollectionReader
23
24 from . import arvados_testutil as tutil
25 from . import run_test_server
26 from unittest import mock
27
28 @parameterized.parameterized_class([{"disk_cache": True}, {"disk_cache": False}])
29 class ArvadosCollectionsTest(run_test_server.TestCaseWithServers,
30                              tutil.ArvadosBaseTestCase):
31     disk_cache = False
32     MAIN_SERVER = {}
33
34     @classmethod
35     def setUpClass(cls):
36         super(ArvadosCollectionsTest, cls).setUpClass()
37         # need admin privileges to make collections with unsigned blocks
38         run_test_server.authorize_with('admin')
39         if cls.disk_cache:
40             cls._disk_cache_dir = tempfile.mkdtemp(prefix='CollectionsTest-')
41         else:
42             cls._disk_cache_dir = None
43         block_cache = arvados.keep.KeepBlockCache(
44             disk_cache=cls.disk_cache,
45             disk_cache_dir=cls._disk_cache_dir,
46         )
47         cls.api_client = arvados.api('v1')
48         cls.keep_client = arvados.KeepClient(api_client=cls.api_client,
49                                              local_store=cls.local_store,
50                                              block_cache=block_cache)
51
52     @classmethod
53     def tearDownClass(cls):
54         if cls._disk_cache_dir:
55             shutil.rmtree(cls._disk_cache_dir)
56
57     def write_foo_bar_baz(self):
58         with arvados.collection.Collection(api_client=self.api_client).open('zzz', 'wb') as f:
59             f.write(b'foobar')
60             f.flush()
61             f.write(b'baz')
62         cw = arvados.collection.Collection(
63             api_client=self.api_client,
64             manifest_locator_or_text=
65             ". 3858f62230ac3c915f300c664312c63f+6 0:3:foo.txt 3:3:bar.txt\n" +
66             "./baz 73feffa4b7f6bb68e44cf984c85f6e88+3 0:3:baz.txt\n")
67         cw.save_new()
68         return cw.portable_data_hash()
69
70     def test_pdh_is_native_str(self):
71         pdh = self.write_foo_bar_baz()
72         self.assertEqual(type(''), type(pdh))
73
74     def test_keep_local_store(self):
75         self.assertEqual(self.keep_client.put(b'foo'), 'acbd18db4cc2f85cedef654fccc4a4d8+3', 'wrong md5 hash from Keep.put')
76         self.assertEqual(self.keep_client.get('acbd18db4cc2f85cedef654fccc4a4d8+3'), b'foo', 'wrong data from Keep.get')
77
78     def test_local_collection_writer(self):
79         self.assertEqual(self.write_foo_bar_baz(),
80                          '23ca013983d6239e98931cc779e68426+114',
81                          'wrong locator hash: ' + self.write_foo_bar_baz())
82
83     def test_collection_empty_file(self):
84         cw = arvados.collection.Collection(api_client=self.api_client)
85         with cw.open('zero.txt', 'wb') as f:
86             pass
87
88         self.assertEqual(cw.manifest_text(), ". d41d8cd98f00b204e9800998ecf8427e+0 0:0:zero.txt\n")
89         self.check_manifest_file_sizes(cw.manifest_text(), [0])
90
91         cw = arvados.collection.Collection(api_client=self.api_client)
92         with cw.open('zero.txt', 'wb') as f:
93             pass
94         with cw.open('one.txt', 'wb') as f:
95             f.write(b'1')
96         with cw.open('foo/zero.txt', 'wb') as f:
97             pass
98         # sorted, that's: [./one.txt, ./zero.txt, foo/zero.txt]
99         self.check_manifest_file_sizes(cw.manifest_text(), [1,0,0])
100
101     def check_manifest_file_sizes(self, manifest_text, expect_sizes):
102         got_sizes = []
103         def walk(subdir):
104             for fnm in subdir:
105                 if isinstance(subdir[fnm], arvados.arvfile.ArvadosFile):
106                     got_sizes.append(subdir[fnm].size())
107                 else:
108                     walk(subdir[fnm])
109         cr = arvados.CollectionReader(manifest_text, self.api_client)
110         walk(cr)
111         self.assertEqual(got_sizes, expect_sizes, "got wrong file sizes %s, expected %s" % (got_sizes, expect_sizes))
112
113     def test_normalized_collection(self):
114         m1 = """. 5348b82a029fd9e971a811ce1f71360b+43 0:43:md5sum.txt
115 . 085c37f02916da1cad16f93c54d899b7+41 0:41:md5sum.txt
116 . 8b22da26f9f433dea0a10e5ec66d73ba+43 0:43:md5sum.txt
117 """
118         self.assertEqual(arvados.CollectionReader(m1, self.api_client).manifest_text(normalize=True),
119                          """. 5348b82a029fd9e971a811ce1f71360b+43 085c37f02916da1cad16f93c54d899b7+41 8b22da26f9f433dea0a10e5ec66d73ba+43 0:127:md5sum.txt
120 """)
121
122         m2 = """. 204e43b8a1185621ca55a94839582e6f+67108864 b9677abbac956bd3e86b1deb28dfac03+67108864 fc15aff2a762b13f521baf042140acec+67108864 323d2a3ce20370c4ca1d3462a344f8fd+25885655 0:227212247:var-GS000016015-ASM.tsv.bz2
123 """
124         self.assertEqual(arvados.CollectionReader(m2, self.api_client).manifest_text(normalize=True), m2)
125
126         m3 = """. 5348b82a029fd9e971a811ce1f71360b+43 3:40:md5sum.txt
127 . 085c37f02916da1cad16f93c54d899b7+41 0:41:md5sum.txt
128 . 8b22da26f9f433dea0a10e5ec66d73ba+43 0:43:md5sum.txt
129 """
130         self.assertEqual(arvados.CollectionReader(m3, self.api_client).manifest_text(normalize=True),
131                          """. 5348b82a029fd9e971a811ce1f71360b+43 085c37f02916da1cad16f93c54d899b7+41 8b22da26f9f433dea0a10e5ec66d73ba+43 3:124:md5sum.txt
132 """)
133
134         m4 = """. 204e43b8a1185621ca55a94839582e6f+67108864 0:3:foo/bar
135 ./zzz 204e43b8a1185621ca55a94839582e6f+67108864 0:999:zzz
136 ./foo 323d2a3ce20370c4ca1d3462a344f8fd+25885655 0:3:bar
137 """
138         self.assertEqual(arvados.CollectionReader(m4, self.api_client).manifest_text(normalize=True),
139                          """./foo 204e43b8a1185621ca55a94839582e6f+67108864 323d2a3ce20370c4ca1d3462a344f8fd+25885655 0:3:bar 67108864:3:bar
140 ./zzz 204e43b8a1185621ca55a94839582e6f+67108864 0:999:zzz
141 """)
142
143         m5 = """. 204e43b8a1185621ca55a94839582e6f+67108864 0:3:foo/bar
144 ./zzz 204e43b8a1185621ca55a94839582e6f+67108864 0:999:zzz
145 ./foo 204e43b8a1185621ca55a94839582e6f+67108864 3:3:bar
146 """
147         self.assertEqual(arvados.CollectionReader(m5, self.api_client).manifest_text(normalize=True),
148                          """./foo 204e43b8a1185621ca55a94839582e6f+67108864 0:6:bar
149 ./zzz 204e43b8a1185621ca55a94839582e6f+67108864 0:999:zzz
150 """)
151
152         with self.data_file('1000G_ref_manifest') as f6:
153             m6 = f6.read()
154             self.assertEqual(arvados.CollectionReader(m6, self.api_client).manifest_text(normalize=True), m6)
155
156         with self.data_file('jlake_manifest') as f7:
157             m7 = f7.read()
158             self.assertEqual(arvados.CollectionReader(m7, self.api_client).manifest_text(normalize=True), m7)
159
160         m8 = """./a\\040b\\040c 59ca0efa9f5633cb0371bbc0355478d8+13 0:13:hello\\040world.txt
161 """
162         self.assertEqual(arvados.CollectionReader(m8, self.api_client).manifest_text(normalize=True), m8)
163
164     def test_locators_and_ranges(self):
165         blocks2 = [Range('a', 0, 10),
166                    Range('b', 10, 10),
167                    Range('c', 20, 10),
168                    Range('d', 30, 10),
169                    Range('e', 40, 10),
170                    Range('f', 50, 10)]
171
172         self.assertEqual(locators_and_ranges(blocks2,  2,  2), [LocatorAndRange('a', 10, 2, 2)])
173         self.assertEqual(locators_and_ranges(blocks2, 12, 2), [LocatorAndRange('b', 10, 2, 2)])
174         self.assertEqual(locators_and_ranges(blocks2, 22, 2), [LocatorAndRange('c', 10, 2, 2)])
175         self.assertEqual(locators_and_ranges(blocks2, 32, 2), [LocatorAndRange('d', 10, 2, 2)])
176         self.assertEqual(locators_and_ranges(blocks2, 42, 2), [LocatorAndRange('e', 10, 2, 2)])
177         self.assertEqual(locators_and_ranges(blocks2, 52, 2), [LocatorAndRange('f', 10, 2, 2)])
178         self.assertEqual(locators_and_ranges(blocks2, 62, 2), [])
179         self.assertEqual(locators_and_ranges(blocks2, -2, 2), [])
180
181         self.assertEqual(locators_and_ranges(blocks2,  0,  2), [LocatorAndRange('a', 10, 0, 2)])
182         self.assertEqual(locators_and_ranges(blocks2, 10, 2), [LocatorAndRange('b', 10, 0, 2)])
183         self.assertEqual(locators_and_ranges(blocks2, 20, 2), [LocatorAndRange('c', 10, 0, 2)])
184         self.assertEqual(locators_and_ranges(blocks2, 30, 2), [LocatorAndRange('d', 10, 0, 2)])
185         self.assertEqual(locators_and_ranges(blocks2, 40, 2), [LocatorAndRange('e', 10, 0, 2)])
186         self.assertEqual(locators_and_ranges(blocks2, 50, 2), [LocatorAndRange('f', 10, 0, 2)])
187         self.assertEqual(locators_and_ranges(blocks2, 60, 2), [])
188         self.assertEqual(locators_and_ranges(blocks2, -2, 2), [])
189
190         self.assertEqual(locators_and_ranges(blocks2,  9,  2), [LocatorAndRange('a', 10, 9, 1), LocatorAndRange('b', 10, 0, 1)])
191         self.assertEqual(locators_and_ranges(blocks2, 19, 2), [LocatorAndRange('b', 10, 9, 1), LocatorAndRange('c', 10, 0, 1)])
192         self.assertEqual(locators_and_ranges(blocks2, 29, 2), [LocatorAndRange('c', 10, 9, 1), LocatorAndRange('d', 10, 0, 1)])
193         self.assertEqual(locators_and_ranges(blocks2, 39, 2), [LocatorAndRange('d', 10, 9, 1), LocatorAndRange('e', 10, 0, 1)])
194         self.assertEqual(locators_and_ranges(blocks2, 49, 2), [LocatorAndRange('e', 10, 9, 1), LocatorAndRange('f', 10, 0, 1)])
195         self.assertEqual(locators_and_ranges(blocks2, 59, 2), [LocatorAndRange('f', 10, 9, 1)])
196
197
198         blocks3 = [Range('a', 0, 10),
199                   Range('b', 10, 10),
200                   Range('c', 20, 10),
201                   Range('d', 30, 10),
202                   Range('e', 40, 10),
203                   Range('f', 50, 10),
204                    Range('g', 60, 10)]
205
206         self.assertEqual(locators_and_ranges(blocks3,  2,  2), [LocatorAndRange('a', 10, 2, 2)])
207         self.assertEqual(locators_and_ranges(blocks3, 12, 2), [LocatorAndRange('b', 10, 2, 2)])
208         self.assertEqual(locators_and_ranges(blocks3, 22, 2), [LocatorAndRange('c', 10, 2, 2)])
209         self.assertEqual(locators_and_ranges(blocks3, 32, 2), [LocatorAndRange('d', 10, 2, 2)])
210         self.assertEqual(locators_and_ranges(blocks3, 42, 2), [LocatorAndRange('e', 10, 2, 2)])
211         self.assertEqual(locators_and_ranges(blocks3, 52, 2), [LocatorAndRange('f', 10, 2, 2)])
212         self.assertEqual(locators_and_ranges(blocks3, 62, 2), [LocatorAndRange('g', 10, 2, 2)])
213
214
215         blocks = [Range('a', 0, 10),
216                   Range('b', 10, 15),
217                   Range('c', 25, 5)]
218         self.assertEqual(locators_and_ranges(blocks, 1, 0), [])
219         self.assertEqual(locators_and_ranges(blocks, 0, 5), [LocatorAndRange('a', 10, 0, 5)])
220         self.assertEqual(locators_and_ranges(blocks, 3, 5), [LocatorAndRange('a', 10, 3, 5)])
221         self.assertEqual(locators_and_ranges(blocks, 0, 10), [LocatorAndRange('a', 10, 0, 10)])
222
223         self.assertEqual(locators_and_ranges(blocks, 0, 11), [LocatorAndRange('a', 10, 0, 10),
224                                                               LocatorAndRange('b', 15, 0, 1)])
225         self.assertEqual(locators_and_ranges(blocks, 1, 11), [LocatorAndRange('a', 10, 1, 9),
226                                                               LocatorAndRange('b', 15, 0, 2)])
227         self.assertEqual(locators_and_ranges(blocks, 0, 25), [LocatorAndRange('a', 10, 0, 10),
228                                                               LocatorAndRange('b', 15, 0, 15)])
229
230         self.assertEqual(locators_and_ranges(blocks, 0, 30), [LocatorAndRange('a', 10, 0, 10),
231                                                               LocatorAndRange('b', 15, 0, 15),
232                                                               LocatorAndRange('c', 5, 0, 5)])
233         self.assertEqual(locators_and_ranges(blocks, 1, 30), [LocatorAndRange('a', 10, 1, 9),
234                                                               LocatorAndRange('b', 15, 0, 15),
235                                                               LocatorAndRange('c', 5, 0, 5)])
236         self.assertEqual(locators_and_ranges(blocks, 0, 31), [LocatorAndRange('a', 10, 0, 10),
237                                                               LocatorAndRange('b', 15, 0, 15),
238                                                               LocatorAndRange('c', 5, 0, 5)])
239
240         self.assertEqual(locators_and_ranges(blocks, 15, 5), [LocatorAndRange('b', 15, 5, 5)])
241
242         self.assertEqual(locators_and_ranges(blocks, 8, 17), [LocatorAndRange('a', 10, 8, 2),
243                                                               LocatorAndRange('b', 15, 0, 15)])
244
245         self.assertEqual(locators_and_ranges(blocks, 8, 20), [LocatorAndRange('a', 10, 8, 2),
246                                                               LocatorAndRange('b', 15, 0, 15),
247                                                               LocatorAndRange('c', 5, 0, 3)])
248
249         self.assertEqual(locators_and_ranges(blocks, 26, 2), [LocatorAndRange('c', 5, 1, 2)])
250
251         self.assertEqual(locators_and_ranges(blocks, 9, 15), [LocatorAndRange('a', 10, 9, 1),
252                                                               LocatorAndRange('b', 15, 0, 14)])
253         self.assertEqual(locators_and_ranges(blocks, 10, 15), [LocatorAndRange('b', 15, 0, 15)])
254         self.assertEqual(locators_and_ranges(blocks, 11, 15), [LocatorAndRange('b', 15, 1, 14),
255                                                                LocatorAndRange('c', 5, 0, 1)])
256
257     class MockKeep(object):
258         def __init__(self, content, num_retries=0):
259             self.content = content
260             self.num_prefetch_threads = 1
261
262         def get(self, locator, num_retries=0, prefetch=False):
263             return self.content[locator]
264
265     def test_extract_file(self):
266         m1 = """. 5348b82a029fd9e971a811ce1f71360b+43 0:43:md5sum.txt
267 . 085c37f02916da1cad16f93c54d899b7+41 0:41:md6sum.txt
268 . 8b22da26f9f433dea0a10e5ec66d73ba+43 0:43:md7sum.txt
269 . 085c37f02916da1cad16f93c54d899b7+41 5348b82a029fd9e971a811ce1f71360b+43 8b22da26f9f433dea0a10e5ec66d73ba+43 47:80:md8sum.txt
270 . 085c37f02916da1cad16f93c54d899b7+41 5348b82a029fd9e971a811ce1f71360b+43 8b22da26f9f433dea0a10e5ec66d73ba+43 40:80:md9sum.txt
271 """
272         coll = arvados.CollectionReader(m1, self.api_client)
273         m2 = coll.manifest_text(normalize=True)
274         self.assertEqual(m2,
275                          ". 5348b82a029fd9e971a811ce1f71360b+43 085c37f02916da1cad16f93c54d899b7+41 8b22da26f9f433dea0a10e5ec66d73ba+43 0:43:md5sum.txt 43:41:md6sum.txt 84:43:md7sum.txt 6:37:md8sum.txt 84:43:md8sum.txt 83:1:md9sum.txt 0:43:md9sum.txt 84:36:md9sum.txt\n")
276         self.assertEqual(coll['md5sum.txt'].manifest_text(),
277                          ". 5348b82a029fd9e971a811ce1f71360b+43 0:43:md5sum.txt\n")
278         self.assertEqual(coll['md6sum.txt'].manifest_text(),
279                          ". 085c37f02916da1cad16f93c54d899b7+41 0:41:md6sum.txt\n")
280         self.assertEqual(coll['md7sum.txt'].manifest_text(),
281                          ". 8b22da26f9f433dea0a10e5ec66d73ba+43 0:43:md7sum.txt\n")
282         self.assertEqual(coll['md9sum.txt'].manifest_text(),
283                          ". 085c37f02916da1cad16f93c54d899b7+41 5348b82a029fd9e971a811ce1f71360b+43 8b22da26f9f433dea0a10e5ec66d73ba+43 40:80:md9sum.txt\n")
284
285
286 class CollectionTestMixin(tutil.ApiClientMock):
287     API_COLLECTIONS = run_test_server.fixture('collections')
288     DEFAULT_COLLECTION = API_COLLECTIONS['foo_file']
289     DEFAULT_DATA_HASH = DEFAULT_COLLECTION['portable_data_hash']
290     DEFAULT_MANIFEST = DEFAULT_COLLECTION['manifest_text']
291     DEFAULT_UUID = DEFAULT_COLLECTION['uuid']
292     ALT_COLLECTION = API_COLLECTIONS['bar_file']
293     ALT_DATA_HASH = ALT_COLLECTION['portable_data_hash']
294     ALT_MANIFEST = ALT_COLLECTION['manifest_text']
295
296     def api_client_mock(self, status=200):
297         client = super(CollectionTestMixin, self).api_client_mock()
298         self.mock_keep_services(client, status=status, service_type='proxy', count=1)
299         return client
300
301
302 @tutil.skip_sleep
303 class CollectionReaderTestCase(unittest.TestCase, CollectionTestMixin):
304     def mock_get_collection(self, api_mock, code, fixturename):
305         body = self.API_COLLECTIONS.get(fixturename)
306         self._mock_api_call(api_mock.collections().get, code, body)
307
308     def api_client_mock(self, status=200):
309         client = super(CollectionReaderTestCase, self).api_client_mock()
310         self.mock_get_collection(client, status, 'foo_file')
311         return client
312
313     def test_init_default_retries(self):
314         client = self.api_client_mock(200)
315         reader = arvados.CollectionReader(self.DEFAULT_UUID, api_client=client)
316         reader.manifest_text()
317         client.collections().get().execute.assert_called_with(num_retries=10)
318
319     def test_uuid_init_success(self):
320         client = self.api_client_mock(200)
321         reader = arvados.CollectionReader(self.DEFAULT_UUID, api_client=client,
322                                           num_retries=3)
323         self.assertEqual(self.DEFAULT_COLLECTION['manifest_text'],
324                          reader.manifest_text())
325         client.collections().get().execute.assert_called_with(num_retries=3)
326
327     def test_uuid_init_failure_raises_api_error(self):
328         client = self.api_client_mock(500)
329         with self.assertRaises(arvados.errors.ApiError):
330             reader = arvados.CollectionReader(self.DEFAULT_UUID, api_client=client)
331
332     def test_locator_init(self):
333         client = self.api_client_mock(200)
334         # Ensure Keep will not return anything if asked.
335         with tutil.mock_keep_responses(None, 404):
336             reader = arvados.CollectionReader(self.DEFAULT_DATA_HASH,
337                                               api_client=client)
338             self.assertEqual(self.DEFAULT_MANIFEST, reader.manifest_text())
339
340     def test_init_no_fallback_to_keep(self):
341         # Do not look up a collection UUID or PDH in Keep.
342         for key in [self.DEFAULT_UUID, self.DEFAULT_DATA_HASH]:
343             client = self.api_client_mock(404)
344             with tutil.mock_keep_responses(self.DEFAULT_MANIFEST, 200):
345                 with self.assertRaises(arvados.errors.ApiError):
346                     reader = arvados.CollectionReader(key, api_client=client)
347
348     def test_init_num_retries_propagated(self):
349         # More of an integration test...
350         client = self.api_client_mock(200)
351         reader = arvados.CollectionReader(self.DEFAULT_UUID, api_client=client,
352                                           num_retries=3)
353         with tutil.mock_keep_responses('foo', 500, 500, 200):
354             self.assertEqual('foo', reader.open('foo', 'r').read())
355
356     def test_read_nonnormalized_manifest_with_collection_reader(self):
357         # client should be able to use CollectionReader on a manifest without normalizing it
358         client = self.api_client_mock(500)
359         nonnormal = ". acbd18db4cc2f85cedef654fccc4a4d8+3+Aabadbadbee@abeebdee 0:3:foo.txt 1:0:bar.txt 0:3:foo.txt\n"
360         reader = arvados.CollectionReader(
361             nonnormal,
362             api_client=client, num_retries=0)
363         # Ensure stripped_manifest() doesn't mangle our manifest in
364         # any way other than stripping hints.
365         self.assertEqual(
366             re.sub(r'\+[^\d\s\+]+', '', nonnormal),
367             reader.stripped_manifest())
368         # Ensure stripped_manifest() didn't mutate our reader.
369         self.assertEqual(nonnormal, reader.manifest_text())
370
371     def test_read_empty_collection(self):
372         client = self.api_client_mock(200)
373         self.mock_get_collection(client, 200, 'empty')
374         reader = arvados.CollectionReader('d41d8cd98f00b204e9800998ecf8427e+0',
375                                           api_client=client)
376         self.assertEqual('', reader.manifest_text())
377         self.assertEqual(0, len(reader))
378         self.assertFalse(reader)
379
380     def test_api_response(self):
381         client = self.api_client_mock()
382         reader = arvados.CollectionReader(self.DEFAULT_UUID, api_client=client)
383         self.assertEqual(self.DEFAULT_COLLECTION, reader.api_response())
384
385     def check_open_file(self, coll_file, stream_name, file_name, file_size):
386         self.assertFalse(coll_file.closed, "returned file is not open")
387         self.assertEqual(stream_name, coll_file.stream_name())
388         self.assertEqual(file_name, coll_file.name)
389         self.assertEqual(file_size, coll_file.size())
390
391     def test_open_collection_file_one_argument(self):
392         client = self.api_client_mock(200)
393         reader = arvados.CollectionReader(self.DEFAULT_UUID, api_client=client)
394         cfile = reader.open('./foo', 'rb')
395         self.check_open_file(cfile, '.', 'foo', 3)
396
397     def test_open_deep_file(self):
398         coll_name = 'collection_with_files_in_subdir'
399         client = self.api_client_mock(200)
400         self.mock_get_collection(client, 200, coll_name)
401         reader = arvados.CollectionReader(
402             self.API_COLLECTIONS[coll_name]['uuid'], api_client=client)
403         cfile = reader.open('./subdir2/subdir3/file2_in_subdir3.txt', 'rb')
404         self.check_open_file(cfile, './subdir2/subdir3', 'file2_in_subdir3.txt',
405                              32)
406
407     def test_open_nonexistent_stream(self):
408         client = self.api_client_mock(200)
409         reader = arvados.CollectionReader(self.DEFAULT_UUID, api_client=client)
410         self.assertRaises(IOError, reader.open, './nonexistent/foo')
411
412     def test_open_nonexistent_file(self):
413         client = self.api_client_mock(200)
414         reader = arvados.CollectionReader(self.DEFAULT_UUID, api_client=client)
415         self.assertRaises(IOError, reader.open, 'nonexistent')
416
417
418 class CollectionMethods(run_test_server.TestCaseWithServers):
419
420     def test_keys_values_items_support_indexing(self):
421         c = Collection()
422         with c.open('foo', 'wb') as f:
423             f.write(b'foo')
424         with c.open('bar', 'wb') as f:
425             f.write(b'bar')
426         self.assertEqual(2, len(c.keys()))
427         fn0, fn1 = c.keys()
428         self.assertEqual(2, len(c.values()))
429         f0 = c.values()[0]
430         f1 = c.values()[1]
431         self.assertEqual(2, len(c.items()))
432         self.assertEqual(fn0, c.items()[0][0])
433         self.assertEqual(fn1, c.items()[1][0])
434
435     def test_get_properties(self):
436         c = Collection()
437         self.assertEqual(c.get_properties(), {})
438         c.save_new(properties={"foo":"bar"})
439         self.assertEqual(c.get_properties(), {"foo":"bar"})
440
441     def test_get_trash_at(self):
442         c = Collection()
443         self.assertEqual(c.get_trash_at(), None)
444         c.save_new(trash_at=datetime.datetime(2111, 1, 1, 11, 11, 11, 111111))
445         self.assertEqual(c.get_trash_at(), ciso8601.parse_datetime('2111-01-01T11:11:11.111111000Z'))
446
447
448 class CollectionOpenModes(run_test_server.TestCaseWithServers):
449
450     def test_open_binary_modes(self):
451         c = Collection()
452         for mode in ['wb', 'wb+', 'ab', 'ab+']:
453             with c.open('foo', mode) as f:
454                 f.write(b'foo')
455
456     def test_open_invalid_modes(self):
457         c = Collection()
458         for mode in ['+r', 'aa', '++', 'r+b', 'beer', '', None]:
459             with self.assertRaises(Exception):
460                 c.open('foo', mode)
461
462     def test_open_text_modes(self):
463         c = Collection()
464         with c.open('foo', 'wb') as f:
465             f.write('foo')
466         for mode in ['r', 'rt', 'r+', 'rt+', 'w', 'wt', 'a', 'at']:
467             with c.open('foo', mode) as f:
468                 if mode[0] == 'r' and '+' not in mode:
469                     self.assertEqual('foo', f.read(3))
470                 else:
471                     f.write('bar')
472                     f.seek(0, os.SEEK_SET)
473                     self.assertEqual('bar', f.read(3))
474
475
476 class TextModes(run_test_server.TestCaseWithServers):
477
478     def setUp(self):
479         arvados.config.KEEP_BLOCK_SIZE = 4
480         self.sailboat = '\N{SAILBOAT}'
481         self.snowman = '\N{SNOWMAN}'
482
483     def tearDown(self):
484         arvados.config.KEEP_BLOCK_SIZE = 2 ** 26
485
486     def test_read_sailboat_across_block_boundary(self):
487         c = Collection()
488         f = c.open('sailboats', 'wb')
489         data = self.sailboat.encode('utf-8')
490         f.write(data)
491         f.write(data[:1])
492         f.write(data[1:])
493         f.write(b'\n')
494         f.close()
495         self.assertRegex(c.portable_manifest_text(), r'\+4 .*\+3 ')
496
497         f = c.open('sailboats', 'r')
498         string = f.readline()
499         self.assertEqual(string, self.sailboat+self.sailboat+'\n')
500         f.close()
501
502     def test_write_snowman_across_block_boundary(self):
503         c = Collection()
504         f = c.open('snowmany', 'w')
505         data = self.snowman
506         f.write(data+data+'\n'+data+'\n')
507         f.close()
508         self.assertRegex(c.portable_manifest_text(), r'\+4 .*\+4 .*\+3 ')
509
510         f = c.open('snowmany', 'r')
511         self.assertEqual(f.readline(), self.snowman+self.snowman+'\n')
512         self.assertEqual(f.readline(), self.snowman+'\n')
513         f.close()
514
515
516 class NewCollectionTestCase(unittest.TestCase, CollectionTestMixin):
517
518     def test_replication_desired_kept_on_load(self):
519         m = '. 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt 0:10:count2.txt\n'
520         c1 = Collection(m, replication_desired=1)
521         c1.save_new()
522         loc = c1.manifest_locator()
523         c2 = Collection(loc)
524         self.assertEqual(c1.manifest_text(strip=True), c2.manifest_text(strip=True))
525         self.assertEqual(c1.replication_desired, c2.replication_desired)
526
527     def test_replication_desired_not_loaded_if_provided(self):
528         m = '. 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt 0:10:count2.txt\n'
529         c1 = Collection(m, replication_desired=1)
530         c1.save_new()
531         loc = c1.manifest_locator()
532         c2 = Collection(loc, replication_desired=2)
533         self.assertEqual(c1.manifest_text(strip=True), c2.manifest_text(strip=True))
534         self.assertNotEqual(c1.replication_desired, c2.replication_desired)
535
536     def test_storage_classes_desired_kept_on_load(self):
537         m = '. 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt 0:10:count2.txt\n'
538         c1 = Collection(m, storage_classes_desired=['archival'])
539         c1.save_new()
540         loc = c1.manifest_locator()
541         c2 = Collection(loc)
542         self.assertEqual(c1.manifest_text(strip=True), c2.manifest_text(strip=True))
543         self.assertEqual(c1.storage_classes_desired(), c2.storage_classes_desired())
544
545     def test_storage_classes_change_after_save(self):
546         m = '. 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt 0:10:count2.txt\n'
547         c1 = Collection(m, storage_classes_desired=['archival'])
548         c1.save_new()
549         loc = c1.manifest_locator()
550         c2 = Collection(loc)
551         self.assertEqual(['archival'], c2.storage_classes_desired())
552         c2.save(storage_classes=['highIO'])
553         self.assertEqual(['highIO'], c2.storage_classes_desired())
554         c3 = Collection(loc)
555         self.assertEqual(c1.manifest_text(strip=True), c3.manifest_text(strip=True))
556         self.assertEqual(['highIO'], c3.storage_classes_desired())
557
558     def test_storage_classes_desired_not_loaded_if_provided(self):
559         m = '. 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt 0:10:count2.txt\n'
560         c1 = Collection(m, storage_classes_desired=['archival'])
561         c1.save_new()
562         loc = c1.manifest_locator()
563         c2 = Collection(loc, storage_classes_desired=['default'])
564         self.assertEqual(c1.manifest_text(strip=True), c2.manifest_text(strip=True))
565         self.assertNotEqual(c1.storage_classes_desired(), c2.storage_classes_desired())
566
567     def test_init_manifest(self):
568         m1 = """. 5348b82a029fd9e971a811ce1f71360b+43 0:43:md5sum.txt
569 . 085c37f02916da1cad16f93c54d899b7+41 0:41:md5sum.txt
570 . 8b22da26f9f433dea0a10e5ec66d73ba+43 0:43:md5sum.txt
571 """
572         self.assertEqual(m1, CollectionReader(m1).manifest_text(normalize=False))
573         self.assertEqual(". 5348b82a029fd9e971a811ce1f71360b+43 085c37f02916da1cad16f93c54d899b7+41 8b22da26f9f433dea0a10e5ec66d73ba+43 0:127:md5sum.txt\n", CollectionReader(m1).manifest_text(normalize=True))
574
575     def test_init_manifest_with_collision(self):
576         m1 = """. 5348b82a029fd9e971a811ce1f71360b+43 0:43:md5sum.txt
577 ./md5sum.txt 085c37f02916da1cad16f93c54d899b7+41 0:41:md5sum.txt
578 """
579         with self.assertRaises(arvados.errors.ArgumentError):
580             self.assertEqual(m1, CollectionReader(m1))
581
582     def test_init_manifest_with_error(self):
583         m1 = """. 0:43:md5sum.txt"""
584         with self.assertRaises(arvados.errors.ArgumentError):
585             self.assertEqual(m1, CollectionReader(m1))
586
587     def test_remove(self):
588         c = Collection('. 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt 0:10:count2.txt\n')
589         self.assertEqual(". 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt 0:10:count2.txt\n", c.portable_manifest_text())
590         self.assertIn("count1.txt", c)
591         c.remove("count1.txt")
592         self.assertNotIn("count1.txt", c)
593         self.assertEqual(". 781e5e245d69b566979b86e28d23f2c7+10 0:10:count2.txt\n", c.portable_manifest_text())
594         with self.assertRaises(arvados.errors.ArgumentError):
595             c.remove("")
596
597     def test_remove_recursive(self):
598         c = Collection('. 781e5e245d69b566979b86e28d23f2c7+10 0:10:a/b/c/d/efg.txt 0:10:xyz.txt\n')
599         self.assertEqual(". 781e5e245d69b566979b86e28d23f2c7+10 0:10:xyz.txt\n./a/b/c/d 781e5e245d69b566979b86e28d23f2c7+10 0:10:efg.txt\n", c.portable_manifest_text())
600         self.assertIn("a", c)
601         self.assertEqual(1, len(c["a"].keys()))
602         # cannot remove non-empty directory with default recursive=False
603         with self.assertRaises(OSError):
604             c.remove("a/b")
605         with self.assertRaises(OSError):
606             c.remove("a/b/c/d")
607         c.remove("a/b", recursive=True)
608         self.assertEqual(0, len(c["a"].keys()))
609         self.assertEqual(". 781e5e245d69b566979b86e28d23f2c7+10 0:10:xyz.txt\n./a d41d8cd98f00b204e9800998ecf8427e+0 0:0:\\056\n", c.portable_manifest_text())
610
611     def test_find(self):
612         c = Collection('. 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt 0:10:count2.txt\n')
613         self.assertIs(c.find("."), c)
614         self.assertIs(c.find("./count1.txt"), c["count1.txt"])
615         self.assertIs(c.find("count1.txt"), c["count1.txt"])
616         with self.assertRaises(IOError):
617             c.find("/.")
618         with self.assertRaises(arvados.errors.ArgumentError):
619             c.find("")
620         self.assertIs(c.find("./nonexistant.txt"), None)
621         self.assertIs(c.find("./nonexistantsubdir/nonexistant.txt"), None)
622
623     def test_escaped_paths_dont_get_unescaped_on_manifest(self):
624         # Dir & file names are literally '\056' (escaped form: \134056)
625         manifest = './\\134056\\040Test d41d8cd98f00b204e9800998ecf8427e+0 0:0:\\134056\n'
626         c = Collection(manifest)
627         self.assertEqual(c.portable_manifest_text(), manifest)
628
629     def test_other_special_chars_on_file_token(self):
630         cases = [
631             ('\\000', '\0'),
632             ('\\011', '\t'),
633             ('\\012', '\n'),
634             ('\\072', ':'),
635             ('\\134400', '\\400'),
636         ]
637         for encoded, decoded in cases:
638             manifest = '. d41d8cd98f00b204e9800998ecf8427e+0 0:0:some%sfile.txt\n' % encoded
639             c = Collection(manifest)
640             self.assertEqual(c.portable_manifest_text(), manifest)
641             self.assertIn('some%sfile.txt' % decoded, c.keys())
642
643     def test_escaped_paths_do_get_unescaped_on_listing(self):
644         # Dir & file names are literally '\056' (escaped form: \134056)
645         manifest = './\\134056\\040Test d41d8cd98f00b204e9800998ecf8427e+0 0:0:\\134056\n'
646         c = Collection(manifest)
647         self.assertIn('\\056 Test', c.keys())
648         self.assertIn('\\056', c['\\056 Test'].keys())
649
650     def test_make_empty_dir_with_escaped_chars(self):
651         c = Collection()
652         c.mkdirs('./Empty\\056Dir')
653         self.assertEqual(c.portable_manifest_text(),
654                          './Empty\\134056Dir d41d8cd98f00b204e9800998ecf8427e+0 0:0:\\056\n')
655
656     def test_make_empty_dir_with_spaces(self):
657         c = Collection()
658         c.mkdirs('./foo bar/baz waz')
659         self.assertEqual(c.portable_manifest_text(),
660                          './foo\\040bar/baz\\040waz d41d8cd98f00b204e9800998ecf8427e+0 0:0:\\056\n')
661
662     def test_remove_in_subdir(self):
663         c = Collection('. 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt\n./foo 781e5e245d69b566979b86e28d23f2c7+10 0:10:count2.txt\n')
664         c.remove("foo/count2.txt")
665         self.assertEqual(". 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt\n./foo d41d8cd98f00b204e9800998ecf8427e+0 0:0:\\056\n", c.portable_manifest_text())
666
667     def test_remove_empty_subdir(self):
668         c = Collection('. 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt\n./foo 781e5e245d69b566979b86e28d23f2c7+10 0:10:count2.txt\n')
669         c.remove("foo/count2.txt")
670         c.remove("foo")
671         self.assertEqual(". 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt\n", c.portable_manifest_text())
672
673     def test_remove_nonempty_subdir(self):
674         c = Collection('. 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt\n./foo 781e5e245d69b566979b86e28d23f2c7+10 0:10:count2.txt\n')
675         with self.assertRaises(IOError):
676             c.remove("foo")
677         c.remove("foo", recursive=True)
678         self.assertEqual(". 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt\n", c.portable_manifest_text())
679
680     def test_copy_to_file_in_dir(self):
681         c = Collection('. 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt\n')
682         c.copy("count1.txt", "foo/count2.txt")
683         self.assertEqual(". 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt\n./foo 781e5e245d69b566979b86e28d23f2c7+10 0:10:count2.txt\n", c.portable_manifest_text())
684
685     def test_copy_file(self):
686         c = Collection('. 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt\n')
687         c.copy("count1.txt", "count2.txt")
688         self.assertEqual(". 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt 0:10:count2.txt\n", c.portable_manifest_text())
689
690     def test_copy_to_existing_dir(self):
691         c = Collection('. 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt\n./foo 781e5e245d69b566979b86e28d23f2c7+10 0:10:count2.txt\n')
692         c.copy("count1.txt", "foo")
693         self.assertEqual(". 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt\n./foo 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt 0:10:count2.txt\n", c.portable_manifest_text())
694
695     def test_copy_to_new_dir(self):
696         c = Collection('. 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt\n')
697         c.copy("count1.txt", "foo/")
698         self.assertEqual(". 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt\n./foo 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt\n", c.portable_manifest_text())
699
700     def test_rename_file(self):
701         c = Collection('. 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt\n')
702         c.rename("count1.txt", "count2.txt")
703         self.assertEqual(". 781e5e245d69b566979b86e28d23f2c7+10 0:10:count2.txt\n", c.manifest_text())
704
705     def test_move_file_to_dir(self):
706         c = Collection('. 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt\n')
707         c.mkdirs("foo")
708         c.rename("count1.txt", "foo/count2.txt")
709         self.assertEqual("./foo 781e5e245d69b566979b86e28d23f2c7+10 0:10:count2.txt\n", c.manifest_text())
710
711     def test_move_file_to_other(self):
712         c1 = Collection('. 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt\n')
713         c2 = Collection()
714         c2.rename("count1.txt", "count2.txt", source_collection=c1)
715         self.assertEqual("", c1.manifest_text())
716         self.assertEqual(". 781e5e245d69b566979b86e28d23f2c7+10 0:10:count2.txt\n", c2.manifest_text())
717
718     def test_clone(self):
719         c = Collection('. 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt\n./foo 781e5e245d69b566979b86e28d23f2c7+10 0:10:count2.txt\n')
720         cl = c.clone()
721         self.assertEqual(". 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt\n./foo 781e5e245d69b566979b86e28d23f2c7+10 0:10:count2.txt\n", cl.portable_manifest_text())
722
723     def test_diff_del_add(self):
724         c1 = Collection('. 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt\n')
725         c2 = Collection('. 5348b82a029fd9e971a811ce1f71360b+43 0:10:count2.txt\n')
726         d = c2.diff(c1)
727         self.assertEqual(sorted(d), [
728             ('add', './count1.txt', c1["count1.txt"]),
729             ('del', './count2.txt', c2["count2.txt"]),
730         ])
731         d = c1.diff(c2)
732         self.assertEqual(sorted(d), [
733             ('add', './count2.txt', c2["count2.txt"]),
734             ('del', './count1.txt', c1["count1.txt"]),
735         ])
736         self.assertNotEqual(c1.portable_manifest_text(), c2.portable_manifest_text())
737         c1.apply(d)
738         self.assertEqual(c1.portable_manifest_text(), c2.portable_manifest_text())
739
740     def test_diff_same(self):
741         c1 = Collection('. 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt\n')
742         c2 = Collection('. 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt\n')
743         d = c2.diff(c1)
744         self.assertEqual(d, [('tok', './count1.txt', c2["count1.txt"], c1["count1.txt"])])
745         d = c1.diff(c2)
746         self.assertEqual(d, [('tok', './count1.txt', c2["count1.txt"], c1["count1.txt"])])
747
748         self.assertEqual(c1.portable_manifest_text(), c2.portable_manifest_text())
749         c1.apply(d)
750         self.assertEqual(c1.portable_manifest_text(), c2.portable_manifest_text())
751
752     def test_diff_mod(self):
753         c1 = Collection('. 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt\n')
754         c2 = Collection('. 5348b82a029fd9e971a811ce1f71360b+43 0:10:count1.txt\n')
755         d = c2.diff(c1)
756         self.assertEqual(d, [('mod', './count1.txt', c2["count1.txt"], c1["count1.txt"])])
757         d = c1.diff(c2)
758         self.assertEqual(d, [('mod', './count1.txt', c1["count1.txt"], c2["count1.txt"])])
759
760         self.assertNotEqual(c1.portable_manifest_text(), c2.portable_manifest_text())
761         c1.apply(d)
762         self.assertEqual(c1.portable_manifest_text(), c2.portable_manifest_text())
763
764     def test_diff_add(self):
765         c1 = Collection('. 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt\n')
766         c2 = Collection('. 781e5e245d69b566979b86e28d23f2c7+10 5348b82a029fd9e971a811ce1f71360b+43 0:10:count1.txt 10:20:count2.txt\n')
767         d = c2.diff(c1)
768         self.assertEqual(sorted(d), [
769             ('del', './count2.txt', c2["count2.txt"]),
770             ('tok', './count1.txt', c2["count1.txt"], c1["count1.txt"]),
771         ])
772         d = c1.diff(c2)
773         self.assertEqual(sorted(d), [
774             ('add', './count2.txt', c2["count2.txt"]),
775             ('tok', './count1.txt', c2["count1.txt"], c1["count1.txt"]),
776         ])
777
778         self.assertNotEqual(c1.portable_manifest_text(), c2.portable_manifest_text())
779         c1.apply(d)
780         self.assertEqual(c1.portable_manifest_text(), c2.portable_manifest_text())
781
782     def test_diff_add_in_subcollection(self):
783         c1 = Collection('. 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt\n')
784         c2 = Collection('. 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt\n./foo 5348b82a029fd9e971a811ce1f71360b+43 0:10:count2.txt\n')
785         d = c2.diff(c1)
786         self.assertEqual(sorted(d), [
787             ('del', './foo', c2["foo"]),
788             ('tok', './count1.txt', c2["count1.txt"], c1["count1.txt"]),
789         ])
790         d = c1.diff(c2)
791         self.assertEqual(sorted(d), [
792             ('add', './foo', c2["foo"]),
793             ('tok', './count1.txt', c2["count1.txt"], c1["count1.txt"]),
794         ])
795         self.assertNotEqual(c1.portable_manifest_text(), c2.portable_manifest_text())
796         c1.apply(d)
797         self.assertEqual(c1.portable_manifest_text(), c2.portable_manifest_text())
798
799     def test_diff_del_add_in_subcollection(self):
800         c1 = Collection('. 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt\n./foo 5348b82a029fd9e971a811ce1f71360b+43 0:10:count2.txt\n')
801         c2 = Collection('. 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt\n./foo 5348b82a029fd9e971a811ce1f71360b+43 0:3:count3.txt\n')
802         d = c2.diff(c1)
803         self.assertEqual(sorted(d), [
804             ('add', './foo/count2.txt', c1.find("foo/count2.txt")),
805             ('del', './foo/count3.txt', c2.find("foo/count3.txt")),
806             ('tok', './count1.txt', c2["count1.txt"], c1["count1.txt"]),
807         ])
808         d = c1.diff(c2)
809         self.assertEqual(sorted(d), [
810             ('add', './foo/count3.txt', c2.find("foo/count3.txt")),
811             ('del', './foo/count2.txt', c1.find("foo/count2.txt")),
812             ('tok', './count1.txt', c2["count1.txt"], c1["count1.txt"]),
813         ])
814
815         self.assertNotEqual(c1.portable_manifest_text(), c2.portable_manifest_text())
816         c1.apply(d)
817         self.assertEqual(c1.portable_manifest_text(), c2.portable_manifest_text())
818
819     def test_diff_mod_in_subcollection(self):
820         c1 = Collection('. 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt\n./foo 5348b82a029fd9e971a811ce1f71360b+43 0:10:count2.txt\n')
821         c2 = Collection('. 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt 0:3:foo\n')
822         d = c2.diff(c1)
823         self.assertEqual(sorted(d), [
824             ('mod', './foo', c2["foo"], c1["foo"]),
825             ('tok', './count1.txt', c2["count1.txt"], c1["count1.txt"]),
826         ])
827         d = c1.diff(c2)
828         self.assertEqual(sorted(d), [
829             ('mod', './foo', c1["foo"], c2["foo"]),
830             ('tok', './count1.txt', c2["count1.txt"], c1["count1.txt"]),
831         ])
832
833         self.assertNotEqual(c1.portable_manifest_text(), c2.portable_manifest_text())
834         c1.apply(d)
835         self.assertEqual(c1.portable_manifest_text(), c2.portable_manifest_text())
836
837     def test_conflict_keep_local_change(self):
838         c1 = Collection('. 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt\n')
839         c2 = Collection('. 5348b82a029fd9e971a811ce1f71360b+43 0:10:count2.txt\n')
840         d = c1.diff(c2)
841         self.assertEqual(sorted(d), [
842             ('add', './count2.txt', c2["count2.txt"]),
843             ('del', './count1.txt', c1["count1.txt"]),
844         ])
845         f = c1.open("count1.txt", "wb")
846         f.write(b"zzzzz")
847
848         # c1 changed, so it should not be deleted.
849         c1.apply(d)
850         self.assertEqual(c1.portable_manifest_text(), ". 95ebc3c7b3b9f1d2c40fec14415d3cb8+5 5348b82a029fd9e971a811ce1f71360b+43 0:5:count1.txt 5:10:count2.txt\n")
851
852     def test_conflict_mod(self):
853         c1 = Collection('. 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt')
854         c2 = Collection('. 5348b82a029fd9e971a811ce1f71360b+43 0:10:count1.txt')
855         d = c1.diff(c2)
856         self.assertEqual(d, [('mod', './count1.txt', c1["count1.txt"], c2["count1.txt"])])
857         f = c1.open("count1.txt", "wb")
858         f.write(b"zzzzz")
859
860         # c1 changed, so c2 mod will go to a conflict file
861         c1.apply(d)
862         self.assertRegex(
863             c1.portable_manifest_text(),
864             r"\. 95ebc3c7b3b9f1d2c40fec14415d3cb8\+5 5348b82a029fd9e971a811ce1f71360b\+43 0:5:count1\.txt 5:10:count1\.txt~\d\d\d\d\d\d\d\d-\d\d\d\d\d\d~conflict~$")
865
866     def test_conflict_add(self):
867         c1 = Collection('. 781e5e245d69b566979b86e28d23f2c7+10 0:10:count2.txt\n')
868         c2 = Collection('. 5348b82a029fd9e971a811ce1f71360b+43 0:10:count1.txt\n')
869         d = c1.diff(c2)
870         self.assertEqual(sorted(d), [
871             ('add', './count1.txt', c2["count1.txt"]),
872             ('del', './count2.txt', c1["count2.txt"]),
873         ])
874         f = c1.open("count1.txt", "wb")
875         f.write(b"zzzzz")
876
877         # c1 added count1.txt, so c2 add will go to a conflict file
878         c1.apply(d)
879         self.assertRegex(
880             c1.portable_manifest_text(),
881             r"\. 95ebc3c7b3b9f1d2c40fec14415d3cb8\+5 5348b82a029fd9e971a811ce1f71360b\+43 0:5:count1\.txt 5:10:count1\.txt~\d\d\d\d\d\d\d\d-\d\d\d\d\d\d~conflict~$")
882
883     def test_conflict_del(self):
884         c1 = Collection('. 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt')
885         c2 = Collection('. 5348b82a029fd9e971a811ce1f71360b+43 0:10:count1.txt')
886         d = c1.diff(c2)
887         self.assertEqual(d, [('mod', './count1.txt', c1["count1.txt"], c2["count1.txt"])])
888         c1.remove("count1.txt")
889
890         # c1 deleted, so c2 mod will go to a conflict file
891         c1.apply(d)
892         self.assertRegex(
893             c1.portable_manifest_text(),
894             r"\. 5348b82a029fd9e971a811ce1f71360b\+43 0:10:count1\.txt~\d\d\d\d\d\d\d\d-\d\d\d\d\d\d~conflict~$")
895
896     def test_notify(self):
897         c1 = Collection()
898         events = []
899         c1.subscribe(lambda event, collection, name, item: events.append((event, collection, name, item)))
900         f = c1.open("foo.txt", "wb")
901         self.assertEqual(events[0], (arvados.collection.ADD, c1, "foo.txt", f.arvadosfile))
902
903     def test_open_w(self):
904         c1 = Collection(". 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt\n")
905         self.assertEqual(c1["count1.txt"].size(), 10)
906         c1.open("count1.txt", "wb").close()
907         self.assertEqual(c1["count1.txt"].size(), 0)
908
909
910 class NewCollectionTestCaseWithServersAndTokens(run_test_server.TestCaseWithServers):
911     MAIN_SERVER = {}
912     KEEP_SERVER = {}
913     local_locator_re = r"[0-9a-f]{32}\+\d+\+A[a-f0-9]{40}@[a-f0-9]{8}"
914     remote_locator_re = r"[0-9a-f]{32}\+\d+\+R[a-z]{5}-[a-f0-9]{40}@[a-f0-9]{8}"
915
916     def setUp(self):
917         self.keep_put = getattr(arvados.keep.KeepClient, 'put')
918
919     @mock.patch('arvados.keep.KeepClient.put', autospec=True)
920     def test_storage_classes_desired(self, put_mock):
921         put_mock.side_effect = self.keep_put
922         c = Collection(storage_classes_desired=['default'])
923         with c.open("file.txt", 'wb') as f:
924             f.write('content')
925         c.save_new()
926         _, kwargs = put_mock.call_args
927         self.assertEqual(['default'], kwargs['classes'])
928
929     @mock.patch('arvados.keep.KeepClient.put', autospec=True)
930     def test_repacked_block_submission_get_permission_token(self, mocked_put):
931         '''
932         Make sure that those blocks that are committed after repacking small ones,
933         get their permission tokens assigned on the collection manifest.
934         '''
935         def wrapped_keep_put(*args, **kwargs):
936             # Simulate slow put operations
937             time.sleep(1)
938             return self.keep_put(*args, **kwargs)
939
940         mocked_put.side_effect = wrapped_keep_put
941         c = Collection()
942         # Write 70 files ~1MiB each so we force to produce 1 big block by repacking
943         # small ones before finishing the upload.
944         for i in range(70):
945             f = c.open("file_{}.txt".format(i), 'wb')
946             f.write(random.choice('abcdefghijklmnopqrstuvwxyz') * (2**20+i))
947             f.close(flush=False)
948         # We should get 2 blocks with their tokens
949         self.assertEqual(len(re.findall(self.local_locator_re, c.manifest_text())), 2)
950
951     @mock.patch('arvados.keep.KeepClient.refresh_signature')
952     def test_copy_remote_blocks_on_save_new(self, rs_mock):
953         remote_block_loc = "acbd18db4cc2f85cedef654fccc4a4d8+3+Remote-" + "a" * 40 + "@abcdef01"
954         local_block_loc = "acbd18db4cc2f85cedef654fccc4a4d8+3+A" + "b" * 40 + "@abcdef01"
955         rs_mock.return_value = local_block_loc
956         c = Collection(". " + remote_block_loc + " 0:3:foofile.txt\n")
957         self.assertEqual(
958             len(re.findall(self.remote_locator_re, c.manifest_text())), 1)
959         self.assertEqual(
960             len(re.findall(self.local_locator_re, c.manifest_text())), 0)
961         c.save_new()
962         rs_mock.assert_called()
963         self.assertEqual(
964             len(re.findall(self.remote_locator_re, c.manifest_text())), 0)
965         self.assertEqual(
966             len(re.findall(self.local_locator_re, c.manifest_text())), 1)
967
968     @mock.patch('arvados.keep.KeepClient.refresh_signature')
969     def test_copy_remote_blocks_on_save(self, rs_mock):
970         remote_block_loc = "acbd18db4cc2f85cedef654fccc4a4d8+3+Remote-" + "a" * 40 + "@abcdef01"
971         local_block_loc = "acbd18db4cc2f85cedef654fccc4a4d8+3+A" + "b" * 40 + "@abcdef01"
972         rs_mock.return_value = local_block_loc
973         # Remote collection
974         remote_c = Collection(". " + remote_block_loc + " 0:3:foofile.txt\n")
975         self.assertEqual(
976             len(re.findall(self.remote_locator_re, remote_c.manifest_text())), 1)
977         # Local collection
978         local_c = Collection()
979         with local_c.open('barfile.txt', 'wb') as f:
980             f.write('bar')
981         local_c.save_new()
982         self.assertEqual(
983             len(re.findall(self.local_locator_re, local_c.manifest_text())), 1)
984         self.assertEqual(
985             len(re.findall(self.remote_locator_re, local_c.manifest_text())), 0)
986         # Copy remote file to local collection
987         local_c.copy('./foofile.txt', './copied/foofile.txt', remote_c)
988         self.assertEqual(
989             len(re.findall(self.local_locator_re, local_c.manifest_text())), 1)
990         self.assertEqual(
991             len(re.findall(self.remote_locator_re, local_c.manifest_text())), 1)
992         # Save local collection: remote block should be copied
993         local_c.save()
994         rs_mock.assert_called()
995         self.assertEqual(
996             len(re.findall(self.local_locator_re, local_c.manifest_text())), 2)
997         self.assertEqual(
998             len(re.findall(self.remote_locator_re, local_c.manifest_text())), 0)
999
1000
1001 class NewCollectionTestCaseWithServers(run_test_server.TestCaseWithServers):
1002     def test_preserve_version_on_save(self):
1003         c = Collection()
1004         c.save_new(preserve_version=True)
1005         coll_record = arvados.api().collections().get(uuid=c.manifest_locator()).execute()
1006         self.assertEqual(coll_record['version'], 1)
1007         self.assertEqual(coll_record['preserve_version'], True)
1008         with c.open("foo.txt", "wb") as foo:
1009             foo.write(b"foo")
1010         c.save(preserve_version=True)
1011         coll_record = arvados.api().collections().get(uuid=c.manifest_locator()).execute()
1012         self.assertEqual(coll_record['version'], 2)
1013         self.assertEqual(coll_record['preserve_version'], True)
1014         with c.open("bar.txt", "wb") as foo:
1015             foo.write(b"bar")
1016         c.save(preserve_version=False)
1017         coll_record = arvados.api().collections().get(uuid=c.manifest_locator()).execute()
1018         self.assertEqual(coll_record['version'], 3)
1019         self.assertEqual(coll_record['preserve_version'], False)
1020
1021     def test_get_manifest_text_only_committed(self):
1022         c = Collection()
1023         with c.open("count.txt", "wb") as f:
1024             # One file committed
1025             with c.open("foo.txt", "wb") as foo:
1026                 foo.write(b"foo")
1027                 foo.flush() # Force block commit
1028             f.write(b"0123456789")
1029             # Other file not committed. Block not written to keep yet.
1030             self.assertEqual(
1031                 c._get_manifest_text(".",
1032                                      strip=False,
1033                                      normalize=False,
1034                                      only_committed=True),
1035                 '. acbd18db4cc2f85cedef654fccc4a4d8+3 0:0:count.txt 0:3:foo.txt\n')
1036             # And now with the file closed...
1037             f.flush() # Force block commit
1038         self.assertEqual(
1039             c._get_manifest_text(".",
1040                                  strip=False,
1041                                  normalize=False,
1042                                  only_committed=True),
1043             ". 781e5e245d69b566979b86e28d23f2c7+10 acbd18db4cc2f85cedef654fccc4a4d8+3 0:10:count.txt 10:3:foo.txt\n")
1044
1045     def test_only_small_blocks_are_packed_together(self):
1046         c = Collection()
1047         # Write a couple of small files,
1048         f = c.open("count.txt", "wb")
1049         f.write(b"0123456789")
1050         f.close(flush=False)
1051         foo = c.open("foo.txt", "wb")
1052         foo.write(b"foo")
1053         foo.close(flush=False)
1054         # Then, write a big file, it shouldn't be packed with the ones above
1055         big = c.open("bigfile.txt", "wb")
1056         big.write(b"x" * 1024 * 1024 * 33) # 33 MB > KEEP_BLOCK_SIZE/2
1057         big.close(flush=False)
1058         self.assertEqual(
1059             c.manifest_text("."),
1060             '. 2d303c138c118af809f39319e5d507e9+34603008 a8430a058b8fbf408e1931b794dbd6fb+13 0:34603008:bigfile.txt 34603008:10:count.txt 34603018:3:foo.txt\n')
1061
1062     def test_flush_after_small_block_packing(self):
1063         c = Collection()
1064         # Write a couple of small files,
1065         f = c.open("count.txt", "wb")
1066         f.write(b"0123456789")
1067         f.close(flush=False)
1068         foo = c.open("foo.txt", "wb")
1069         foo.write(b"foo")
1070         foo.close(flush=False)
1071
1072         self.assertEqual(
1073             c.manifest_text(),
1074             '. a8430a058b8fbf408e1931b794dbd6fb+13 0:10:count.txt 10:3:foo.txt\n')
1075
1076         f = c.open("count.txt", "rb+")
1077         f.close(flush=True)
1078
1079         self.assertEqual(
1080             c.manifest_text(),
1081             '. a8430a058b8fbf408e1931b794dbd6fb+13 0:10:count.txt 10:3:foo.txt\n')
1082
1083     def test_write_after_small_block_packing2(self):
1084         c = Collection()
1085         # Write a couple of small files,
1086         f = c.open("count.txt", "wb")
1087         f.write(b"0123456789")
1088         f.close(flush=False)
1089         foo = c.open("foo.txt", "wb")
1090         foo.write(b"foo")
1091         foo.close(flush=False)
1092
1093         self.assertEqual(
1094             c.manifest_text(),
1095             '. a8430a058b8fbf408e1931b794dbd6fb+13 0:10:count.txt 10:3:foo.txt\n')
1096
1097         f = c.open("count.txt", "rb+")
1098         f.write(b"abc")
1099         f.close(flush=False)
1100
1101         self.assertEqual(
1102             c.manifest_text(),
1103             '. 900150983cd24fb0d6963f7d28e17f72+3 a8430a058b8fbf408e1931b794dbd6fb+13 0:3:count.txt 6:7:count.txt 13:3:foo.txt\n')
1104
1105
1106     def test_small_block_packing_with_overwrite(self):
1107         c = Collection()
1108         c.open("b1", "wb").close()
1109         c["b1"].writeto(0, b"b1", 0)
1110
1111         c.open("b2", "wb").close()
1112         c["b2"].writeto(0, b"b2", 0)
1113
1114         c["b1"].writeto(0, b"1b", 0)
1115
1116         self.assertEqual(c.manifest_text(), ". ed4f3f67c70b02b29c50ce1ea26666bd+4 0:2:b1 2:2:b2\n")
1117         self.assertEqual(c["b1"].manifest_text(), ". ed4f3f67c70b02b29c50ce1ea26666bd+4 0:2:b1\n")
1118         self.assertEqual(c["b2"].manifest_text(), ". ed4f3f67c70b02b29c50ce1ea26666bd+4 2:2:b2\n")
1119
1120
1121 class CollectionCreateUpdateTest(run_test_server.TestCaseWithServers):
1122     MAIN_SERVER = {}
1123     KEEP_SERVER = {}
1124
1125     def create_count_txt(self):
1126         # Create an empty collection, save it to the API server, then write a
1127         # file, but don't save it.
1128
1129         c = Collection()
1130         c.save_new("CollectionCreateUpdateTest", ensure_unique_name=True)
1131         self.assertEqual(c.portable_data_hash(), "d41d8cd98f00b204e9800998ecf8427e+0")
1132         self.assertEqual(c.api_response()["portable_data_hash"], "d41d8cd98f00b204e9800998ecf8427e+0" )
1133
1134         with c.open("count.txt", "wb") as f:
1135             f.write(b"0123456789")
1136
1137         self.assertEqual(c.portable_manifest_text(), ". 781e5e245d69b566979b86e28d23f2c7+10 0:10:count.txt\n")
1138
1139         return c
1140
1141     def test_create_and_save(self):
1142         c = self.create_count_txt()
1143         c.save(properties={'type' : 'Intermediate'},
1144                storage_classes=['archive'],
1145                trash_at=datetime.datetime(2111, 1, 1, 11, 11, 11, 111111))
1146
1147         self.assertRegex(
1148             c.manifest_text(),
1149             r"^\. 781e5e245d69b566979b86e28d23f2c7\+10\+A[a-f0-9]{40}@[a-f0-9]{8} 0:10:count\.txt$",)
1150         self.assertEqual(c.api_response()["storage_classes_desired"], ['archive'])
1151         self.assertEqual(c.api_response()["properties"], {'type' : 'Intermediate'})
1152         self.assertEqual(c.api_response()["trash_at"], '2111-01-01T11:11:11.111111000Z')
1153
1154
1155     def test_create_and_save_new(self):
1156         c = self.create_count_txt()
1157         c.save_new(properties={'type' : 'Intermediate'},
1158                    storage_classes=['archive'],
1159                    trash_at=datetime.datetime(2111, 1, 1, 11, 11, 11, 111111))
1160
1161         self.assertRegex(
1162             c.manifest_text(),
1163             r"^\. 781e5e245d69b566979b86e28d23f2c7\+10\+A[a-f0-9]{40}@[a-f0-9]{8} 0:10:count\.txt$",)
1164         self.assertEqual(c.api_response()["storage_classes_desired"], ['archive'])
1165         self.assertEqual(c.api_response()["properties"], {'type' : 'Intermediate'})
1166         self.assertEqual(c.api_response()["trash_at"], '2111-01-01T11:11:11.111111000Z')
1167
1168     def test_create_and_save_after_commiting(self):
1169         c = self.create_count_txt()
1170         c.save(properties={'type' : 'Intermediate'},
1171                storage_classes=['hot'],
1172                trash_at=datetime.datetime(2111, 1, 1, 11, 11, 11, 111111))
1173         c.save(properties={'type' : 'Output'},
1174                storage_classes=['cold'],
1175                trash_at=datetime.datetime(2222, 2, 2, 22, 22, 22, 222222))
1176
1177         self.assertEqual(c.api_response()["storage_classes_desired"], ['cold'])
1178         self.assertEqual(c.api_response()["properties"], {'type' : 'Output'})
1179         self.assertEqual(c.api_response()["trash_at"], '2222-02-02T22:22:22.222222000Z')
1180
1181     def test_create_diff_apply(self):
1182         c1 = self.create_count_txt()
1183         c1.save()
1184
1185         c2 = Collection(c1.manifest_locator())
1186         with c2.open("count.txt", "wb") as f:
1187             f.write(b"abcdefg")
1188
1189         diff = c1.diff(c2)
1190
1191         self.assertEqual(diff[0], (arvados.collection.MOD, u'./count.txt', c1["count.txt"], c2["count.txt"]))
1192
1193         c1.apply(diff)
1194         self.assertEqual(c1.portable_data_hash(), c2.portable_data_hash())
1195
1196     def test_diff_apply_with_token(self):
1197         baseline = CollectionReader(". 781e5e245d69b566979b86e28d23f2c7+10+A715fd31f8111894f717eb1003c1b0216799dd9ec@54f5dd1a 0:10:count.txt\n")
1198         c = Collection(". 781e5e245d69b566979b86e28d23f2c7+10 0:10:count.txt\n")
1199         other = CollectionReader(". 7ac66c0f148de9519b8bd264312c4d64+7+A715fd31f8111894f717eb1003c1b0216799dd9ec@54f5dd1a 0:7:count.txt\n")
1200
1201         diff = baseline.diff(other)
1202         self.assertEqual(diff, [('mod', u'./count.txt', c["count.txt"], other["count.txt"])])
1203
1204         c.apply(diff)
1205
1206         self.assertEqual(c.manifest_text(), ". 7ac66c0f148de9519b8bd264312c4d64+7+A715fd31f8111894f717eb1003c1b0216799dd9ec@54f5dd1a 0:7:count.txt\n")
1207
1208
1209     def test_create_and_update(self):
1210         c1 = self.create_count_txt()
1211         c1.save()
1212
1213         c2 = arvados.collection.Collection(c1.manifest_locator())
1214         with c2.open("count.txt", "wb") as f:
1215             f.write(b"abcdefg")
1216
1217         c2.save()
1218
1219         self.assertNotEqual(c1.portable_data_hash(), c2.portable_data_hash())
1220         c1.update()
1221         self.assertEqual(c1.portable_data_hash(), c2.portable_data_hash())
1222
1223
1224     def test_create_and_update_with_conflict(self):
1225         c1 = self.create_count_txt()
1226         c1.save()
1227
1228         with c1.open("count.txt", "wb") as f:
1229             f.write(b"XYZ")
1230
1231         c2 = arvados.collection.Collection(c1.manifest_locator())
1232         with c2.open("count.txt", "wb") as f:
1233             f.write(b"abcdefg")
1234
1235         c2.save()
1236
1237         c1.update()
1238         self.assertRegex(
1239             c1.manifest_text(),
1240             r"\. e65075d550f9b5bf9992fa1d71a131be\+3\S* 7ac66c0f148de9519b8bd264312c4d64\+7\S* 0:3:count\.txt 3:7:count\.txt~\d\d\d\d\d\d\d\d-\d\d\d\d\d\d~conflict~$")
1241
1242     def test_pdh_is_native_str(self):
1243         c1 = self.create_count_txt()
1244         pdh = c1.portable_data_hash()
1245         self.assertEqual(type(''), type(pdh))
1246
1247
1248 if __name__ == '__main__':
1249     unittest.main()