Merge branch '21855-rpm-url-update'
[arvados.git] / sdk / python / tests / test_collections.py
1 # Copyright (C) The Arvados Authors. All rights reserved.
2 #
3 # SPDX-License-Identifier: Apache-2.0
4
5 import ciso8601
6 import copy
7 import datetime
8 import os
9 import random
10 import re
11 import shutil
12 import sys
13 import tempfile
14 import time
15 import unittest
16
17 import parameterized
18 from unittest import mock
19
20 from . import run_test_server
21 from arvados._ranges import Range, LocatorAndRange, locators_and_ranges
22
23 import arvados
24 import arvados.keep
25
26 from arvados.collection import Collection, CollectionReader
27 from arvados._ranges import Range, LocatorAndRange
28
29 from . import arvados_testutil as tutil
30 from . import run_test_server
31
32 @parameterized.parameterized_class([{"disk_cache": True}, {"disk_cache": False}])
33 class ArvadosCollectionsTest(run_test_server.TestCaseWithServers,
34                              tutil.ArvadosBaseTestCase):
35     disk_cache = False
36     MAIN_SERVER = {}
37
38     @classmethod
39     def setUpClass(cls):
40         super(ArvadosCollectionsTest, cls).setUpClass()
41         # need admin privileges to make collections with unsigned blocks
42         run_test_server.authorize_with('admin')
43         if cls.disk_cache:
44             cls._disk_cache_dir = tempfile.mkdtemp(prefix='CollectionsTest-')
45         else:
46             cls._disk_cache_dir = None
47         block_cache = arvados.keep.KeepBlockCache(
48             disk_cache=cls.disk_cache,
49             disk_cache_dir=cls._disk_cache_dir,
50         )
51         cls.api_client = arvados.api('v1')
52         cls.keep_client = arvados.KeepClient(api_client=cls.api_client,
53                                              local_store=cls.local_store,
54                                              block_cache=block_cache)
55
56     @classmethod
57     def tearDownClass(cls):
58         if cls._disk_cache_dir:
59             shutil.rmtree(cls._disk_cache_dir)
60
61     def write_foo_bar_baz(self):
62         with arvados.collection.Collection(api_client=self.api_client).open('zzz', 'wb') as f:
63             f.write(b'foobar')
64             f.flush()
65             f.write(b'baz')
66         cw = arvados.collection.Collection(
67             api_client=self.api_client,
68             manifest_locator_or_text=
69             ". 3858f62230ac3c915f300c664312c63f+6 0:3:foo.txt 3:3:bar.txt\n" +
70             "./baz 73feffa4b7f6bb68e44cf984c85f6e88+3 0:3:baz.txt\n")
71         cw.save_new()
72         return cw.portable_data_hash()
73
74     def test_pdh_is_native_str(self):
75         pdh = self.write_foo_bar_baz()
76         self.assertEqual(type(''), type(pdh))
77
78     def test_keep_local_store(self):
79         self.assertEqual(self.keep_client.put(b'foo'), 'acbd18db4cc2f85cedef654fccc4a4d8+3', 'wrong md5 hash from Keep.put')
80         self.assertEqual(self.keep_client.get('acbd18db4cc2f85cedef654fccc4a4d8+3'), b'foo', 'wrong data from Keep.get')
81
82     def test_local_collection_writer(self):
83         self.assertEqual(self.write_foo_bar_baz(),
84                          '23ca013983d6239e98931cc779e68426+114',
85                          'wrong locator hash: ' + self.write_foo_bar_baz())
86
87     def test_collection_empty_file(self):
88         cw = arvados.collection.Collection(api_client=self.api_client)
89         with cw.open('zero.txt', 'wb') as f:
90             pass
91
92         self.assertEqual(cw.manifest_text(), ". d41d8cd98f00b204e9800998ecf8427e+0 0:0:zero.txt\n")
93         self.check_manifest_file_sizes(cw.manifest_text(), [0])
94
95         cw = arvados.collection.Collection(api_client=self.api_client)
96         with cw.open('zero.txt', 'wb') as f:
97             pass
98         with cw.open('one.txt', 'wb') as f:
99             f.write(b'1')
100         with cw.open('foo/zero.txt', 'wb') as f:
101             pass
102         # sorted, that's: [./one.txt, ./zero.txt, foo/zero.txt]
103         self.check_manifest_file_sizes(cw.manifest_text(), [1,0,0])
104
105     def check_manifest_file_sizes(self, manifest_text, expect_sizes):
106         got_sizes = []
107         def walk(subdir):
108             for fnm in subdir:
109                 if isinstance(subdir[fnm], arvados.arvfile.ArvadosFile):
110                     got_sizes.append(subdir[fnm].size())
111                 else:
112                     walk(subdir[fnm])
113         cr = arvados.CollectionReader(manifest_text, self.api_client)
114         walk(cr)
115         self.assertEqual(got_sizes, expect_sizes, "got wrong file sizes %s, expected %s" % (got_sizes, expect_sizes))
116
117     def test_normalized_collection(self):
118         m1 = """. 5348b82a029fd9e971a811ce1f71360b+43 0:43:md5sum.txt
119 . 085c37f02916da1cad16f93c54d899b7+41 0:41:md5sum.txt
120 . 8b22da26f9f433dea0a10e5ec66d73ba+43 0:43:md5sum.txt
121 """
122         self.assertEqual(arvados.CollectionReader(m1, self.api_client).manifest_text(normalize=True),
123                          """. 5348b82a029fd9e971a811ce1f71360b+43 085c37f02916da1cad16f93c54d899b7+41 8b22da26f9f433dea0a10e5ec66d73ba+43 0:127:md5sum.txt
124 """)
125
126         m2 = """. 204e43b8a1185621ca55a94839582e6f+67108864 b9677abbac956bd3e86b1deb28dfac03+67108864 fc15aff2a762b13f521baf042140acec+67108864 323d2a3ce20370c4ca1d3462a344f8fd+25885655 0:227212247:var-GS000016015-ASM.tsv.bz2
127 """
128         self.assertEqual(arvados.CollectionReader(m2, self.api_client).manifest_text(normalize=True), m2)
129
130         m3 = """. 5348b82a029fd9e971a811ce1f71360b+43 3:40:md5sum.txt
131 . 085c37f02916da1cad16f93c54d899b7+41 0:41:md5sum.txt
132 . 8b22da26f9f433dea0a10e5ec66d73ba+43 0:43:md5sum.txt
133 """
134         self.assertEqual(arvados.CollectionReader(m3, self.api_client).manifest_text(normalize=True),
135                          """. 5348b82a029fd9e971a811ce1f71360b+43 085c37f02916da1cad16f93c54d899b7+41 8b22da26f9f433dea0a10e5ec66d73ba+43 3:124:md5sum.txt
136 """)
137
138         m4 = """. 204e43b8a1185621ca55a94839582e6f+67108864 0:3:foo/bar
139 ./zzz 204e43b8a1185621ca55a94839582e6f+67108864 0:999:zzz
140 ./foo 323d2a3ce20370c4ca1d3462a344f8fd+25885655 0:3:bar
141 """
142         self.assertEqual(arvados.CollectionReader(m4, self.api_client).manifest_text(normalize=True),
143                          """./foo 204e43b8a1185621ca55a94839582e6f+67108864 323d2a3ce20370c4ca1d3462a344f8fd+25885655 0:3:bar 67108864:3:bar
144 ./zzz 204e43b8a1185621ca55a94839582e6f+67108864 0:999:zzz
145 """)
146
147         m5 = """. 204e43b8a1185621ca55a94839582e6f+67108864 0:3:foo/bar
148 ./zzz 204e43b8a1185621ca55a94839582e6f+67108864 0:999:zzz
149 ./foo 204e43b8a1185621ca55a94839582e6f+67108864 3:3:bar
150 """
151         self.assertEqual(arvados.CollectionReader(m5, self.api_client).manifest_text(normalize=True),
152                          """./foo 204e43b8a1185621ca55a94839582e6f+67108864 0:6:bar
153 ./zzz 204e43b8a1185621ca55a94839582e6f+67108864 0:999:zzz
154 """)
155
156         with self.data_file('1000G_ref_manifest') as f6:
157             m6 = f6.read()
158             self.assertEqual(arvados.CollectionReader(m6, self.api_client).manifest_text(normalize=True), m6)
159
160         with self.data_file('jlake_manifest') as f7:
161             m7 = f7.read()
162             self.assertEqual(arvados.CollectionReader(m7, self.api_client).manifest_text(normalize=True), m7)
163
164         m8 = """./a\\040b\\040c 59ca0efa9f5633cb0371bbc0355478d8+13 0:13:hello\\040world.txt
165 """
166         self.assertEqual(arvados.CollectionReader(m8, self.api_client).manifest_text(normalize=True), m8)
167
168     def test_locators_and_ranges(self):
169         blocks2 = [Range('a', 0, 10),
170                    Range('b', 10, 10),
171                    Range('c', 20, 10),
172                    Range('d', 30, 10),
173                    Range('e', 40, 10),
174                    Range('f', 50, 10)]
175
176         self.assertEqual(locators_and_ranges(blocks2,  2,  2), [LocatorAndRange('a', 10, 2, 2)])
177         self.assertEqual(locators_and_ranges(blocks2, 12, 2), [LocatorAndRange('b', 10, 2, 2)])
178         self.assertEqual(locators_and_ranges(blocks2, 22, 2), [LocatorAndRange('c', 10, 2, 2)])
179         self.assertEqual(locators_and_ranges(blocks2, 32, 2), [LocatorAndRange('d', 10, 2, 2)])
180         self.assertEqual(locators_and_ranges(blocks2, 42, 2), [LocatorAndRange('e', 10, 2, 2)])
181         self.assertEqual(locators_and_ranges(blocks2, 52, 2), [LocatorAndRange('f', 10, 2, 2)])
182         self.assertEqual(locators_and_ranges(blocks2, 62, 2), [])
183         self.assertEqual(locators_and_ranges(blocks2, -2, 2), [])
184
185         self.assertEqual(locators_and_ranges(blocks2,  0,  2), [LocatorAndRange('a', 10, 0, 2)])
186         self.assertEqual(locators_and_ranges(blocks2, 10, 2), [LocatorAndRange('b', 10, 0, 2)])
187         self.assertEqual(locators_and_ranges(blocks2, 20, 2), [LocatorAndRange('c', 10, 0, 2)])
188         self.assertEqual(locators_and_ranges(blocks2, 30, 2), [LocatorAndRange('d', 10, 0, 2)])
189         self.assertEqual(locators_and_ranges(blocks2, 40, 2), [LocatorAndRange('e', 10, 0, 2)])
190         self.assertEqual(locators_and_ranges(blocks2, 50, 2), [LocatorAndRange('f', 10, 0, 2)])
191         self.assertEqual(locators_and_ranges(blocks2, 60, 2), [])
192         self.assertEqual(locators_and_ranges(blocks2, -2, 2), [])
193
194         self.assertEqual(locators_and_ranges(blocks2,  9,  2), [LocatorAndRange('a', 10, 9, 1), LocatorAndRange('b', 10, 0, 1)])
195         self.assertEqual(locators_and_ranges(blocks2, 19, 2), [LocatorAndRange('b', 10, 9, 1), LocatorAndRange('c', 10, 0, 1)])
196         self.assertEqual(locators_and_ranges(blocks2, 29, 2), [LocatorAndRange('c', 10, 9, 1), LocatorAndRange('d', 10, 0, 1)])
197         self.assertEqual(locators_and_ranges(blocks2, 39, 2), [LocatorAndRange('d', 10, 9, 1), LocatorAndRange('e', 10, 0, 1)])
198         self.assertEqual(locators_and_ranges(blocks2, 49, 2), [LocatorAndRange('e', 10, 9, 1), LocatorAndRange('f', 10, 0, 1)])
199         self.assertEqual(locators_and_ranges(blocks2, 59, 2), [LocatorAndRange('f', 10, 9, 1)])
200
201
202         blocks3 = [Range('a', 0, 10),
203                   Range('b', 10, 10),
204                   Range('c', 20, 10),
205                   Range('d', 30, 10),
206                   Range('e', 40, 10),
207                   Range('f', 50, 10),
208                    Range('g', 60, 10)]
209
210         self.assertEqual(locators_and_ranges(blocks3,  2,  2), [LocatorAndRange('a', 10, 2, 2)])
211         self.assertEqual(locators_and_ranges(blocks3, 12, 2), [LocatorAndRange('b', 10, 2, 2)])
212         self.assertEqual(locators_and_ranges(blocks3, 22, 2), [LocatorAndRange('c', 10, 2, 2)])
213         self.assertEqual(locators_and_ranges(blocks3, 32, 2), [LocatorAndRange('d', 10, 2, 2)])
214         self.assertEqual(locators_and_ranges(blocks3, 42, 2), [LocatorAndRange('e', 10, 2, 2)])
215         self.assertEqual(locators_and_ranges(blocks3, 52, 2), [LocatorAndRange('f', 10, 2, 2)])
216         self.assertEqual(locators_and_ranges(blocks3, 62, 2), [LocatorAndRange('g', 10, 2, 2)])
217
218
219         blocks = [Range('a', 0, 10),
220                   Range('b', 10, 15),
221                   Range('c', 25, 5)]
222         self.assertEqual(locators_and_ranges(blocks, 1, 0), [])
223         self.assertEqual(locators_and_ranges(blocks, 0, 5), [LocatorAndRange('a', 10, 0, 5)])
224         self.assertEqual(locators_and_ranges(blocks, 3, 5), [LocatorAndRange('a', 10, 3, 5)])
225         self.assertEqual(locators_and_ranges(blocks, 0, 10), [LocatorAndRange('a', 10, 0, 10)])
226
227         self.assertEqual(locators_and_ranges(blocks, 0, 11), [LocatorAndRange('a', 10, 0, 10),
228                                                               LocatorAndRange('b', 15, 0, 1)])
229         self.assertEqual(locators_and_ranges(blocks, 1, 11), [LocatorAndRange('a', 10, 1, 9),
230                                                               LocatorAndRange('b', 15, 0, 2)])
231         self.assertEqual(locators_and_ranges(blocks, 0, 25), [LocatorAndRange('a', 10, 0, 10),
232                                                               LocatorAndRange('b', 15, 0, 15)])
233
234         self.assertEqual(locators_and_ranges(blocks, 0, 30), [LocatorAndRange('a', 10, 0, 10),
235                                                               LocatorAndRange('b', 15, 0, 15),
236                                                               LocatorAndRange('c', 5, 0, 5)])
237         self.assertEqual(locators_and_ranges(blocks, 1, 30), [LocatorAndRange('a', 10, 1, 9),
238                                                               LocatorAndRange('b', 15, 0, 15),
239                                                               LocatorAndRange('c', 5, 0, 5)])
240         self.assertEqual(locators_and_ranges(blocks, 0, 31), [LocatorAndRange('a', 10, 0, 10),
241                                                               LocatorAndRange('b', 15, 0, 15),
242                                                               LocatorAndRange('c', 5, 0, 5)])
243
244         self.assertEqual(locators_and_ranges(blocks, 15, 5), [LocatorAndRange('b', 15, 5, 5)])
245
246         self.assertEqual(locators_and_ranges(blocks, 8, 17), [LocatorAndRange('a', 10, 8, 2),
247                                                               LocatorAndRange('b', 15, 0, 15)])
248
249         self.assertEqual(locators_and_ranges(blocks, 8, 20), [LocatorAndRange('a', 10, 8, 2),
250                                                               LocatorAndRange('b', 15, 0, 15),
251                                                               LocatorAndRange('c', 5, 0, 3)])
252
253         self.assertEqual(locators_and_ranges(blocks, 26, 2), [LocatorAndRange('c', 5, 1, 2)])
254
255         self.assertEqual(locators_and_ranges(blocks, 9, 15), [LocatorAndRange('a', 10, 9, 1),
256                                                               LocatorAndRange('b', 15, 0, 14)])
257         self.assertEqual(locators_and_ranges(blocks, 10, 15), [LocatorAndRange('b', 15, 0, 15)])
258         self.assertEqual(locators_and_ranges(blocks, 11, 15), [LocatorAndRange('b', 15, 1, 14),
259                                                                LocatorAndRange('c', 5, 0, 1)])
260
261     class MockKeep(object):
262         def __init__(self, content, num_retries=0):
263             self.content = content
264             self.num_prefetch_threads = 1
265
266         def get(self, locator, num_retries=0, prefetch=False):
267             return self.content[locator]
268
269     def test_extract_file(self):
270         m1 = """. 5348b82a029fd9e971a811ce1f71360b+43 0:43:md5sum.txt
271 . 085c37f02916da1cad16f93c54d899b7+41 0:41:md6sum.txt
272 . 8b22da26f9f433dea0a10e5ec66d73ba+43 0:43:md7sum.txt
273 . 085c37f02916da1cad16f93c54d899b7+41 5348b82a029fd9e971a811ce1f71360b+43 8b22da26f9f433dea0a10e5ec66d73ba+43 47:80:md8sum.txt
274 . 085c37f02916da1cad16f93c54d899b7+41 5348b82a029fd9e971a811ce1f71360b+43 8b22da26f9f433dea0a10e5ec66d73ba+43 40:80:md9sum.txt
275 """
276         coll = arvados.CollectionReader(m1, self.api_client)
277         m2 = coll.manifest_text(normalize=True)
278         self.assertEqual(m2,
279                          ". 5348b82a029fd9e971a811ce1f71360b+43 085c37f02916da1cad16f93c54d899b7+41 8b22da26f9f433dea0a10e5ec66d73ba+43 0:43:md5sum.txt 43:41:md6sum.txt 84:43:md7sum.txt 6:37:md8sum.txt 84:43:md8sum.txt 83:1:md9sum.txt 0:43:md9sum.txt 84:36:md9sum.txt\n")
280         self.assertEqual(coll['md5sum.txt'].manifest_text(),
281                          ". 5348b82a029fd9e971a811ce1f71360b+43 0:43:md5sum.txt\n")
282         self.assertEqual(coll['md6sum.txt'].manifest_text(),
283                          ". 085c37f02916da1cad16f93c54d899b7+41 0:41:md6sum.txt\n")
284         self.assertEqual(coll['md7sum.txt'].manifest_text(),
285                          ". 8b22da26f9f433dea0a10e5ec66d73ba+43 0:43:md7sum.txt\n")
286         self.assertEqual(coll['md9sum.txt'].manifest_text(),
287                          ". 085c37f02916da1cad16f93c54d899b7+41 5348b82a029fd9e971a811ce1f71360b+43 8b22da26f9f433dea0a10e5ec66d73ba+43 40:80:md9sum.txt\n")
288
289
290 class CollectionTestMixin(tutil.ApiClientMock):
291     API_COLLECTIONS = run_test_server.fixture('collections')
292     DEFAULT_COLLECTION = API_COLLECTIONS['foo_file']
293     DEFAULT_DATA_HASH = DEFAULT_COLLECTION['portable_data_hash']
294     DEFAULT_MANIFEST = DEFAULT_COLLECTION['manifest_text']
295     DEFAULT_UUID = DEFAULT_COLLECTION['uuid']
296     ALT_COLLECTION = API_COLLECTIONS['bar_file']
297     ALT_DATA_HASH = ALT_COLLECTION['portable_data_hash']
298     ALT_MANIFEST = ALT_COLLECTION['manifest_text']
299
300     def api_client_mock(self, status=200):
301         client = super(CollectionTestMixin, self).api_client_mock()
302         self.mock_keep_services(client, status=status, service_type='proxy', count=1)
303         return client
304
305
306 @tutil.skip_sleep
307 class CollectionReaderTestCase(unittest.TestCase, CollectionTestMixin):
308     def mock_get_collection(self, api_mock, code, fixturename):
309         body = self.API_COLLECTIONS.get(fixturename)
310         self._mock_api_call(api_mock.collections().get, code, body)
311
312     def api_client_mock(self, status=200):
313         client = super(CollectionReaderTestCase, self).api_client_mock()
314         self.mock_get_collection(client, status, 'foo_file')
315         return client
316
317     def test_init_default_retries(self):
318         client = self.api_client_mock(200)
319         reader = arvados.CollectionReader(self.DEFAULT_UUID, api_client=client)
320         reader.manifest_text()
321         client.collections().get().execute.assert_called_with(num_retries=10)
322
323     def test_uuid_init_success(self):
324         client = self.api_client_mock(200)
325         reader = arvados.CollectionReader(self.DEFAULT_UUID, api_client=client,
326                                           num_retries=3)
327         self.assertEqual(self.DEFAULT_COLLECTION['manifest_text'],
328                          reader.manifest_text())
329         client.collections().get().execute.assert_called_with(num_retries=3)
330
331     def test_uuid_init_failure_raises_api_error(self):
332         client = self.api_client_mock(500)
333         with self.assertRaises(arvados.errors.ApiError):
334             reader = arvados.CollectionReader(self.DEFAULT_UUID, api_client=client)
335
336     def test_locator_init(self):
337         client = self.api_client_mock(200)
338         # Ensure Keep will not return anything if asked.
339         with tutil.mock_keep_responses(None, 404):
340             reader = arvados.CollectionReader(self.DEFAULT_DATA_HASH,
341                                               api_client=client)
342             self.assertEqual(self.DEFAULT_MANIFEST, reader.manifest_text())
343
344     def test_init_no_fallback_to_keep(self):
345         # Do not look up a collection UUID or PDH in Keep.
346         for key in [self.DEFAULT_UUID, self.DEFAULT_DATA_HASH]:
347             client = self.api_client_mock(404)
348             with tutil.mock_keep_responses(self.DEFAULT_MANIFEST, 200):
349                 with self.assertRaises(arvados.errors.ApiError):
350                     reader = arvados.CollectionReader(key, api_client=client)
351
352     def test_init_num_retries_propagated(self):
353         # More of an integration test...
354         client = self.api_client_mock(200)
355         reader = arvados.CollectionReader(self.DEFAULT_UUID, api_client=client,
356                                           num_retries=3)
357         with tutil.mock_keep_responses('foo', 500, 500, 200):
358             self.assertEqual('foo', reader.open('foo', 'r').read())
359
360     def test_read_nonnormalized_manifest_with_collection_reader(self):
361         # client should be able to use CollectionReader on a manifest without normalizing it
362         client = self.api_client_mock(500)
363         nonnormal = ". acbd18db4cc2f85cedef654fccc4a4d8+3+Aabadbadbee@abeebdee 0:3:foo.txt 1:0:bar.txt 0:3:foo.txt\n"
364         reader = arvados.CollectionReader(
365             nonnormal,
366             api_client=client, num_retries=0)
367         # Ensure stripped_manifest() doesn't mangle our manifest in
368         # any way other than stripping hints.
369         self.assertEqual(
370             re.sub(r'\+[^\d\s\+]+', '', nonnormal),
371             reader.stripped_manifest())
372         # Ensure stripped_manifest() didn't mutate our reader.
373         self.assertEqual(nonnormal, reader.manifest_text())
374
375     def test_read_empty_collection(self):
376         client = self.api_client_mock(200)
377         self.mock_get_collection(client, 200, 'empty')
378         reader = arvados.CollectionReader('d41d8cd98f00b204e9800998ecf8427e+0',
379                                           api_client=client)
380         self.assertEqual('', reader.manifest_text())
381         self.assertEqual(0, len(reader))
382         self.assertFalse(reader)
383
384     def test_api_response(self):
385         client = self.api_client_mock()
386         reader = arvados.CollectionReader(self.DEFAULT_UUID, api_client=client)
387         self.assertEqual(self.DEFAULT_COLLECTION, reader.api_response())
388
389     def check_open_file(self, coll_file, stream_name, file_name, file_size):
390         self.assertFalse(coll_file.closed, "returned file is not open")
391         self.assertEqual(stream_name, coll_file.stream_name())
392         self.assertEqual(file_name, coll_file.name)
393         self.assertEqual(file_size, coll_file.size())
394
395     def test_open_collection_file_one_argument(self):
396         client = self.api_client_mock(200)
397         reader = arvados.CollectionReader(self.DEFAULT_UUID, api_client=client)
398         cfile = reader.open('./foo', 'rb')
399         self.check_open_file(cfile, '.', 'foo', 3)
400
401     def test_open_deep_file(self):
402         coll_name = 'collection_with_files_in_subdir'
403         client = self.api_client_mock(200)
404         self.mock_get_collection(client, 200, coll_name)
405         reader = arvados.CollectionReader(
406             self.API_COLLECTIONS[coll_name]['uuid'], api_client=client)
407         cfile = reader.open('./subdir2/subdir3/file2_in_subdir3.txt', 'rb')
408         self.check_open_file(cfile, './subdir2/subdir3', 'file2_in_subdir3.txt',
409                              32)
410
411     def test_open_nonexistent_stream(self):
412         client = self.api_client_mock(200)
413         reader = arvados.CollectionReader(self.DEFAULT_UUID, api_client=client)
414         self.assertRaises(IOError, reader.open, './nonexistent/foo')
415
416     def test_open_nonexistent_file(self):
417         client = self.api_client_mock(200)
418         reader = arvados.CollectionReader(self.DEFAULT_UUID, api_client=client)
419         self.assertRaises(IOError, reader.open, 'nonexistent')
420
421
422 class CollectionMethods(run_test_server.TestCaseWithServers):
423
424     def test_keys_values_items_support_indexing(self):
425         c = Collection()
426         with c.open('foo', 'wb') as f:
427             f.write(b'foo')
428         with c.open('bar', 'wb') as f:
429             f.write(b'bar')
430         self.assertEqual(2, len(c.keys()))
431         fn0, fn1 = c.keys()
432         self.assertEqual(2, len(c.values()))
433         f0 = c.values()[0]
434         f1 = c.values()[1]
435         self.assertEqual(2, len(c.items()))
436         self.assertEqual(fn0, c.items()[0][0])
437         self.assertEqual(fn1, c.items()[1][0])
438
439     def test_get_properties(self):
440         c = Collection()
441         self.assertEqual(c.get_properties(), {})
442         c.save_new(properties={"foo":"bar"})
443         self.assertEqual(c.get_properties(), {"foo":"bar"})
444
445     def test_get_trash_at(self):
446         c = Collection()
447         self.assertEqual(c.get_trash_at(), None)
448         c.save_new(trash_at=datetime.datetime(2111, 1, 1, 11, 11, 11, 111111))
449         self.assertEqual(c.get_trash_at(), ciso8601.parse_datetime('2111-01-01T11:11:11.111111000Z'))
450
451
452 class CollectionOpenModes(run_test_server.TestCaseWithServers):
453
454     def test_open_binary_modes(self):
455         c = Collection()
456         for mode in ['wb', 'wb+', 'ab', 'ab+']:
457             with c.open('foo', mode) as f:
458                 f.write(b'foo')
459
460     def test_open_invalid_modes(self):
461         c = Collection()
462         for mode in ['+r', 'aa', '++', 'r+b', 'beer', '', None]:
463             with self.assertRaises(Exception):
464                 c.open('foo', mode)
465
466     def test_open_text_modes(self):
467         c = Collection()
468         with c.open('foo', 'wb') as f:
469             f.write('foo')
470         for mode in ['r', 'rt', 'r+', 'rt+', 'w', 'wt', 'a', 'at']:
471             with c.open('foo', mode) as f:
472                 if mode[0] == 'r' and '+' not in mode:
473                     self.assertEqual('foo', f.read(3))
474                 else:
475                     f.write('bar')
476                     f.seek(0, os.SEEK_SET)
477                     self.assertEqual('bar', f.read(3))
478
479
480 class TextModes(run_test_server.TestCaseWithServers):
481
482     def setUp(self):
483         arvados.config.KEEP_BLOCK_SIZE = 4
484         self.sailboat = '\N{SAILBOAT}'
485         self.snowman = '\N{SNOWMAN}'
486
487     def tearDown(self):
488         arvados.config.KEEP_BLOCK_SIZE = 2 ** 26
489
490     def test_read_sailboat_across_block_boundary(self):
491         c = Collection()
492         f = c.open('sailboats', 'wb')
493         data = self.sailboat.encode('utf-8')
494         f.write(data)
495         f.write(data[:1])
496         f.write(data[1:])
497         f.write(b'\n')
498         f.close()
499         self.assertRegex(c.portable_manifest_text(), r'\+4 .*\+3 ')
500
501         f = c.open('sailboats', 'r')
502         string = f.readline()
503         self.assertEqual(string, self.sailboat+self.sailboat+'\n')
504         f.close()
505
506     def test_write_snowman_across_block_boundary(self):
507         c = Collection()
508         f = c.open('snowmany', 'w')
509         data = self.snowman
510         f.write(data+data+'\n'+data+'\n')
511         f.close()
512         self.assertRegex(c.portable_manifest_text(), r'\+4 .*\+4 .*\+3 ')
513
514         f = c.open('snowmany', 'r')
515         self.assertEqual(f.readline(), self.snowman+self.snowman+'\n')
516         self.assertEqual(f.readline(), self.snowman+'\n')
517         f.close()
518
519
520 class NewCollectionTestCase(unittest.TestCase, CollectionTestMixin):
521
522     def test_replication_desired_kept_on_load(self):
523         m = '. 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt 0:10:count2.txt\n'
524         c1 = Collection(m, replication_desired=1)
525         c1.save_new()
526         loc = c1.manifest_locator()
527         c2 = Collection(loc)
528         self.assertEqual(c1.manifest_text(strip=True), c2.manifest_text(strip=True))
529         self.assertEqual(c1.replication_desired, c2.replication_desired)
530
531     def test_replication_desired_not_loaded_if_provided(self):
532         m = '. 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt 0:10:count2.txt\n'
533         c1 = Collection(m, replication_desired=1)
534         c1.save_new()
535         loc = c1.manifest_locator()
536         c2 = Collection(loc, replication_desired=2)
537         self.assertEqual(c1.manifest_text(strip=True), c2.manifest_text(strip=True))
538         self.assertNotEqual(c1.replication_desired, c2.replication_desired)
539
540     def test_storage_classes_desired_kept_on_load(self):
541         m = '. 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt 0:10:count2.txt\n'
542         c1 = Collection(m, storage_classes_desired=['archival'])
543         c1.save_new()
544         loc = c1.manifest_locator()
545         c2 = Collection(loc)
546         self.assertEqual(c1.manifest_text(strip=True), c2.manifest_text(strip=True))
547         self.assertEqual(c1.storage_classes_desired(), c2.storage_classes_desired())
548
549     def test_storage_classes_change_after_save(self):
550         m = '. 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt 0:10:count2.txt\n'
551         c1 = Collection(m, storage_classes_desired=['archival'])
552         c1.save_new()
553         loc = c1.manifest_locator()
554         c2 = Collection(loc)
555         self.assertEqual(['archival'], c2.storage_classes_desired())
556         c2.save(storage_classes=['highIO'])
557         self.assertEqual(['highIO'], c2.storage_classes_desired())
558         c3 = Collection(loc)
559         self.assertEqual(c1.manifest_text(strip=True), c3.manifest_text(strip=True))
560         self.assertEqual(['highIO'], c3.storage_classes_desired())
561
562     def test_storage_classes_desired_not_loaded_if_provided(self):
563         m = '. 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt 0:10:count2.txt\n'
564         c1 = Collection(m, storage_classes_desired=['archival'])
565         c1.save_new()
566         loc = c1.manifest_locator()
567         c2 = Collection(loc, storage_classes_desired=['default'])
568         self.assertEqual(c1.manifest_text(strip=True), c2.manifest_text(strip=True))
569         self.assertNotEqual(c1.storage_classes_desired(), c2.storage_classes_desired())
570
571     def test_init_manifest(self):
572         m1 = """. 5348b82a029fd9e971a811ce1f71360b+43 0:43:md5sum.txt
573 . 085c37f02916da1cad16f93c54d899b7+41 0:41:md5sum.txt
574 . 8b22da26f9f433dea0a10e5ec66d73ba+43 0:43:md5sum.txt
575 """
576         self.assertEqual(m1, CollectionReader(m1).manifest_text(normalize=False))
577         self.assertEqual(". 5348b82a029fd9e971a811ce1f71360b+43 085c37f02916da1cad16f93c54d899b7+41 8b22da26f9f433dea0a10e5ec66d73ba+43 0:127:md5sum.txt\n", CollectionReader(m1).manifest_text(normalize=True))
578
579     def test_init_manifest_with_collision(self):
580         m1 = """. 5348b82a029fd9e971a811ce1f71360b+43 0:43:md5sum.txt
581 ./md5sum.txt 085c37f02916da1cad16f93c54d899b7+41 0:41:md5sum.txt
582 """
583         with self.assertRaises(arvados.errors.ArgumentError):
584             self.assertEqual(m1, CollectionReader(m1))
585
586     def test_init_manifest_with_error(self):
587         m1 = """. 0:43:md5sum.txt"""
588         with self.assertRaises(arvados.errors.ArgumentError):
589             self.assertEqual(m1, CollectionReader(m1))
590
591     def test_remove(self):
592         c = Collection('. 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt 0:10:count2.txt\n')
593         self.assertEqual(". 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt 0:10:count2.txt\n", c.portable_manifest_text())
594         self.assertIn("count1.txt", c)
595         c.remove("count1.txt")
596         self.assertNotIn("count1.txt", c)
597         self.assertEqual(". 781e5e245d69b566979b86e28d23f2c7+10 0:10:count2.txt\n", c.portable_manifest_text())
598         with self.assertRaises(arvados.errors.ArgumentError):
599             c.remove("")
600
601     def test_remove_recursive(self):
602         c = Collection('. 781e5e245d69b566979b86e28d23f2c7+10 0:10:a/b/c/d/efg.txt 0:10:xyz.txt\n')
603         self.assertEqual(". 781e5e245d69b566979b86e28d23f2c7+10 0:10:xyz.txt\n./a/b/c/d 781e5e245d69b566979b86e28d23f2c7+10 0:10:efg.txt\n", c.portable_manifest_text())
604         self.assertIn("a", c)
605         self.assertEqual(1, len(c["a"].keys()))
606         # cannot remove non-empty directory with default recursive=False
607         with self.assertRaises(OSError):
608             c.remove("a/b")
609         with self.assertRaises(OSError):
610             c.remove("a/b/c/d")
611         c.remove("a/b", recursive=True)
612         self.assertEqual(0, len(c["a"].keys()))
613         self.assertEqual(". 781e5e245d69b566979b86e28d23f2c7+10 0:10:xyz.txt\n./a d41d8cd98f00b204e9800998ecf8427e+0 0:0:\\056\n", c.portable_manifest_text())
614
615     def test_find(self):
616         c = Collection('. 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt 0:10:count2.txt\n')
617         self.assertIs(c.find("."), c)
618         self.assertIs(c.find("./count1.txt"), c["count1.txt"])
619         self.assertIs(c.find("count1.txt"), c["count1.txt"])
620         with self.assertRaises(IOError):
621             c.find("/.")
622         with self.assertRaises(arvados.errors.ArgumentError):
623             c.find("")
624         self.assertIs(c.find("./nonexistant.txt"), None)
625         self.assertIs(c.find("./nonexistantsubdir/nonexistant.txt"), None)
626
627     def test_escaped_paths_dont_get_unescaped_on_manifest(self):
628         # Dir & file names are literally '\056' (escaped form: \134056)
629         manifest = './\\134056\\040Test d41d8cd98f00b204e9800998ecf8427e+0 0:0:\\134056\n'
630         c = Collection(manifest)
631         self.assertEqual(c.portable_manifest_text(), manifest)
632
633     def test_other_special_chars_on_file_token(self):
634         cases = [
635             ('\\000', '\0'),
636             ('\\011', '\t'),
637             ('\\012', '\n'),
638             ('\\072', ':'),
639             ('\\134400', '\\400'),
640         ]
641         for encoded, decoded in cases:
642             manifest = '. d41d8cd98f00b204e9800998ecf8427e+0 0:0:some%sfile.txt\n' % encoded
643             c = Collection(manifest)
644             self.assertEqual(c.portable_manifest_text(), manifest)
645             self.assertIn('some%sfile.txt' % decoded, c.keys())
646
647     def test_escaped_paths_do_get_unescaped_on_listing(self):
648         # Dir & file names are literally '\056' (escaped form: \134056)
649         manifest = './\\134056\\040Test d41d8cd98f00b204e9800998ecf8427e+0 0:0:\\134056\n'
650         c = Collection(manifest)
651         self.assertIn('\\056 Test', c.keys())
652         self.assertIn('\\056', c['\\056 Test'].keys())
653
654     def test_make_empty_dir_with_escaped_chars(self):
655         c = Collection()
656         c.mkdirs('./Empty\\056Dir')
657         self.assertEqual(c.portable_manifest_text(),
658                          './Empty\\134056Dir d41d8cd98f00b204e9800998ecf8427e+0 0:0:\\056\n')
659
660     def test_make_empty_dir_with_spaces(self):
661         c = Collection()
662         c.mkdirs('./foo bar/baz waz')
663         self.assertEqual(c.portable_manifest_text(),
664                          './foo\\040bar/baz\\040waz d41d8cd98f00b204e9800998ecf8427e+0 0:0:\\056\n')
665
666     def test_remove_in_subdir(self):
667         c = Collection('. 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt\n./foo 781e5e245d69b566979b86e28d23f2c7+10 0:10:count2.txt\n')
668         c.remove("foo/count2.txt")
669         self.assertEqual(". 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt\n./foo d41d8cd98f00b204e9800998ecf8427e+0 0:0:\\056\n", c.portable_manifest_text())
670
671     def test_remove_empty_subdir(self):
672         c = Collection('. 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt\n./foo 781e5e245d69b566979b86e28d23f2c7+10 0:10:count2.txt\n')
673         c.remove("foo/count2.txt")
674         c.remove("foo")
675         self.assertEqual(". 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt\n", c.portable_manifest_text())
676
677     def test_remove_nonempty_subdir(self):
678         c = Collection('. 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt\n./foo 781e5e245d69b566979b86e28d23f2c7+10 0:10:count2.txt\n')
679         with self.assertRaises(IOError):
680             c.remove("foo")
681         c.remove("foo", recursive=True)
682         self.assertEqual(". 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt\n", c.portable_manifest_text())
683
684     def test_copy_to_file_in_dir(self):
685         c = Collection('. 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt\n')
686         c.copy("count1.txt", "foo/count2.txt")
687         self.assertEqual(". 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt\n./foo 781e5e245d69b566979b86e28d23f2c7+10 0:10:count2.txt\n", c.portable_manifest_text())
688
689     def test_copy_file(self):
690         c = Collection('. 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt\n')
691         c.copy("count1.txt", "count2.txt")
692         self.assertEqual(". 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt 0:10:count2.txt\n", c.portable_manifest_text())
693
694     def test_copy_to_existing_dir(self):
695         c = Collection('. 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt\n./foo 781e5e245d69b566979b86e28d23f2c7+10 0:10:count2.txt\n')
696         c.copy("count1.txt", "foo")
697         self.assertEqual(". 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt\n./foo 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt 0:10:count2.txt\n", c.portable_manifest_text())
698
699     def test_copy_to_new_dir(self):
700         c = Collection('. 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt\n')
701         c.copy("count1.txt", "foo/")
702         self.assertEqual(". 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt\n./foo 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt\n", c.portable_manifest_text())
703
704     def test_rename_file(self):
705         c = Collection('. 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt\n')
706         c.rename("count1.txt", "count2.txt")
707         self.assertEqual(". 781e5e245d69b566979b86e28d23f2c7+10 0:10:count2.txt\n", c.manifest_text())
708
709     def test_move_file_to_dir(self):
710         c = Collection('. 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt\n')
711         c.mkdirs("foo")
712         c.rename("count1.txt", "foo/count2.txt")
713         self.assertEqual("./foo 781e5e245d69b566979b86e28d23f2c7+10 0:10:count2.txt\n", c.manifest_text())
714
715     def test_move_file_to_other(self):
716         c1 = Collection('. 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt\n')
717         c2 = Collection()
718         c2.rename("count1.txt", "count2.txt", source_collection=c1)
719         self.assertEqual("", c1.manifest_text())
720         self.assertEqual(". 781e5e245d69b566979b86e28d23f2c7+10 0:10:count2.txt\n", c2.manifest_text())
721
722     def test_clone(self):
723         c = Collection('. 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt\n./foo 781e5e245d69b566979b86e28d23f2c7+10 0:10:count2.txt\n')
724         cl = c.clone()
725         self.assertEqual(". 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt\n./foo 781e5e245d69b566979b86e28d23f2c7+10 0:10:count2.txt\n", cl.portable_manifest_text())
726
727     def test_diff_del_add(self):
728         c1 = Collection('. 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt\n')
729         c2 = Collection('. 5348b82a029fd9e971a811ce1f71360b+43 0:10:count2.txt\n')
730         d = c2.diff(c1)
731         self.assertEqual(sorted(d), [
732             ('add', './count1.txt', c1["count1.txt"]),
733             ('del', './count2.txt', c2["count2.txt"]),
734         ])
735         d = c1.diff(c2)
736         self.assertEqual(sorted(d), [
737             ('add', './count2.txt', c2["count2.txt"]),
738             ('del', './count1.txt', c1["count1.txt"]),
739         ])
740         self.assertNotEqual(c1.portable_manifest_text(), c2.portable_manifest_text())
741         c1.apply(d)
742         self.assertEqual(c1.portable_manifest_text(), c2.portable_manifest_text())
743
744     def test_diff_same(self):
745         c1 = Collection('. 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt\n')
746         c2 = Collection('. 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt\n')
747         d = c2.diff(c1)
748         self.assertEqual(d, [('tok', './count1.txt', c2["count1.txt"], c1["count1.txt"])])
749         d = c1.diff(c2)
750         self.assertEqual(d, [('tok', './count1.txt', c2["count1.txt"], c1["count1.txt"])])
751
752         self.assertEqual(c1.portable_manifest_text(), c2.portable_manifest_text())
753         c1.apply(d)
754         self.assertEqual(c1.portable_manifest_text(), c2.portable_manifest_text())
755
756     def test_diff_mod(self):
757         c1 = Collection('. 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt\n')
758         c2 = Collection('. 5348b82a029fd9e971a811ce1f71360b+43 0:10:count1.txt\n')
759         d = c2.diff(c1)
760         self.assertEqual(d, [('mod', './count1.txt', c2["count1.txt"], c1["count1.txt"])])
761         d = c1.diff(c2)
762         self.assertEqual(d, [('mod', './count1.txt', c1["count1.txt"], c2["count1.txt"])])
763
764         self.assertNotEqual(c1.portable_manifest_text(), c2.portable_manifest_text())
765         c1.apply(d)
766         self.assertEqual(c1.portable_manifest_text(), c2.portable_manifest_text())
767
768     def test_diff_add(self):
769         c1 = Collection('. 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt\n')
770         c2 = Collection('. 781e5e245d69b566979b86e28d23f2c7+10 5348b82a029fd9e971a811ce1f71360b+43 0:10:count1.txt 10:20:count2.txt\n')
771         d = c2.diff(c1)
772         self.assertEqual(sorted(d), [
773             ('del', './count2.txt', c2["count2.txt"]),
774             ('tok', './count1.txt', c2["count1.txt"], c1["count1.txt"]),
775         ])
776         d = c1.diff(c2)
777         self.assertEqual(sorted(d), [
778             ('add', './count2.txt', c2["count2.txt"]),
779             ('tok', './count1.txt', c2["count1.txt"], c1["count1.txt"]),
780         ])
781
782         self.assertNotEqual(c1.portable_manifest_text(), c2.portable_manifest_text())
783         c1.apply(d)
784         self.assertEqual(c1.portable_manifest_text(), c2.portable_manifest_text())
785
786     def test_diff_add_in_subcollection(self):
787         c1 = Collection('. 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt\n')
788         c2 = Collection('. 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt\n./foo 5348b82a029fd9e971a811ce1f71360b+43 0:10:count2.txt\n')
789         d = c2.diff(c1)
790         self.assertEqual(sorted(d), [
791             ('del', './foo', c2["foo"]),
792             ('tok', './count1.txt', c2["count1.txt"], c1["count1.txt"]),
793         ])
794         d = c1.diff(c2)
795         self.assertEqual(sorted(d), [
796             ('add', './foo', c2["foo"]),
797             ('tok', './count1.txt', c2["count1.txt"], c1["count1.txt"]),
798         ])
799         self.assertNotEqual(c1.portable_manifest_text(), c2.portable_manifest_text())
800         c1.apply(d)
801         self.assertEqual(c1.portable_manifest_text(), c2.portable_manifest_text())
802
803     def test_diff_del_add_in_subcollection(self):
804         c1 = Collection('. 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt\n./foo 5348b82a029fd9e971a811ce1f71360b+43 0:10:count2.txt\n')
805         c2 = Collection('. 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt\n./foo 5348b82a029fd9e971a811ce1f71360b+43 0:3:count3.txt\n')
806         d = c2.diff(c1)
807         self.assertEqual(sorted(d), [
808             ('add', './foo/count2.txt', c1.find("foo/count2.txt")),
809             ('del', './foo/count3.txt', c2.find("foo/count3.txt")),
810             ('tok', './count1.txt', c2["count1.txt"], c1["count1.txt"]),
811         ])
812         d = c1.diff(c2)
813         self.assertEqual(sorted(d), [
814             ('add', './foo/count3.txt', c2.find("foo/count3.txt")),
815             ('del', './foo/count2.txt', c1.find("foo/count2.txt")),
816             ('tok', './count1.txt', c2["count1.txt"], c1["count1.txt"]),
817         ])
818
819         self.assertNotEqual(c1.portable_manifest_text(), c2.portable_manifest_text())
820         c1.apply(d)
821         self.assertEqual(c1.portable_manifest_text(), c2.portable_manifest_text())
822
823     def test_diff_mod_in_subcollection(self):
824         c1 = Collection('. 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt\n./foo 5348b82a029fd9e971a811ce1f71360b+43 0:10:count2.txt\n')
825         c2 = Collection('. 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt 0:3:foo\n')
826         d = c2.diff(c1)
827         self.assertEqual(sorted(d), [
828             ('mod', './foo', c2["foo"], c1["foo"]),
829             ('tok', './count1.txt', c2["count1.txt"], c1["count1.txt"]),
830         ])
831         d = c1.diff(c2)
832         self.assertEqual(sorted(d), [
833             ('mod', './foo', c1["foo"], c2["foo"]),
834             ('tok', './count1.txt', c2["count1.txt"], c1["count1.txt"]),
835         ])
836
837         self.assertNotEqual(c1.portable_manifest_text(), c2.portable_manifest_text())
838         c1.apply(d)
839         self.assertEqual(c1.portable_manifest_text(), c2.portable_manifest_text())
840
841     def test_conflict_keep_local_change(self):
842         c1 = Collection('. 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt\n')
843         c2 = Collection('. 5348b82a029fd9e971a811ce1f71360b+43 0:10:count2.txt\n')
844         d = c1.diff(c2)
845         self.assertEqual(sorted(d), [
846             ('add', './count2.txt', c2["count2.txt"]),
847             ('del', './count1.txt', c1["count1.txt"]),
848         ])
849         f = c1.open("count1.txt", "wb")
850         f.write(b"zzzzz")
851
852         # c1 changed, so it should not be deleted.
853         c1.apply(d)
854         self.assertEqual(c1.portable_manifest_text(), ". 95ebc3c7b3b9f1d2c40fec14415d3cb8+5 5348b82a029fd9e971a811ce1f71360b+43 0:5:count1.txt 5:10:count2.txt\n")
855
856     def test_conflict_mod(self):
857         c1 = Collection('. 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt')
858         c2 = Collection('. 5348b82a029fd9e971a811ce1f71360b+43 0:10:count1.txt')
859         d = c1.diff(c2)
860         self.assertEqual(d, [('mod', './count1.txt', c1["count1.txt"], c2["count1.txt"])])
861         f = c1.open("count1.txt", "wb")
862         f.write(b"zzzzz")
863
864         # c1 changed, so c2 mod will go to a conflict file
865         c1.apply(d)
866         self.assertRegex(
867             c1.portable_manifest_text(),
868             r"\. 95ebc3c7b3b9f1d2c40fec14415d3cb8\+5 5348b82a029fd9e971a811ce1f71360b\+43 0:5:count1\.txt 5:10:count1\.txt~\d\d\d\d\d\d\d\d-\d\d\d\d\d\d~conflict~$")
869
870     def test_conflict_add(self):
871         c1 = Collection('. 781e5e245d69b566979b86e28d23f2c7+10 0:10:count2.txt\n')
872         c2 = Collection('. 5348b82a029fd9e971a811ce1f71360b+43 0:10:count1.txt\n')
873         d = c1.diff(c2)
874         self.assertEqual(sorted(d), [
875             ('add', './count1.txt', c2["count1.txt"]),
876             ('del', './count2.txt', c1["count2.txt"]),
877         ])
878         f = c1.open("count1.txt", "wb")
879         f.write(b"zzzzz")
880
881         # c1 added count1.txt, so c2 add will go to a conflict file
882         c1.apply(d)
883         self.assertRegex(
884             c1.portable_manifest_text(),
885             r"\. 95ebc3c7b3b9f1d2c40fec14415d3cb8\+5 5348b82a029fd9e971a811ce1f71360b\+43 0:5:count1\.txt 5:10:count1\.txt~\d\d\d\d\d\d\d\d-\d\d\d\d\d\d~conflict~$")
886
887     def test_conflict_del(self):
888         c1 = Collection('. 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt')
889         c2 = Collection('. 5348b82a029fd9e971a811ce1f71360b+43 0:10:count1.txt')
890         d = c1.diff(c2)
891         self.assertEqual(d, [('mod', './count1.txt', c1["count1.txt"], c2["count1.txt"])])
892         c1.remove("count1.txt")
893
894         # c1 deleted, so c2 mod will go to a conflict file
895         c1.apply(d)
896         self.assertRegex(
897             c1.portable_manifest_text(),
898             r"\. 5348b82a029fd9e971a811ce1f71360b\+43 0:10:count1\.txt~\d\d\d\d\d\d\d\d-\d\d\d\d\d\d~conflict~$")
899
900     def test_notify(self):
901         c1 = Collection()
902         events = []
903         c1.subscribe(lambda event, collection, name, item: events.append((event, collection, name, item)))
904         f = c1.open("foo.txt", "wb")
905         self.assertEqual(events[0], (arvados.collection.ADD, c1, "foo.txt", f.arvadosfile))
906
907     def test_open_w(self):
908         c1 = Collection(". 781e5e245d69b566979b86e28d23f2c7+10 0:10:count1.txt\n")
909         self.assertEqual(c1["count1.txt"].size(), 10)
910         c1.open("count1.txt", "wb").close()
911         self.assertEqual(c1["count1.txt"].size(), 0)
912
913
914 class NewCollectionTestCaseWithServersAndTokens(run_test_server.TestCaseWithServers):
915     MAIN_SERVER = {}
916     KEEP_SERVER = {}
917     local_locator_re = r"[0-9a-f]{32}\+\d+\+A[a-f0-9]{40}@[a-f0-9]{8}"
918     remote_locator_re = r"[0-9a-f]{32}\+\d+\+R[a-z]{5}-[a-f0-9]{40}@[a-f0-9]{8}"
919
920     def setUp(self):
921         self.keep_put = getattr(arvados.keep.KeepClient, 'put')
922
923     @mock.patch('arvados.keep.KeepClient.put', autospec=True)
924     def test_storage_classes_desired(self, put_mock):
925         put_mock.side_effect = self.keep_put
926         c = Collection(storage_classes_desired=['default'])
927         with c.open("file.txt", 'wb') as f:
928             f.write('content')
929         c.save_new()
930         _, kwargs = put_mock.call_args
931         self.assertEqual(['default'], kwargs['classes'])
932
933     @mock.patch('arvados.keep.KeepClient.put', autospec=True)
934     def test_repacked_block_submission_get_permission_token(self, mocked_put):
935         '''
936         Make sure that those blocks that are committed after repacking small ones,
937         get their permission tokens assigned on the collection manifest.
938         '''
939         def wrapped_keep_put(*args, **kwargs):
940             # Simulate slow put operations
941             time.sleep(1)
942             return self.keep_put(*args, **kwargs)
943
944         mocked_put.side_effect = wrapped_keep_put
945         c = Collection()
946         # Write 70 files ~1MiB each so we force to produce 1 big block by repacking
947         # small ones before finishing the upload.
948         for i in range(70):
949             f = c.open("file_{}.txt".format(i), 'wb')
950             f.write(random.choice('abcdefghijklmnopqrstuvwxyz') * (2**20+i))
951             f.close(flush=False)
952         # We should get 2 blocks with their tokens
953         self.assertEqual(len(re.findall(self.local_locator_re, c.manifest_text())), 2)
954
955     @mock.patch('arvados.keep.KeepClient.refresh_signature')
956     def test_copy_remote_blocks_on_save_new(self, rs_mock):
957         remote_block_loc = "acbd18db4cc2f85cedef654fccc4a4d8+3+Remote-" + "a" * 40 + "@abcdef01"
958         local_block_loc = "acbd18db4cc2f85cedef654fccc4a4d8+3+A" + "b" * 40 + "@abcdef01"
959         rs_mock.return_value = local_block_loc
960         c = Collection(". " + remote_block_loc + " 0:3:foofile.txt\n")
961         self.assertEqual(
962             len(re.findall(self.remote_locator_re, c.manifest_text())), 1)
963         self.assertEqual(
964             len(re.findall(self.local_locator_re, c.manifest_text())), 0)
965         c.save_new()
966         rs_mock.assert_called()
967         self.assertEqual(
968             len(re.findall(self.remote_locator_re, c.manifest_text())), 0)
969         self.assertEqual(
970             len(re.findall(self.local_locator_re, c.manifest_text())), 1)
971
972     @mock.patch('arvados.keep.KeepClient.refresh_signature')
973     def test_copy_remote_blocks_on_save(self, rs_mock):
974         remote_block_loc = "acbd18db4cc2f85cedef654fccc4a4d8+3+Remote-" + "a" * 40 + "@abcdef01"
975         local_block_loc = "acbd18db4cc2f85cedef654fccc4a4d8+3+A" + "b" * 40 + "@abcdef01"
976         rs_mock.return_value = local_block_loc
977         # Remote collection
978         remote_c = Collection(". " + remote_block_loc + " 0:3:foofile.txt\n")
979         self.assertEqual(
980             len(re.findall(self.remote_locator_re, remote_c.manifest_text())), 1)
981         # Local collection
982         local_c = Collection()
983         with local_c.open('barfile.txt', 'wb') as f:
984             f.write('bar')
985         local_c.save_new()
986         self.assertEqual(
987             len(re.findall(self.local_locator_re, local_c.manifest_text())), 1)
988         self.assertEqual(
989             len(re.findall(self.remote_locator_re, local_c.manifest_text())), 0)
990         # Copy remote file to local collection
991         local_c.copy('./foofile.txt', './copied/foofile.txt', remote_c)
992         self.assertEqual(
993             len(re.findall(self.local_locator_re, local_c.manifest_text())), 1)
994         self.assertEqual(
995             len(re.findall(self.remote_locator_re, local_c.manifest_text())), 1)
996         # Save local collection: remote block should be copied
997         local_c.save()
998         rs_mock.assert_called()
999         self.assertEqual(
1000             len(re.findall(self.local_locator_re, local_c.manifest_text())), 2)
1001         self.assertEqual(
1002             len(re.findall(self.remote_locator_re, local_c.manifest_text())), 0)
1003
1004
1005 class NewCollectionTestCaseWithServers(run_test_server.TestCaseWithServers):
1006     def test_preserve_version_on_save(self):
1007         c = Collection()
1008         c.save_new(preserve_version=True)
1009         coll_record = arvados.api().collections().get(uuid=c.manifest_locator()).execute()
1010         self.assertEqual(coll_record['version'], 1)
1011         self.assertEqual(coll_record['preserve_version'], True)
1012         with c.open("foo.txt", "wb") as foo:
1013             foo.write(b"foo")
1014         c.save(preserve_version=True)
1015         coll_record = arvados.api().collections().get(uuid=c.manifest_locator()).execute()
1016         self.assertEqual(coll_record['version'], 2)
1017         self.assertEqual(coll_record['preserve_version'], True)
1018         with c.open("bar.txt", "wb") as foo:
1019             foo.write(b"bar")
1020         c.save(preserve_version=False)
1021         coll_record = arvados.api().collections().get(uuid=c.manifest_locator()).execute()
1022         self.assertEqual(coll_record['version'], 3)
1023         self.assertEqual(coll_record['preserve_version'], False)
1024
1025     def test_get_manifest_text_only_committed(self):
1026         c = Collection()
1027         with c.open("count.txt", "wb") as f:
1028             # One file committed
1029             with c.open("foo.txt", "wb") as foo:
1030                 foo.write(b"foo")
1031                 foo.flush() # Force block commit
1032             f.write(b"0123456789")
1033             # Other file not committed. Block not written to keep yet.
1034             self.assertEqual(
1035                 c._get_manifest_text(".",
1036                                      strip=False,
1037                                      normalize=False,
1038                                      only_committed=True),
1039                 '. acbd18db4cc2f85cedef654fccc4a4d8+3 0:0:count.txt 0:3:foo.txt\n')
1040             # And now with the file closed...
1041             f.flush() # Force block commit
1042         self.assertEqual(
1043             c._get_manifest_text(".",
1044                                  strip=False,
1045                                  normalize=False,
1046                                  only_committed=True),
1047             ". 781e5e245d69b566979b86e28d23f2c7+10 acbd18db4cc2f85cedef654fccc4a4d8+3 0:10:count.txt 10:3:foo.txt\n")
1048
1049     def test_only_small_blocks_are_packed_together(self):
1050         c = Collection()
1051         # Write a couple of small files,
1052         f = c.open("count.txt", "wb")
1053         f.write(b"0123456789")
1054         f.close(flush=False)
1055         foo = c.open("foo.txt", "wb")
1056         foo.write(b"foo")
1057         foo.close(flush=False)
1058         # Then, write a big file, it shouldn't be packed with the ones above
1059         big = c.open("bigfile.txt", "wb")
1060         big.write(b"x" * 1024 * 1024 * 33) # 33 MB > KEEP_BLOCK_SIZE/2
1061         big.close(flush=False)
1062         self.assertEqual(
1063             c.manifest_text("."),
1064             '. 2d303c138c118af809f39319e5d507e9+34603008 a8430a058b8fbf408e1931b794dbd6fb+13 0:34603008:bigfile.txt 34603008:10:count.txt 34603018:3:foo.txt\n')
1065
1066     def test_flush_after_small_block_packing(self):
1067         c = Collection()
1068         # Write a couple of small files,
1069         f = c.open("count.txt", "wb")
1070         f.write(b"0123456789")
1071         f.close(flush=False)
1072         foo = c.open("foo.txt", "wb")
1073         foo.write(b"foo")
1074         foo.close(flush=False)
1075
1076         self.assertEqual(
1077             c.manifest_text(),
1078             '. a8430a058b8fbf408e1931b794dbd6fb+13 0:10:count.txt 10:3:foo.txt\n')
1079
1080         f = c.open("count.txt", "rb+")
1081         f.close(flush=True)
1082
1083         self.assertEqual(
1084             c.manifest_text(),
1085             '. a8430a058b8fbf408e1931b794dbd6fb+13 0:10:count.txt 10:3:foo.txt\n')
1086
1087     def test_write_after_small_block_packing2(self):
1088         c = Collection()
1089         # Write a couple of small files,
1090         f = c.open("count.txt", "wb")
1091         f.write(b"0123456789")
1092         f.close(flush=False)
1093         foo = c.open("foo.txt", "wb")
1094         foo.write(b"foo")
1095         foo.close(flush=False)
1096
1097         self.assertEqual(
1098             c.manifest_text(),
1099             '. a8430a058b8fbf408e1931b794dbd6fb+13 0:10:count.txt 10:3:foo.txt\n')
1100
1101         f = c.open("count.txt", "rb+")
1102         f.write(b"abc")
1103         f.close(flush=False)
1104
1105         self.assertEqual(
1106             c.manifest_text(),
1107             '. 900150983cd24fb0d6963f7d28e17f72+3 a8430a058b8fbf408e1931b794dbd6fb+13 0:3:count.txt 6:7:count.txt 13:3:foo.txt\n')
1108
1109
1110     def test_small_block_packing_with_overwrite(self):
1111         c = Collection()
1112         c.open("b1", "wb").close()
1113         c["b1"].writeto(0, b"b1", 0)
1114
1115         c.open("b2", "wb").close()
1116         c["b2"].writeto(0, b"b2", 0)
1117
1118         c["b1"].writeto(0, b"1b", 0)
1119
1120         self.assertEqual(c.manifest_text(), ". ed4f3f67c70b02b29c50ce1ea26666bd+4 0:2:b1 2:2:b2\n")
1121         self.assertEqual(c["b1"].manifest_text(), ". ed4f3f67c70b02b29c50ce1ea26666bd+4 0:2:b1\n")
1122         self.assertEqual(c["b2"].manifest_text(), ". ed4f3f67c70b02b29c50ce1ea26666bd+4 2:2:b2\n")
1123
1124
1125 class CollectionCreateUpdateTest(run_test_server.TestCaseWithServers):
1126     MAIN_SERVER = {}
1127     KEEP_SERVER = {}
1128
1129     def create_count_txt(self):
1130         # Create an empty collection, save it to the API server, then write a
1131         # file, but don't save it.
1132
1133         c = Collection()
1134         c.save_new("CollectionCreateUpdateTest", ensure_unique_name=True)
1135         self.assertEqual(c.portable_data_hash(), "d41d8cd98f00b204e9800998ecf8427e+0")
1136         self.assertEqual(c.api_response()["portable_data_hash"], "d41d8cd98f00b204e9800998ecf8427e+0" )
1137
1138         with c.open("count.txt", "wb") as f:
1139             f.write(b"0123456789")
1140
1141         self.assertEqual(c.portable_manifest_text(), ". 781e5e245d69b566979b86e28d23f2c7+10 0:10:count.txt\n")
1142
1143         return c
1144
1145     def test_create_and_save(self):
1146         c = self.create_count_txt()
1147         c.save(properties={'type' : 'Intermediate'},
1148                storage_classes=['archive'],
1149                trash_at=datetime.datetime(2111, 1, 1, 11, 11, 11, 111111))
1150
1151         self.assertRegex(
1152             c.manifest_text(),
1153             r"^\. 781e5e245d69b566979b86e28d23f2c7\+10\+A[a-f0-9]{40}@[a-f0-9]{8} 0:10:count\.txt$",)
1154         self.assertEqual(c.api_response()["storage_classes_desired"], ['archive'])
1155         self.assertEqual(c.api_response()["properties"], {'type' : 'Intermediate'})
1156         self.assertEqual(c.api_response()["trash_at"], '2111-01-01T11:11:11.111111000Z')
1157
1158
1159     def test_create_and_save_new(self):
1160         c = self.create_count_txt()
1161         c.save_new(properties={'type' : 'Intermediate'},
1162                    storage_classes=['archive'],
1163                    trash_at=datetime.datetime(2111, 1, 1, 11, 11, 11, 111111))
1164
1165         self.assertRegex(
1166             c.manifest_text(),
1167             r"^\. 781e5e245d69b566979b86e28d23f2c7\+10\+A[a-f0-9]{40}@[a-f0-9]{8} 0:10:count\.txt$",)
1168         self.assertEqual(c.api_response()["storage_classes_desired"], ['archive'])
1169         self.assertEqual(c.api_response()["properties"], {'type' : 'Intermediate'})
1170         self.assertEqual(c.api_response()["trash_at"], '2111-01-01T11:11:11.111111000Z')
1171
1172     def test_create_and_save_after_commiting(self):
1173         c = self.create_count_txt()
1174         c.save(properties={'type' : 'Intermediate'},
1175                storage_classes=['hot'],
1176                trash_at=datetime.datetime(2111, 1, 1, 11, 11, 11, 111111))
1177         c.save(properties={'type' : 'Output'},
1178                storage_classes=['cold'],
1179                trash_at=datetime.datetime(2222, 2, 2, 22, 22, 22, 222222))
1180
1181         self.assertEqual(c.api_response()["storage_classes_desired"], ['cold'])
1182         self.assertEqual(c.api_response()["properties"], {'type' : 'Output'})
1183         self.assertEqual(c.api_response()["trash_at"], '2222-02-02T22:22:22.222222000Z')
1184
1185     def test_create_diff_apply(self):
1186         c1 = self.create_count_txt()
1187         c1.save()
1188
1189         c2 = Collection(c1.manifest_locator())
1190         with c2.open("count.txt", "wb") as f:
1191             f.write(b"abcdefg")
1192
1193         diff = c1.diff(c2)
1194
1195         self.assertEqual(diff[0], (arvados.collection.MOD, u'./count.txt', c1["count.txt"], c2["count.txt"]))
1196
1197         c1.apply(diff)
1198         self.assertEqual(c1.portable_data_hash(), c2.portable_data_hash())
1199
1200     def test_diff_apply_with_token(self):
1201         baseline = CollectionReader(". 781e5e245d69b566979b86e28d23f2c7+10+A715fd31f8111894f717eb1003c1b0216799dd9ec@54f5dd1a 0:10:count.txt\n")
1202         c = Collection(". 781e5e245d69b566979b86e28d23f2c7+10 0:10:count.txt\n")
1203         other = CollectionReader(". 7ac66c0f148de9519b8bd264312c4d64+7+A715fd31f8111894f717eb1003c1b0216799dd9ec@54f5dd1a 0:7:count.txt\n")
1204
1205         diff = baseline.diff(other)
1206         self.assertEqual(diff, [('mod', u'./count.txt', c["count.txt"], other["count.txt"])])
1207
1208         c.apply(diff)
1209
1210         self.assertEqual(c.manifest_text(), ". 7ac66c0f148de9519b8bd264312c4d64+7+A715fd31f8111894f717eb1003c1b0216799dd9ec@54f5dd1a 0:7:count.txt\n")
1211
1212
1213     def test_create_and_update(self):
1214         c1 = self.create_count_txt()
1215         c1.save()
1216
1217         c2 = arvados.collection.Collection(c1.manifest_locator())
1218         with c2.open("count.txt", "wb") as f:
1219             f.write(b"abcdefg")
1220
1221         c2.save()
1222
1223         self.assertNotEqual(c1.portable_data_hash(), c2.portable_data_hash())
1224         c1.update()
1225         self.assertEqual(c1.portable_data_hash(), c2.portable_data_hash())
1226
1227
1228     def test_create_and_update_with_conflict(self):
1229         c1 = self.create_count_txt()
1230         c1.save()
1231
1232         with c1.open("count.txt", "wb") as f:
1233             f.write(b"XYZ")
1234
1235         c2 = arvados.collection.Collection(c1.manifest_locator())
1236         with c2.open("count.txt", "wb") as f:
1237             f.write(b"abcdefg")
1238
1239         c2.save()
1240
1241         c1.update()
1242         self.assertRegex(
1243             c1.manifest_text(),
1244             r"\. e65075d550f9b5bf9992fa1d71a131be\+3\S* 7ac66c0f148de9519b8bd264312c4d64\+7\S* 0:3:count\.txt 3:7:count\.txt~\d\d\d\d\d\d\d\d-\d\d\d\d\d\d~conflict~$")
1245
1246     def test_pdh_is_native_str(self):
1247         c1 = self.create_count_txt()
1248         pdh = c1.portable_data_hash()
1249         self.assertEqual(type(''), type(pdh))
1250
1251
1252 if __name__ == '__main__':
1253     unittest.main()