Merge branch '20257-http-import' refs #20257
[arvados.git] / sdk / python / tests / test_http.py
1 # Copyright (C) The Arvados Authors. All rights reserved.
2 #
3 # SPDX-License-Identifier: Apache-2.0
4
5 from future import standard_library
6 standard_library.install_aliases()
7
8 import copy
9 import io
10 import functools
11 import hashlib
12 import json
13 import logging
14 import mock
15 import sys
16 import unittest
17 import datetime
18
19 import arvados
20 import arvados.collection
21 import arvados.keep
22 import pycurl
23
24 from arvados.http_to_keep import http_to_keep
25
26 import ruamel.yaml as yaml
27
28 # Turns out there was already "FakeCurl" that serves the same purpose, but
29 # I wrote this before I knew that.  Whoops.
30 class CurlMock:
31     def __init__(self, headers = {}):
32         self.perform_was_called = False
33         self.headers = headers
34         self.get_response = 200
35         self.head_response = 200
36         self.req_headers = []
37
38     def setopt(self, op, *args):
39         if op == pycurl.URL:
40             self.url = args[0]
41         if op == pycurl.WRITEFUNCTION:
42             self.writefn = args[0]
43         if op == pycurl.HEADERFUNCTION:
44             self.headerfn = args[0]
45         if op == pycurl.NOBODY:
46             self.head = True
47         if op == pycurl.HTTPGET:
48             self.head = False
49         if op == pycurl.HTTPHEADER:
50             self.req_headers = args[0]
51
52     def getinfo(self, op):
53         if op == pycurl.RESPONSE_CODE:
54             if self.head:
55                 return self.head_response
56             else:
57                 return self.get_response
58
59     def perform(self):
60         self.perform_was_called = True
61
62         if self.head:
63             self.headerfn("HTTP/1.1 {} Status\r\n".format(self.head_response))
64         else:
65             self.headerfn("HTTP/1.1 {} Status\r\n".format(self.get_response))
66
67         for k,v in self.headers.items():
68             self.headerfn("%s: %s" % (k,v))
69
70         if not self.head and self.get_response == 200:
71             self.writefn(self.chunk)
72
73
74 class TestHttpToKeep(unittest.TestCase):
75
76     @mock.patch("pycurl.Curl")
77     @mock.patch("arvados.collection.Collection")
78     def test_http_get(self, collectionmock, curlmock):
79         api = mock.MagicMock()
80
81         api.collections().list().execute.return_value = {
82             "items": []
83         }
84
85         cm = mock.MagicMock()
86         cm.manifest_locator.return_value = "zzzzz-4zz18-zzzzzzzzzzzzzz3"
87         cm.portable_data_hash.return_value = "99999999999999999999999999999998+99"
88         collectionmock.return_value = cm
89
90         mockobj = CurlMock()
91         mockobj.chunk = b'abc'
92         def init():
93             return mockobj
94         curlmock.side_effect = init
95
96         utcnow = mock.MagicMock()
97         utcnow.return_value = datetime.datetime(2018, 5, 15)
98
99         r = http_to_keep(api, None, "http://example.com/file1.txt", utcnow=utcnow)
100         self.assertEqual(r, ("99999999999999999999999999999998+99", "file1.txt"))
101
102         assert mockobj.url == b"http://example.com/file1.txt"
103         assert mockobj.perform_was_called is True
104
105         cm.open.assert_called_with("file1.txt", "wb")
106         cm.save_new.assert_called_with(name="Downloaded from http%3A%2F%2Fexample.com%2Ffile1.txt",
107                                        owner_uuid=None, ensure_unique_name=True)
108
109         api.collections().update.assert_has_calls([
110             mock.call(uuid=cm.manifest_locator(),
111                       body={"collection":{"properties": {'http://example.com/file1.txt': {'Date': 'Tue, 15 May 2018 00:00:00 GMT'}}}})
112         ])
113
114
115     @mock.patch("pycurl.Curl")
116     @mock.patch("arvados.collection.CollectionReader")
117     def test_http_expires(self, collectionmock, curlmock):
118         api = mock.MagicMock()
119
120         api.collections().list().execute.return_value = {
121             "items": [{
122                 "uuid": "zzzzz-4zz18-zzzzzzzzzzzzzz3",
123                 "portable_data_hash": "99999999999999999999999999999998+99",
124                 "properties": {
125                     'http://example.com/file1.txt': {
126                         'Date': 'Tue, 15 May 2018 00:00:00 GMT',
127                         'Expires': 'Tue, 17 May 2018 00:00:00 GMT'
128                     }
129                 }
130             }]
131         }
132
133         cm = mock.MagicMock()
134         cm.manifest_locator.return_value = "zzzzz-4zz18-zzzzzzzzzzzzzz3"
135         cm.portable_data_hash.return_value = "99999999999999999999999999999998+99"
136         cm.keys.return_value = ["file1.txt"]
137         collectionmock.return_value = cm
138
139         mockobj = CurlMock()
140         mockobj.chunk = b'abc'
141         def init():
142             return mockobj
143         curlmock.side_effect = init
144
145         utcnow = mock.MagicMock()
146         utcnow.return_value = datetime.datetime(2018, 5, 16)
147
148         r = http_to_keep(api, None, "http://example.com/file1.txt", utcnow=utcnow)
149         self.assertEqual(r, ("99999999999999999999999999999998+99", "file1.txt"))
150
151         assert mockobj.perform_was_called is False
152
153
154     @mock.patch("pycurl.Curl")
155     @mock.patch("arvados.collection.CollectionReader")
156     def test_http_cache_control(self, collectionmock, curlmock):
157         api = mock.MagicMock()
158
159         api.collections().list().execute.return_value = {
160             "items": [{
161                 "uuid": "zzzzz-4zz18-zzzzzzzzzzzzzz3",
162                 "portable_data_hash": "99999999999999999999999999999998+99",
163                 "properties": {
164                     'http://example.com/file1.txt': {
165                         'Date': 'Tue, 15 May 2018 00:00:00 GMT',
166                         'Cache-Control': 'max-age=172800'
167                     }
168                 }
169             }]
170         }
171
172         cm = mock.MagicMock()
173         cm.manifest_locator.return_value = "zzzzz-4zz18-zzzzzzzzzzzzzz3"
174         cm.portable_data_hash.return_value = "99999999999999999999999999999998+99"
175         cm.keys.return_value = ["file1.txt"]
176         collectionmock.return_value = cm
177
178         mockobj = CurlMock()
179         mockobj.chunk = b'abc'
180         def init():
181             return mockobj
182         curlmock.side_effect = init
183
184         utcnow = mock.MagicMock()
185         utcnow.return_value = datetime.datetime(2018, 5, 16)
186
187         r = http_to_keep(api, None, "http://example.com/file1.txt", utcnow=utcnow)
188         self.assertEqual(r, ("99999999999999999999999999999998+99", "file1.txt"))
189
190         assert mockobj.perform_was_called is False
191
192
193     @mock.patch("pycurl.Curl")
194     @mock.patch("arvados.collection.Collection")
195     def test_http_expired(self, collectionmock, curlmock):
196         api = mock.MagicMock()
197
198         api.collections().list().execute.return_value = {
199             "items": [{
200                 "uuid": "zzzzz-4zz18-zzzzzzzzzzzzzz3",
201                 "portable_data_hash": "99999999999999999999999999999998+99",
202                 "properties": {
203                     'http://example.com/file1.txt': {
204                         'Date': 'Tue, 15 May 2018 00:00:00 GMT',
205                         'Expires': 'Wed, 16 May 2018 00:00:00 GMT'
206                     }
207                 }
208             }]
209         }
210
211         cm = mock.MagicMock()
212         cm.manifest_locator.return_value = "zzzzz-4zz18-zzzzzzzzzzzzzz4"
213         cm.portable_data_hash.return_value = "99999999999999999999999999999997+99"
214         cm.keys.return_value = ["file1.txt"]
215         collectionmock.return_value = cm
216
217         mockobj = CurlMock({'Date': 'Thu, 17 May 2018 00:00:00 GMT'})
218         mockobj.chunk = b'def'
219         def init():
220             return mockobj
221         curlmock.side_effect = init
222
223         utcnow = mock.MagicMock()
224         utcnow.return_value = datetime.datetime(2018, 5, 17)
225
226         r = http_to_keep(api, None, "http://example.com/file1.txt", utcnow=utcnow)
227         self.assertEqual(r, ("99999999999999999999999999999997+99", "file1.txt"))
228
229         assert mockobj.url == b"http://example.com/file1.txt"
230         assert mockobj.perform_was_called is True
231
232         cm.open.assert_called_with("file1.txt", "wb")
233         cm.save_new.assert_called_with(name="Downloaded from http%3A%2F%2Fexample.com%2Ffile1.txt",
234                                        owner_uuid=None, ensure_unique_name=True)
235
236         api.collections().update.assert_has_calls([
237             mock.call(uuid=cm.manifest_locator(),
238                       body={"collection":{"properties": {'http://example.com/file1.txt': {'Date': 'Thu, 17 May 2018 00:00:00 GMT'}}}})
239         ])
240
241
242     @mock.patch("pycurl.Curl")
243     @mock.patch("arvados.collection.CollectionReader")
244     def test_http_etag(self, collectionmock, curlmock):
245         api = mock.MagicMock()
246
247         api.collections().list().execute.return_value = {
248             "items": [{
249                 "uuid": "zzzzz-4zz18-zzzzzzzzzzzzzz3",
250                 "portable_data_hash": "99999999999999999999999999999998+99",
251                 "properties": {
252                     'http://example.com/file1.txt': {
253                         'Date': 'Tue, 15 May 2018 00:00:00 GMT',
254                         'Expires': 'Wed, 16 May 2018 00:00:00 GMT',
255                         'Etag': '"123456"'
256                     }
257                 }
258             }]
259         }
260
261         cm = mock.MagicMock()
262         cm.manifest_locator.return_value = "zzzzz-4zz18-zzzzzzzzzzzzzz3"
263         cm.portable_data_hash.return_value = "99999999999999999999999999999998+99"
264         cm.keys.return_value = ["file1.txt"]
265         collectionmock.return_value = cm
266
267         mockobj = CurlMock({
268             'Date': 'Thu, 17 May 2018 00:00:00 GMT',
269             'Expires': 'Sat, 19 May 2018 00:00:00 GMT',
270             'Etag': '"123456"'
271         })
272         mockobj.chunk = None
273         def init():
274             return mockobj
275         curlmock.side_effect = init
276
277         utcnow = mock.MagicMock()
278         utcnow.return_value = datetime.datetime(2018, 5, 17)
279
280         r = http_to_keep(api, None, "http://example.com/file1.txt", utcnow=utcnow)
281         self.assertEqual(r, ("99999999999999999999999999999998+99", "file1.txt"))
282
283         cm.open.assert_not_called()
284
285         api.collections().update.assert_has_calls([
286             mock.call(uuid=cm.manifest_locator(),
287                       body={"collection":{"properties": {'http://example.com/file1.txt': {
288                           'Date': 'Thu, 17 May 2018 00:00:00 GMT',
289                           'Expires': 'Sat, 19 May 2018 00:00:00 GMT',
290                           'Etag': '"123456"'
291                       }}}})
292                       ])
293
294     @mock.patch("pycurl.Curl")
295     @mock.patch("arvados.collection.Collection")
296     def test_http_content_disp(self, collectionmock, curlmock):
297         api = mock.MagicMock()
298
299         api.collections().list().execute.return_value = {
300             "items": []
301         }
302
303         cm = mock.MagicMock()
304         cm.manifest_locator.return_value = "zzzzz-4zz18-zzzzzzzzzzzzzz3"
305         cm.portable_data_hash.return_value = "99999999999999999999999999999998+99"
306         collectionmock.return_value = cm
307
308         mockobj = CurlMock({"Content-Disposition": "attachment; filename=file1.txt"})
309         mockobj.chunk = "abc"
310         def init():
311             return mockobj
312         curlmock.side_effect = init
313
314         utcnow = mock.MagicMock()
315         utcnow.return_value = datetime.datetime(2018, 5, 15)
316
317         r = http_to_keep(api, None, "http://example.com/download?fn=/file1.txt", utcnow=utcnow)
318         self.assertEqual(r, ("99999999999999999999999999999998+99", "file1.txt"))
319
320         assert mockobj.url == b"http://example.com/download?fn=/file1.txt"
321
322         cm.open.assert_called_with("file1.txt", "wb")
323         cm.save_new.assert_called_with(name="Downloaded from http%3A%2F%2Fexample.com%2Fdownload%3Ffn%3D%2Ffile1.txt",
324                                        owner_uuid=None, ensure_unique_name=True)
325
326         api.collections().update.assert_has_calls([
327             mock.call(uuid=cm.manifest_locator(),
328                       body={"collection":{"properties": {"http://example.com/download?fn=/file1.txt": {'Date': 'Tue, 15 May 2018 00:00:00 GMT'}}}})
329         ])
330
331     @mock.patch("pycurl.Curl")
332     @mock.patch("arvados.collection.CollectionReader")
333     def test_http_etag_if_none_match(self, collectionmock, curlmock):
334         api = mock.MagicMock()
335
336         api.collections().list().execute.return_value = {
337             "items": [{
338                 "uuid": "zzzzz-4zz18-zzzzzzzzzzzzzz3",
339                 "portable_data_hash": "99999999999999999999999999999998+99",
340                 "properties": {
341                     'http://example.com/file1.txt': {
342                         'Date': 'Tue, 15 May 2018 00:00:00 GMT',
343                         'Expires': 'Tue, 16 May 2018 00:00:00 GMT',
344                         'Etag': '"123456"'
345                     }
346                 }
347             }]
348         }
349
350         cm = mock.MagicMock()
351         cm.manifest_locator.return_value = "zzzzz-4zz18-zzzzzzzzzzzzzz3"
352         cm.portable_data_hash.return_value = "99999999999999999999999999999998+99"
353         cm.keys.return_value = ["file1.txt"]
354         collectionmock.return_value = cm
355
356         mockobj = CurlMock({
357             'Date': 'Tue, 17 May 2018 00:00:00 GMT',
358             'Expires': 'Tue, 19 May 2018 00:00:00 GMT',
359             'Etag': '"123456"'
360         })
361         mockobj.chunk = None
362         mockobj.head_response = 403
363         mockobj.get_response = 304
364         def init():
365             return mockobj
366         curlmock.side_effect = init
367
368         utcnow = mock.MagicMock()
369         utcnow.return_value = datetime.datetime(2018, 5, 17)
370
371         r = http_to_keep(api, None, "http://example.com/file1.txt", utcnow=utcnow)
372         self.assertEqual(r, ("99999999999999999999999999999998+99", "file1.txt"))
373
374         print(mockobj.req_headers)
375         assert mockobj.req_headers == ["Accept: application/octet-stream", "If-None-Match: \"123456\""]
376         cm.open.assert_not_called()
377
378         api.collections().update.assert_has_calls([
379             mock.call(uuid=cm.manifest_locator(),
380                       body={"collection":{"properties": {'http://example.com/file1.txt': {
381                           'Date': 'Tue, 17 May 2018 00:00:00 GMT',
382                           'Expires': 'Tue, 19 May 2018 00:00:00 GMT',
383                           'Etag': '"123456"'
384                       }}}})
385                       ])
386
387     @mock.patch("pycurl.Curl")
388     @mock.patch("arvados.collection.CollectionReader")
389     def test_http_prefer_cached_downloads(self, collectionmock, curlmock):
390         api = mock.MagicMock()
391
392         api.collections().list().execute.return_value = {
393             "items": [{
394                 "uuid": "zzzzz-4zz18-zzzzzzzzzzzzzz3",
395                 "portable_data_hash": "99999999999999999999999999999998+99",
396                 "properties": {
397                     'http://example.com/file1.txt': {
398                         'Date': 'Tue, 15 May 2018 00:00:00 GMT',
399                         'Expires': 'Tue, 16 May 2018 00:00:00 GMT',
400                         'Etag': '"123456"'
401                     }
402                 }
403             }]
404         }
405
406         cm = mock.MagicMock()
407         cm.manifest_locator.return_value = "zzzzz-4zz18-zzzzzzzzzzzzzz3"
408         cm.portable_data_hash.return_value = "99999999999999999999999999999998+99"
409         cm.keys.return_value = ["file1.txt"]
410         collectionmock.return_value = cm
411
412         mockobj = CurlMock()
413         def init():
414             return mockobj
415         curlmock.side_effect = init
416
417         utcnow = mock.MagicMock()
418         utcnow.return_value = datetime.datetime(2018, 5, 17)
419
420         r = http_to_keep(api, None, "http://example.com/file1.txt", utcnow=utcnow, prefer_cached_downloads=True)
421         self.assertEqual(r, ("99999999999999999999999999999998+99", "file1.txt"))
422
423         assert mockobj.perform_was_called is False
424         cm.open.assert_not_called()
425         api.collections().update.assert_not_called()
426
427     @mock.patch("pycurl.Curl")
428     @mock.patch("arvados.collection.CollectionReader")
429     def test_http_varying_url_params(self, collectionmock, curlmock):
430         for prurl in ("http://example.com/file1.txt", "http://example.com/file1.txt?KeyId=123&Signature=456&Expires=789"):
431             api = mock.MagicMock()
432
433             api.collections().list().execute.return_value = {
434                 "items": [{
435                     "uuid": "zzzzz-4zz18-zzzzzzzzzzzzzz3",
436                     "portable_data_hash": "99999999999999999999999999999998+99",
437                     "properties": {
438                         prurl: {
439                             'Date': 'Tue, 15 May 2018 00:00:00 GMT',
440                             'Expires': 'Tue, 16 May 2018 00:00:00 GMT',
441                             'Etag': '"123456"'
442                         }
443                     }
444                 }]
445             }
446
447             cm = mock.MagicMock()
448             cm.manifest_locator.return_value = "zzzzz-4zz18-zzzzzzzzzzzzzz3"
449             cm.portable_data_hash.return_value = "99999999999999999999999999999998+99"
450             cm.keys.return_value = ["file1.txt"]
451             collectionmock.return_value = cm
452
453             mockobj = CurlMock({
454                 'Date': 'Tue, 17 May 2018 00:00:00 GMT',
455                 'Expires': 'Tue, 19 May 2018 00:00:00 GMT',
456                 'Etag': '"123456"'
457             })
458             mockobj.chunk = None
459             def init():
460                 return mockobj
461             curlmock.side_effect = init
462
463             utcnow = mock.MagicMock()
464             utcnow.return_value = datetime.datetime(2018, 5, 17)
465
466             r = http_to_keep(api, None, "http://example.com/file1.txt?KeyId=123&Signature=456&Expires=789",
467                                               utcnow=utcnow, varying_url_params="KeyId,Signature,Expires")
468             self.assertEqual(r, ("99999999999999999999999999999998+99", "file1.txt"))
469
470             assert mockobj.perform_was_called is True
471             cm.open.assert_not_called()
472
473             api.collections().update.assert_has_calls([
474                 mock.call(uuid=cm.manifest_locator(),
475                           body={"collection":{"properties": {'http://example.com/file1.txt': {
476                               'Date': 'Tue, 17 May 2018 00:00:00 GMT',
477                               'Expires': 'Tue, 19 May 2018 00:00:00 GMT',
478                               'Etag': '"123456"'
479                           }}}})
480                           ])