# Copyright (C) The Arvados Authors. All rights reserved. # # SPDX-License-Identifier: Apache-2.0 import copy import io import functools import hashlib import json import logging import sys import unittest import datetime from unittest import mock import arvados import arvados.collection import arvados.keep import pycurl from arvados._internal import http_to_keep # Turns out there was already "FakeCurl" that serves the same purpose, but # I wrote this before I knew that. Whoops. class CurlMock: def __init__(self, headers = {}): self.perform_was_called = False self.headers = headers self.get_response = 200 self.head_response = 200 self.req_headers = [] def setopt(self, op, *args): if op == pycurl.URL: self.url = args[0] if op == pycurl.WRITEFUNCTION: self.writefn = args[0] if op == pycurl.HEADERFUNCTION: self.headerfn = args[0] if op == pycurl.NOBODY: self.head = True if op == pycurl.HTTPGET: self.head = False if op == pycurl.HTTPHEADER: self.req_headers = args[0] def getinfo(self, op): if op == pycurl.RESPONSE_CODE: if self.head: return self.head_response else: return self.get_response def perform(self): self.perform_was_called = True if self.head: self.headerfn("HTTP/1.1 {} Status\r\n".format(self.head_response)) else: self.headerfn("HTTP/1.1 {} Status\r\n".format(self.get_response)) for k,v in self.headers.items(): self.headerfn("%s: %s" % (k,v)) if not self.head and self.get_response == 200: self.writefn(self.chunk) class TestHttpToKeep(unittest.TestCase): @mock.patch("pycurl.Curl") @mock.patch("arvados.collection.Collection") def test_http_get(self, collectionmock, curlmock): api = mock.MagicMock() api.collections().list().execute.return_value = { "items": [] } cm = mock.MagicMock() cm.manifest_locator.return_value = "zzzzz-4zz18-zzzzzzzzzzzzzz3" cm.portable_data_hash.return_value = "99999999999999999999999999999998+99" collectionmock.return_value = cm mockobj = CurlMock() mockobj.chunk = b'abc' def init(): return mockobj curlmock.side_effect = init utcnow = mock.MagicMock() utcnow.return_value = datetime.datetime(2018, 5, 15) r = http_to_keep.http_to_keep(api, None, "http://example.com/file1.txt", utcnow=utcnow) self.assertEqual(r, ("99999999999999999999999999999998+99", "file1.txt", 'zzzzz-4zz18-zzzzzzzzzzzzzz3', 'http://example.com/file1.txt', datetime.datetime(2018, 5, 15, 0, 0))) assert mockobj.url == b"http://example.com/file1.txt" assert mockobj.perform_was_called is True cm.open.assert_called_with("file1.txt", "wb") cm.save_new.assert_called_with(name="Downloaded from http%3A%2F%2Fexample.com%2Ffile1.txt", owner_uuid=None, ensure_unique_name=True) api.collections().update.assert_has_calls([ mock.call(uuid=cm.manifest_locator(), body={"collection":{"properties": {'http://example.com/file1.txt': {'Date': 'Tue, 15 May 2018 00:00:00 GMT'}}}}) ]) @mock.patch("pycurl.Curl") @mock.patch("arvados.collection.CollectionReader") def test_http_expires(self, collectionmock, curlmock): api = mock.MagicMock() api.collections().list().execute.return_value = { "items": [{ "uuid": "zzzzz-4zz18-zzzzzzzzzzzzzz3", "portable_data_hash": "99999999999999999999999999999998+99", "properties": { 'http://example.com/file1.txt': { 'Date': 'Tue, 15 May 2018 00:00:00 GMT', 'Expires': 'Tue, 17 May 2018 00:00:00 GMT' } } }] } cm = mock.MagicMock() cm.manifest_locator.return_value = "zzzzz-4zz18-zzzzzzzzzzzzzz3" cm.portable_data_hash.return_value = "99999999999999999999999999999998+99" cm.keys.return_value = ["file1.txt"] collectionmock.return_value = cm mockobj = CurlMock() mockobj.chunk = b'abc' def init(): return mockobj curlmock.side_effect = init utcnow = mock.MagicMock() utcnow.return_value = datetime.datetime(2018, 5, 16) r = http_to_keep.http_to_keep(api, None, "http://example.com/file1.txt", utcnow=utcnow) self.assertEqual(r, ("99999999999999999999999999999998+99", "file1.txt", 'zzzzz-4zz18-zzzzzzzzzzzzzz3', 'http://example.com/file1.txt', datetime.datetime(2018, 5, 16, 0, 0))) assert mockobj.perform_was_called is False @mock.patch("pycurl.Curl") @mock.patch("arvados.collection.CollectionReader") def test_http_cache_control(self, collectionmock, curlmock): api = mock.MagicMock() api.collections().list().execute.return_value = { "items": [{ "uuid": "zzzzz-4zz18-zzzzzzzzzzzzzz3", "portable_data_hash": "99999999999999999999999999999998+99", "properties": { 'http://example.com/file1.txt': { 'Date': 'Tue, 15 May 2018 00:00:00 GMT', 'Cache-Control': 'max-age=172800' } } }] } cm = mock.MagicMock() cm.manifest_locator.return_value = "zzzzz-4zz18-zzzzzzzzzzzzzz3" cm.portable_data_hash.return_value = "99999999999999999999999999999998+99" cm.keys.return_value = ["file1.txt"] collectionmock.return_value = cm mockobj = CurlMock() mockobj.chunk = b'abc' def init(): return mockobj curlmock.side_effect = init utcnow = mock.MagicMock() utcnow.return_value = datetime.datetime(2018, 5, 16) r = http_to_keep.http_to_keep(api, None, "http://example.com/file1.txt", utcnow=utcnow) self.assertEqual(r, ("99999999999999999999999999999998+99", "file1.txt", 'zzzzz-4zz18-zzzzzzzzzzzzzz3', 'http://example.com/file1.txt', datetime.datetime(2018, 5, 16, 0, 0))) assert mockobj.perform_was_called is False @mock.patch("pycurl.Curl") @mock.patch("arvados.collection.Collection") def test_http_expired(self, collectionmock, curlmock): api = mock.MagicMock() api.collections().list().execute.return_value = { "items": [{ "uuid": "zzzzz-4zz18-zzzzzzzzzzzzzz3", "portable_data_hash": "99999999999999999999999999999998+99", "properties": { 'http://example.com/file1.txt': { 'Date': 'Tue, 15 May 2018 00:00:00 GMT', 'Expires': 'Wed, 16 May 2018 00:00:00 GMT' } } }] } cm = mock.MagicMock() cm.manifest_locator.return_value = "zzzzz-4zz18-zzzzzzzzzzzzzz4" cm.portable_data_hash.return_value = "99999999999999999999999999999997+99" cm.keys.return_value = ["file1.txt"] collectionmock.return_value = cm mockobj = CurlMock({'Date': 'Thu, 17 May 2018 00:00:00 GMT'}) mockobj.chunk = b'def' def init(): return mockobj curlmock.side_effect = init utcnow = mock.MagicMock() utcnow.return_value = datetime.datetime(2018, 5, 17) r = http_to_keep.http_to_keep(api, None, "http://example.com/file1.txt", utcnow=utcnow) self.assertEqual(r, ("99999999999999999999999999999997+99", "file1.txt", 'zzzzz-4zz18-zzzzzzzzzzzzzz4', 'http://example.com/file1.txt', datetime.datetime(2018, 5, 17, 0, 0))) assert mockobj.url == b"http://example.com/file1.txt" assert mockobj.perform_was_called is True cm.open.assert_called_with("file1.txt", "wb") cm.save_new.assert_called_with(name="Downloaded from http%3A%2F%2Fexample.com%2Ffile1.txt", owner_uuid=None, ensure_unique_name=True) api.collections().update.assert_has_calls([ mock.call(uuid=cm.manifest_locator(), body={"collection":{"properties": {'http://example.com/file1.txt': {'Date': 'Thu, 17 May 2018 00:00:00 GMT'}}}}) ]) @mock.patch("pycurl.Curl") @mock.patch("arvados.collection.CollectionReader") def test_http_etag(self, collectionmock, curlmock): api = mock.MagicMock() api.collections().list().execute.return_value = { "items": [{ "uuid": "zzzzz-4zz18-zzzzzzzzzzzzzz3", "portable_data_hash": "99999999999999999999999999999998+99", "properties": { 'http://example.com/file1.txt': { 'Date': 'Tue, 15 May 2018 00:00:00 GMT', 'Expires': 'Wed, 16 May 2018 00:00:00 GMT', 'Etag': '"123456"' } } }] } cm = mock.MagicMock() cm.manifest_locator.return_value = "zzzzz-4zz18-zzzzzzzzzzzzzz3" cm.portable_data_hash.return_value = "99999999999999999999999999999998+99" cm.keys.return_value = ["file1.txt"] collectionmock.return_value = cm mockobj = CurlMock({ 'Date': 'Thu, 17 May 2018 00:00:00 GMT', 'Expires': 'Sat, 19 May 2018 00:00:00 GMT', 'Etag': '"123456"' }) mockobj.chunk = None def init(): return mockobj curlmock.side_effect = init utcnow = mock.MagicMock() utcnow.return_value = datetime.datetime(2018, 5, 17) r = http_to_keep.http_to_keep(api, None, "http://example.com/file1.txt", utcnow=utcnow) self.assertEqual(r, ("99999999999999999999999999999998+99", "file1.txt", 'zzzzz-4zz18-zzzzzzzzzzzzzz3', 'http://example.com/file1.txt', datetime.datetime(2018, 5, 17, 0, 0))) cm.open.assert_not_called() api.collections().update.assert_has_calls([ mock.call(uuid=cm.manifest_locator(), body={"collection":{"properties": {'http://example.com/file1.txt': { 'Date': 'Thu, 17 May 2018 00:00:00 GMT', 'Expires': 'Sat, 19 May 2018 00:00:00 GMT', 'Etag': '"123456"' }}}}) ]) @mock.patch("pycurl.Curl") @mock.patch("arvados.collection.Collection") def test_http_content_disp(self, collectionmock, curlmock): api = mock.MagicMock() api.collections().list().execute.return_value = { "items": [] } cm = mock.MagicMock() cm.manifest_locator.return_value = "zzzzz-4zz18-zzzzzzzzzzzzzz3" cm.portable_data_hash.return_value = "99999999999999999999999999999998+99" collectionmock.return_value = cm mockobj = CurlMock({"Content-Disposition": "attachment; filename=file1.txt"}) mockobj.chunk = "abc" def init(): return mockobj curlmock.side_effect = init utcnow = mock.MagicMock() utcnow.return_value = datetime.datetime(2018, 5, 15) r = http_to_keep.http_to_keep(api, None, "http://example.com/download?fn=/file1.txt", utcnow=utcnow) self.assertEqual(r, ("99999999999999999999999999999998+99", "file1.txt", 'zzzzz-4zz18-zzzzzzzzzzzzzz3', 'http://example.com/download?fn=/file1.txt', datetime.datetime(2018, 5, 15, 0, 0))) assert mockobj.url == b"http://example.com/download?fn=/file1.txt" cm.open.assert_called_with("file1.txt", "wb") cm.save_new.assert_called_with(name="Downloaded from http%3A%2F%2Fexample.com%2Fdownload%3Ffn%3D%2Ffile1.txt", owner_uuid=None, ensure_unique_name=True) api.collections().update.assert_has_calls([ mock.call(uuid=cm.manifest_locator(), body={"collection":{"properties": {"http://example.com/download?fn=/file1.txt": {'Date': 'Tue, 15 May 2018 00:00:00 GMT'}}}}) ]) @mock.patch("pycurl.Curl") @mock.patch("arvados.collection.CollectionReader") def test_http_etag_if_none_match(self, collectionmock, curlmock): api = mock.MagicMock() api.collections().list().execute.return_value = { "items": [{ "uuid": "zzzzz-4zz18-zzzzzzzzzzzzzz3", "portable_data_hash": "99999999999999999999999999999998+99", "properties": { 'http://example.com/file1.txt': { 'Date': 'Tue, 15 May 2018 00:00:00 GMT', 'Expires': 'Tue, 16 May 2018 00:00:00 GMT', 'Etag': '"123456"' } } }] } cm = mock.MagicMock() cm.manifest_locator.return_value = "zzzzz-4zz18-zzzzzzzzzzzzzz3" cm.portable_data_hash.return_value = "99999999999999999999999999999998+99" cm.keys.return_value = ["file1.txt"] collectionmock.return_value = cm mockobj = CurlMock({ 'Date': 'Tue, 17 May 2018 00:00:00 GMT', 'Expires': 'Tue, 19 May 2018 00:00:00 GMT', 'Etag': '"123456"' }) mockobj.chunk = None mockobj.head_response = 403 mockobj.get_response = 304 def init(): return mockobj curlmock.side_effect = init utcnow = mock.MagicMock() utcnow.return_value = datetime.datetime(2018, 5, 17) r = http_to_keep.http_to_keep(api, None, "http://example.com/file1.txt", utcnow=utcnow) self.assertEqual(r, ("99999999999999999999999999999998+99", "file1.txt", 'zzzzz-4zz18-zzzzzzzzzzzzzz3', 'http://example.com/file1.txt', datetime.datetime(2018, 5, 17, 0, 0))) print(mockobj.req_headers) assert mockobj.req_headers == ["Accept: application/octet-stream", "If-None-Match: \"123456\""] cm.open.assert_not_called() api.collections().update.assert_has_calls([ mock.call(uuid=cm.manifest_locator(), body={"collection":{"properties": {'http://example.com/file1.txt': { 'Date': 'Tue, 17 May 2018 00:00:00 GMT', 'Expires': 'Tue, 19 May 2018 00:00:00 GMT', 'Etag': '"123456"' }}}}) ]) @mock.patch("pycurl.Curl") @mock.patch("arvados.collection.CollectionReader") def test_http_prefer_cached_downloads(self, collectionmock, curlmock): api = mock.MagicMock() api.collections().list().execute.return_value = { "items": [{ "uuid": "zzzzz-4zz18-zzzzzzzzzzzzzz3", "portable_data_hash": "99999999999999999999999999999998+99", "properties": { 'http://example.com/file1.txt': { 'Date': 'Tue, 15 May 2018 00:00:00 GMT', 'Expires': 'Tue, 16 May 2018 00:00:00 GMT', 'Etag': '"123456"' } } }] } cm = mock.MagicMock() cm.manifest_locator.return_value = "zzzzz-4zz18-zzzzzzzzzzzzzz3" cm.portable_data_hash.return_value = "99999999999999999999999999999998+99" cm.keys.return_value = ["file1.txt"] collectionmock.return_value = cm mockobj = CurlMock() def init(): return mockobj curlmock.side_effect = init utcnow = mock.MagicMock() utcnow.return_value = datetime.datetime(2018, 5, 17) r = http_to_keep.http_to_keep(api, None, "http://example.com/file1.txt", utcnow=utcnow, prefer_cached_downloads=True) self.assertEqual(r, ("99999999999999999999999999999998+99", "file1.txt", 'zzzzz-4zz18-zzzzzzzzzzzzzz3', 'http://example.com/file1.txt', datetime.datetime(2018, 5, 17, 0, 0))) assert mockobj.perform_was_called is False cm.open.assert_not_called() api.collections().update.assert_not_called() @mock.patch("pycurl.Curl") @mock.patch("arvados.collection.CollectionReader") def test_http_varying_url_params(self, collectionmock, curlmock): for prurl in ("http://example.com/file1.txt", "http://example.com/file1.txt?KeyId=123&Signature=456&Expires=789"): api = mock.MagicMock() api.collections().list().execute.return_value = { "items": [{ "uuid": "zzzzz-4zz18-zzzzzzzzzzzzzz3", "portable_data_hash": "99999999999999999999999999999998+99", "properties": { prurl: { 'Date': 'Tue, 15 May 2018 00:00:00 GMT', 'Expires': 'Tue, 16 May 2018 00:00:00 GMT', 'Etag': '"123456"' } } }] } cm = mock.MagicMock() cm.manifest_locator.return_value = "zzzzz-4zz18-zzzzzzzzzzzzzz3" cm.portable_data_hash.return_value = "99999999999999999999999999999998+99" cm.keys.return_value = ["file1.txt"] collectionmock.return_value = cm mockobj = CurlMock({ 'Date': 'Tue, 17 May 2018 00:00:00 GMT', 'Expires': 'Tue, 19 May 2018 00:00:00 GMT', 'Etag': '"123456"' }) mockobj.chunk = None def init(): return mockobj curlmock.side_effect = init utcnow = mock.MagicMock() utcnow.return_value = datetime.datetime(2018, 5, 17) r = http_to_keep.http_to_keep(api, None, "http://example.com/file1.txt?KeyId=123&Signature=456&Expires=789", utcnow=utcnow, varying_url_params="KeyId,Signature,Expires") self.assertEqual(r, ("99999999999999999999999999999998+99", "file1.txt", 'zzzzz-4zz18-zzzzzzzzzzzzzz3', 'http://example.com/file1.txt', datetime.datetime(2018, 5, 17, 0, 0))) assert mockobj.perform_was_called is True cm.open.assert_not_called() api.collections().update.assert_has_calls([ mock.call(uuid=cm.manifest_locator(), body={"collection":{"properties": {'http://example.com/file1.txt': { 'Date': 'Tue, 17 May 2018 00:00:00 GMT', 'Expires': 'Tue, 19 May 2018 00:00:00 GMT', 'Etag': '"123456"' }}}}) ])