From 739b1b9ec3662f988ad09509bcc933ce5c23c4e8 Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Fri, 25 May 2018 16:45:12 -0400 Subject: [PATCH] 11162: Add tests for http_to_keep. Arvados-DCO-1.1-Signed-off-by: Peter Amstutz --- sdk/cwl/arvados_cwl/http.py | 58 +++++--- sdk/cwl/tests/test_http.py | 286 ++++++++++++++++++++++++++++++++++++ 2 files changed, 321 insertions(+), 23 deletions(-) create mode 100644 sdk/cwl/tests/test_http.py diff --git a/sdk/cwl/arvados_cwl/http.py b/sdk/cwl/arvados_cwl/http.py index 1ee1607466..32fc1cf90d 100644 --- a/sdk/cwl/arvados_cwl/http.py +++ b/sdk/cwl/arvados_cwl/http.py @@ -11,7 +11,8 @@ import logging logger = logging.getLogger('arvados.cwl-runner') def my_formatdate(dt): - return email.utils.formatdate(timeval=time.mktime(now.timetuple()), localtime=False, usegmt=True) + return email.utils.formatdate(timeval=time.mktime(dt.timetuple()), + localtime=False, usegmt=True) def my_parsedate(text): parsed = email.utils.parsedate(text) @@ -20,7 +21,7 @@ def my_parsedate(text): else: return datetime.datetime(1970, 1, 1) -def fresh_cache(url, properties): +def fresh_cache(url, properties, now): pr = properties[url] expires = None @@ -45,20 +46,20 @@ def fresh_cache(url, properties): if not expires: return False - return (datetime.datetime.utcnow() < expires) + return (now < expires) -def remember_headers(url, properties, headers): +def remember_headers(url, properties, headers, now): properties.setdefault(url, {}) for h in ("Cache-Control", "ETag", "Expires", "Date", "Content-Length"): if h in headers: properties[url][h] = headers[h] if "Date" not in headers: - properties[url]["Date"] = my_formatdate(datetime.datetime.utcnow()) + properties[url]["Date"] = my_formatdate(now) -def changed(url, properties): +def changed(url, properties, now): req = requests.head(url, allow_redirects=True) - remember_headers(url, properties, req.headers) + remember_headers(url, properties, req.headers, now) if req.status_code != 200: raise Exception("Got status %s" % req.status_code) @@ -67,19 +68,22 @@ def changed(url, properties): if "ETag" in pr and "ETag" in req.headers: if pr["ETag"] == req.headers["ETag"]: return False + return True -def http_to_keep(api, project_uuid, url): +def http_to_keep(api, project_uuid, url, utcnow=datetime.datetime.utcnow): r = api.collections().list(filters=[["properties", "exists", url]]).execute() + now = utcnow() + for item in r["items"]: properties = item["properties"] - if fresh_cache(url, properties): + if fresh_cache(url, properties, now): # Do nothing cr = arvados.collection.CollectionReader(item["portable_data_hash"], api_client=api) return "keep:%s/%s" % (item["portable_data_hash"], cr.keys()[0]) - if not changed(url, properties): + if not changed(url, properties, now): # ETag didn't change, same content, just update headers api.collections().update(uuid=item["uuid"], body={"collection":{"properties": properties}}).execute() cr = arvados.collection.CollectionReader(item["portable_data_hash"], api_client=api) @@ -91,18 +95,23 @@ def http_to_keep(api, project_uuid, url): if req.status_code != 200: raise Exception("Failed to download '%s' got status %s " % (url, req.status_code)) - remember_headers(url, properties, req.headers) + remember_headers(url, properties, req.headers, now) - logger.info("Downloading %s (%s bytes)", url, properties[url]["Content-Length"]) + if "Content-Length" in properties[url]: + cl = int(properties[url]["Content-Length"]) + logger.info("Downloading %s (%s bytes)", url, cl) + else: + cl = None + logger.info("Downloading %s (unknown size)", url) c = arvados.collection.Collection() if req.headers.get("Content-Disposition"): grp = re.search(r'filename=("((\"|[^"])+)"|([^][()<>@,;:\"/?={} ]+))', req.headers["Content-Disposition"]) - if grp.groups(2): - name = grp.groups(2) + if grp.group(2): + name = grp.group(2) else: - name = grp.groups(3) + name = grp.group(4) else: name = urlparse.urlparse(url).path.split("/")[-1] @@ -113,14 +122,17 @@ def http_to_keep(api, project_uuid, url): for chunk in req.iter_content(chunk_size=1024): count += len(chunk) f.write(chunk) - now = time.time() - if (now - checkpoint) > 20: - bps = (float(count)/float(now - start)) - logger.info("%2.1f%% complete, %3.2f MiB/s, %1.0f seconds left", - float(count * 100) / float(properties[url]["Content-Length"]), - bps/(1024*1024), - (int(properties[url]["Content-Length"])-count)/bps) - checkpoint = now + loopnow = time.time() + if (loopnow - checkpoint) > 20: + bps = (float(count)/float(loopnow - start)) + if cl is not None: + logger.info("%2.1f%% complete, %3.2f MiB/s, %1.0f seconds left", + float(count * 100) / float(cl), + bps/(1024*1024), + (cl-count)/bps) + else: + logger.info("%d downloaded, %3.2f MiB/s", count, bps/(1024*1024)) + checkpoint = loopnow c.save_new(name="Downloaded from %s" % url, owner_uuid=project_uuid, ensure_unique_name=True) diff --git a/sdk/cwl/tests/test_http.py b/sdk/cwl/tests/test_http.py new file mode 100644 index 0000000000..0c66c39c0b --- /dev/null +++ b/sdk/cwl/tests/test_http.py @@ -0,0 +1,286 @@ +# Copyright (C) The Arvados Authors. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +import copy +import cStringIO +import functools +import hashlib +import json +import logging +import mock +import sys +import unittest +import datetime + +import arvados +import arvados.collection +import arvados_cwl +import arvados_cwl.runner +import arvados.keep + +from .matcher import JsonDiffMatcher, StripYAMLComments +from .mock_discovery import get_rootDesc + +import arvados_cwl.http + +import ruamel.yaml as yaml + + +class TestHttpToKeep(unittest.TestCase): + + @mock.patch("requests.get") + @mock.patch("arvados.collection.Collection") + def test_http_get(self, collectionmock, getmock): + api = mock.MagicMock() + + api.collections().list().execute.return_value = { + "items": [] + } + + cm = mock.MagicMock() + cm.manifest_locator.return_value = "zzzzz-4zz18-zzzzzzzzzzzzzz3" + cm.portable_data_hash.return_value = "99999999999999999999999999999998+99" + collectionmock.return_value = cm + + req = mock.MagicMock() + req.status_code = 200 + req.headers = {} + req.iter_content.return_value = ["abc"] + getmock.return_value = req + + utcnow = mock.MagicMock() + utcnow.return_value = datetime.datetime(2018, 5, 15) + + r = arvados_cwl.http.http_to_keep(api, None, "http://example.com/file1.txt", utcnow=utcnow) + self.assertEqual(r, "keep:99999999999999999999999999999998+99/file1.txt") + + getmock.assert_called_with("http://example.com/file1.txt", stream=True, allow_redirects=True) + + cm.open.assert_called_with("file1.txt", "w") + cm.save_new.assert_called_with(name="Downloaded from http://example.com/file1.txt", + owner_uuid=None, ensure_unique_name=True) + + api.collections().update.assert_has_calls([ + mock.call(uuid=cm.manifest_locator(), + body={"collection":{"properties": {'http://example.com/file1.txt': {'Date': 'Tue, 15 May 2018 00:00:00 GMT'}}}}) + ]) + + + @mock.patch("requests.get") + @mock.patch("arvados.collection.CollectionReader") + def test_http_expires(self, collectionmock, getmock): + api = mock.MagicMock() + + api.collections().list().execute.return_value = { + "items": [{ + "uuid": "zzzzz-4zz18-zzzzzzzzzzzzzz3", + "portable_data_hash": "99999999999999999999999999999998+99", + "properties": { + 'http://example.com/file1.txt': { + 'Date': 'Tue, 15 May 2018 00:00:00 GMT', + 'Expires': 'Tue, 17 May 2018 00:00:00 GMT' + } + } + }] + } + + cm = mock.MagicMock() + cm.manifest_locator.return_value = "zzzzz-4zz18-zzzzzzzzzzzzzz3" + cm.portable_data_hash.return_value = "99999999999999999999999999999998+99" + cm.keys.return_value = ["file1.txt"] + collectionmock.return_value = cm + + req = mock.MagicMock() + req.status_code = 200 + req.headers = {} + req.iter_content.return_value = ["abc"] + getmock.return_value = req + + utcnow = mock.MagicMock() + utcnow.return_value = datetime.datetime(2018, 5, 16) + + r = arvados_cwl.http.http_to_keep(api, None, "http://example.com/file1.txt", utcnow=utcnow) + self.assertEqual(r, "keep:99999999999999999999999999999998+99/file1.txt") + + getmock.assert_not_called() + + + @mock.patch("requests.get") + @mock.patch("arvados.collection.CollectionReader") + def test_http_cache_control(self, collectionmock, getmock): + api = mock.MagicMock() + + api.collections().list().execute.return_value = { + "items": [{ + "uuid": "zzzzz-4zz18-zzzzzzzzzzzzzz3", + "portable_data_hash": "99999999999999999999999999999998+99", + "properties": { + 'http://example.com/file1.txt': { + 'Date': 'Tue, 15 May 2018 00:00:00 GMT', + 'Cache-Control': 'max-age=172800' + } + } + }] + } + + cm = mock.MagicMock() + cm.manifest_locator.return_value = "zzzzz-4zz18-zzzzzzzzzzzzzz3" + cm.portable_data_hash.return_value = "99999999999999999999999999999998+99" + cm.keys.return_value = ["file1.txt"] + collectionmock.return_value = cm + + req = mock.MagicMock() + req.status_code = 200 + req.headers = {} + req.iter_content.return_value = ["abc"] + getmock.return_value = req + + utcnow = mock.MagicMock() + utcnow.return_value = datetime.datetime(2018, 5, 16) + + r = arvados_cwl.http.http_to_keep(api, None, "http://example.com/file1.txt", utcnow=utcnow) + self.assertEqual(r, "keep:99999999999999999999999999999998+99/file1.txt") + + getmock.assert_not_called() + + + @mock.patch("requests.get") + @mock.patch("requests.head") + @mock.patch("arvados.collection.Collection") + def test_http_expired(self, collectionmock, headmock, getmock): + api = mock.MagicMock() + + api.collections().list().execute.return_value = { + "items": [{ + "uuid": "zzzzz-4zz18-zzzzzzzzzzzzzz3", + "portable_data_hash": "99999999999999999999999999999998+99", + "properties": { + 'http://example.com/file1.txt': { + 'Date': 'Tue, 15 May 2018 00:00:00 GMT', + 'Expires': 'Tue, 16 May 2018 00:00:00 GMT' + } + } + }] + } + + cm = mock.MagicMock() + cm.manifest_locator.return_value = "zzzzz-4zz18-zzzzzzzzzzzzzz4" + cm.portable_data_hash.return_value = "99999999999999999999999999999997+99" + cm.keys.return_value = ["file1.txt"] + collectionmock.return_value = cm + + req = mock.MagicMock() + req.status_code = 200 + req.headers = {'Date': 'Tue, 17 May 2018 00:00:00 GMT'} + req.iter_content.return_value = ["def"] + getmock.return_value = req + headmock.return_value = req + + utcnow = mock.MagicMock() + utcnow.return_value = datetime.datetime(2018, 5, 17) + + r = arvados_cwl.http.http_to_keep(api, None, "http://example.com/file1.txt", utcnow=utcnow) + self.assertEqual(r, "keep:99999999999999999999999999999997+99/file1.txt") + + getmock.assert_called_with("http://example.com/file1.txt", stream=True, allow_redirects=True) + + cm.open.assert_called_with("file1.txt", "w") + cm.save_new.assert_called_with(name="Downloaded from http://example.com/file1.txt", + owner_uuid=None, ensure_unique_name=True) + + api.collections().update.assert_has_calls([ + mock.call(uuid=cm.manifest_locator(), + body={"collection":{"properties": {'http://example.com/file1.txt': {'Date': 'Tue, 17 May 2018 00:00:00 GMT'}}}}) + ]) + + + @mock.patch("requests.get") + @mock.patch("requests.head") + @mock.patch("arvados.collection.CollectionReader") + def test_http_etag(self, collectionmock, headmock, getmock): + api = mock.MagicMock() + + api.collections().list().execute.return_value = { + "items": [{ + "uuid": "zzzzz-4zz18-zzzzzzzzzzzzzz3", + "portable_data_hash": "99999999999999999999999999999998+99", + "properties": { + 'http://example.com/file1.txt': { + 'Date': 'Tue, 15 May 2018 00:00:00 GMT', + 'Expires': 'Tue, 16 May 2018 00:00:00 GMT', + 'ETag': '123456' + } + } + }] + } + + cm = mock.MagicMock() + cm.manifest_locator.return_value = "zzzzz-4zz18-zzzzzzzzzzzzzz3" + cm.portable_data_hash.return_value = "99999999999999999999999999999998+99" + cm.keys.return_value = ["file1.txt"] + collectionmock.return_value = cm + + req = mock.MagicMock() + req.status_code = 200 + req.headers = { + 'Date': 'Tue, 17 May 2018 00:00:00 GMT', + 'Expires': 'Tue, 19 May 2018 00:00:00 GMT', + 'ETag': '123456' + } + headmock.return_value = req + + utcnow = mock.MagicMock() + utcnow.return_value = datetime.datetime(2018, 5, 17) + + r = arvados_cwl.http.http_to_keep(api, None, "http://example.com/file1.txt", utcnow=utcnow) + self.assertEqual(r, "keep:99999999999999999999999999999998+99/file1.txt") + + getmock.assert_not_called() + cm.open.assert_not_called() + + api.collections().update.assert_has_calls([ + mock.call(uuid=cm.manifest_locator(), + body={"collection":{"properties": {'http://example.com/file1.txt': { + 'Date': 'Tue, 17 May 2018 00:00:00 GMT', + 'Expires': 'Tue, 19 May 2018 00:00:00 GMT', + 'ETag': '123456' + }}}}) + ]) + + @mock.patch("requests.get") + @mock.patch("arvados.collection.Collection") + def test_http_content_disp(self, collectionmock, getmock): + api = mock.MagicMock() + + api.collections().list().execute.return_value = { + "items": [] + } + + cm = mock.MagicMock() + cm.manifest_locator.return_value = "zzzzz-4zz18-zzzzzzzzzzzzzz3" + cm.portable_data_hash.return_value = "99999999999999999999999999999998+99" + collectionmock.return_value = cm + + req = mock.MagicMock() + req.status_code = 200 + req.headers = {"Content-Disposition": "attachment; filename=file1.txt"} + req.iter_content.return_value = ["abc"] + getmock.return_value = req + + utcnow = mock.MagicMock() + utcnow.return_value = datetime.datetime(2018, 5, 15) + + r = arvados_cwl.http.http_to_keep(api, None, "http://example.com/download?fn=/file1.txt", utcnow=utcnow) + self.assertEqual(r, "keep:99999999999999999999999999999998+99/file1.txt") + + getmock.assert_called_with("http://example.com/download?fn=/file1.txt", stream=True, allow_redirects=True) + + cm.open.assert_called_with("file1.txt", "w") + cm.save_new.assert_called_with(name="Downloaded from http://example.com/download?fn=/file1.txt", + owner_uuid=None, ensure_unique_name=True) + + api.collections().update.assert_has_calls([ + mock.call(uuid=cm.manifest_locator(), + body={"collection":{"properties": {"http://example.com/download?fn=/file1.txt": {'Date': 'Tue, 15 May 2018 00:00:00 GMT'}}}}) + ]) -- 2.30.2