From 57725b1da407705895e107fa928ce7462c928ff1 Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Fri, 26 Jul 2024 09:34:56 -0400 Subject: [PATCH] 19982: Applied patch of Alex's initial work on this There's another branch but it is out of date, so starting a new branch here. Original work on this feature was alex.coleman@curii.com, I'm picking up where she left off. Arvados-DCO-1.1-Signed-off-by: Peter Amstutz --- sdk/cwl/arvados_cwl/__init__.py | 1 + sdk/cwl/arvados_cwl/arvcontainer.py | 18 ++++++++- sdk/cwl/tests/test_container.py | 58 ++++++++++++++++++++++++++--- 3 files changed, 71 insertions(+), 6 deletions(-) diff --git a/sdk/cwl/arvados_cwl/__init__.py b/sdk/cwl/arvados_cwl/__init__.py index 30d91b4094..8b828e2624 100644 --- a/sdk/cwl/arvados_cwl/__init__.py +++ b/sdk/cwl/arvados_cwl/__init__.py @@ -288,6 +288,7 @@ def add_arv_hints(): "http://arvados.org/cwl#OutputCollectionProperties", "http://arvados.org/cwl#KeepCacheTypeRequirement", "http://arvados.org/cwl#OutOfMemoryRetry", + "http://arvados.org/cwl#SpotInstanceRetry", ]) def exit_signal_handler(sigcode, frame): diff --git a/sdk/cwl/arvados_cwl/arvcontainer.py b/sdk/cwl/arvados_cwl/arvcontainer.py index a340f30e95..aa6e04539a 100644 --- a/sdk/cwl/arvados_cwl/arvcontainer.py +++ b/sdk/cwl/arvados_cwl/arvcontainer.py @@ -489,6 +489,17 @@ class ArvadosContainer(JobBase): logger.debug("Container request was %s", container_request) self.output_callback({}, "permanentFail") + + def spot_instance_retry(self, record, container): + spot_instance_retry_req, _ = self.get_requirement("http://arvados.org/cwl#SpotInstanceRetry") + if spot_instance_retry_req is None: + return False + if container["preemptionNotice"]: + return True + return False + + + def out_of_memory_retry(self, record, container): oom_retry_req, _ = self.get_requirement("http://arvados.org/cwl#OutOfMemoryRetry") if oom_retry_req is None: @@ -545,7 +556,12 @@ class ArvadosContainer(JobBase): self.run(None) retried = True return - + if processStatus == "permanentFail" and self.attempt_count == 1 and self.spot_instance_retry(record, container): + logger.warning("%s Container failed with preemptible instance reclaimed, trying again nonpreemptible") + self.job_runtime.enable_preemptible = False + self.run(None) + retried = True + return if rcode == 137: logger.warning("%s Container may have been killed for using too much RAM. Try resubmitting with a higher 'ramMin' or use the arv:OutOfMemoryRetry feature.", self.arvrunner.label(self)) diff --git a/sdk/cwl/tests/test_container.py b/sdk/cwl/tests/test_container.py index 6161670839..42f2326bed 100644 --- a/sdk/cwl/tests/test_container.py +++ b/sdk/cwl/tests/test_container.py @@ -84,8 +84,7 @@ class TestContainer(unittest.TestCase): "construct_tool_object": runner.arv_make_tool, "fetcher_constructor": functools.partial(arvados_cwl.CollectionFetcher, api_client=runner.api, fs_access=fs_access), "loader": Loader({}), - "metadata": cmap({"cwlVersion": INTERNAL_VERSION, "http://commonwl.org/cwltool#original_cwlVersion": "v1.0"}), - "default_docker_image": "arvados/jobs:"+arvados_cwl.__version__ + "metadata": cmap({"cwlVersion": INTERNAL_VERSION, "http://commonwl.org/cwltool#original_cwlVersion": "v1.0"}) }) runtimeContext = arvados_cwl.context.ArvRuntimeContext( {"work_api": "containers", @@ -620,7 +619,7 @@ class TestContainer(unittest.TestCase): self.fail("RuntimeStatusLoggingHandler should not be called recursively") - # Test to make sure that an exception raised from + # Test to make sure trunner = mock.MagicMock()hat an exception raised from # get_current_container doesn't cause the logger to raise an # exception @mock.patch("arvados_cwl.util.get_current_container") @@ -1607,8 +1606,7 @@ class TestWorkflow(unittest.TestCase): "make_fs_access": make_fs_access, "loader": document_loader, "metadata": {"cwlVersion": INTERNAL_VERSION, "http://commonwl.org/cwltool#original_cwlVersion": "v1.0"}, - "construct_tool_object": runner.arv_make_tool, - "default_docker_image": "arvados/jobs:"+arvados_cwl.__version__}) + "construct_tool_object": runner.arv_make_tool}) runtimeContext = arvados_cwl.context.ArvRuntimeContext( {"work_api": "containers", "basedir": "", @@ -1853,3 +1851,53 @@ class TestWorkflow(unittest.TestCase): api._rootDesc = copy.deepcopy(get_rootDesc()) runner = arvados_cwl.executor.ArvCwlExecutor(api) self.assertEqual(runner.work_api, 'containers') + + @mock.patch("arvados.collection.Collection") + def test_spot_instance_retry(self, blah): + arvados_cwl.add_arv_hints() + + api = mock.MagicMock() + + runner = mock.MagicMock() + runner.api = api + runner.num_retries = 0 + runner.ignore_docker_for_reuse = False + runner.intermediate_output_ttl = 0 + runner.secret_store = cwltool.secrets.SecretStore() + + runner.api.containers().get().execute.return_value = { + "state": "Complete", + "output": "abc+123", + "exit_code": 138 # Want exit code to be failure + } + # Add assertions to make sure it reran as nonpreemptible + loadingContext, runtimeContext = self.helper(runner) + arvjob = arvados_cwl.ArvadosContainer(runner, + runtimeContext, + mock.MagicMock(), + {}, + None, + [], + [], + "testjob") + arvjob.output_callback = mock.MagicMock() + arvjob.collect_outputs = mock.MagicMock() + arvjob.successCodes = [0] + arvjob.outdir = "/var/spool/cwl" + arvjob.output_ttl = 3600 + arvjob.uuid = "zzzzz-xvhdp-zzzzzzzzzzzzzz1" + + arvjob.collect_outputs.return_value = {"out": "stuff"} + + arvjob.done({ + "state": "Final", + "log_uuid": "zzzzz-4zz18-zzzzzzzzzzzzzz1", + "output_uuid": "zzzzz-4zz18-zzzzzzzzzzzzzz2", + "uuid": "zzzzz-xvhdp-zzzzzzzzzzzzzzz", + "container_uuid": "zzzzz-8i9sb-zzzzzzzzzzzzzzz", + "modified_at": "2017-05-26T12:01:22Z", + "properties": {} + }) + + self.assertTrue(api.container_requests().create.called) + self.assertTrue(arvjob.attempt_count == 2) \ No newline at end of file -- 2.30.2