From 9e76a12ff0b25322f86caf6d5ea70c09cbfd8829 Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Mon, 6 Mar 2023 12:42:15 -0500 Subject: [PATCH] 19975: Add integration test for out-of-memory resubmit Arvados-DCO-1.1-Signed-off-by: Peter Amstutz --- sdk/cwl/arvados_cwl/arvcontainer.py | 13 +++++++------ sdk/cwl/tests/arvados-tests.yml | 15 +++++++++++++++ sdk/cwl/tests/oom/19975-oom.cwl | 18 ++++++++++++++++++ sdk/cwl/tests/oom/19975-oom3.cwl | 19 +++++++++++++++++++ sdk/cwl/tests/oom/fakeoom.py | 13 +++++++++++++ sdk/cwl/tests/oom/fakeoom.yml | 7 +++++++ sdk/cwl/tests/oom/fakeoom2.py | 13 +++++++++++++ sdk/cwl/tests/oom/fakeoom2.yml | 7 +++++++ sdk/cwl/tests/oom/fakeoom3.py | 14 ++++++++++++++ sdk/cwl/tests/oom/fakeoom3.yml | 7 +++++++ 10 files changed, 120 insertions(+), 6 deletions(-) create mode 100644 sdk/cwl/tests/oom/19975-oom.cwl create mode 100644 sdk/cwl/tests/oom/19975-oom3.cwl create mode 100644 sdk/cwl/tests/oom/fakeoom.py create mode 100644 sdk/cwl/tests/oom/fakeoom.yml create mode 100644 sdk/cwl/tests/oom/fakeoom2.py create mode 100644 sdk/cwl/tests/oom/fakeoom2.yml create mode 100644 sdk/cwl/tests/oom/fakeoom3.py create mode 100644 sdk/cwl/tests/oom/fakeoom3.yml diff --git a/sdk/cwl/arvados_cwl/arvcontainer.py b/sdk/cwl/arvados_cwl/arvcontainer.py index 632e171b40..aafbc38fc6 100644 --- a/sdk/cwl/arvados_cwl/arvcontainer.py +++ b/sdk/cwl/arvados_cwl/arvcontainer.py @@ -386,12 +386,14 @@ class ArvadosContainer(JobBase): try: ram = runtime_constraints["ram"] + self.uuid = runtimeContext.submit_request_uuid + for i in ramMultiplier: runtime_constraints["ram"] = ram * i - if runtimeContext.submit_request_uuid: + if self.uuid: response = self.arvrunner.api.container_requests().update( - uuid=runtimeContext.submit_request_uuid, + uuid=self.uuid, body=container_request, **extra_submit_params ).execute(num_retries=self.arvrunner.num_retries) @@ -400,7 +402,7 @@ class ArvadosContainer(JobBase): body=container_request, **extra_submit_params ).execute(num_retries=self.arvrunner.num_retries) - runtimeContext.submit_request_uuid = response["uuid"] + self.uuid = response["uuid"] if response["container_uuid"] is not None: break @@ -410,12 +412,11 @@ class ArvadosContainer(JobBase): container_request["state"] = "Committed" response = self.arvrunner.api.container_requests().update( - uuid=runtimeContext.submit_request_uuid, + uuid=self.uuid, body=container_request, **extra_submit_params ).execute(num_retries=self.arvrunner.num_retries) - self.uuid = response["uuid"] self.arvrunner.process_submitted(self) self.attempt_count += 1 @@ -449,7 +450,7 @@ class ArvadosContainer(JobBase): done.logtail(logc, callback, "", maxlen=1000) # Check allocation failure - oom_matches = oom_retry_req.get('memoryErrorRegex') or r'(bad_alloc|out ?of ?memory|container using over 9.% of memory)' + oom_matches = oom_retry_req.get('memoryErrorRegex') or r'(bad_alloc|out ?of ?memory|memory ?error|container using over 9.% of memory)' if re.search(oom_matches, loglines[0], re.IGNORECASE | re.MULTILINE): return True diff --git a/sdk/cwl/tests/arvados-tests.yml b/sdk/cwl/tests/arvados-tests.yml index f242e63236..a93c64a224 100644 --- a/sdk/cwl/tests/arvados-tests.yml +++ b/sdk/cwl/tests/arvados-tests.yml @@ -479,3 +479,18 @@ } tool: 19678-name-id.cwl doc: "Test issue 19678 - non-string type input parameter called 'name'" + +- job: oom/fakeoom.yml + output: {} + tool: oom/19975-oom.cwl + doc: "Test feature 19975 - retry on exit 137" + +- job: oom/fakeoom2.yml + output: {} + tool: oom/19975-oom.cwl + doc: "Test feature 19975 - retry on memory error" + +- job: oom/fakeoom3.yml + output: {} + tool: oom/19975-oom3.cwl + doc: "Test feature 19975 - retry on custom error" diff --git a/sdk/cwl/tests/oom/19975-oom.cwl b/sdk/cwl/tests/oom/19975-oom.cwl new file mode 100644 index 0000000000..ec80648716 --- /dev/null +++ b/sdk/cwl/tests/oom/19975-oom.cwl @@ -0,0 +1,18 @@ +# Copyright (C) The Arvados Authors. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +cwlVersion: v1.2 +class: CommandLineTool +$namespaces: + arv: "http://arvados.org/cwl#" +hints: + arv:OutOfMemoryRetry: + memoryRetryMultipler: 2 + ResourceRequirement: + ramMin: 256 + arv:APIRequirement: {} +inputs: + fakeoom: File +outputs: [] +arguments: [python3, $(inputs.fakeoom)] diff --git a/sdk/cwl/tests/oom/19975-oom3.cwl b/sdk/cwl/tests/oom/19975-oom3.cwl new file mode 100644 index 0000000000..af3271b847 --- /dev/null +++ b/sdk/cwl/tests/oom/19975-oom3.cwl @@ -0,0 +1,19 @@ +# Copyright (C) The Arvados Authors. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +cwlVersion: v1.2 +class: CommandLineTool +$namespaces: + arv: "http://arvados.org/cwl#" +hints: + arv:OutOfMemoryRetry: + memoryRetryMultipler: 2 + memoryErrorRegex: Whoops + ResourceRequirement: + ramMin: 256 + arv:APIRequirement: {} +inputs: + fakeoom: File +outputs: [] +arguments: [python3, $(inputs.fakeoom)] diff --git a/sdk/cwl/tests/oom/fakeoom.py b/sdk/cwl/tests/oom/fakeoom.py new file mode 100644 index 0000000000..cc0b2ed48e --- /dev/null +++ b/sdk/cwl/tests/oom/fakeoom.py @@ -0,0 +1,13 @@ +# Copyright (C) The Arvados Authors. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +import sys +import time +import arvados + +api = arvados.api() +current_container = api.containers().current().execute() + +if current_container["runtime_constraints"]["ram"] < (512*1024*1024): + sys.exit(137) diff --git a/sdk/cwl/tests/oom/fakeoom.yml b/sdk/cwl/tests/oom/fakeoom.yml new file mode 100644 index 0000000000..da95fb6be7 --- /dev/null +++ b/sdk/cwl/tests/oom/fakeoom.yml @@ -0,0 +1,7 @@ +# Copyright (C) The Arvados Authors. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +fakeoom: + class: File + location: fakeoom.py diff --git a/sdk/cwl/tests/oom/fakeoom2.py b/sdk/cwl/tests/oom/fakeoom2.py new file mode 100644 index 0000000000..89bd1f5c3b --- /dev/null +++ b/sdk/cwl/tests/oom/fakeoom2.py @@ -0,0 +1,13 @@ +# Copyright (C) The Arvados Authors. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +import sys +import time +import arvados + +api = arvados.api() +current_container = api.containers().current().execute() + +if current_container["runtime_constraints"]["ram"] < (512*1024*1024): + raise MemoryError() diff --git a/sdk/cwl/tests/oom/fakeoom2.yml b/sdk/cwl/tests/oom/fakeoom2.yml new file mode 100644 index 0000000000..4161252e5d --- /dev/null +++ b/sdk/cwl/tests/oom/fakeoom2.yml @@ -0,0 +1,7 @@ +# Copyright (C) The Arvados Authors. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +fakeoom: + class: File + location: fakeoom2.py diff --git a/sdk/cwl/tests/oom/fakeoom3.py b/sdk/cwl/tests/oom/fakeoom3.py new file mode 100644 index 0000000000..460c4a5844 --- /dev/null +++ b/sdk/cwl/tests/oom/fakeoom3.py @@ -0,0 +1,14 @@ +# Copyright (C) The Arvados Authors. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +import sys +import time +import arvados + +api = arvados.api() +current_container = api.containers().current().execute() + +if current_container["runtime_constraints"]["ram"] < (512*1024*1024): + print("Whoops") + sys.exit(1) diff --git a/sdk/cwl/tests/oom/fakeoom3.yml b/sdk/cwl/tests/oom/fakeoom3.yml new file mode 100644 index 0000000000..a6fc03ce46 --- /dev/null +++ b/sdk/cwl/tests/oom/fakeoom3.yml @@ -0,0 +1,7 @@ +# Copyright (C) The Arvados Authors. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +fakeoom: + class: File + location: fakeoom3.py -- 2.30.2