From: Peter Amstutz Date: Tue, 7 Mar 2023 18:42:58 +0000 (-0500) Subject: Merge branch '19975-oom-resubmit' refs #19975 X-Git-Tag: 2.6.0~21 X-Git-Url: https://git.arvados.org/arvados.git/commitdiff_plain/765604b096c464b242e60860ace75bd6645fd4e7?hp=efd62fdeffaef4d47710511f7e1ac37d5b4cc5a7 Merge branch '19975-oom-resubmit' refs #19975 Arvados-DCO-1.1-Signed-off-by: Peter Amstutz --- diff --git a/doc/user/cwl/cwl-extensions.html.textile.liquid b/doc/user/cwl/cwl-extensions.html.textile.liquid index 197816f4a4..e05072ddf6 100644 --- a/doc/user/cwl/cwl-extensions.html.textile.liquid +++ b/doc/user/cwl/cwl-extensions.html.textile.liquid @@ -71,6 +71,10 @@ hints: arv:UsePreemptible: usePreemptible: true + + arv:OutOfMemoryRetry: + memoryRetryMultipler: 2 + memoryErrorRegex: "custom memory error" {% endcodeblock %} h2(#RunInSingleContainer). arv:RunInSingleContainer @@ -188,6 +192,20 @@ table(table table-bordered table-condensed). |_. Field |_. Type |_. Description | |usePreemptible|boolean|Required, true to opt-in to using preemptible instances, false to opt-out.| + +h2(#OutOfMemoryRetry). arv:OutOfMemoryRetry + +Specify that when a workflow step appears to have failed because it did not request enough RAM, it should be re-submitted with more RAM. Out of memory conditions are detected either by the container being unexpectedly killed (exit code 137) or by matching a pattern in the container's output (see @memoryErrorRegex@). Retrying will increase the base RAM request by the value of @memoryRetryMultipler@. For example, if the original RAM request was 10 GiB and the multiplier is 1.5, then it will re-submit with 15 GiB. + +Containers are only re-submitted once. If it fails a second time after increasing RAM, then the worklow step will still fail. + +Also note that expressions that use @$(runtime.ram)@ (such as dynamic command line parameters) are not reevaluated when the container is resubmitted. + +table(table table-bordered table-condensed). +|_. Field |_. Type |_. Description | +|memoryRetryMultipler|float|Required, the retry will multiply the base memory request by this factor to get the retry memory request.| +|memoryErrorRegex|string|Optional, a custom regex that, if found in the stdout, stderr or crunch-run logging of a program, will trigger a retry with greater RAM. If not provided, the default pattern matches "out of memory" (with or without spaces), "memory error" (with or without spaces), "bad_alloc" and "container using over 90% of memory".| + h2. arv:dockerCollectionPDH This is an optional extension field appearing on the standard @DockerRequirement@. It specifies the portable data hash of the Arvados collection containing the Docker image. If present, it takes precedence over @dockerPull@ or @dockerImageId@. diff --git a/sdk/cwl/arvados_cwl/__init__.py b/sdk/cwl/arvados_cwl/__init__.py index 52a9a6c208..74ca9312bf 100644 --- a/sdk/cwl/arvados_cwl/__init__.py +++ b/sdk/cwl/arvados_cwl/__init__.py @@ -285,6 +285,7 @@ def add_arv_hints(): "http://arvados.org/cwl#UsePreemptible", "http://arvados.org/cwl#OutputCollectionProperties", "http://arvados.org/cwl#KeepCacheTypeRequirement", + "http://arvados.org/cwl#OutOfMemoryRetry", ]) def exit_signal_handler(sigcode, frame): diff --git a/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.0.yml b/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.0.yml index fc370eb813..91a05e1254 100644 --- a/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.0.yml +++ b/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.0.yml @@ -456,3 +456,30 @@ $graph: only or written to disk and memory-mapped. The disk cache leverages the kernel's virtual memory system so "hot" data will generally still be kept in RAM. + +- name: OutOfMemoryRetry + type: record + extends: cwl:ProcessRequirement + inVocab: false + doc: | + Detect when a failed tool run may have run out of memory, and + re-submit the container with more RAM. + fields: + - name: class + type: string + doc: "'arv:OutOfMemoryRetry" + jsonldPredicate: + _id: "@type" + _type: "@vocab" + - name: memoryErrorRegex + type: string? + doc: | + A regular expression that will be used on the text of stdout + and stderr produced by the tool to determine if a failed job + should be retried with more RAM. By default, searches for the + substrings 'bad_alloc' and 'OutOfMemory'. + - name: memoryRetryMultipler + type: float + doc: | + If the container failed on its first run, re-submit the + container with the RAM request multiplied by this factor. diff --git a/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.1.yml b/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.1.yml index 69c0ed6cff..458d5a37a7 100644 --- a/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.1.yml +++ b/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.1.yml @@ -399,3 +399,30 @@ $graph: only or written to disk and memory-mapped. The disk cache leverages the kernel's virtual memory system so "hot" data will generally still be kept in RAM. + +- name: OutOfMemoryRetry + type: record + extends: cwl:ProcessRequirement + inVocab: false + doc: | + Detect when a failed tool run may have run out of memory, and + re-submit the container with more RAM. + fields: + - name: class + type: string + doc: "'arv:OutOfMemoryRetry" + jsonldPredicate: + _id: "@type" + _type: "@vocab" + - name: memoryErrorRegex + type: string? + doc: | + A regular expression that will be used on the text of stdout + and stderr produced by the tool to determine if a failed job + should be retried with more RAM. By default, searches for the + substrings 'bad_alloc' and 'OutOfMemory'. + - name: memoryRetryMultipler + type: float + doc: | + If the container failed on its first run, re-submit the + container with the RAM request multiplied by this factor. diff --git a/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.2.yml b/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.2.yml index 86cd06effe..f4246ed70a 100644 --- a/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.2.yml +++ b/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.2.yml @@ -401,3 +401,31 @@ $graph: only or written to disk and memory-mapped. The disk cache leverages the kernel's virtual memory system so "hot" data will generally still be kept in RAM. + + +- name: OutOfMemoryRetry + type: record + extends: cwl:ProcessRequirement + inVocab: false + doc: | + Detect when a failed tool run may have run out of memory, and + re-submit the container with more RAM. + fields: + - name: class + type: string + doc: "'arv:OutOfMemoryRetry" + jsonldPredicate: + _id: "@type" + _type: "@vocab" + - name: memoryErrorRegex + type: string? + doc: | + A regular expression that will be used on the text of stdout + and stderr produced by the tool to determine if a failed job + should be retried with more RAM. By default, searches for the + substrings 'bad_alloc' and 'OutOfMemory'. + - name: memoryRetryMultipler + type: float + doc: | + If the container failed on its first run, re-submit the + container with the RAM request multiplied by this factor. diff --git a/sdk/cwl/arvados_cwl/arvcontainer.py b/sdk/cwl/arvados_cwl/arvcontainer.py index 742906c616..be8e557bd8 100644 --- a/sdk/cwl/arvados_cwl/arvcontainer.py +++ b/sdk/cwl/arvados_cwl/arvcontainer.py @@ -15,6 +15,7 @@ import datetime import ciso8601 import uuid import math +import re import arvados_cwl.util import ruamel.yaml @@ -56,6 +57,7 @@ class ArvadosContainer(JobBase): self.job_runtime = job_runtime self.running = False self.uuid = None + self.attempt_count = 0 def update_pipeline_component(self, r): pass @@ -88,7 +90,7 @@ class ArvadosContainer(JobBase): container_request["output_path"] = self.outdir container_request["cwd"] = self.outdir container_request["priority"] = runtimeContext.priority - container_request["state"] = "Committed" + container_request["state"] = "Uncommitted" container_request.setdefault("properties", {}) container_request["properties"]["cwl_input"] = self.joborder @@ -365,6 +367,12 @@ class ArvadosContainer(JobBase): logger.warning("%s API revision is %s, revision %s is required to support setting properties on output collections.", self.arvrunner.label(self), self.arvrunner.api._rootDesc["revision"], "20220510") + ram_multiplier = [1] + + oom_retry_req, _ = self.get_requirement("http://arvados.org/cwl#OutOfMemoryRetry") + if oom_retry_req and oom_retry_req.get('memoryRetryMultipler'): + ram_multiplier.append(oom_retry_req.get('memoryRetryMultipler')) + if runtimeContext.runnerjob.startswith("arvwf:"): wfuuid = runtimeContext.runnerjob[6:runtimeContext.runnerjob.index("#")] wfrecord = self.arvrunner.api.workflows().get(uuid=wfuuid).execute(num_retries=self.arvrunner.num_retries) @@ -372,23 +380,45 @@ class ArvadosContainer(JobBase): container_request["name"] = wfrecord["name"] container_request["properties"]["template_uuid"] = wfuuid - self.output_callback = self.arvrunner.get_wrapped_callback(self.output_callback) + if self.attempt_count == 0: + self.output_callback = self.arvrunner.get_wrapped_callback(self.output_callback) try: - if runtimeContext.submit_request_uuid: - response = self.arvrunner.api.container_requests().update( - uuid=runtimeContext.submit_request_uuid, - body=container_request, - **extra_submit_params - ).execute(num_retries=self.arvrunner.num_retries) - else: - response = self.arvrunner.api.container_requests().create( - body=container_request, - **extra_submit_params - ).execute(num_retries=self.arvrunner.num_retries) + ram = runtime_constraints["ram"] + + self.uuid = runtimeContext.submit_request_uuid + + for i in ram_multiplier: + runtime_constraints["ram"] = ram * i + + if self.uuid: + response = self.arvrunner.api.container_requests().update( + uuid=self.uuid, + body=container_request, + **extra_submit_params + ).execute(num_retries=self.arvrunner.num_retries) + else: + response = self.arvrunner.api.container_requests().create( + body=container_request, + **extra_submit_params + ).execute(num_retries=self.arvrunner.num_retries) + self.uuid = response["uuid"] + + if response["container_uuid"] is not None: + break + + if response["container_uuid"] is None: + runtime_constraints["ram"] = ram * ram_multiplier[self.attempt_count] + + container_request["state"] = "Committed" + response = self.arvrunner.api.container_requests().update( + uuid=self.uuid, + body=container_request, + **extra_submit_params + ).execute(num_retries=self.arvrunner.num_retries) - self.uuid = response["uuid"] self.arvrunner.process_submitted(self) + self.attempt_count += 1 if response["state"] == "Final": logger.info("%s reused container %s", self.arvrunner.label(self), response["container_uuid"]) @@ -399,8 +429,36 @@ class ArvadosContainer(JobBase): logger.debug("Container request was %s", container_request) self.output_callback({}, "permanentFail") + def out_of_memory_retry(self, record, container): + oom_retry_req, _ = self.get_requirement("http://arvados.org/cwl#OutOfMemoryRetry") + if oom_retry_req is None: + return False + + # Sometimes it gets killed with no warning + if container["exit_code"] == 137: + return True + + logc = arvados.collection.CollectionReader(record["log_uuid"], + api_client=self.arvrunner.api, + keep_client=self.arvrunner.keep_client, + num_retries=self.arvrunner.num_retries) + + loglines = [""] + def callback(v1, v2, v3): + loglines[0] = v3 + + done.logtail(logc, callback, "", maxlen=1000) + + # Check allocation failure + oom_matches = oom_retry_req.get('memoryErrorRegex') or r'(bad_alloc|out ?of ?memory|memory ?error|container using over 9.% of memory)' + if re.search(oom_matches, loglines[0], re.IGNORECASE | re.MULTILINE): + return True + + return False + def done(self, record): outputs = {} + retried = False try: container = self.arvrunner.api.containers().get( uuid=record["container_uuid"] @@ -418,8 +476,17 @@ class ArvadosContainer(JobBase): else: processStatus = "permanentFail" + if processStatus == "permanentFail" and self.attempt_count == 1 and self.out_of_memory_retry(record, container): + logger.warning("%s Container failed with out of memory error, retrying with more RAM.", + self.arvrunner.label(self)) + self.job_runtime.submit_request_uuid = None + self.uuid = None + self.run(None) + retried = True + return + if rcode == 137: - logger.warning("%s Container may have been killed for using too much RAM. Try resubmitting with a higher 'ramMin'.", + logger.warning("%s Container may have been killed for using too much RAM. Try resubmitting with a higher 'ramMin' or use the arv:OutOfMemoryRetry feature.", self.arvrunner.label(self)) else: processStatus = "permanentFail" @@ -464,7 +531,8 @@ class ArvadosContainer(JobBase): logger.exception("%s while getting output object:", self.arvrunner.label(self)) processStatus = "permanentFail" finally: - self.output_callback(outputs, processStatus) + if not retried: + self.output_callback(outputs, processStatus) class RunnerContainer(Runner): diff --git a/sdk/cwl/tests/arvados-tests.yml b/sdk/cwl/tests/arvados-tests.yml index f242e63236..a93c64a224 100644 --- a/sdk/cwl/tests/arvados-tests.yml +++ b/sdk/cwl/tests/arvados-tests.yml @@ -479,3 +479,18 @@ } tool: 19678-name-id.cwl doc: "Test issue 19678 - non-string type input parameter called 'name'" + +- job: oom/fakeoom.yml + output: {} + tool: oom/19975-oom.cwl + doc: "Test feature 19975 - retry on exit 137" + +- job: oom/fakeoom2.yml + output: {} + tool: oom/19975-oom.cwl + doc: "Test feature 19975 - retry on memory error" + +- job: oom/fakeoom3.yml + output: {} + tool: oom/19975-oom3.cwl + doc: "Test feature 19975 - retry on custom error" diff --git a/sdk/cwl/tests/oom/19975-oom.cwl b/sdk/cwl/tests/oom/19975-oom.cwl new file mode 100644 index 0000000000..ec80648716 --- /dev/null +++ b/sdk/cwl/tests/oom/19975-oom.cwl @@ -0,0 +1,18 @@ +# Copyright (C) The Arvados Authors. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +cwlVersion: v1.2 +class: CommandLineTool +$namespaces: + arv: "http://arvados.org/cwl#" +hints: + arv:OutOfMemoryRetry: + memoryRetryMultipler: 2 + ResourceRequirement: + ramMin: 256 + arv:APIRequirement: {} +inputs: + fakeoom: File +outputs: [] +arguments: [python3, $(inputs.fakeoom)] diff --git a/sdk/cwl/tests/oom/19975-oom3.cwl b/sdk/cwl/tests/oom/19975-oom3.cwl new file mode 100644 index 0000000000..af3271b847 --- /dev/null +++ b/sdk/cwl/tests/oom/19975-oom3.cwl @@ -0,0 +1,19 @@ +# Copyright (C) The Arvados Authors. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +cwlVersion: v1.2 +class: CommandLineTool +$namespaces: + arv: "http://arvados.org/cwl#" +hints: + arv:OutOfMemoryRetry: + memoryRetryMultipler: 2 + memoryErrorRegex: Whoops + ResourceRequirement: + ramMin: 256 + arv:APIRequirement: {} +inputs: + fakeoom: File +outputs: [] +arguments: [python3, $(inputs.fakeoom)] diff --git a/sdk/cwl/tests/oom/fakeoom.py b/sdk/cwl/tests/oom/fakeoom.py new file mode 100644 index 0000000000..cc0b2ed48e --- /dev/null +++ b/sdk/cwl/tests/oom/fakeoom.py @@ -0,0 +1,13 @@ +# Copyright (C) The Arvados Authors. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +import sys +import time +import arvados + +api = arvados.api() +current_container = api.containers().current().execute() + +if current_container["runtime_constraints"]["ram"] < (512*1024*1024): + sys.exit(137) diff --git a/sdk/cwl/tests/oom/fakeoom.yml b/sdk/cwl/tests/oom/fakeoom.yml new file mode 100644 index 0000000000..da95fb6be7 --- /dev/null +++ b/sdk/cwl/tests/oom/fakeoom.yml @@ -0,0 +1,7 @@ +# Copyright (C) The Arvados Authors. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +fakeoom: + class: File + location: fakeoom.py diff --git a/sdk/cwl/tests/oom/fakeoom2.py b/sdk/cwl/tests/oom/fakeoom2.py new file mode 100644 index 0000000000..89bd1f5c3b --- /dev/null +++ b/sdk/cwl/tests/oom/fakeoom2.py @@ -0,0 +1,13 @@ +# Copyright (C) The Arvados Authors. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +import sys +import time +import arvados + +api = arvados.api() +current_container = api.containers().current().execute() + +if current_container["runtime_constraints"]["ram"] < (512*1024*1024): + raise MemoryError() diff --git a/sdk/cwl/tests/oom/fakeoom2.yml b/sdk/cwl/tests/oom/fakeoom2.yml new file mode 100644 index 0000000000..4161252e5d --- /dev/null +++ b/sdk/cwl/tests/oom/fakeoom2.yml @@ -0,0 +1,7 @@ +# Copyright (C) The Arvados Authors. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +fakeoom: + class: File + location: fakeoom2.py diff --git a/sdk/cwl/tests/oom/fakeoom3.py b/sdk/cwl/tests/oom/fakeoom3.py new file mode 100644 index 0000000000..460c4a5844 --- /dev/null +++ b/sdk/cwl/tests/oom/fakeoom3.py @@ -0,0 +1,14 @@ +# Copyright (C) The Arvados Authors. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +import sys +import time +import arvados + +api = arvados.api() +current_container = api.containers().current().execute() + +if current_container["runtime_constraints"]["ram"] < (512*1024*1024): + print("Whoops") + sys.exit(1) diff --git a/sdk/cwl/tests/oom/fakeoom3.yml b/sdk/cwl/tests/oom/fakeoom3.yml new file mode 100644 index 0000000000..a6fc03ce46 --- /dev/null +++ b/sdk/cwl/tests/oom/fakeoom3.yml @@ -0,0 +1,7 @@ +# Copyright (C) The Arvados Authors. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +fakeoom: + class: File + location: fakeoom3.py