Merge branch '19975-oom-resubmit' refs #19975

author Peter Amstutz <peter.amstutz@curii.com>

Tue, 7 Mar 2023 18:42:58 +0000 (13:42 -0500)

committer Peter Amstutz <peter.amstutz@curii.com>

Tue, 7 Mar 2023 18:42:58 +0000 (13:42 -0500)
author Peter Amstutz <peter.amstutz@curii.com>
Tue, 7 Mar 2023 18:42:58 +0000 (13:42 -0500)
committer Peter Amstutz <peter.amstutz@curii.com>
Tue, 7 Mar 2023 18:42:58 +0000 (13:42 -0500)
diff --git a/doc/user/cwl/cwl-extensions.html.textile.liquid b/doc/user/cwl/cwl-extensions.html.textile.liquid

index 197816f4a401fd417a6fa208a5954f9fbe78c1f9..e05072ddf6843a9a01599d4854317a49da62b4a1 100644 (file)
--- a/doc/user/cwl/cwl-extensions.html.textile.liquid
+++ b/doc/user/cwl/cwl-extensions.html.textile.liquid
@@ -71,6 +71,10 @@ hints:
  
    arv:UsePreemptible:
      usePreemptible: true
+
+  arv:OutOfMemoryRetry:
+    memoryRetryMultipler: 2
+    memoryErrorRegex: "custom memory error"
  {% endcodeblock %}
  
  h2(#RunInSingleContainer). arv:RunInSingleContainer
@@ -188,6 +192,20 @@ table(table table-bordered table-condensed).
  |_. Field |_. Type |_. Description |
  |usePreemptible|boolean|Required, true to opt-in to using preemptible instances, false to opt-out.|
  
+
+h2(#OutOfMemoryRetry). arv:OutOfMemoryRetry
+
+Specify that when a workflow step appears to have failed because it did not request enough RAM, it should be re-submitted with more RAM.  Out of memory conditions are detected either by the container being unexpectedly killed (exit code 137) or by matching a pattern in the container's output (see @memoryErrorRegex@).  Retrying will increase the base RAM request by the value of @memoryRetryMultipler@.  For example, if the original RAM request was 10 GiB and the multiplier is 1.5, then it will re-submit with 15 GiB.
+
+Containers are only re-submitted once.  If it fails a second time after increasing RAM, then the worklow step will still fail.
+
+Also note that expressions that use @$(runtime.ram)@ (such as dynamic command line parameters) are not reevaluated when the container is resubmitted.
+
+table(table table-bordered table-condensed).
+|_. Field |_. Type |_. Description |
+|memoryRetryMultipler|float|Required, the retry will multiply the base memory request by this factor to get the retry memory request.|
+|memoryErrorRegex|string|Optional, a custom regex that, if found in the stdout, stderr or crunch-run logging of a program, will trigger a retry with greater RAM.  If not provided, the default pattern matches "out of memory" (with or without spaces), "memory error" (with or without spaces), "bad_alloc" and "container using over 90% of memory".|
+
  h2. arv:dockerCollectionPDH
  
  This is an optional extension field appearing on the standard @DockerRequirement@.  It specifies the portable data hash of the Arvados collection containing the Docker image.  If present, it takes precedence over @dockerPull@ or @dockerImageId@.
diff --git a/sdk/cwl/arvados_cwl/__init__.py b/sdk/cwl/arvados_cwl/__init__.py

index 52a9a6c208cf73a304eb7650e67b5ad5a0d8ab2a..74ca9312bf54b1e965984f8bb893525ee2147e05 100644 (file)
--- a/sdk/cwl/arvados_cwl/__init__.py
+++ b/sdk/cwl/arvados_cwl/__init__.py
@@ -285,6 +285,7 @@ def add_arv_hints():
          "http://arvados.org/cwl#UsePreemptible",
          "http://arvados.org/cwl#OutputCollectionProperties",
          "http://arvados.org/cwl#KeepCacheTypeRequirement",
+        "http://arvados.org/cwl#OutOfMemoryRetry",
      ])
  
  def exit_signal_handler(sigcode, frame):
diff --git a/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.0.yml b/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.0.yml

index fc370eb8132bd35466fce0ec19049e141f94d436..91a05e125439952b8023dd197dd620211052d9eb 100644 (file)
--- a/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.0.yml
+++ b/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.0.yml
@@ -456,3 +456,30 @@ $graph:
          only or written to disk and memory-mapped.  The disk cache
          leverages the kernel's virtual memory system so "hot" data will
          generally still be kept in RAM.
+
+- name: OutOfMemoryRetry
+  type: record
+  extends: cwl:ProcessRequirement
+  inVocab: false
+  doc: |
+    Detect when a failed tool run may have run out of memory, and
+    re-submit the container with more RAM.
+  fields:
+    - name: class
+      type: string
+      doc: "'arv:OutOfMemoryRetry"
+      jsonldPredicate:
+        _id: "@type"
+        _type: "@vocab"
+    - name: memoryErrorRegex
+      type: string?
+      doc: |
+        A regular expression that will be used on the text of stdout
+        and stderr produced by the tool to determine if a failed job
+        should be retried with more RAM.  By default, searches for the
+        substrings 'bad_alloc' and 'OutOfMemory'.
+    - name: memoryRetryMultipler
+      type: float
+      doc: |
+        If the container failed on its first run, re-submit the
+        container with the RAM request multiplied by this factor.
diff --git a/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.1.yml b/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.1.yml

index 69c0ed6cffadfb73a695de7229570563e24de995..458d5a37a7b0339bfd4bc21dd43fa5ac09d0fe86 100644 (file)
--- a/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.1.yml
+++ b/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.1.yml
@@ -399,3 +399,30 @@ $graph:
          only or written to disk and memory-mapped.  The disk cache
          leverages the kernel's virtual memory system so "hot" data will
          generally still be kept in RAM.
+
+- name: OutOfMemoryRetry
+  type: record
+  extends: cwl:ProcessRequirement
+  inVocab: false
+  doc: |
+    Detect when a failed tool run may have run out of memory, and
+    re-submit the container with more RAM.
+  fields:
+    - name: class
+      type: string
+      doc: "'arv:OutOfMemoryRetry"
+      jsonldPredicate:
+        _id: "@type"
+        _type: "@vocab"
+    - name: memoryErrorRegex
+      type: string?
+      doc: |
+        A regular expression that will be used on the text of stdout
+        and stderr produced by the tool to determine if a failed job
+        should be retried with more RAM.  By default, searches for the
+        substrings 'bad_alloc' and 'OutOfMemory'.
+    - name: memoryRetryMultipler
+      type: float
+      doc: |
+        If the container failed on its first run, re-submit the
+        container with the RAM request multiplied by this factor.
diff --git a/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.2.yml b/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.2.yml

index 86cd06effea70ff2ea8e1f6567e0803b8e4caa27..f4246ed70a5b5f04f240a83b3baf8ec1c67d3827 100644 (file)
--- a/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.2.yml
+++ b/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.2.yml
@@ -401,3 +401,31 @@ $graph:
          only or written to disk and memory-mapped.  The disk cache
          leverages the kernel's virtual memory system so "hot" data will
          generally still be kept in RAM.
+
+
+- name: OutOfMemoryRetry
+  type: record
+  extends: cwl:ProcessRequirement
+  inVocab: false
+  doc: |
+    Detect when a failed tool run may have run out of memory, and
+    re-submit the container with more RAM.
+  fields:
+    - name: class
+      type: string
+      doc: "'arv:OutOfMemoryRetry"
+      jsonldPredicate:
+        _id: "@type"
+        _type: "@vocab"
+    - name: memoryErrorRegex
+      type: string?
+      doc: |
+        A regular expression that will be used on the text of stdout
+        and stderr produced by the tool to determine if a failed job
+        should be retried with more RAM.  By default, searches for the
+        substrings 'bad_alloc' and 'OutOfMemory'.
+    - name: memoryRetryMultipler
+      type: float
+      doc: |
+        If the container failed on its first run, re-submit the
+        container with the RAM request multiplied by this factor.
diff --git a/sdk/cwl/arvados_cwl/arvcontainer.py b/sdk/cwl/arvados_cwl/arvcontainer.py

index 742906c616514b9bcd2317cfeb4a11697f1c8522..be8e557bd8f9f2e0626eedff68a0dfe65fefc35d 100644 (file)
--- a/sdk/cwl/arvados_cwl/arvcontainer.py
+++ b/sdk/cwl/arvados_cwl/arvcontainer.py
@@ -15,6 +15,7 @@ import datetime
  import ciso8601
  import uuid
  import math
+import re
  
  import arvados_cwl.util
  import ruamel.yaml
@@ -56,6 +57,7 @@ class ArvadosContainer(JobBase):
          self.job_runtime = job_runtime
          self.running = False
          self.uuid = None
+        self.attempt_count = 0
  
      def update_pipeline_component(self, r):
          pass
@@ -88,7 +90,7 @@ class ArvadosContainer(JobBase):
          container_request["output_path"] = self.outdir
          container_request["cwd"] = self.outdir
          container_request["priority"] = runtimeContext.priority
-        container_request["state"] = "Committed"
+        container_request["state"] = "Uncommitted"
          container_request.setdefault("properties", {})
  
          container_request["properties"]["cwl_input"] = self.joborder
@@ -365,6 +367,12 @@ class ArvadosContainer(JobBase):
                  logger.warning("%s API revision is %s, revision %s is required to support setting properties on output collections.",
                                 self.arvrunner.label(self), self.arvrunner.api._rootDesc["revision"], "20220510")
  
+        ram_multiplier = [1]
+
+        oom_retry_req, _ = self.get_requirement("http://arvados.org/cwl#OutOfMemoryRetry")
+        if oom_retry_req and oom_retry_req.get('memoryRetryMultipler'):
+            ram_multiplier.append(oom_retry_req.get('memoryRetryMultipler'))
+
          if runtimeContext.runnerjob.startswith("arvwf:"):
              wfuuid = runtimeContext.runnerjob[6:runtimeContext.runnerjob.index("#")]
              wfrecord = self.arvrunner.api.workflows().get(uuid=wfuuid).execute(num_retries=self.arvrunner.num_retries)
@@ -372,23 +380,45 @@ class ArvadosContainer(JobBase):
                  container_request["name"] = wfrecord["name"]
              container_request["properties"]["template_uuid"] = wfuuid
  
-        self.output_callback = self.arvrunner.get_wrapped_callback(self.output_callback)
+        if self.attempt_count == 0:
+            self.output_callback = self.arvrunner.get_wrapped_callback(self.output_callback)
  
          try:
-            if runtimeContext.submit_request_uuid:
-                response = self.arvrunner.api.container_requests().update(
-                    uuid=runtimeContext.submit_request_uuid,
-                    body=container_request,
-                    **extra_submit_params
-                ).execute(num_retries=self.arvrunner.num_retries)
-            else:
-                response = self.arvrunner.api.container_requests().create(
-                    body=container_request,
-                    **extra_submit_params
-                ).execute(num_retries=self.arvrunner.num_retries)
+            ram = runtime_constraints["ram"]
+
+            self.uuid = runtimeContext.submit_request_uuid
+
+            for i in ram_multiplier:
+                runtime_constraints["ram"] = ram * i
+
+                if self.uuid:
+                    response = self.arvrunner.api.container_requests().update(
+                        uuid=self.uuid,
+                        body=container_request,
+                        **extra_submit_params
+                    ).execute(num_retries=self.arvrunner.num_retries)
+                else:
+                    response = self.arvrunner.api.container_requests().create(
+                        body=container_request,
+                        **extra_submit_params
+                    ).execute(num_retries=self.arvrunner.num_retries)
+                    self.uuid = response["uuid"]
+
+                if response["container_uuid"] is not None:
+                    break
+
+            if response["container_uuid"] is None:
+                runtime_constraints["ram"] = ram * ram_multiplier[self.attempt_count]
+
+            container_request["state"] = "Committed"
+            response = self.arvrunner.api.container_requests().update(
+                uuid=self.uuid,
+                body=container_request,
+                **extra_submit_params
+            ).execute(num_retries=self.arvrunner.num_retries)
  
-            self.uuid = response["uuid"]
              self.arvrunner.process_submitted(self)
+            self.attempt_count += 1
  
              if response["state"] == "Final":
                  logger.info("%s reused container %s", self.arvrunner.label(self), response["container_uuid"])
@@ -399,8 +429,36 @@ class ArvadosContainer(JobBase):
              logger.debug("Container request was %s", container_request)
              self.output_callback({}, "permanentFail")
  
+    def out_of_memory_retry(self, record, container):
+        oom_retry_req, _ = self.get_requirement("http://arvados.org/cwl#OutOfMemoryRetry")
+        if oom_retry_req is None:
+            return False
+
+        # Sometimes it gets killed with no warning
+        if container["exit_code"] == 137:
+            return True
+
+        logc = arvados.collection.CollectionReader(record["log_uuid"],
+                                                   api_client=self.arvrunner.api,
+                                                   keep_client=self.arvrunner.keep_client,
+                                                   num_retries=self.arvrunner.num_retries)
+
+        loglines = [""]
+        def callback(v1, v2, v3):
+            loglines[0] = v3
+
+        done.logtail(logc, callback, "", maxlen=1000)
+
+        # Check allocation failure
+        oom_matches = oom_retry_req.get('memoryErrorRegex') or r'(bad_alloc|out ?of ?memory|memory ?error|container using over 9.% of memory)'
+        if re.search(oom_matches, loglines[0], re.IGNORECASE | re.MULTILINE):
+            return True
+
+        return False
+
      def done(self, record):
          outputs = {}
+        retried = False
          try:
              container = self.arvrunner.api.containers().get(
                  uuid=record["container_uuid"]
@@ -418,8 +476,17 @@ class ArvadosContainer(JobBase):
                  else:
                      processStatus = "permanentFail"
  
+                if processStatus == "permanentFail" and self.attempt_count == 1 and self.out_of_memory_retry(record, container):
+                    logger.warning("%s Container failed with out of memory error, retrying with more RAM.",
+                                 self.arvrunner.label(self))
+                    self.job_runtime.submit_request_uuid = None
+                    self.uuid = None
+                    self.run(None)
+                    retried = True
+                    return
+
                  if rcode == 137:
-                    logger.warning("%s Container may have been killed for using too much RAM.  Try resubmitting with a higher 'ramMin'.",
+                    logger.warning("%s Container may have been killed for using too much RAM.  Try resubmitting with a higher 'ramMin' or use the arv:OutOfMemoryRetry feature.",
                                   self.arvrunner.label(self))
              else:
                  processStatus = "permanentFail"
@@ -464,7 +531,8 @@ class ArvadosContainer(JobBase):
              logger.exception("%s while getting output object:", self.arvrunner.label(self))
              processStatus = "permanentFail"
          finally:
-            self.output_callback(outputs, processStatus)
+            if not retried:
+                self.output_callback(outputs, processStatus)
  
  
  class RunnerContainer(Runner):
diff --git a/sdk/cwl/tests/arvados-tests.yml b/sdk/cwl/tests/arvados-tests.yml

index f242e632366c940ae7193c3add3237eb76bd79ad..a93c64a224c1e83b3d126b720fadeba6e8f59039 100644 (file)
--- a/sdk/cwl/tests/arvados-tests.yml
+++ b/sdk/cwl/tests/arvados-tests.yml
@@ -479,3 +479,18 @@
    }
    tool: 19678-name-id.cwl
    doc: "Test issue 19678 - non-string type input parameter called 'name'"
+
+- job: oom/fakeoom.yml
+  output: {}
+  tool: oom/19975-oom.cwl
+  doc: "Test feature 19975 - retry on exit 137"
+
+- job: oom/fakeoom2.yml
+  output: {}
+  tool: oom/19975-oom.cwl
+  doc: "Test feature 19975 - retry on memory error"
+
+- job: oom/fakeoom3.yml
+  output: {}
+  tool: oom/19975-oom3.cwl
+  doc: "Test feature 19975 - retry on custom error"
diff --git a/sdk/cwl/tests/oom/19975-oom.cwl b/sdk/cwl/tests/oom/19975-oom.cwl

new file mode 100644 (file)

index 0000000..ec80648
--- /dev/null
+++ b/sdk/cwl/tests/oom/19975-oom.cwl
@@ -0,0 +1,18 @@
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+cwlVersion: v1.2
+class: CommandLineTool
+$namespaces:
+  arv: "http://arvados.org/cwl#"
+hints:
+  arv:OutOfMemoryRetry:
+    memoryRetryMultipler: 2
+  ResourceRequirement:
+    ramMin: 256
+  arv:APIRequirement: {}
+inputs:
+  fakeoom: File
+outputs: []
+arguments: [python3, $(inputs.fakeoom)]
diff --git a/sdk/cwl/tests/oom/19975-oom3.cwl b/sdk/cwl/tests/oom/19975-oom3.cwl

new file mode 100644 (file)

index 0000000..af3271b
--- /dev/null
+++ b/sdk/cwl/tests/oom/19975-oom3.cwl
@@ -0,0 +1,19 @@
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+cwlVersion: v1.2
+class: CommandLineTool
+$namespaces:
+  arv: "http://arvados.org/cwl#"
+hints:
+  arv:OutOfMemoryRetry:
+    memoryRetryMultipler: 2
+    memoryErrorRegex: Whoops
+  ResourceRequirement:
+    ramMin: 256
+  arv:APIRequirement: {}
+inputs:
+  fakeoom: File
+outputs: []
+arguments: [python3, $(inputs.fakeoom)]
diff --git a/sdk/cwl/tests/oom/fakeoom.py b/sdk/cwl/tests/oom/fakeoom.py

new file mode 100644 (file)

index 0000000..cc0b2ed
--- /dev/null
+++ b/sdk/cwl/tests/oom/fakeoom.py
@@ -0,0 +1,13 @@
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import sys
+import time
+import arvados
+
+api = arvados.api()
+current_container = api.containers().current().execute()
+
+if current_container["runtime_constraints"]["ram"] < (512*1024*1024):
+    sys.exit(137)
diff --git a/sdk/cwl/tests/oom/fakeoom.yml b/sdk/cwl/tests/oom/fakeoom.yml

new file mode 100644 (file)

index 0000000..da95fb6
--- /dev/null
+++ b/sdk/cwl/tests/oom/fakeoom.yml
@@ -0,0 +1,7 @@
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+fakeoom:
+  class: File
+  location: fakeoom.py
diff --git a/sdk/cwl/tests/oom/fakeoom2.py b/sdk/cwl/tests/oom/fakeoom2.py

new file mode 100644 (file)

index 0000000..89bd1f5
--- /dev/null
+++ b/sdk/cwl/tests/oom/fakeoom2.py
@@ -0,0 +1,13 @@
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import sys
+import time
+import arvados
+
+api = arvados.api()
+current_container = api.containers().current().execute()
+
+if current_container["runtime_constraints"]["ram"] < (512*1024*1024):
+    raise MemoryError()
diff --git a/sdk/cwl/tests/oom/fakeoom2.yml b/sdk/cwl/tests/oom/fakeoom2.yml

new file mode 100644 (file)

index 0000000..4161252
--- /dev/null
+++ b/sdk/cwl/tests/oom/fakeoom2.yml
@@ -0,0 +1,7 @@
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+fakeoom:
+  class: File
+  location: fakeoom2.py
diff --git a/sdk/cwl/tests/oom/fakeoom3.py b/sdk/cwl/tests/oom/fakeoom3.py

new file mode 100644 (file)

index 0000000..460c4a5
--- /dev/null
+++ b/sdk/cwl/tests/oom/fakeoom3.py
@@ -0,0 +1,14 @@
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import sys
+import time
+import arvados
+
+api = arvados.api()
+current_container = api.containers().current().execute()
+
+if current_container["runtime_constraints"]["ram"] < (512*1024*1024):
+    print("Whoops")
+    sys.exit(1)
diff --git a/sdk/cwl/tests/oom/fakeoom3.yml b/sdk/cwl/tests/oom/fakeoom3.yml

new file mode 100644 (file)

index 0000000..a6fc03c
--- /dev/null
+++ b/sdk/cwl/tests/oom/fakeoom3.yml
@@ -0,0 +1,7 @@
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+fakeoom:
+  class: File
+  location: fakeoom3.py
author	Peter Amstutz <peter.amstutz@curii.com>
	Tue, 7 Mar 2023 18:42:58 +0000 (13:42 -0500)
committer	Peter Amstutz <peter.amstutz@curii.com>
	Tue, 7 Mar 2023 18:42:58 +0000 (13:42 -0500)
doc/user/cwl/cwl-extensions.html.textile.liquid		patch \| blob \| history
sdk/cwl/arvados_cwl/__init__.py		patch \| blob \| history
sdk/cwl/arvados_cwl/arv-cwl-schema-v1.0.yml		patch \| blob \| history
sdk/cwl/arvados_cwl/arv-cwl-schema-v1.1.yml		patch \| blob \| history
sdk/cwl/arvados_cwl/arv-cwl-schema-v1.2.yml		patch \| blob \| history
sdk/cwl/arvados_cwl/arvcontainer.py		patch \| blob \| history
sdk/cwl/tests/arvados-tests.yml		patch \| blob \| history
sdk/cwl/tests/oom/19975-oom.cwl	[new file with mode: 0644]	patch \| blob
sdk/cwl/tests/oom/19975-oom3.cwl	[new file with mode: 0644]	patch \| blob
sdk/cwl/tests/oom/fakeoom.py	[new file with mode: 0644]	patch \| blob
sdk/cwl/tests/oom/fakeoom.yml	[new file with mode: 0644]	patch \| blob
sdk/cwl/tests/oom/fakeoom2.py	[new file with mode: 0644]	patch \| blob
sdk/cwl/tests/oom/fakeoom2.yml	[new file with mode: 0644]	patch \| blob
sdk/cwl/tests/oom/fakeoom3.py	[new file with mode: 0644]	patch \| blob
sdk/cwl/tests/oom/fakeoom3.yml	[new file with mode: 0644]	patch \| blob