19975: Add OutOfMemoryRetry extension

author Peter Amstutz <peter.amstutz@curii.com>

Sun, 5 Mar 2023 04:03:19 +0000 (23:03 -0500)

committer Peter Amstutz <peter.amstutz@curii.com>

Sun, 5 Mar 2023 04:03:19 +0000 (23:03 -0500)
author Peter Amstutz <peter.amstutz@curii.com>
Sun, 5 Mar 2023 04:03:19 +0000 (23:03 -0500)
committer Peter Amstutz <peter.amstutz@curii.com>
Sun, 5 Mar 2023 04:03:19 +0000 (23:03 -0500)
diff --git a/sdk/cwl/arvados_cwl/__init__.py b/sdk/cwl/arvados_cwl/__init__.py

index 52a9a6c208cf73a304eb7650e67b5ad5a0d8ab2a..74ca9312bf54b1e965984f8bb893525ee2147e05 100644 (file)
--- a/sdk/cwl/arvados_cwl/__init__.py
+++ b/sdk/cwl/arvados_cwl/__init__.py
@@ -285,6 +285,7 @@ def add_arv_hints():
          "http://arvados.org/cwl#UsePreemptible",
          "http://arvados.org/cwl#OutputCollectionProperties",
          "http://arvados.org/cwl#KeepCacheTypeRequirement",
+        "http://arvados.org/cwl#OutOfMemoryRetry",
      ])
  
  def exit_signal_handler(sigcode, frame):
diff --git a/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.0.yml b/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.0.yml

index fc370eb8132bd35466fce0ec19049e141f94d436..91a05e125439952b8023dd197dd620211052d9eb 100644 (file)
--- a/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.0.yml
+++ b/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.0.yml
@@ -456,3 +456,30 @@ $graph:
          only or written to disk and memory-mapped.  The disk cache
          leverages the kernel's virtual memory system so "hot" data will
          generally still be kept in RAM.
+
+- name: OutOfMemoryRetry
+  type: record
+  extends: cwl:ProcessRequirement
+  inVocab: false
+  doc: |
+    Detect when a failed tool run may have run out of memory, and
+    re-submit the container with more RAM.
+  fields:
+    - name: class
+      type: string
+      doc: "'arv:OutOfMemoryRetry"
+      jsonldPredicate:
+        _id: "@type"
+        _type: "@vocab"
+    - name: memoryErrorRegex
+      type: string?
+      doc: |
+        A regular expression that will be used on the text of stdout
+        and stderr produced by the tool to determine if a failed job
+        should be retried with more RAM.  By default, searches for the
+        substrings 'bad_alloc' and 'OutOfMemory'.
+    - name: memoryRetryMultipler
+      type: float
+      doc: |
+        If the container failed on its first run, re-submit the
+        container with the RAM request multiplied by this factor.
diff --git a/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.1.yml b/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.1.yml

index 69c0ed6cffadfb73a695de7229570563e24de995..458d5a37a7b0339bfd4bc21dd43fa5ac09d0fe86 100644 (file)
--- a/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.1.yml
+++ b/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.1.yml
@@ -399,3 +399,30 @@ $graph:
          only or written to disk and memory-mapped.  The disk cache
          leverages the kernel's virtual memory system so "hot" data will
          generally still be kept in RAM.
+
+- name: OutOfMemoryRetry
+  type: record
+  extends: cwl:ProcessRequirement
+  inVocab: false
+  doc: |
+    Detect when a failed tool run may have run out of memory, and
+    re-submit the container with more RAM.
+  fields:
+    - name: class
+      type: string
+      doc: "'arv:OutOfMemoryRetry"
+      jsonldPredicate:
+        _id: "@type"
+        _type: "@vocab"
+    - name: memoryErrorRegex
+      type: string?
+      doc: |
+        A regular expression that will be used on the text of stdout
+        and stderr produced by the tool to determine if a failed job
+        should be retried with more RAM.  By default, searches for the
+        substrings 'bad_alloc' and 'OutOfMemory'.
+    - name: memoryRetryMultipler
+      type: float
+      doc: |
+        If the container failed on its first run, re-submit the
+        container with the RAM request multiplied by this factor.
diff --git a/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.2.yml b/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.2.yml

index 86cd06effea70ff2ea8e1f6567e0803b8e4caa27..f4246ed70a5b5f04f240a83b3baf8ec1c67d3827 100644 (file)
--- a/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.2.yml
+++ b/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.2.yml
@@ -401,3 +401,31 @@ $graph:
          only or written to disk and memory-mapped.  The disk cache
          leverages the kernel's virtual memory system so "hot" data will
          generally still be kept in RAM.
+
+
+- name: OutOfMemoryRetry
+  type: record
+  extends: cwl:ProcessRequirement
+  inVocab: false
+  doc: |
+    Detect when a failed tool run may have run out of memory, and
+    re-submit the container with more RAM.
+  fields:
+    - name: class
+      type: string
+      doc: "'arv:OutOfMemoryRetry"
+      jsonldPredicate:
+        _id: "@type"
+        _type: "@vocab"
+    - name: memoryErrorRegex
+      type: string?
+      doc: |
+        A regular expression that will be used on the text of stdout
+        and stderr produced by the tool to determine if a failed job
+        should be retried with more RAM.  By default, searches for the
+        substrings 'bad_alloc' and 'OutOfMemory'.
+    - name: memoryRetryMultipler
+      type: float
+      doc: |
+        If the container failed on its first run, re-submit the
+        container with the RAM request multiplied by this factor.
diff --git a/sdk/cwl/arvados_cwl/arvcontainer.py b/sdk/cwl/arvados_cwl/arvcontainer.py

index e828b16d3091772d72e25b270f7d8269ff9916c6..632e171b40220f90e57663bcbc363f25d5b86dac 100644 (file)
--- a/sdk/cwl/arvados_cwl/arvcontainer.py
+++ b/sdk/cwl/arvados_cwl/arvcontainer.py
@@ -367,6 +367,12 @@ class ArvadosContainer(JobBase):
                  logger.warning("%s API revision is %s, revision %s is required to support setting properties on output collections.",
                                 self.arvrunner.label(self), self.arvrunner.api._rootDesc["revision"], "20220510")
  
+        ramMultiplier = [1]
+
+        oom_retry_req, _ = self.get_requirement("http://arvados.org/cwl#OutOfMemoryRetry")
+        if oom_retry_req and oom_retry_req.get('memoryRetryMultipler'):
+            ramMultiplier.append(oom_retry_req.get('memoryRetryMultipler'))
+
          if runtimeContext.runnerjob.startswith("arvwf:"):
              wfuuid = runtimeContext.runnerjob[6:runtimeContext.runnerjob.index("#")]
              wfrecord = self.arvrunner.api.workflows().get(uuid=wfuuid).execute(num_retries=self.arvrunner.num_retries)
@@ -380,7 +386,7 @@ class ArvadosContainer(JobBase):
          try:
              ram = runtime_constraints["ram"]
  
-            for i in range(1, 4):
+            for i in ramMultiplier:
                  runtime_constraints["ram"] = ram * i
  
                  if runtimeContext.submit_request_uuid:
@@ -400,7 +406,7 @@ class ArvadosContainer(JobBase):
                      break
  
              if response["container_uuid"] is None:
-                runtime_constraints["ram"] = ram * (self.attempt_count+1)
+                runtime_constraints["ram"] = ram * ramMultiplier[self.attempt_count]
  
              container_request["state"] = "Committed"
              response = self.arvrunner.api.container_requests().update(
@@ -423,6 +429,14 @@ class ArvadosContainer(JobBase):
              self.output_callback({}, "permanentFail")
  
      def out_of_memory_retry(self, record, container):
+        oom_retry_req, _ = self.get_requirement("http://arvados.org/cwl#OutOfMemoryRetry")
+        if oom_retry_req is None:
+            return False
+
+        # Sometimes it gets killed with no warning
+        if container["exit_code"] == 137:
+            return True
+
          logc = arvados.collection.CollectionReader(record["log_uuid"],
                                                     api_client=self.arvrunner.api,
                                                     keep_client=self.arvrunner.keep_client,
@@ -434,14 +448,9 @@ class ArvadosContainer(JobBase):
  
          done.logtail(logc, callback, "", maxlen=1000)
  
-        # Check OOM killed
-        oom_matches = r'container using over 9.% of memory'
-        if container["exit_code"] == 137 and re.search(oom_matches, loglines[0], re.IGNORECASE | re.MULTILINE):
-            return True
-
          # Check allocation failure
-        bad_alloc_matches = r'(bad_alloc|out ?of ?memory)'
-        if re.search(bad_alloc_matches, loglines[0], re.IGNORECASE | re.MULTILINE):
+        oom_matches = oom_retry_req.get('memoryErrorRegex') or r'(bad_alloc|out ?of ?memory|container using over 9.% of memory)'
+        if re.search(oom_matches, loglines[0], re.IGNORECASE | re.MULTILINE):
              return True
  
          return False
@@ -466,7 +475,7 @@ class ArvadosContainer(JobBase):
                  else:
                      processStatus = "permanentFail"
  
-                if processStatus == "permanentFail" and self.out_of_memory_retry(record, container):
+                if processStatus == "permanentFail" and self.attempt_count == 1 and self.out_of_memory_retry(record, container):
                      logger.info("%s Container failed with out of memory error, retrying with more RAM.",
                                   self.arvrunner.label(self))
                      self.job_runtime.submit_request_uuid = None
author	Peter Amstutz <peter.amstutz@curii.com>
	Sun, 5 Mar 2023 04:03:19 +0000 (23:03 -0500)
committer	Peter Amstutz <peter.amstutz@curii.com>
	Sun, 5 Mar 2023 04:03:19 +0000 (23:03 -0500)
sdk/cwl/arvados_cwl/__init__.py		patch \| blob \| history
sdk/cwl/arvados_cwl/arv-cwl-schema-v1.0.yml		patch \| blob \| history
sdk/cwl/arvados_cwl/arv-cwl-schema-v1.1.yml		patch \| blob \| history
sdk/cwl/arvados_cwl/arv-cwl-schema-v1.2.yml		patch \| blob \| history
sdk/cwl/arvados_cwl/arvcontainer.py		patch \| blob \| history