19975: Add integration test for out-of-memory resubmit
authorPeter Amstutz <peter.amstutz@curii.com>
Mon, 6 Mar 2023 17:42:15 +0000 (12:42 -0500)
committerPeter Amstutz <peter.amstutz@curii.com>
Mon, 6 Mar 2023 19:49:40 +0000 (14:49 -0500)
Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz@curii.com>

sdk/cwl/arvados_cwl/arvcontainer.py
sdk/cwl/tests/arvados-tests.yml
sdk/cwl/tests/oom/19975-oom.cwl [new file with mode: 0644]
sdk/cwl/tests/oom/19975-oom3.cwl [new file with mode: 0644]
sdk/cwl/tests/oom/fakeoom.py [new file with mode: 0644]
sdk/cwl/tests/oom/fakeoom.yml [new file with mode: 0644]
sdk/cwl/tests/oom/fakeoom2.py [new file with mode: 0644]
sdk/cwl/tests/oom/fakeoom2.yml [new file with mode: 0644]
sdk/cwl/tests/oom/fakeoom3.py [new file with mode: 0644]
sdk/cwl/tests/oom/fakeoom3.yml [new file with mode: 0644]

index 632e171b40220f90e57663bcbc363f25d5b86dac..aafbc38fc61cf55b2629d444b477cae8cfc36aa9 100644 (file)
@@ -386,12 +386,14 @@ class ArvadosContainer(JobBase):
         try:
             ram = runtime_constraints["ram"]
 
+            self.uuid = runtimeContext.submit_request_uuid
+
             for i in ramMultiplier:
                 runtime_constraints["ram"] = ram * i
 
-                if runtimeContext.submit_request_uuid:
+                if self.uuid:
                     response = self.arvrunner.api.container_requests().update(
-                        uuid=runtimeContext.submit_request_uuid,
+                        uuid=self.uuid,
                         body=container_request,
                         **extra_submit_params
                     ).execute(num_retries=self.arvrunner.num_retries)
@@ -400,7 +402,7 @@ class ArvadosContainer(JobBase):
                         body=container_request,
                         **extra_submit_params
                     ).execute(num_retries=self.arvrunner.num_retries)
-                    runtimeContext.submit_request_uuid = response["uuid"]
+                    self.uuid = response["uuid"]
 
                 if response["container_uuid"] is not None:
                     break
@@ -410,12 +412,11 @@ class ArvadosContainer(JobBase):
 
             container_request["state"] = "Committed"
             response = self.arvrunner.api.container_requests().update(
-                uuid=runtimeContext.submit_request_uuid,
+                uuid=self.uuid,
                 body=container_request,
                 **extra_submit_params
             ).execute(num_retries=self.arvrunner.num_retries)
 
-            self.uuid = response["uuid"]
             self.arvrunner.process_submitted(self)
             self.attempt_count += 1
 
@@ -449,7 +450,7 @@ class ArvadosContainer(JobBase):
         done.logtail(logc, callback, "", maxlen=1000)
 
         # Check allocation failure
-        oom_matches = oom_retry_req.get('memoryErrorRegex') or r'(bad_alloc|out ?of ?memory|container using over 9.% of memory)'
+        oom_matches = oom_retry_req.get('memoryErrorRegex') or r'(bad_alloc|out ?of ?memory|memory ?error|container using over 9.% of memory)'
         if re.search(oom_matches, loglines[0], re.IGNORECASE | re.MULTILINE):
             return True
 
index f242e632366c940ae7193c3add3237eb76bd79ad..a93c64a224c1e83b3d126b720fadeba6e8f59039 100644 (file)
   }
   tool: 19678-name-id.cwl
   doc: "Test issue 19678 - non-string type input parameter called 'name'"
+
+- job: oom/fakeoom.yml
+  output: {}
+  tool: oom/19975-oom.cwl
+  doc: "Test feature 19975 - retry on exit 137"
+
+- job: oom/fakeoom2.yml
+  output: {}
+  tool: oom/19975-oom.cwl
+  doc: "Test feature 19975 - retry on memory error"
+
+- job: oom/fakeoom3.yml
+  output: {}
+  tool: oom/19975-oom3.cwl
+  doc: "Test feature 19975 - retry on custom error"
diff --git a/sdk/cwl/tests/oom/19975-oom.cwl b/sdk/cwl/tests/oom/19975-oom.cwl
new file mode 100644 (file)
index 0000000..ec80648
--- /dev/null
@@ -0,0 +1,18 @@
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+cwlVersion: v1.2
+class: CommandLineTool
+$namespaces:
+  arv: "http://arvados.org/cwl#"
+hints:
+  arv:OutOfMemoryRetry:
+    memoryRetryMultipler: 2
+  ResourceRequirement:
+    ramMin: 256
+  arv:APIRequirement: {}
+inputs:
+  fakeoom: File
+outputs: []
+arguments: [python3, $(inputs.fakeoom)]
diff --git a/sdk/cwl/tests/oom/19975-oom3.cwl b/sdk/cwl/tests/oom/19975-oom3.cwl
new file mode 100644 (file)
index 0000000..af3271b
--- /dev/null
@@ -0,0 +1,19 @@
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+cwlVersion: v1.2
+class: CommandLineTool
+$namespaces:
+  arv: "http://arvados.org/cwl#"
+hints:
+  arv:OutOfMemoryRetry:
+    memoryRetryMultipler: 2
+    memoryErrorRegex: Whoops
+  ResourceRequirement:
+    ramMin: 256
+  arv:APIRequirement: {}
+inputs:
+  fakeoom: File
+outputs: []
+arguments: [python3, $(inputs.fakeoom)]
diff --git a/sdk/cwl/tests/oom/fakeoom.py b/sdk/cwl/tests/oom/fakeoom.py
new file mode 100644 (file)
index 0000000..cc0b2ed
--- /dev/null
@@ -0,0 +1,13 @@
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import sys
+import time
+import arvados
+
+api = arvados.api()
+current_container = api.containers().current().execute()
+
+if current_container["runtime_constraints"]["ram"] < (512*1024*1024):
+    sys.exit(137)
diff --git a/sdk/cwl/tests/oom/fakeoom.yml b/sdk/cwl/tests/oom/fakeoom.yml
new file mode 100644 (file)
index 0000000..da95fb6
--- /dev/null
@@ -0,0 +1,7 @@
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+fakeoom:
+  class: File
+  location: fakeoom.py
diff --git a/sdk/cwl/tests/oom/fakeoom2.py b/sdk/cwl/tests/oom/fakeoom2.py
new file mode 100644 (file)
index 0000000..89bd1f5
--- /dev/null
@@ -0,0 +1,13 @@
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import sys
+import time
+import arvados
+
+api = arvados.api()
+current_container = api.containers().current().execute()
+
+if current_container["runtime_constraints"]["ram"] < (512*1024*1024):
+    raise MemoryError()
diff --git a/sdk/cwl/tests/oom/fakeoom2.yml b/sdk/cwl/tests/oom/fakeoom2.yml
new file mode 100644 (file)
index 0000000..4161252
--- /dev/null
@@ -0,0 +1,7 @@
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+fakeoom:
+  class: File
+  location: fakeoom2.py
diff --git a/sdk/cwl/tests/oom/fakeoom3.py b/sdk/cwl/tests/oom/fakeoom3.py
new file mode 100644 (file)
index 0000000..460c4a5
--- /dev/null
@@ -0,0 +1,14 @@
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import sys
+import time
+import arvados
+
+api = arvados.api()
+current_container = api.containers().current().execute()
+
+if current_container["runtime_constraints"]["ram"] < (512*1024*1024):
+    print("Whoops")
+    sys.exit(1)
diff --git a/sdk/cwl/tests/oom/fakeoom3.yml b/sdk/cwl/tests/oom/fakeoom3.yml
new file mode 100644 (file)
index 0000000..a6fc03c
--- /dev/null
@@ -0,0 +1,7 @@
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+fakeoom:
+  class: File
+  location: fakeoom3.py