Merge branch '17301-cwl-oom' refs #17301
authorPeter Amstutz <peter.amstutz@curii.com>
Mon, 25 Apr 2022 20:06:06 +0000 (16:06 -0400)
committerPeter Amstutz <peter.amstutz@curii.com>
Mon, 25 Apr 2022 20:06:06 +0000 (16:06 -0400)
Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz@curii.com>

sdk/cwl/arvados_cwl/arvcontainer.py
sdk/cwl/arvados_cwl/executor.py
sdk/cwl/tests/test_container.py
sdk/cwl/tests/test_make_output.py
sdk/cwl/tests/test_pathmapper.py
sdk/cwl/tests/test_submit.py

index e2c2f2e67bfef0b7b1bcd0f09f8cd1d212639f78..f75bde81e6cebd655a8378fbd382f18b3bf18d2f 100644 (file)
@@ -392,6 +392,10 @@ class ArvadosContainer(JobBase):
                     processStatus = "success"
                 else:
                     processStatus = "permanentFail"
+
+                if rcode == 137:
+                    logger.warning("%s Container may have been killed for using too much RAM.  Try resubmitting with a higher 'ramMin'.",
+                                 self.arvrunner.label(self))
             else:
                 processStatus = "permanentFail"
 
index 5f24d2407d1ad727a9aa534382973399f89b0090..680ca0b7b2c85df6b2f7d55709205b47ad591ef7 100644 (file)
@@ -261,46 +261,29 @@ The 'jobs' API is no longer supported.
             if current is None:
                 return
             runtime_status = current.get('runtime_status', {})
-            # In case of status being an error, only report the first one.
-            if kind == 'error':
-                if not runtime_status.get('error'):
-                    runtime_status.update({
-                        'error': message
-                    })
-                    if detail is not None:
-                        runtime_status.update({
-                            'errorDetail': detail
-                        })
-                # Further errors are only mentioned as a count.
-                else:
-                    # Get anything before an optional 'and N more' string.
-                    try:
-                        error_msg = re.match(
-                            r'^(.*?)(?=\s*\(and \d+ more\)|$)', runtime_status.get('error')).groups()[0]
-                        more_failures = re.match(
-                            r'.*\(and (\d+) more\)', runtime_status.get('error'))
-                    except TypeError:
-                        # Ignore tests stubbing errors
-                        return
-                    if more_failures:
-                        failure_qty = int(more_failures.groups()[0])
-                        runtime_status.update({
-                            'error': "%s (and %d more)" % (error_msg, failure_qty+1)
-                        })
-                    else:
-                        runtime_status.update({
-                            'error': "%s (and 1 more)" % error_msg
-                        })
-            elif kind in ['warning', 'activity']:
-                # Record the last warning/activity status without regard of
-                # previous occurences.
+            if kind in ('error', 'warning'):
+                updatemessage = runtime_status.get(kind, "")
+                if not updatemessage:
+                    updatemessage = message
+
+                # Subsequent messages tacked on in detail
+                updatedetail = runtime_status.get(kind+'Detail', "")
+                maxlines = 40
+                if updatedetail.count("\n") < maxlines:
+                    if updatedetail:
+                        updatedetail += "\n"
+                    updatedetail += message + "\n"
+
+                    if detail:
+                        updatedetail += detail + "\n"
+
+                    if updatedetail.count("\n") >= maxlines:
+                        updatedetail += "\nSome messages may have been omitted.  Check the full log."
+
                 runtime_status.update({
-                    kind: message
+                    kind: updatemessage,
+                    kind+'Detail': updatedetail,
                 })
-                if detail is not None:
-                    runtime_status.update({
-                        kind+"Detail": detail
-                    })
             else:
                 # Ignore any other status kind
                 return
index 798c5af289322ce2ae23edc26ca7d8a863d50186..975fcdf8a3a25934729201b4e5ec987330311761 100644 (file)
@@ -63,6 +63,13 @@ class TestContainer(unittest.TestCase):
         cwltool.process._names = set()
         arv_docker_clear_cache()
 
+    def tearDown(self):
+        root_logger = logging.getLogger('')
+
+        # Remove existing RuntimeStatusLoggingHandlers if they exist
+        handlers = [h for h in root_logger.handlers if not isinstance(h, arvados_cwl.executor.RuntimeStatusLoggingHandler)]
+        root_logger.handlers = handlers
+
     def helper(self, runner, enable_reuse=True):
         document_loader, avsc_names, schema_metadata, metaschema_loader = cwltool.process.get_schema(INTERNAL_VERSION)
 
@@ -1385,6 +1392,8 @@ class TestWorkflow(unittest.TestCase):
         runner.api.collections().list().execute.return_value = {"items": [{"uuid": "zzzzz-4zz18-zzzzzzzzzzzzzzz",
                                                                            "portable_data_hash": "99999999999999999999999999999993+99"}]}
 
+        runner.api.containers().current().execute.return_value = {}
+
         runner.project_uuid = "zzzzz-8i9sb-zzzzzzzzzzzzzzz"
         runner.ignore_docker_for_reuse = False
         runner.num_retries = 0
index 127b3f372bed10615ce6fa701740201e649b1b6b..fe269592cb50619477e8effdf60ba4a42d4860aa 100644 (file)
@@ -23,6 +23,13 @@ class TestMakeOutput(unittest.TestCase):
         self.api = mock.MagicMock()
         self.api._rootDesc = get_rootDesc()
 
+    def tearDown(self):
+        root_logger = logging.getLogger('')
+
+        # Remove existing RuntimeStatusLoggingHandlers if they exist
+        handlers = [h for h in root_logger.handlers if not isinstance(h, arvados_cwl.executor.RuntimeStatusLoggingHandler)]
+        root_logger.handlers = handlers
+
     @mock.patch("arvados.collection.Collection")
     @mock.patch("arvados.collection.CollectionReader")
     def test_make_output_collection(self, reader, col):
@@ -171,4 +178,4 @@ class TestMakeOutput(unittest.TestCase):
 
         # Check that the file name conflict is resolved and open is called for both
         final.open.assert_any_call("a_file", "wb")
-        final.open.assert_any_call("a_file_2", "wb")
\ No newline at end of file
+        final.open.assert_any_call("a_file_2", "wb")
index b78e89012ad62c5f952476da0553b2d26dac5fd3..cbd5ba1bc1a7b4c7052f40af80cb0ee874a2703b 100644 (file)
@@ -32,6 +32,13 @@ class TestPathmap(unittest.TestCase):
         self.api = mock.MagicMock()
         self.api._rootDesc = get_rootDesc()
 
+    def tearDown(self):
+        root_logger = logging.getLogger('')
+
+        # Remove existing RuntimeStatusLoggingHandlers if they exist
+        handlers = [h for h in root_logger.handlers if not isinstance(h, arvados_cwl.executor.RuntimeStatusLoggingHandler)]
+        root_logger.handlers = handlers
+
     def test_keepref(self):
         """Test direct keep references."""
 
index aecc7a590be046fafce94b588c4af6bf4286aad5..5092fc45756d9f07ae983ba9547e3245147a2cf9 100644 (file)
@@ -353,6 +353,12 @@ class TestSubmit(unittest.TestCase):
         cwltool.process._names = set()
         arvados_cwl.arvdocker.arv_docker_clear_cache()
 
+    def tearDown(self):
+        root_logger = logging.getLogger('')
+
+        # Remove existing RuntimeStatusLoggingHandlers if they exist
+        handlers = [h for h in root_logger.handlers if not isinstance(h, arvados_cwl.executor.RuntimeStatusLoggingHandler)]
+        root_logger.handlers = handlers
 
     @mock.patch("time.sleep")
     @stubs
@@ -1054,9 +1060,6 @@ class TestSubmit(unittest.TestCase):
                          stubs.expect_container_request_uuid + '\n')
         self.assertEqual(exited, 0)
 
-    def tearDown(self):
-        arvados_cwl.arvdocker.arv_docker_clear_cache()
-
     @mock.patch("arvados.commands.keepdocker.find_one_image_hash")
     @mock.patch("cwltool.docker.DockerCommandLineJob.get_image")
     @mock.patch("arvados.api")
@@ -1423,7 +1426,7 @@ class TestSubmit(unittest.TestCase):
             self.assertEqual(exited, 1)
             self.assertRegex(
                 capture_stderr.getvalue(),
-                r"Collection uuid zzzzz-4zz18-zzzzzzzzzzzzzzz not found")
+                r"Collection\s*uuid\s*zzzzz-4zz18-zzzzzzzzzzzzzzz\s*not\s*found")
         finally:
             cwltool_logger.removeHandler(stderr_logger)
 
@@ -1525,6 +1528,13 @@ class TestCreateWorkflow(unittest.TestCase):
         cwltool.process._names = set()
         arvados_cwl.arvdocker.arv_docker_clear_cache()
 
+    def tearDown(self):
+        root_logger = logging.getLogger('')
+
+        # Remove existing RuntimeStatusLoggingHandlers if they exist
+        handlers = [h for h in root_logger.handlers if not isinstance(h, arvados_cwl.executor.RuntimeStatusLoggingHandler)]
+        root_logger.handlers = handlers
+
     @stubs
     def test_create(self, stubs):
         project_uuid = 'zzzzz-j7d0g-zzzzzzzzzzzzzzz'