Merge branch '17301-cwl-oom' refs #17301

author Peter Amstutz <peter.amstutz@curii.com>

Mon, 25 Apr 2022 20:06:06 +0000 (16:06 -0400)

committer Peter Amstutz <peter.amstutz@curii.com>

Mon, 25 Apr 2022 20:06:06 +0000 (16:06 -0400)
author Peter Amstutz <peter.amstutz@curii.com>
Mon, 25 Apr 2022 20:06:06 +0000 (16:06 -0400)
committer Peter Amstutz <peter.amstutz@curii.com>
Mon, 25 Apr 2022 20:06:06 +0000 (16:06 -0400)
diff --git a/sdk/cwl/arvados_cwl/arvcontainer.py b/sdk/cwl/arvados_cwl/arvcontainer.py

index e2c2f2e67bfef0b7b1bcd0f09f8cd1d212639f78..f75bde81e6cebd655a8378fbd382f18b3bf18d2f 100644 (file)
--- a/sdk/cwl/arvados_cwl/arvcontainer.py
+++ b/sdk/cwl/arvados_cwl/arvcontainer.py
@@ -392,6 +392,10 @@ class ArvadosContainer(JobBase):
                      processStatus = "success"
                  else:
                      processStatus = "permanentFail"
+
+                if rcode == 137:
+                    logger.warning("%s Container may have been killed for using too much RAM.  Try resubmitting with a higher 'ramMin'.",
+                                 self.arvrunner.label(self))
              else:
                  processStatus = "permanentFail"
  
diff --git a/sdk/cwl/arvados_cwl/executor.py b/sdk/cwl/arvados_cwl/executor.py

index 5f24d2407d1ad727a9aa534382973399f89b0090..680ca0b7b2c85df6b2f7d55709205b47ad591ef7 100644 (file)
--- a/sdk/cwl/arvados_cwl/executor.py
+++ b/sdk/cwl/arvados_cwl/executor.py
@@ -261,46 +261,29 @@ The 'jobs' API is no longer supported.
              if current is None:
                  return
              runtime_status = current.get('runtime_status', {})
-            # In case of status being an error, only report the first one.
-            if kind == 'error':
-                if not runtime_status.get('error'):
-                    runtime_status.update({
-                        'error': message
-                    })
-                    if detail is not None:
-                        runtime_status.update({
-                            'errorDetail': detail
-                        })
-                # Further errors are only mentioned as a count.
-                else:
-                    # Get anything before an optional 'and N more' string.
-                    try:
-                        error_msg = re.match(
-                            r'^(.*?)(?=\s*\(and \d+ more\)|$)', runtime_status.get('error')).groups()[0]
-                        more_failures = re.match(
-                            r'.*\(and (\d+) more\)', runtime_status.get('error'))
-                    except TypeError:
-                        # Ignore tests stubbing errors
-                        return
-                    if more_failures:
-                        failure_qty = int(more_failures.groups()[0])
-                        runtime_status.update({
-                            'error': "%s (and %d more)" % (error_msg, failure_qty+1)
-                        })
-                    else:
-                        runtime_status.update({
-                            'error': "%s (and 1 more)" % error_msg
-                        })
-            elif kind in ['warning', 'activity']:
-                # Record the last warning/activity status without regard of
-                # previous occurences.
+            if kind in ('error', 'warning'):
+                updatemessage = runtime_status.get(kind, "")
+                if not updatemessage:
+                    updatemessage = message
+
+                # Subsequent messages tacked on in detail
+                updatedetail = runtime_status.get(kind+'Detail', "")
+                maxlines = 40
+                if updatedetail.count("\n") < maxlines:
+                    if updatedetail:
+                        updatedetail += "\n"
+                    updatedetail += message + "\n"
+
+                    if detail:
+                        updatedetail += detail + "\n"
+
+                    if updatedetail.count("\n") >= maxlines:
+                        updatedetail += "\nSome messages may have been omitted.  Check the full log."
+
                  runtime_status.update({
-                    kind: message
+                    kind: updatemessage,
+                    kind+'Detail': updatedetail,
                  })
-                if detail is not None:
-                    runtime_status.update({
-                        kind+"Detail": detail
-                    })
              else:
                  # Ignore any other status kind
                  return
diff --git a/sdk/cwl/tests/test_container.py b/sdk/cwl/tests/test_container.py

index 798c5af289322ce2ae23edc26ca7d8a863d50186..975fcdf8a3a25934729201b4e5ec987330311761 100644 (file)
--- a/sdk/cwl/tests/test_container.py
+++ b/sdk/cwl/tests/test_container.py
@@ -63,6 +63,13 @@ class TestContainer(unittest.TestCase):
          cwltool.process._names = set()
          arv_docker_clear_cache()
  
+    def tearDown(self):
+        root_logger = logging.getLogger('')
+
+        # Remove existing RuntimeStatusLoggingHandlers if they exist
+        handlers = [h for h in root_logger.handlers if not isinstance(h, arvados_cwl.executor.RuntimeStatusLoggingHandler)]
+        root_logger.handlers = handlers
+
      def helper(self, runner, enable_reuse=True):
          document_loader, avsc_names, schema_metadata, metaschema_loader = cwltool.process.get_schema(INTERNAL_VERSION)
  
@@ -1385,6 +1392,8 @@ class TestWorkflow(unittest.TestCase):
          runner.api.collections().list().execute.return_value = {"items": [{"uuid": "zzzzz-4zz18-zzzzzzzzzzzzzzz",
                                                                             "portable_data_hash": "99999999999999999999999999999993+99"}]}
  
+        runner.api.containers().current().execute.return_value = {}
+
          runner.project_uuid = "zzzzz-8i9sb-zzzzzzzzzzzzzzz"
          runner.ignore_docker_for_reuse = False
          runner.num_retries = 0
diff --git a/sdk/cwl/tests/test_make_output.py b/sdk/cwl/tests/test_make_output.py

index 127b3f372bed10615ce6fa701740201e649b1b6b..fe269592cb50619477e8effdf60ba4a42d4860aa 100644 (file)
--- a/sdk/cwl/tests/test_make_output.py
+++ b/sdk/cwl/tests/test_make_output.py
@@ -23,6 +23,13 @@ class TestMakeOutput(unittest.TestCase):
          self.api = mock.MagicMock()
          self.api._rootDesc = get_rootDesc()
  
+    def tearDown(self):
+        root_logger = logging.getLogger('')
+
+        # Remove existing RuntimeStatusLoggingHandlers if they exist
+        handlers = [h for h in root_logger.handlers if not isinstance(h, arvados_cwl.executor.RuntimeStatusLoggingHandler)]
+        root_logger.handlers = handlers
+
      @mock.patch("arvados.collection.Collection")
      @mock.patch("arvados.collection.CollectionReader")
      def test_make_output_collection(self, reader, col):
@@ -171,4 +178,4 @@ class TestMakeOutput(unittest.TestCase):
  
          # Check that the file name conflict is resolved and open is called for both
          final.open.assert_any_call("a_file", "wb")
-        final.open.assert_any_call("a_file_2", "wb")
-\ No newline at end of file
+        final.open.assert_any_call("a_file_2", "wb")
diff --git a/sdk/cwl/tests/test_pathmapper.py b/sdk/cwl/tests/test_pathmapper.py

index b78e89012ad62c5f952476da0553b2d26dac5fd3..cbd5ba1bc1a7b4c7052f40af80cb0ee874a2703b 100644 (file)
--- a/sdk/cwl/tests/test_pathmapper.py
+++ b/sdk/cwl/tests/test_pathmapper.py
@@ -32,6 +32,13 @@ class TestPathmap(unittest.TestCase):
          self.api = mock.MagicMock()
          self.api._rootDesc = get_rootDesc()
  
+    def tearDown(self):
+        root_logger = logging.getLogger('')
+
+        # Remove existing RuntimeStatusLoggingHandlers if they exist
+        handlers = [h for h in root_logger.handlers if not isinstance(h, arvados_cwl.executor.RuntimeStatusLoggingHandler)]
+        root_logger.handlers = handlers
+
      def test_keepref(self):
          """Test direct keep references."""
  
diff --git a/sdk/cwl/tests/test_submit.py b/sdk/cwl/tests/test_submit.py

index aecc7a590be046fafce94b588c4af6bf4286aad5..5092fc45756d9f07ae983ba9547e3245147a2cf9 100644 (file)
--- a/sdk/cwl/tests/test_submit.py
+++ b/sdk/cwl/tests/test_submit.py
@@ -353,6 +353,12 @@ class TestSubmit(unittest.TestCase):
          cwltool.process._names = set()
          arvados_cwl.arvdocker.arv_docker_clear_cache()
  
+    def tearDown(self):
+        root_logger = logging.getLogger('')
+
+        # Remove existing RuntimeStatusLoggingHandlers if they exist
+        handlers = [h for h in root_logger.handlers if not isinstance(h, arvados_cwl.executor.RuntimeStatusLoggingHandler)]
+        root_logger.handlers = handlers
  
      @mock.patch("time.sleep")
      @stubs
@@ -1054,9 +1060,6 @@ class TestSubmit(unittest.TestCase):
                           stubs.expect_container_request_uuid + '\n')
          self.assertEqual(exited, 0)
  
-    def tearDown(self):
-        arvados_cwl.arvdocker.arv_docker_clear_cache()
-
      @mock.patch("arvados.commands.keepdocker.find_one_image_hash")
      @mock.patch("cwltool.docker.DockerCommandLineJob.get_image")
      @mock.patch("arvados.api")
@@ -1423,7 +1426,7 @@ class TestSubmit(unittest.TestCase):
              self.assertEqual(exited, 1)
              self.assertRegex(
                  capture_stderr.getvalue(),
-                r"Collection uuid zzzzz-4zz18-zzzzzzzzzzzzzzz not found")
+                r"Collection\s*uuid\s*zzzzz-4zz18-zzzzzzzzzzzzzzz\s*not\s*found")
          finally:
              cwltool_logger.removeHandler(stderr_logger)
  
@@ -1525,6 +1528,13 @@ class TestCreateWorkflow(unittest.TestCase):
          cwltool.process._names = set()
          arvados_cwl.arvdocker.arv_docker_clear_cache()
  
+    def tearDown(self):
+        root_logger = logging.getLogger('')
+
+        # Remove existing RuntimeStatusLoggingHandlers if they exist
+        handlers = [h for h in root_logger.handlers if not isinstance(h, arvados_cwl.executor.RuntimeStatusLoggingHandler)]
+        root_logger.handlers = handlers
+
      @stubs
      def test_create(self, stubs):
          project_uuid = 'zzzzz-j7d0g-zzzzzzzzzzzzzzz'
author	Peter Amstutz <peter.amstutz@curii.com>
	Mon, 25 Apr 2022 20:06:06 +0000 (16:06 -0400)
committer	Peter Amstutz <peter.amstutz@curii.com>
	Mon, 25 Apr 2022 20:06:06 +0000 (16:06 -0400)
sdk/cwl/arvados_cwl/arvcontainer.py		patch \| blob \| history
sdk/cwl/arvados_cwl/executor.py		patch \| blob \| history
sdk/cwl/tests/test_container.py		patch \| blob \| history
sdk/cwl/tests/test_make_output.py		patch \| blob \| history
sdk/cwl/tests/test_pathmapper.py		patch \| blob \| history
sdk/cwl/tests/test_submit.py		patch \| blob \| history