13773: Update runtime_status_error() method to report warning & activity status
authorLucas Di Pentima <ldipentima@veritasgenetics.com>
Fri, 7 Sep 2018 17:04:02 +0000 (14:04 -0300)
committerLucas Di Pentima <ldipentima@veritasgenetics.com>
Fri, 7 Sep 2018 17:04:02 +0000 (14:04 -0300)
Arvados-DCO-1.1-Signed-off-by: Lucas Di Pentima <ldipentima@veritasgenetics.com>

sdk/cwl/arvados_cwl/__init__.py
sdk/cwl/arvados_cwl/arvcontainer.py
sdk/cwl/tests/test_container.py

index 7f56eac8c9061f860b1f1e1236597d66f00a1a1f..b73b768fb8112a638a886485a81a24f6e7031f11 100644 (file)
@@ -194,15 +194,14 @@ http://doc.arvados.org/install/install-api-server.html#disable_api_methods
             self.task_queue.add(partial(j.done, record))
             del self.processes[uuid]
 
-    def runtime_status_error(self, child_label, child_uuid, error_log):
+    def runtime_status_update(self, kind, message, detail=None):
         """
-        Called from a failing child container. Records the first child error
-        on this runner's runtime_status field.
-        On subsequent errors, updates the 'error' key to show how many additional
-        failures happened.
+        Updates the runtime_status field on the runner container.
+        Called from a failing child container: records the first child error
+        or updates the error count on subsequent error statuses.
+        Also called from other parts that need to report errros, warnings or just
+        activity statuses.
         """
-        error_msg = "%s %s failed" % (child_label, child_uuid)
-        logger.info(error_msg)
         with self.workflow_eval_lock:
             try:
                 current = self.api.containers().current().execute(num_retries=self.num_retries)
@@ -212,34 +211,49 @@ http://doc.arvados.org/install/install-api-server.html#disable_api_methods
                     logger.info("Getting current container: %s", e)
                 return
             runtime_status = current.get('runtime_status', {})
-            # Save first fatal error
-            if not runtime_status.get('error'):
-                runtime_status.update({
-                    'error': error_msg,
-                    'errorDetail': error_log or "No error logs available"
-                })
-            # Further errors are only mentioned as a count
-            else:
-                error_msg = re.match(
-                    r'^(.*failed)\s*\(?', runtime_status.get('error')).groups()[0]
-                more_failures = re.match(
-                    r'.*\(.*(\d+) more\)', runtime_status.get('error'))
-                if more_failures:
-                    failure_qty = int(more_failures.groups()[0])
+            # In case of status being an error, only report the first one.
+            if kind == 'error':
+                if not runtime_status.get('error'):
                     runtime_status.update({
-                        'error': "%s (and %d more)" % (error_msg, failure_qty+1)
+                        'error': message,
+                        'errorDetail': detail or "No error logs available"
                     })
+                # Further errors are only mentioned as a count.
                 else:
+                    # Get anything before an optional 'and N more' string.
+                    error_msg = re.match(
+                        r'^(.*?)(?=\s*\(and \d+ more\)|$)', runtime_status.get('error')).groups()[0]
+                    more_failures = re.match(
+                        r'.*\(and (\d+) more\)', runtime_status.get('error'))
+                    if more_failures:
+                        failure_qty = int(more_failures.groups()[0])
+                        runtime_status.update({
+                            'error': "%s (and %d more)" % (error_msg, failure_qty+1)
+                        })
+                    else:
+                        runtime_status.update({
+                            'error': "%s (and 1 more)" % error_msg
+                        })
+            elif kind in ['warning', 'activity']:
+                # Record the last warning/activity status without regard of
+                # previous occurences.
+                runtime_status.update({
+                    kind: message
+                })
+                if detail is not None:
                     runtime_status.update({
-                        'error': "%s (and 1 more)" % error_msg
+                        kind+"Detail": detail
                     })
+            else:
+                # Ignore any other status kind
+                return
             try:
                 self.api.containers().update(uuid=current['uuid'],
                                             body={
                                                 'runtime_status': runtime_status,
                                             }).execute(num_retries=self.num_retries)
             except Exception as e:
-                logger.error("Updating runtime_status: %s", e)
+                logger.info("Couldn't update runtime_status: %s", e)
 
     def wrapped_callback(self, cb, obj, st):
         with self.workflow_eval_lock:
index d49b650023be644bbd8b33692ac7e31d5835d8fd..670f543520a029d6ce75aaac45ed90c52e0dd54d 100644 (file)
@@ -324,7 +324,10 @@ class ArvadosContainer(JobBase):
                 error_log = done.logtail(
                     logc, logger.error,
                     "%s (%s) error log:" % (label, record["uuid"]), maxlen=40)
-                self.arvrunner.runtime_status_error(label, record["uuid"], error_log)
+                self.arvrunner.runtime_status_update(
+                    "error",
+                    "%s %s failed" % (label, record["uuid"]),
+                    error_log)
 
             if record["output_uuid"]:
                 if self.arvrunner.trash_intermediate or self.arvrunner.intermediate_output_ttl:
index fcc9d1550c9a9cc2d1f06ab6d92c14ad92037a6f..9bac3184f3806f36edbf2555bf67ac166e7a5370 100644 (file)
@@ -542,9 +542,9 @@ class TestContainer(unittest.TestCase):
             "modified_at": "2017-05-26T12:01:22Z"
         })
 
-        runner.runtime_status_error.assert_called_with(
-            '[container testjob]',
-            'zzzzz-xvhdp-zzzzzzzzzzzzzzz',
+        runner.runtime_status_update.assert_called_with(
+            'error',
+            '[container testjob] zzzzz-xvhdp-zzzzzzzzzzzzzzz failed',
             'some error detail'
         )
         arvjob.output_callback.assert_called_with({"out": "stuff"}, "permanentFail")