19744: Report steps with low utilization at end of workflow
authorPeter Amstutz <peter.amstutz@curii.com>
Wed, 28 Feb 2024 19:53:29 +0000 (14:53 -0500)
committerPeter Amstutz <peter.amstutz@curii.com>
Wed, 28 Feb 2024 19:53:29 +0000 (14:53 -0500)
Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz@curii.com>

sdk/cwl/arvados_cwl/arvcontainer.py
sdk/cwl/arvados_cwl/executor.py
tools/crunchstat-summary/crunchstat_summary/summarizer.py

index f048e505e8ea0b6d348b2a619798221e843267bc..c3b914ba996a795623c5c9a1f155a2b11098b4d9 100644 (file)
@@ -548,7 +548,7 @@ class ArvadosContainer(JobBase):
 
                     # Post warnings about nodes that are under-utilized.
                     for rc in summarizer._recommend_gen(lambda x: x):
-                        self.usage_report_notes.append(rc)
+                        self.job_runtime.usage_report_notes.append(rc)
 
                 except Exception as e:
                     logger.warning("%s unable to generate resource usage report",
index 28ee60ac3973b5103a7cc71c97836bc562347cc7..432b380aabcd90c4c91ff3d7d72a9af29ab52823 100644 (file)
@@ -930,7 +930,7 @@ The 'jobs' API is no longer supported.
             raise WorkflowException("Workflow did not return a result.")
 
         if runtimeContext.usage_report_notes:
-            logger.info("Resource report notifications:")
+            logger.info("Steps with low resource utilization (possible optimization opportunities):")
             for x in runtimeContext.usage_report_notes:
                 logger.info("  %s", x)
 
index 8a2cda130b331c20807f7039e40576b547aefb3a..bc41fdae33272d3df98ad8c998bf5a05db308120 100644 (file)
@@ -483,7 +483,7 @@ class Summarizer(object):
                 '{} peak RAM usage was only {}% ({} MiB used / {} MiB requested)'
             ).format(
                 self.label,
-                int(100*(used_mib / asked_mib)),
+                int(math.ceil(100*(used_mib / asked_mib))),
                 int(used_mib),
                 int(asked_mib))
 
@@ -497,18 +497,23 @@ class Summarizer(object):
 
         if self.job_tot['net:keep0']['rx'] == 0:
             return
+
+        miss_rate = (float(self.job_tot['keepcache']['miss']) /
+                     float(self.job_tot['keepcalls']['get']))
+
         utilization = (float(self.job_tot['blkio:0:0']['read']) /
                        float(self.job_tot['net:keep0']['rx']))
         # FIXME: the default on this get won't work correctly
         asked_cache = self.existing_constraints.get('keep_cache_ram') or self.existing_constraints.get('keep_cache_disk')
 
-        if utilization < 0.5:
+        if utilization < 0.5 and miss_rate > .05:
             yield recommendformat(
-                '{} Keep cache utilization was {:.2f}% -- '
+                '{} Keep cache utilization was only {:.2f}% and miss rate was {:.2f}% -- '
                 'recommend increasing keep_cache'
             ).format(
                 self.label,
-                utilization * 100.0)
+                utilization * 100.0,
+                miss_rate * 100.0)
 
 
     def _recommend_temp_disk(self, recommendformat):