19744: Add --enable/disable-usage-report
authorPeter Amstutz <peter.amstutz@curii.com>
Tue, 27 Feb 2024 21:28:59 +0000 (16:28 -0500)
committerPeter Amstutz <peter.amstutz@curii.com>
Tue, 27 Feb 2024 21:28:59 +0000 (16:28 -0500)
Now warns about under-utilized nodes.

Also code cleanup from review.

Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz@curii.com>

sdk/cwl/arvados_cwl/__init__.py
sdk/cwl/arvados_cwl/arvcontainer.py
sdk/cwl/arvados_cwl/context.py
sdk/cwl/arvados_cwl/executor.py
tools/crunchstat-summary/crunchstat_summary/dygraphs.js
tools/crunchstat-summary/crunchstat_summary/summarizer.py

index 9fc00c00171ba4d435a237c4bbb68482421aafaf..7e13488758b10f5ec9f2ac5a61ec31dfaa1ba4f8 100644 (file)
@@ -258,6 +258,10 @@ def arg_parser():  # type: () -> argparse.ArgumentParser
                         default=False, dest="trash_intermediate",
                         help="Do not trash intermediate outputs (default).")
 
+    exgroup = parser.add_mutually_exclusive_group()
+    exgroup.add_argument("--enable-usage-report", dest="enable_usage_report", default=None, action="store_true", help="Create usage_report.html with a summary of each step's resource usage.")
+    exgroup.add_argument("--disable-usage-report", dest="enable_usage_report", default=None, action="store_false", help="Disable usage report.")
+
     parser.add_argument("workflow", default=None, help="The workflow to execute")
     parser.add_argument("job_order", nargs=argparse.REMAINDER, help="The input object to the workflow.")
 
index 70202743c483f29691d1138c06c2cd120ec3fd8a..d4b1e0050dda2a35a2e007bf47bff1c4d4fd1c3f 100644 (file)
@@ -534,17 +534,22 @@ class ArvadosContainer(JobBase):
                 body={"container_request": {"properties": properties}}
             ).execute(num_retries=self.arvrunner.num_retries)
 
-            if logc is not None:
+            if logc is not None and self.job_runtime.enable_usage_report is not False:
                 try:
-                    summerizer = crunchstat_summary.summarizer.ContainerRequestSummarizer(
+                    summarizer = crunchstat_summary.summarizer.ContainerRequestSummarizer(
                         record,
                         collection_object=logc,
                         label=self.name,
                         arv=self.arvrunner.api)
-                    summerizer.run()
+                    summarizer.run()
                     with logc.open("usage_report.html", "wt") as mr:
-                        mr.write(summerizer.html_report())
+                        mr.write(summarizer.html_report())
                     logc.save()
+
+                    # Post warnings about nodes that are under-utilized.
+                    for rc in summarizer._recommend_gen(lambda x: x):
+                        logger.warning(x)
+
                 except Exception as e:
                     logger.warning("%s unable to generate resource usage report",
                                  self.arvrunner.label(self),
@@ -727,6 +732,12 @@ class RunnerContainer(Runner):
         if runtimeContext.prefer_cached_downloads:
             command.append("--prefer-cached-downloads")
 
+        if runtimeContext.enable_usage_report is True:
+            command.append("--enable-usage-report")
+
+        if runtimeContext.enable_usage_report is False:
+            command.append("--disable-usage-report")
+
         if self.fast_parser:
             command.append("--fast-parser")
 
index 0439cb5b15cb64d1c449e39114358d564dc21b86..a90a6d48c33bf800dd5a8d285e3a80fd615b0703 100644 (file)
@@ -46,6 +46,7 @@ class ArvRuntimeContext(RuntimeContext):
         self.cached_docker_lookups = {}
         self.print_keep_deps = False
         self.git_info = {}
+        self.enable_usage_report = None
 
         super(ArvRuntimeContext, self).__init__(kwargs)
 
index 729baffe1d2ef50ba7950ee27f6bb5973ceed563..1b484a17208a9aadf9a98fccc73918d940a724f7 100644 (file)
@@ -70,7 +70,7 @@ class RuntimeStatusLoggingHandler(logging.Handler):
             kind = 'error'
         elif record.levelno >= logging.WARNING:
             kind = 'warning'
-        if kind == 'warning' and (record.name == "salad" or record.name == "crunchstat_summary"):
+        if kind == 'warning' and record.name in ("salad", "crunchstat_summary"):
             # Don't send validation warnings to runtime status,
             # they're noisy and unhelpful.
             return
index 07d9418966887a301c0c4eeb75a3a7bbf1ee570d..76c92107042bf9663324489c6e5ed9a0d4576981 100644 (file)
@@ -40,9 +40,6 @@ window.onload = function() {
         },
     }
     chartdata.forEach(function(section, section_idx) {
-        //var h1 = document.createElement('h1');
-        //h1.appendChild(document.createTextNode(section.label));
-        //document.body.appendChild(h1);
         var chartDiv = document.getElementById("chart");
         section.charts.forEach(function(chart, chart_idx) {
             // Skip chart if every series has zero data points
index 4d5674ec83c1ccf819c8d2427732ad5b40bacb14..a7c2b0a3831c7f667adb1435ce8ce0cfccfb91c5 100644 (file)
@@ -518,30 +518,17 @@ class Summarizer(object):
 
 
     def _recommend_temp_disk(self, recommendformat):
-        """Recommend decreasing temp disk if utilization < 50%.
-
-        This recommendation is disabled for the time being.  It uses
-        the total disk on the node and not the amount of disk
-        requested, so it triggers a false positive basically every
-        time.  To get the amount of disk requested we need to fish it
-        out of the mounts, which is extra work I don't want do right
-        now.
+        """This recommendation is disabled for the time being.  It was
+        using the total disk on the node and not the amount of disk
+        requested, so it would trigger a false positive basically
+        every time.  To get the amount of disk requested we need to
+        fish it out of the mounts, which is extra work I don't want do
+        right now.  You can find the old code at commit 616d135e77
+
         """
 
         return []
 
-        # total = float(self.job_tot['statfs']['total'])
-        # utilization = (float(self.job_tot['statfs']['used']) / total) if total > 0 else 0.0
-
-        # if utilization < 50.0 and total > 0:
-        #     yield recommendformat(
-        #         '{} max temp disk utilization was {:.0f}% of {:.0f} MiB -- '
-        #         'consider reducing "tmpdirMin" and/or "outdirMin"'
-        #     ).format(
-        #         self.label,
-        #         utilization * 100.0,
-        #         total / MB)
-
 
     def _format(self, val):
         """Return a string representation of a stat.