From d49af567353a4597e6a478ff871bdc6d3bd50f08 Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Wed, 28 Feb 2024 14:53:29 -0500 Subject: [PATCH 1/1] 19744: Report steps with low utilization at end of workflow Arvados-DCO-1.1-Signed-off-by: Peter Amstutz --- sdk/cwl/arvados_cwl/arvcontainer.py | 2 +- sdk/cwl/arvados_cwl/executor.py | 2 +- .../crunchstat_summary/summarizer.py | 13 +++++++++---- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/sdk/cwl/arvados_cwl/arvcontainer.py b/sdk/cwl/arvados_cwl/arvcontainer.py index f048e505e8..c3b914ba99 100644 --- a/sdk/cwl/arvados_cwl/arvcontainer.py +++ b/sdk/cwl/arvados_cwl/arvcontainer.py @@ -548,7 +548,7 @@ class ArvadosContainer(JobBase): # Post warnings about nodes that are under-utilized. for rc in summarizer._recommend_gen(lambda x: x): - self.usage_report_notes.append(rc) + self.job_runtime.usage_report_notes.append(rc) except Exception as e: logger.warning("%s unable to generate resource usage report", diff --git a/sdk/cwl/arvados_cwl/executor.py b/sdk/cwl/arvados_cwl/executor.py index 28ee60ac39..432b380aab 100644 --- a/sdk/cwl/arvados_cwl/executor.py +++ b/sdk/cwl/arvados_cwl/executor.py @@ -930,7 +930,7 @@ The 'jobs' API is no longer supported. raise WorkflowException("Workflow did not return a result.") if runtimeContext.usage_report_notes: - logger.info("Resource report notifications:") + logger.info("Steps with low resource utilization (possible optimization opportunities):") for x in runtimeContext.usage_report_notes: logger.info(" %s", x) diff --git a/tools/crunchstat-summary/crunchstat_summary/summarizer.py b/tools/crunchstat-summary/crunchstat_summary/summarizer.py index 8a2cda130b..bc41fdae33 100644 --- a/tools/crunchstat-summary/crunchstat_summary/summarizer.py +++ b/tools/crunchstat-summary/crunchstat_summary/summarizer.py @@ -483,7 +483,7 @@ class Summarizer(object): '{} peak RAM usage was only {}% ({} MiB used / {} MiB requested)' ).format( self.label, - int(100*(used_mib / asked_mib)), + int(math.ceil(100*(used_mib / asked_mib))), int(used_mib), int(asked_mib)) @@ -497,18 +497,23 @@ class Summarizer(object): if self.job_tot['net:keep0']['rx'] == 0: return + + miss_rate = (float(self.job_tot['keepcache']['miss']) / + float(self.job_tot['keepcalls']['get'])) + utilization = (float(self.job_tot['blkio:0:0']['read']) / float(self.job_tot['net:keep0']['rx'])) # FIXME: the default on this get won't work correctly asked_cache = self.existing_constraints.get('keep_cache_ram') or self.existing_constraints.get('keep_cache_disk') - if utilization < 0.5: + if utilization < 0.5 and miss_rate > .05: yield recommendformat( - '{} Keep cache utilization was {:.2f}% -- ' + '{} Keep cache utilization was only {:.2f}% and miss rate was {:.2f}% -- ' 'recommend increasing keep_cache' ).format( self.label, - utilization * 100.0) + utilization * 100.0, + miss_rate * 100.0) def _recommend_temp_disk(self, recommendformat): -- 2.30.2