19744: Incorporate runtime constraints and node info into report

author Peter Amstutz <peter.amstutz@curii.com>

Thu, 22 Feb 2024 00:05:36 +0000 (19:05 -0500)

committer Peter Amstutz <peter.amstutz@curii.com>

Thu, 22 Feb 2024 00:05:36 +0000 (19:05 -0500)
author Peter Amstutz <peter.amstutz@curii.com>
Thu, 22 Feb 2024 00:05:36 +0000 (19:05 -0500)
committer Peter Amstutz <peter.amstutz@curii.com>
Thu, 22 Feb 2024 00:05:36 +0000 (19:05 -0500)
diff --git a/sdk/cwl/arvados_cwl/arvcontainer.py b/sdk/cwl/arvados_cwl/arvcontainer.py

index 97b4e45225f47b42d40711da0224c80fc6889153..bf1199a05414c6bc94e64f074d38412496eceae3 100644 (file)
--- a/sdk/cwl/arvados_cwl/arvcontainer.py
+++ b/sdk/cwl/arvados_cwl/arvcontainer.py
@@ -541,7 +541,7 @@ class ArvadosContainer(JobBase):
                          mr.write(summerizer.html_report())
                      logc.save()
                  except Exception as e:
-                    logger.error("%s unable to generate resource usage report",
+                    logger.warning("%s unable to generate resource usage report",
                                   self.arvrunner.label(self),
                                   exc_info=(e if self.arvrunner.debug else False))
  
diff --git a/tools/crunchstat-summary/crunchstat_summary/reader.py b/tools/crunchstat-summary/crunchstat_summary/reader.py

index 8ccdbc2fcf04e45ca3ab3ec6e2270933d050ea1c..2af770f51bc7909ed5a2982fc860d990b8bd6158 100644 (file)
--- a/tools/crunchstat-summary/crunchstat_summary/reader.py
+++ b/tools/crunchstat-summary/crunchstat_summary/reader.py
@@ -4,6 +4,7 @@
  
  import arvados
  import itertools
+import json
  import queue
  import threading
  
@@ -11,24 +12,26 @@ from crunchstat_summary import logger
  
  
  class CollectionReader(object):
-    def __init__(self, collection_id):
+    def __init__(self, collection_id, api_client=None):
          self._collection_id = collection_id
          self._label = collection_id
          self._readers = []
+        self._api_client = api_client
+        self._collection = arvados.collection.CollectionReader(self._collection_id, api_client=self._api_client)
  
      def __str__(self):
          return self._label
  
      def __iter__(self):
          logger.debug('load collection %s', self._collection_id)
-        collection = arvados.collection.CollectionReader(self._collection_id)
-        filenames = [filename for filename in collection]
+
+        filenames = [filename for filename in self._collection]
          # Crunch2 has multiple stats files
          if len(filenames) > 1:
              filenames = ['crunchstat.txt', 'arv-mount.txt']
          for filename in filenames:
              try:
-                self._readers.append(collection.open(filename))
+                self._readers.append(self._collection.open(filename))
              except IOError:
                  logger.warn('Unable to open %s', filename)
          self._label = "{}/{}".format(self._collection_id, filenames[0])
@@ -43,6 +46,14 @@ class CollectionReader(object):
                  reader.close()
              self._readers = []
  
+    def node_info(self):
+        try:
+            with self._collection.open("node.json") as f:
+                return json.load(f)
+        except IOError:
+            logger.warn('Unable to open node.json')
+        return {}
+
  
  class LiveLogReader(object):
      EOF = None
diff --git a/tools/crunchstat-summary/crunchstat_summary/summarizer.py b/tools/crunchstat-summary/crunchstat_summary/summarizer.py

index 062be3a65a929b91bc818a01de915aafdb439596..a721ff36b60d2be4bc4d4dc625fc05b92590610e 100644 (file)
--- a/tools/crunchstat-summary/crunchstat_summary/summarizer.py
+++ b/tools/crunchstat-summary/crunchstat_summary/summarizer.py
@@ -277,11 +277,11 @@ class Summarizer(object):
          label = ""
          s = (self.finishtime - self.starttime).total_seconds()
          if s > 86400:
-            label += '{}d'.format(int(s/86400))
+            label += '{}d '.format(int(s/86400))
          if s > 3600:
-            label += '{}h'.format(int(s/3600) % 24)
+            label += '{}h '.format(int(s/3600) % 24)
          if s > 60:
-            label += '{}m'.format(int(s/60) % 60)
+            label += '{}m '.format(int(s/60) % 60)
          label += '{}s'.format(int(s) % 60)
          return label
  
@@ -312,57 +312,97 @@ class Summarizer(object):
          by_single_task = ""
          if len(self.tasks) > 1:
              by_single_task = " by a single task"
+
          metrics = [
              ('Elapsed time',
               self.elapsed_time(),
               None,
               ''),
-                ('CPU time spent{}'.format(by_single_task),
-                 self.stats_max['cpu']['user+sys'],
-                 None,
-                 's'),
-                ('Max CPU usage in a single interval',
-                 self.stats_max['cpu']['user+sys__rate'],
-                 lambda x: x * 100,
-                 '%'),
-                ('Overall CPU usage',
-                 float(self.job_tot['cpu']['user+sys']) /
-                 self.job_tot['time']['elapsed']
-                 if self.job_tot['time']['elapsed'] > 0 else 0,
-                 lambda x: x * 100,
-                 '%'),
-                ('Max memory used{}'.format(by_single_task),
-                 self.stats_max['mem']['rss'],
-                 lambda x: x / 1e9,
-                 'GB'),
-                ('Max network traffic{}'.format(by_single_task),
-                 self.stats_max['net:eth0']['tx+rx'] +
-                 self.stats_max['net:keep0']['tx+rx'],
-                 lambda x: x / 1e9,
-                 'GB'),
-                ('Max network speed in a single interval',
-                 self.stats_max['net:eth0']['tx+rx__rate'] +
-                 self.stats_max['net:keep0']['tx+rx__rate'],
-                 lambda x: x / 1e6,
-                 'MB/s'),
-                ('Keep cache miss rate',
-                 (float(self.job_tot['keepcache']['miss']) /
-                 float(self.job_tot['keepcalls']['get']))
-                 if self.job_tot['keepcalls']['get'] > 0 else 0,
-                 lambda x: x * 100.0,
-                 '%'),
-                ('Keep cache utilization',
-                 (float(self.job_tot['blkio:0:0']['read']) /
-                 float(self.job_tot['net:keep0']['rx']))
-                 if self.job_tot['net:keep0']['rx'] > 0 else 0,
-                 lambda x: x * 100.0,
-                 '%'),
-               ('Temp disk utilization',
-                 (float(self.job_tot['statfs']['used']) /
-                 float(self.job_tot['statfs']['total']))
-                 if self.job_tot['statfs']['total'] > 0 else 0,
-                 lambda x: x * 100.0,
-                '%'),
+
+            ('Estimated cost',
+             '${:.3f}'.format(self.cost),
+             None,
+             '') if self.cost > 0 else None,
+
+            ('Assigned instance type',
+             self.node_info.get('ProviderType'),
+             None,
+             '') if self.node_info.get('ProviderType') else None,
+
+            ('Instance hourly price',
+             '${:.3f}'.format(self.node_info.get('Price')),
+             None,
+             '') if self.node_info.get('Price') else None,
+
+            ('Max CPU usage in a single interval',
+             self.stats_max['cpu']['user+sys__rate'],
+             lambda x: x * 100,
+             '%'),
+
+            ('Overall CPU usage',
+             float(self.job_tot['cpu']['user+sys']) /
+             self.job_tot['time']['elapsed']
+             if self.job_tot['time']['elapsed'] > 0 else 0,
+             lambda x: x * 100,
+             '%'),
+
+            ('Requested CPU cores',
+             self.existing_constraints.get(self._map_runtime_constraint('vcpus')),
+             None,
+             ''),
+
+            ('Instance VCPUs',
+             self.node_info.get('VCPUs'),
+             None,
+             '') if self.node_info.get('VCPUs') else None,
+
+            ('Max memory used{}'.format(by_single_task),
+             self.stats_max['mem']['rss'],
+             lambda x: x / 2**20,
+             'MB'),
+
+            ('Requested RAM',
+             self.existing_constraints.get(self._map_runtime_constraint('ram')),
+             lambda x: x / 2**20,
+             'MB'),
+
+            ('Maximum RAM request for this instance type',
+             (self.node_info.get('RAM') - self.arv_config.get('Containers', {}).get('ReserveExtraRAM', {}))*.95,
+             lambda x: x / 2**20,
+             'MB'),
+
+            ('Max network traffic{}'.format(by_single_task),
+             self.stats_max['net:eth0']['tx+rx'] +
+             self.stats_max['net:keep0']['tx+rx'],
+             lambda x: x / 1e9,
+             'GB'),
+
+            ('Max network speed in a single interval',
+             self.stats_max['net:eth0']['tx+rx__rate'] +
+             self.stats_max['net:keep0']['tx+rx__rate'],
+             lambda x: x / 1e6,
+             'MB/s'),
+
+            ('Keep cache miss rate',
+             (float(self.job_tot['keepcache']['miss']) /
+              float(self.job_tot['keepcalls']['get']))
+             if self.job_tot['keepcalls']['get'] > 0 else 0,
+             lambda x: x * 100.0,
+             '%'),
+
+            ('Keep cache utilization',
+             (float(self.job_tot['blkio:0:0']['read']) /
+              float(self.job_tot['net:keep0']['rx']))
+             if self.job_tot['net:keep0']['rx'] > 0 else 0,
+             lambda x: x * 100.0,
+             '%'),
+
+            ('Temp disk utilization',
+             (float(self.job_tot['statfs']['used']) /
+              float(self.job_tot['statfs']['total']))
+             if self.job_tot['statfs']['total'] > 0 else 0,
+             lambda x: x * 100.0,
+             '%'),
          ]
  
          if len(self.tasks) > 1:
@@ -371,6 +411,8 @@ class Summarizer(object):
                   None,
                   ''))
          for args in metrics:
+            if args is None:
+                continue
              format_string, val, transform, suffix = args
              if val == float('-Inf'):
                  continue
@@ -581,6 +623,7 @@ class ProcessSummarizer(Summarizer):
      def __init__(self, process, label=None, **kwargs):
          rdr = None
          self.process = process
+        arv = kwargs.get("arv") or arvados.api('v1')
          if label is None:
              label = self.process.get('name', self.process['uuid'])
          # Pre-Arvados v1.4 everything is in 'log'
@@ -588,7 +631,7 @@ class ProcessSummarizer(Summarizer):
          log_collection = self.process.get('log', self.process.get('log_uuid'))
          if log_collection and self.process.get('state') != 'Uncommitted': # arvados.util.CR_UNCOMMITTED:
              try:
-                rdr = crunchstat_summary.reader.CollectionReader(log_collection)
+                rdr = crunchstat_summary.reader.CollectionReader(log_collection, api_client=arv)
              except arvados.errors.NotFoundError as e:
                  logger.warning("Trying event logs after failing to read "
                                 "log collection %s: %s", self.process['log'], e)
@@ -596,8 +639,14 @@ class ProcessSummarizer(Summarizer):
              uuid = self.process.get('container_uuid', self.process.get('uuid'))
              rdr = crunchstat_summary.reader.LiveLogReader(uuid)
              label = label + ' (partial)'
+        else:
+            self.node_info = rdr.node_info()
+
          super(ProcessSummarizer, self).__init__(rdr, label=label, **kwargs)
          self.existing_constraints = self.process.get('runtime_constraints', {})
+        self.arv_config = arv.config()
+        self.cost = self.process.get('cost', 0)
+
  
  
  class JobSummarizer(ProcessSummarizer):
author	Peter Amstutz <peter.amstutz@curii.com>
	Thu, 22 Feb 2024 00:05:36 +0000 (19:05 -0500)
committer	Peter Amstutz <peter.amstutz@curii.com>
	Thu, 22 Feb 2024 00:05:36 +0000 (19:05 -0500)
sdk/cwl/arvados_cwl/arvcontainer.py		patch \| blob \| history
tools/crunchstat-summary/crunchstat_summary/reader.py		patch \| blob \| history
tools/crunchstat-summary/crunchstat_summary/summarizer.py		patch \| blob \| history