19744: Incorporate runtime constraints and node info into report
authorPeter Amstutz <peter.amstutz@curii.com>
Thu, 22 Feb 2024 00:05:36 +0000 (19:05 -0500)
committerPeter Amstutz <peter.amstutz@curii.com>
Thu, 22 Feb 2024 00:05:36 +0000 (19:05 -0500)
This makes it much easier to judge CPU and RAM utilization.

Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz@curii.com>

sdk/cwl/arvados_cwl/arvcontainer.py
tools/crunchstat-summary/crunchstat_summary/reader.py
tools/crunchstat-summary/crunchstat_summary/summarizer.py

index 97b4e45225f47b42d40711da0224c80fc6889153..bf1199a05414c6bc94e64f074d38412496eceae3 100644 (file)
@@ -541,7 +541,7 @@ class ArvadosContainer(JobBase):
                         mr.write(summerizer.html_report())
                     logc.save()
                 except Exception as e:
-                    logger.error("%s unable to generate resource usage report",
+                    logger.warning("%s unable to generate resource usage report",
                                  self.arvrunner.label(self),
                                  exc_info=(e if self.arvrunner.debug else False))
 
index 8ccdbc2fcf04e45ca3ab3ec6e2270933d050ea1c..2af770f51bc7909ed5a2982fc860d990b8bd6158 100644 (file)
@@ -4,6 +4,7 @@
 
 import arvados
 import itertools
+import json
 import queue
 import threading
 
@@ -11,24 +12,26 @@ from crunchstat_summary import logger
 
 
 class CollectionReader(object):
-    def __init__(self, collection_id):
+    def __init__(self, collection_id, api_client=None):
         self._collection_id = collection_id
         self._label = collection_id
         self._readers = []
+        self._api_client = api_client
+        self._collection = arvados.collection.CollectionReader(self._collection_id, api_client=self._api_client)
 
     def __str__(self):
         return self._label
 
     def __iter__(self):
         logger.debug('load collection %s', self._collection_id)
-        collection = arvados.collection.CollectionReader(self._collection_id)
-        filenames = [filename for filename in collection]
+
+        filenames = [filename for filename in self._collection]
         # Crunch2 has multiple stats files
         if len(filenames) > 1:
             filenames = ['crunchstat.txt', 'arv-mount.txt']
         for filename in filenames:
             try:
-                self._readers.append(collection.open(filename))
+                self._readers.append(self._collection.open(filename))
             except IOError:
                 logger.warn('Unable to open %s', filename)
         self._label = "{}/{}".format(self._collection_id, filenames[0])
@@ -43,6 +46,14 @@ class CollectionReader(object):
                 reader.close()
             self._readers = []
 
+    def node_info(self):
+        try:
+            with self._collection.open("node.json") as f:
+                return json.load(f)
+        except IOError:
+            logger.warn('Unable to open node.json')
+        return {}
+
 
 class LiveLogReader(object):
     EOF = None
index 062be3a65a929b91bc818a01de915aafdb439596..a721ff36b60d2be4bc4d4dc625fc05b92590610e 100644 (file)
@@ -277,11 +277,11 @@ class Summarizer(object):
         label = ""
         s = (self.finishtime - self.starttime).total_seconds()
         if s > 86400:
-            label += '{}d'.format(int(s/86400))
+            label += '{}d '.format(int(s/86400))
         if s > 3600:
-            label += '{}h'.format(int(s/3600) % 24)
+            label += '{}h '.format(int(s/3600) % 24)
         if s > 60:
-            label += '{}m'.format(int(s/60) % 60)
+            label += '{}m '.format(int(s/60) % 60)
         label += '{}s'.format(int(s) % 60)
         return label
 
@@ -312,57 +312,97 @@ class Summarizer(object):
         by_single_task = ""
         if len(self.tasks) > 1:
             by_single_task = " by a single task"
+
         metrics = [
             ('Elapsed time',
              self.elapsed_time(),
              None,
              ''),
-                ('CPU time spent{}'.format(by_single_task),
-                 self.stats_max['cpu']['user+sys'],
-                 None,
-                 's'),
-                ('Max CPU usage in a single interval',
-                 self.stats_max['cpu']['user+sys__rate'],
-                 lambda x: x * 100,
-                 '%'),
-                ('Overall CPU usage',
-                 float(self.job_tot['cpu']['user+sys']) /
-                 self.job_tot['time']['elapsed']
-                 if self.job_tot['time']['elapsed'] > 0 else 0,
-                 lambda x: x * 100,
-                 '%'),
-                ('Max memory used{}'.format(by_single_task),
-                 self.stats_max['mem']['rss'],
-                 lambda x: x / 1e9,
-                 'GB'),
-                ('Max network traffic{}'.format(by_single_task),
-                 self.stats_max['net:eth0']['tx+rx'] +
-                 self.stats_max['net:keep0']['tx+rx'],
-                 lambda x: x / 1e9,
-                 'GB'),
-                ('Max network speed in a single interval',
-                 self.stats_max['net:eth0']['tx+rx__rate'] +
-                 self.stats_max['net:keep0']['tx+rx__rate'],
-                 lambda x: x / 1e6,
-                 'MB/s'),
-                ('Keep cache miss rate',
-                 (float(self.job_tot['keepcache']['miss']) /
-                 float(self.job_tot['keepcalls']['get']))
-                 if self.job_tot['keepcalls']['get'] > 0 else 0,
-                 lambda x: x * 100.0,
-                 '%'),
-                ('Keep cache utilization',
-                 (float(self.job_tot['blkio:0:0']['read']) /
-                 float(self.job_tot['net:keep0']['rx']))
-                 if self.job_tot['net:keep0']['rx'] > 0 else 0,
-                 lambda x: x * 100.0,
-                 '%'),
-               ('Temp disk utilization',
-                 (float(self.job_tot['statfs']['used']) /
-                 float(self.job_tot['statfs']['total']))
-                 if self.job_tot['statfs']['total'] > 0 else 0,
-                 lambda x: x * 100.0,
-                '%'),
+
+            ('Estimated cost',
+             '${:.3f}'.format(self.cost),
+             None,
+             '') if self.cost > 0 else None,
+
+            ('Assigned instance type',
+             self.node_info.get('ProviderType'),
+             None,
+             '') if self.node_info.get('ProviderType') else None,
+
+            ('Instance hourly price',
+             '${:.3f}'.format(self.node_info.get('Price')),
+             None,
+             '') if self.node_info.get('Price') else None,
+
+            ('Max CPU usage in a single interval',
+             self.stats_max['cpu']['user+sys__rate'],
+             lambda x: x * 100,
+             '%'),
+
+            ('Overall CPU usage',
+             float(self.job_tot['cpu']['user+sys']) /
+             self.job_tot['time']['elapsed']
+             if self.job_tot['time']['elapsed'] > 0 else 0,
+             lambda x: x * 100,
+             '%'),
+
+            ('Requested CPU cores',
+             self.existing_constraints.get(self._map_runtime_constraint('vcpus')),
+             None,
+             ''),
+
+            ('Instance VCPUs',
+             self.node_info.get('VCPUs'),
+             None,
+             '') if self.node_info.get('VCPUs') else None,
+
+            ('Max memory used{}'.format(by_single_task),
+             self.stats_max['mem']['rss'],
+             lambda x: x / 2**20,
+             'MB'),
+
+            ('Requested RAM',
+             self.existing_constraints.get(self._map_runtime_constraint('ram')),
+             lambda x: x / 2**20,
+             'MB'),
+
+            ('Maximum RAM request for this instance type',
+             (self.node_info.get('RAM') - self.arv_config.get('Containers', {}).get('ReserveExtraRAM', {}))*.95,
+             lambda x: x / 2**20,
+             'MB'),
+
+            ('Max network traffic{}'.format(by_single_task),
+             self.stats_max['net:eth0']['tx+rx'] +
+             self.stats_max['net:keep0']['tx+rx'],
+             lambda x: x / 1e9,
+             'GB'),
+
+            ('Max network speed in a single interval',
+             self.stats_max['net:eth0']['tx+rx__rate'] +
+             self.stats_max['net:keep0']['tx+rx__rate'],
+             lambda x: x / 1e6,
+             'MB/s'),
+
+            ('Keep cache miss rate',
+             (float(self.job_tot['keepcache']['miss']) /
+              float(self.job_tot['keepcalls']['get']))
+             if self.job_tot['keepcalls']['get'] > 0 else 0,
+             lambda x: x * 100.0,
+             '%'),
+
+            ('Keep cache utilization',
+             (float(self.job_tot['blkio:0:0']['read']) /
+              float(self.job_tot['net:keep0']['rx']))
+             if self.job_tot['net:keep0']['rx'] > 0 else 0,
+             lambda x: x * 100.0,
+             '%'),
+
+            ('Temp disk utilization',
+             (float(self.job_tot['statfs']['used']) /
+              float(self.job_tot['statfs']['total']))
+             if self.job_tot['statfs']['total'] > 0 else 0,
+             lambda x: x * 100.0,
+             '%'),
         ]
 
         if len(self.tasks) > 1:
@@ -371,6 +411,8 @@ class Summarizer(object):
                  None,
                  ''))
         for args in metrics:
+            if args is None:
+                continue
             format_string, val, transform, suffix = args
             if val == float('-Inf'):
                 continue
@@ -581,6 +623,7 @@ class ProcessSummarizer(Summarizer):
     def __init__(self, process, label=None, **kwargs):
         rdr = None
         self.process = process
+        arv = kwargs.get("arv") or arvados.api('v1')
         if label is None:
             label = self.process.get('name', self.process['uuid'])
         # Pre-Arvados v1.4 everything is in 'log'
@@ -588,7 +631,7 @@ class ProcessSummarizer(Summarizer):
         log_collection = self.process.get('log', self.process.get('log_uuid'))
         if log_collection and self.process.get('state') != 'Uncommitted': # arvados.util.CR_UNCOMMITTED:
             try:
-                rdr = crunchstat_summary.reader.CollectionReader(log_collection)
+                rdr = crunchstat_summary.reader.CollectionReader(log_collection, api_client=arv)
             except arvados.errors.NotFoundError as e:
                 logger.warning("Trying event logs after failing to read "
                                "log collection %s: %s", self.process['log'], e)
@@ -596,8 +639,14 @@ class ProcessSummarizer(Summarizer):
             uuid = self.process.get('container_uuid', self.process.get('uuid'))
             rdr = crunchstat_summary.reader.LiveLogReader(uuid)
             label = label + ' (partial)'
+        else:
+            self.node_info = rdr.node_info()
+
         super(ProcessSummarizer, self).__init__(rdr, label=label, **kwargs)
         self.existing_constraints = self.process.get('runtime_constraints', {})
+        self.arv_config = arv.config()
+        self.cost = self.process.get('cost', 0)
+
 
 
 class JobSummarizer(ProcessSummarizer):