7901: Add job stats, elapsed time, summed user+sys and tx+rx, and some human-readable...
authorTom Clegg <tom@curoverse.com>
Thu, 17 Dec 2015 15:05:30 +0000 (10:05 -0500)
committerTom Clegg <tom@curoverse.com>
Thu, 17 Dec 2015 15:06:23 +0000 (10:06 -0500)
tools/crunchstat-summary/crunchstat_summary/summarizer.py
tools/crunchstat-summary/tests/logfile_20151204190335.txt.gz.report
tools/crunchstat-summary/tests/logfile_20151210063411.txt.gz.report
tools/crunchstat-summary/tests/logfile_20151210063439.txt.gz.report

index 65160184d6a48739df7c3ecf552483bb58310ad0..ac0964b30e990724ecb5c7b16fb1cbd940bce25d 100644 (file)
@@ -1,6 +1,8 @@
 from __future__ import print_function
 
 import arvados
+import collections
+import functools
 import gzip
 import re
 import sys
@@ -11,27 +13,46 @@ class Summarizer(object):
         self.args = args
 
     def run(self):
-        stats_max = {}
+        # stats_max: {category: {stat: val}}
+        self.stats_max = collections.defaultdict(
+            functools.partial(collections.defaultdict,
+                              lambda: float('-Inf')))
+        # task_stats: {task_id: {category: {stat: val}}}
+        self.task_stats = collections.defaultdict(
+            functools.partial(collections.defaultdict, dict))
         for line in self._logdata():
+            m = re.search(r'^\S+ \S+ \d+ (?P<seq>\d+) success in (?P<elapsed>\d+) seconds', line)
+            if m:
+                task_id = m.group('seq')
+                elapsed = int(m.group('elapsed'))
+                self.task_stats[task_id]['time'] = {'elapsed': elapsed}
+                if elapsed > self.stats_max['time']['elapsed']:
+                    self.stats_max['time']['elapsed'] = elapsed
+                continue
             m = re.search(r'^\S+ \S+ \d+ (?P<seq>\d+) stderr crunchstat: (?P<category>\S+) (?P<current>.*?)( -- interval (?P<interval>.*))?\n', line)
             if not m:
                 continue
             if m.group('category').endswith(':'):
                 # "notice:" etc.
                 continue
+            task_id = m.group('seq')
             this_interval_s = None
             for group in ['current', 'interval']:
                 if not m.group(group):
                     continue
                 category = m.group('category')
-                if category not in stats_max:
-                    stats_max[category] = {}
                 words = m.group(group).split(' ')
+                stats = {}
                 for val, stat in zip(words[::2], words[1::2]):
                     if '.' in val:
-                        val = float(val)
+                        stats[stat] = float(val)
                     else:
-                        val = int(val)
+                        stats[stat] = int(val)
+                if 'user' in stats or 'sys' in stats:
+                    stats['user+sys'] = stats.get('user', 0) + stats.get('sys', 0)
+                if 'tx' in stats or 'rx' in stats:
+                    stats['tx+rx'] = stats.get('tx', 0) + stats.get('rx', 0)
+                for stat, val in stats.iteritems():
                     if group == 'interval':
                         if stat == 'seconds':
                             this_interval_s = val
@@ -44,26 +65,67 @@ class Summarizer(object):
                         else:
                             stat = stat + '__rate'
                             val = val / this_interval_s
-                    if val > stats_max[category].get(stat, float('-Inf')):
-                        stats_max[category][stat] = val
-        self.stats_max = stats_max
+                    else:
+                        self.task_stats[task_id][category][stat] = val
+                    if val > self.stats_max[category][stat]:
+                        self.stats_max[category][stat] = val
 
     def report(self):
         return "\n".join(self._report_gen()) + "\n"
 
     def _report_gen(self):
-        yield "\t".join(['category', 'metric', 'max', 'max_rate'])
-        for category, stat_max in self.stats_max.iteritems():
-            for stat, val in stat_max.iteritems():
+        job_tot = collections.defaultdict(
+            functools.partial(collections.defaultdict, int))
+        for task_id, task_stat in self.task_stats.iteritems():
+            for category, stat_last in task_stat.iteritems():
+                for stat, val in stat_last.iteritems():
+                    if stat in ['cpus', 'cache', 'swap', 'rss']:
+                        # meaningless stats like 16 cpu cores x 5 tasks = 80
+                        continue
+                    job_tot[category][stat] += val
+        yield "\t".join(['category', 'metric', 'task_max', 'task_max_rate', 'job_total'])
+        for category, stat_max in sorted(self.stats_max.iteritems()):
+            for stat, val in sorted(stat_max.iteritems()):
                 if stat.endswith('__rate'):
                     continue
-                if stat+'__rate' in stat_max:
-                    max_rate = '{:.2f}'.format(stat_max[stat+'__rate'])
-                else:
-                    max_rate = '-'
-                if isinstance(val, float):
-                    val = '{:.2f}'.format(val)
-                yield "\t".join([category, stat, str(val), max_rate])
+                max_rate = self._format(stat_max.get(stat+'__rate', '-'))
+                val = self._format(val)
+                tot = self._format(job_tot[category].get(stat, '-'))
+                yield "\t".join([category, stat, str(val), max_rate, tot])
+        for args in (
+                ('Max CPU time spent by a single task: {}s',
+                 self.stats_max['cpu']['user+sys'],
+                 None),
+                ('Max CPU usage in a single interval: {}%',
+                 self.stats_max['cpu']['user+sys__rate'],
+                 lambda x: x * 100),
+                ('Overall CPU usage: {}%',
+                 job_tot['cpu']['user+sys'] / job_tot['time']['elapsed'],
+                 lambda x: x * 100),
+                ('Max memory used by a single task: {}GB',
+                 self.stats_max['mem']['rss'],
+                 lambda x: x / 1e9),
+                ('Max network traffic in a single task: {}GB',
+                 self.stats_max['net:eth0']['tx+rx'],
+                 lambda x: x / 1e9),
+                ('Max network speed in a single interval: {}MB/s',
+                 self.stats_max['net:eth0']['tx+rx__rate'],
+                 lambda x: x / 1e6)):
+            format_string, val, transform = args
+            if val == float('-Inf'):
+                continue
+            if transform:
+                val = transform(val)
+            yield "# "+format_string.format(self._format(val))
+
+    def _format(self, val):
+        """Return a string representation of a stat.
+
+        {:.2f} for floats, default format for everything else."""
+        if isinstance(val, float):
+            return '{:.2f}'.format(val)
+        else:
+            return '{}'.format(val)
 
     def _logdata(self):
         if self.args.log_file:
index 5ac1b28c01952e63fb86833d4cdcadd17a7880e7..ef7beb11c0057009f8465aa2836349ca32235f5a 100644 (file)
@@ -1,20 +1,30 @@
-category       metric  max     max_rate
-net:eth0       rx      1754364530      41658344.87
-net:eth0       tx      38837956        920817.97
-mem    rss     349814784       -
-mem    cache   1678139392      -
-mem    swap    0       -
-mem    pgmajfault      0       -
-keepcalls      put     0       0.00
-keepcalls      get     0       0.00
-keepcache      miss    0       0.00
-keepcache      hit     0       0.00
-blkio:0:0      write   0       0.00
-blkio:0:0      read    0       0.00
-net:keep0      rx      0       0.00
-net:keep0      tx      0       0.00
-cpu    sys     1.92    0.04
-cpu    user    3.83    0.09
-cpu    cpus    8       -
-fuseops        write   0       0.00
-fuseops        read    0       0.00
+category       metric  task_max        task_max_rate   job_total
+blkio:0:0      read    0       0.00    0
+blkio:0:0      write   0       0.00    0
+cpu    cpus    8       -       -
+cpu    sys     1.92    0.04    1.92
+cpu    user    3.83    0.09    3.83
+cpu    user+sys        5.75    0.13    5.75
+fuseops        read    0       0.00    0
+fuseops        write   0       0.00    0
+keepcache      hit     0       0.00    0
+keepcache      miss    0       0.00    0
+keepcalls      get     0       0.00    0
+keepcalls      put     0       0.00    0
+mem    cache   1678139392      -       -
+mem    pgmajfault      0       -       0
+mem    rss     349814784       -       -
+mem    swap    0       -       -
+net:eth0       rx      1754364530      41658344.87     1754364530
+net:eth0       tx      38837956        920817.97       38837956
+net:eth0       tx+rx   1793202486      42579162.83     1793202486
+net:keep0      rx      0       0.00    0
+net:keep0      tx      0       0.00    0
+net:keep0      tx+rx   0       0.00    0
+time   elapsed 80      -       80
+# Max CPU time spent by a single task: 5.75s
+# Max CPU usage in a single interval: 13.00%
+# Overall CPU usage: 7.19%
+# Max memory used by a single task: 0.35GB
+# Max network traffic in a single task: 1.79GB
+# Max network speed in a single interval: 42.58MB/s
index d056317827ecad73581b5e0cec7ce9e291d0b8d5..38af3e7e8c7752eb07532f5a2f4fd54a616eab4e 100644 (file)
@@ -1,10 +1,17 @@
-category       metric  max     max_rate
-mem    rss     856064  -
-mem    cache   12288   -
-mem    swap    0       -
-mem    pgmajfault      0       -
-cpu    sys     0.00    -
-cpu    user    0.00    -
-cpu    cpus    8       -
-net:eth0       rx      90      -
-net:eth0       tx      90      -
+category       metric  task_max        task_max_rate   job_total
+cpu    cpus    8       -       -
+cpu    sys     0.00    -       0.00
+cpu    user    0.00    -       0.00
+cpu    user+sys        0.00    -       0.00
+mem    cache   12288   -       -
+mem    pgmajfault      0       -       0
+mem    rss     856064  -       -
+mem    swap    0       -       -
+net:eth0       rx      90      -       90
+net:eth0       tx      90      -       90
+net:eth0       tx+rx   180     -       180
+time   elapsed 2       -       4
+# Max CPU time spent by a single task: 0.00s
+# Overall CPU usage: 0.00%
+# Max memory used by a single task: 0.00GB
+# Max network traffic in a single task: 0.00GB
index d3939dc971a838e500a932ba5f01751224f0aaf5..7e42d612b753fadb25e4bac7cee7e941e01231e3 100644 (file)
@@ -1,10 +1,17 @@
-category       metric  max     max_rate
-mem    rss     450560  -
-mem    cache   8192    -
-mem    swap    0       -
-mem    pgmajfault      0       -
-cpu    sys     0.00    -
-cpu    user    0.00    -
-cpu    cpus    8       -
-net:eth0       rx      90      -
-net:eth0       tx      90      -
+category       metric  task_max        task_max_rate   job_total
+cpu    cpus    8       -       -
+cpu    sys     0.00    -       0.00
+cpu    user    0.00    -       0.00
+cpu    user+sys        0.00    -       0.00
+mem    cache   8192    -       -
+mem    pgmajfault      0       -       0
+mem    rss     450560  -       -
+mem    swap    0       -       -
+net:eth0       rx      90      -       90
+net:eth0       tx      90      -       90
+net:eth0       tx+rx   180     -       180
+time   elapsed 2       -       3
+# Max CPU time spent by a single task: 0.00s
+# Overall CPU usage: 0.00%
+# Max memory used by a single task: 0.00GB
+# Max network traffic in a single task: 0.00GB