10472: add latent support for rolled up stats
[arvados.git] / tools / crunchstat-summary / crunchstat_summary / summarizer.py
index 2ac12abcba23e381073589cf209915b88a9d8cef..f1c97c282c90612789204672e793f7be935939ae 100644 (file)
@@ -11,6 +11,7 @@ import math
 import re
 import sys
 import threading
+import _strptime
 
 from arvados.api import OrderedJsonModel
 from crunchstat_summary import logger
@@ -59,6 +60,21 @@ class Summarizer(object):
 
         logger.debug("%s: logdata %s", self.label, logdata)
 
+    def run_child(self, uuid):
+        if self._skip_child_jobs:
+            logger.warning('%s: omitting stats from child job %s'
+                           ' because --skip-child-jobs flag is on',
+                           self.label, uuid)
+            return
+        logger.debug('%s: follow %s', self.label, uuid)
+        child_summarizer = JobSummarizer(uuid)
+        child_summarizer.stats_max = self.stats_max
+        child_summarizer.task_stats = self.task_stats
+        child_summarizer.tasks = self.tasks
+        child_summarizer.starttime = self.starttime
+        child_summarizer.run()
+        logger.debug('%s: done %s', self.label, uuid)
+
     def run(self):
         logger.debug("%s: parsing logdata %s", self.label, self._logdata)
         for line in self._logdata:
@@ -79,102 +95,99 @@ class Summarizer(object):
                     self.stats_max['time']['elapsed'] = elapsed
                 continue
 
+            # Old style job logs only - newer style uses job['components']
+            uuid = None
             m = re.search(r'^\S+ \S+ \d+ (?P<seq>\d+) stderr Queued job (?P<uuid>\S+)$', line)
             if m:
-                uuid = m.group('uuid')
-                if self._skip_child_jobs:
-                    logger.warning('%s: omitting stats from child job %s'
-                                   ' because --skip-child-jobs flag is on',
-                                   self.label, uuid)
-                    continue
-                logger.debug('%s: follow %s', self.label, uuid)
-                child_summarizer = JobSummarizer(uuid)
-                child_summarizer.stats_max = self.stats_max
-                child_summarizer.task_stats = self.task_stats
-                child_summarizer.tasks = self.tasks
-                child_summarizer.starttime = self.starttime
-                child_summarizer.run()
-                logger.debug('%s: done %s', self.label, uuid)
+                self.run_child(m.group('uuid'))
                 continue
 
             m = re.search(r'^(?P<timestamp>[^\s.]+)(\.\d+)? (?P<job_uuid>\S+) \d+ (?P<seq>\d+) stderr crunchstat: (?P<category>\S+) (?P<current>.*?)( -- interval (?P<interval>.*))?\n', line)
             if not m:
                 continue
 
-            if self.label is None:
-                self.label = m.group('job_uuid')
-                logger.debug('%s: using job uuid as label', self.label)
-            if m.group('category').endswith(':'):
-                # "stderr crunchstat: notice: ..."
-                continue
-            elif m.group('category') in ('error', 'caught'):
-                continue
-            elif m.group('category') == 'read':
-                # "stderr crunchstat: read /proc/1234/net/dev: ..."
-                # (crunchstat formatting fixed, but old logs still say this)
-                continue
-            task_id = self.seq_to_uuid[int(m.group('seq'))]
-            task = self.tasks[task_id]
-
-            # Use the first and last crunchstat timestamps as
-            # approximations of starttime and finishtime.
-            timestamp = datetime.datetime.strptime(
-                m.group('timestamp'), '%Y-%m-%d_%H:%M:%S')
-            if not task.starttime:
-                task.starttime = timestamp
-                logger.debug('%s: task %s starttime %s',
-                             self.label, task_id, timestamp)
-            task.finishtime = timestamp
-
-            if not self.starttime:
-                self.starttime = timestamp
-            self.finishtime = timestamp
-
-            this_interval_s = None
-            for group in ['current', 'interval']:
-                if not m.group(group):
+            try:
+                if self.label is None:
+                    self.label = m.group('job_uuid')
+                    logger.debug('%s: using job uuid as label', self.label)
+                if m.group('category').endswith(':'):
+                    # "stderr crunchstat: notice: ..."
                     continue
-                category = m.group('category')
-                words = m.group(group).split(' ')
-                stats = {}
-                for val, stat in zip(words[::2], words[1::2]):
-                    try:
-                        if '.' in val:
-                            stats[stat] = float(val)
-                        else:
-                            stats[stat] = int(val)
-                    except ValueError as e:
-                        raise ValueError(
-                            'Error parsing {} stat in "{}": {!r}'.format(
-                                stat, line, e))
-                if 'user' in stats or 'sys' in stats:
-                    stats['user+sys'] = stats.get('user', 0) + stats.get('sys', 0)
-                if 'tx' in stats or 'rx' in stats:
-                    stats['tx+rx'] = stats.get('tx', 0) + stats.get('rx', 0)
-                for stat, val in stats.iteritems():
-                    if group == 'interval':
-                        if stat == 'seconds':
-                            this_interval_s = val
-                            continue
-                        elif not (this_interval_s > 0):
-                            logger.error(
-                                "BUG? interval stat given with duration {!r}".
-                                format(this_interval_s))
-                            continue
+                elif m.group('category') in ('error', 'caught'):
+                    continue
+                elif m.group('category') == 'read':
+                    # "stderr crunchstat: read /proc/1234/net/dev: ..."
+                    # (crunchstat formatting fixed, but old logs still say this)
+                    continue
+                task_id = self.seq_to_uuid[int(m.group('seq'))]
+                task = self.tasks[task_id]
+
+                # Use the first and last crunchstat timestamps as
+                # approximations of starttime and finishtime.
+                timestamp = datetime.datetime.strptime(
+                    m.group('timestamp'), '%Y-%m-%d_%H:%M:%S')
+                if not task.starttime:
+                    task.starttime = timestamp
+                    logger.debug('%s: task %s starttime %s',
+                                 self.label, task_id, timestamp)
+                task.finishtime = timestamp
+
+                if not self.starttime:
+                    self.starttime = timestamp
+                self.finishtime = timestamp
+
+                this_interval_s = None
+                for group in ['current', 'interval']:
+                    if not m.group(group):
+                        continue
+                    category = m.group('category')
+                    words = m.group(group).split(' ')
+                    stats = {}
+                    for val, stat in zip(words[::2], words[1::2]):
+                        try:
+                            if '.' in val:
+                                stats[stat] = float(val)
+                            else:
+                                stats[stat] = int(val)
+                        except ValueError as e:
+                            raise ValueError(
+                                'Error parsing {} stat: {!r}'.format(
+                                    stat, e))
+                    if 'user' in stats or 'sys' in stats:
+                        stats['user+sys'] = stats.get('user', 0) + stats.get('sys', 0)
+                    if 'tx' in stats or 'rx' in stats:
+                        stats['tx+rx'] = stats.get('tx', 0) + stats.get('rx', 0)
+                    for stat, val in stats.iteritems():
+                        if group == 'interval':
+                            if stat == 'seconds':
+                                this_interval_s = val
+                                continue
+                            elif not (this_interval_s > 0):
+                                logger.error(
+                                    "BUG? interval stat given with duration {!r}".
+                                    format(this_interval_s))
+                                continue
+                            else:
+                                stat = stat + '__rate'
+                                val = val / this_interval_s
+                                if stat in ['user+sys__rate', 'tx+rx__rate']:
+                                    task.series[category, stat].append(
+                                        (timestamp - self.starttime, val))
                         else:
-                            stat = stat + '__rate'
-                            val = val / this_interval_s
-                            if stat in ['user+sys__rate', 'tx+rx__rate']:
+                            if stat in ['rss']:
                                 task.series[category, stat].append(
                                     (timestamp - self.starttime, val))
-                    else:
-                        if stat in ['rss']:
-                            task.series[category, stat].append(
-                                (timestamp - self.starttime, val))
-                        self.task_stats[task_id][category][stat] = val
-                    if val > self.stats_max[category][stat]:
-                        self.stats_max[category][stat] = val
-        logger.debug('%s: done parsing', self.label)
+                            self.task_stats[task_id][category][stat] = val
+                        if val > self.stats_max[category][stat]:
+                            self.stats_max[category][stat] = val
+            except Exception as e:
+                logger.info('Skipping malformed line: {}Error was: {}\n'.format(line, e))
+        logger.debug('%s: done parsing log', self.label)
+
+        # Enabling this will roll up stats for all subjobs into the parent job
+        if False and 'components' in self.job:
+            for cname, component in self.job['components'].iteritems():
+                self.run_child(component)
 
         self.job_tot = collections.defaultdict(
             functools.partial(collections.defaultdict, int))
@@ -401,8 +414,8 @@ class JobSummarizer(Summarizer):
 
 class PipelineSummarizer(object):
     def __init__(self, pipeline_instance_uuid, **kwargs):
-        arv = arvados.api('v1', model=OrderedJsonModel())
-        instance = arv.pipeline_instances().get(
+        self.arv = arvados.api('v1', model=OrderedJsonModel())
+        instance = self.arv.pipeline_instances().get(
             uuid=pipeline_instance_uuid).execute()
         self.summarizers = collections.OrderedDict()
         for cname, component in instance['components'].iteritems():
@@ -410,14 +423,20 @@ class PipelineSummarizer(object):
                 logger.warning(
                     "%s: skipping component with no job assigned", cname)
             else:
-                logger.info(
-                    "%s: job %s", cname, component['job']['uuid'])
-                summarizer = JobSummarizer(component['job'], **kwargs)
-                summarizer.label = '{} {}'.format(
-                    cname, component['job']['uuid'])
-                self.summarizers[cname] = summarizer
+                self.summarize_job(cname, component['job'], **kwargs)
         self.label = pipeline_instance_uuid
 
+    def summarize_job(self, cname, job, **kwargs):
+        uuid = job['uuid']
+        logger.info("%s: job %s", cname, uuid)
+        summarizer = JobSummarizer(job, **kwargs)
+        summarizer.label = '{} {}'.format(cname, uuid)
+        self.summarizers[cname] = summarizer
+        if 'components' in job:
+            for cname, uuid in job['components'].iteritems():
+                subjob = self.arv.jobs().get(uuid=uuid).execute()
+                self.summarize_job(cname, subjob, **kwargs)
+
     def run(self):
         threads = []
         for summarizer in self.summarizers.itervalues():