X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/68fee3aff2bc0e189827956720de58c3e8668bdc..6fa0df704b8b1e52579307d64c2e6d4edfa9aff7:/tools/crunchstat-summary/crunchstat_summary/summarizer.py diff --git a/tools/crunchstat-summary/crunchstat_summary/summarizer.py b/tools/crunchstat-summary/crunchstat_summary/summarizer.py index e8a842d3a9..28afc9e0d5 100644 --- a/tools/crunchstat-summary/crunchstat_summary/summarizer.py +++ b/tools/crunchstat-summary/crunchstat_summary/summarizer.py @@ -2,8 +2,6 @@ # # SPDX-License-Identifier: AGPL-3.0 -from __future__ import print_function - import arvados import collections import crunchstat_summary.dygraphs @@ -24,7 +22,7 @@ from crunchstat_summary import logger # number of GiB. (Actual nodes tend to be sold in sizes like 8 GiB # that have amounts like 7.5 GiB according to the kernel.) AVAILABLE_RAM_RATIO = 0.95 - +MB=2**20 # Workaround datetime.datetime.strptime() thread-safety bug by calling # it once before starting threads. https://bugs.python.org/issue7980 @@ -37,6 +35,7 @@ WEBCHART_CLASS = crunchstat_summary.dygraphs.DygraphsChart class Task(object): def __init__(self): self.starttime = None + self.finishtime = None self.series = collections.defaultdict(list) @@ -70,12 +69,16 @@ class Summarizer(object): def run(self): logger.debug("%s: parsing logdata %s", self.label, self._logdata) - detected_crunch1 = False - for line in self._logdata: - if not detected_crunch1 and '-8i9sb-' in line: - detected_crunch1 = True + with self._logdata as logdata: + self._run(logdata) + + def _run(self, logdata): + self.detected_crunch1 = False + for line in logdata: + if not self.detected_crunch1 and '-8i9sb-' in line: + self.detected_crunch1 = True - if detected_crunch1: + if self.detected_crunch1: m = re.search(r'^\S+ \S+ \d+ (?P\d+) job_task (?P\S+)$', line) if m: seq = int(m.group('seq')) @@ -111,12 +114,14 @@ class Summarizer(object): logger.debug('%s: done %s', self.label, uuid) continue - m = re.search(r'^(?P[^\s.]+)(\.\d+)? (?P\S+) \d+ (?P\d+) stderr crunchstat: (?P\S+) (?P.*?)( -- interval (?P.*))?\n$', line) + # 2017-12-02_17:15:08 e51c5-8i9sb-mfp68stkxnqdd6m 63676 0 stderr crunchstat: keepcalls 0 put 2576 get -- interval 10.0000 seconds 0 put 2576 get + m = re.search(r'^(?P[^\s.]+)(\.\d+)? (?P\S+) \d+ (?P\d+) stderr (?Pcrunchstat: )(?P\S+) (?P.*?)( -- interval (?P.*))?\n$', line) if not m: continue else: # crunch2 - m = re.search(r'^(?P\S+) (?P\S+) (?P.*?)( -- interval (?P.*))?\n$', line) + # 2017-12-01T16:56:24.723509200Z crunchstat: keepcalls 0 put 3 get -- interval 10.0000 seconds 0 put 3 get + m = re.search(r'^(?P\S+) (?Pcrunchstat: )?(?P\S+) (?P.*?)( -- interval (?P.*))?\n$', line) if not m: continue @@ -130,12 +135,12 @@ class Summarizer(object): continue elif m.group('category') in ('error', 'caught'): continue - elif m.group('category') in ['read', 'open', 'cgroup', 'CID']: + elif m.group('category') in ('read', 'open', 'cgroup', 'CID', 'Running'): # "stderr crunchstat: read /proc/1234/net/dev: ..." # (old logs are less careful with unprefixed error messages) continue - if detected_crunch1: + if self.detected_crunch1: task_id = self.seq_to_uuid[int(m.group('seq'))] else: task_id = 'container' @@ -154,15 +159,24 @@ class Summarizer(object): raise ValueError("Cannot parse timestamp {!r}".format( timestamp)) - if not task.starttime: - task.starttime = timestamp + if task.starttime is None: logger.debug('%s: task %s starttime %s', self.label, task_id, timestamp) - task.finishtime = timestamp + if task.starttime is None or timestamp < task.starttime: + task.starttime = timestamp + if task.finishtime is None or timestamp > task.finishtime: + task.finishtime = timestamp - if not self.starttime: + if self.starttime is None or timestamp < task.starttime: self.starttime = timestamp - self.finishtime = timestamp + if self.finishtime is None or timestamp < task.finishtime: + self.finishtime = timestamp + + if (not self.detected_crunch1) and task.starttime is not None and task.finishtime is not None: + elapsed = (task.finishtime - task.starttime).seconds + self.task_stats[task_id]['time'] = {'elapsed': elapsed} + if elapsed > self.stats_max['time']['elapsed']: + self.stats_max['time']['elapsed'] = elapsed this_interval_s = None for group in ['current', 'interval']: @@ -178,14 +192,22 @@ class Summarizer(object): else: stats[stat] = int(val) except ValueError as e: - logger.warning('Error parsing {} stat: {!r}'.format( - stat, e)) + # If the line doesn't start with 'crunchstat:' we + # might have mistaken an error message for a + # structured crunchstat line. + if m.group("crunchstat") is None or m.group("category") == "crunchstat": + logger.warning("%s: log contains message\n %s", self.label, line) + else: + logger.warning( + '%s: Error parsing value %r (stat %r, category %r): %r', + self.label, val, stat, category, e) + logger.warning('%s', line) continue if 'user' in stats or 'sys' in stats: stats['user+sys'] = stats.get('user', 0) + stats.get('sys', 0) if 'tx' in stats or 'rx' in stats: stats['tx+rx'] = stats.get('tx', 0) + stats.get('rx', 0) - for stat, val in stats.iteritems(): + for stat, val in stats.items(): if group == 'interval': if stat == 'seconds': this_interval_s = val @@ -212,9 +234,9 @@ class Summarizer(object): self.job_tot = collections.defaultdict( functools.partial(collections.defaultdict, int)) - for task_id, task_stat in self.task_stats.iteritems(): - for category, stat_last in task_stat.iteritems(): - for stat, val in stat_last.iteritems(): + for task_id, task_stat in self.task_stats.items(): + for category, stat_last in task_stat.items(): + for stat, val in stat_last.items(): if stat in ['cpus', 'cache', 'swap', 'rss']: # meaningless stats like 16 cpu cores x 5 tasks = 80 continue @@ -223,6 +245,8 @@ class Summarizer(object): def long_label(self): label = self.label + if hasattr(self, 'process') and self.process['uuid'] not in label: + label = '{} ({})'.format(label, self.process['uuid']) if self.finishtime: label += ' -- elapsed time ' s = (self.finishtime - self.starttime).total_seconds() @@ -247,8 +271,8 @@ class Summarizer(object): def _text_report_gen(self): yield "\t".join(['category', 'metric', 'task_max', 'task_max_rate', 'job_total']) - for category, stat_max in sorted(self.stats_max.iteritems()): - for stat, val in sorted(stat_max.iteritems()): + for category, stat_max in sorted(self.stats_max.items()): + for stat, val in sorted(stat_max.items()): if stat.endswith('__rate'): continue max_rate = self._format(stat_max.get(stat+'__rate', '-')) @@ -266,7 +290,7 @@ class Summarizer(object): self.stats_max['cpu']['user+sys__rate'], lambda x: x * 100), ('Overall CPU usage: {}%', - self.job_tot['cpu']['user+sys'] / + float(self.job_tot['cpu']['user+sys']) / self.job_tot['time']['elapsed'] if self.job_tot['time']['elapsed'] > 0 else 0, lambda x: x * 100), @@ -307,19 +331,21 @@ class Summarizer(object): def _recommend_cpu(self): """Recommend asking for 4 cores if max CPU usage was 333%""" + constraint_key = self._map_runtime_constraint('vcpus') cpu_max_rate = self.stats_max['cpu']['user+sys__rate'] if cpu_max_rate == float('-Inf'): logger.warning('%s: no CPU usage data', self.label) return used_cores = max(1, int(math.ceil(cpu_max_rate))) - asked_cores = self.existing_constraints.get('min_cores_per_node') + asked_cores = self.existing_constraints.get(constraint_key) if asked_cores is None or used_cores < asked_cores: yield ( '#!! {} max CPU usage was {}% -- ' - 'try runtime_constraints "min_cores_per_node":{}' + 'try runtime_constraints "{}":{}' ).format( self.label, - int(math.ceil(cpu_max_rate*100)), + math.ceil(cpu_max_rate*100), + constraint_key, int(used_cores)) def _recommend_ram(self): @@ -356,40 +382,44 @@ class Summarizer(object): the memory we want -- even if that happens to be 8192 MiB. """ + constraint_key = self._map_runtime_constraint('ram') used_bytes = self.stats_max['mem']['rss'] if used_bytes == float('-Inf'): logger.warning('%s: no memory usage data', self.label) return - used_mib = math.ceil(float(used_bytes) / 1048576) - asked_mib = self.existing_constraints.get('min_ram_mb_per_node') + used_mib = math.ceil(float(used_bytes) / MB) + asked_mib = self.existing_constraints.get(constraint_key) nearlygibs = lambda mebibytes: mebibytes/AVAILABLE_RAM_RATIO/1024 if asked_mib is None or ( math.ceil(nearlygibs(used_mib)) < nearlygibs(asked_mib)): yield ( '#!! {} max RSS was {} MiB -- ' - 'try runtime_constraints "min_ram_mb_per_node":{}' + 'try runtime_constraints "{}":{}' ).format( self.label, int(used_mib), - int(math.ceil(nearlygibs(used_mib))*AVAILABLE_RAM_RATIO*1024)) + constraint_key, + int(math.ceil(nearlygibs(used_mib))*AVAILABLE_RAM_RATIO*1024*(MB)/self._runtime_constraint_mem_unit())) def _recommend_keep_cache(self): """Recommend increasing keep cache if utilization < 80%""" + constraint_key = self._map_runtime_constraint('keep_cache_ram') if self.job_tot['net:keep0']['rx'] == 0: return utilization = (float(self.job_tot['blkio:0:0']['read']) / float(self.job_tot['net:keep0']['rx'])) - asked_mib = self.existing_constraints.get('keep_cache_mb_per_task', 256) + asked_cache = self.existing_constraints.get(constraint_key, 256) if utilization < 0.8: yield ( '#!! {} Keep cache utilization was {:.2f}% -- ' - 'try runtime_constraints "keep_cache_mb_per_task":{} (or more)' + 'try runtime_constraints "{}":{} (or more)' ).format( self.label, utilization * 100.0, - asked_mib*2) + constraint_key, + asked_cache*2*(MB)/self._runtime_constraint_mem_unit()) def _format(self, val): @@ -401,6 +431,22 @@ class Summarizer(object): else: return '{}'.format(val) + def _runtime_constraint_mem_unit(self): + if hasattr(self, 'runtime_constraint_mem_unit'): + return self.runtime_constraint_mem_unit + elif self.detected_crunch1: + return JobSummarizer.runtime_constraint_mem_unit + else: + return ContainerSummarizer.runtime_constraint_mem_unit + + def _map_runtime_constraint(self, key): + if hasattr(self, 'map_runtime_constraint'): + return self.map_runtime_constraint[key] + elif self.detected_crunch1: + return JobSummarizer.map_runtime_constraint[key] + else: + return key + class CollectionSummarizer(Summarizer): def __init__(self, collection_id, **kwargs): @@ -409,11 +455,14 @@ class CollectionSummarizer(Summarizer): self.label = collection_id -def NewSummarizer(process, **kwargs): +def NewSummarizer(process_or_uuid, **kwargs): """Construct with the appropriate subclass for this uuid/object.""" - if not isinstance(process, dict): - uuid = process + if isinstance(process_or_uuid, dict): + process = process_or_uuid + uuid = process['uuid'] + else: + uuid = process_or_uuid process = None arv = arvados.api('v1', model=OrderedJsonModel()) @@ -428,7 +477,7 @@ def NewSummarizer(process, **kwargs): elif '-8i9sb-' in uuid: if process is None: process = arv.jobs().get(uuid=uuid).execute() - klass = JobSummarizer + klass = JobTreeSummarizer elif '-d1hrv-' in uuid: if process is None: process = arv.pipeline_instances().get(uuid=uuid).execute() @@ -462,11 +511,16 @@ class ProcessSummarizer(Summarizer): class JobSummarizer(ProcessSummarizer): - pass + runtime_constraint_mem_unit = MB + map_runtime_constraint = { + 'keep_cache_ram': 'keep_cache_mb_per_task', + 'ram': 'min_ram_mb_per_node', + 'vcpus': 'min_cores_per_node', + } class ContainerSummarizer(ProcessSummarizer): - pass + runtime_constraint_mem_unit = 1 class MultiSummarizer(object): @@ -483,7 +537,7 @@ class MultiSummarizer(object): def run(self): threads = [] - for child in self.children.itervalues(): + for child in self.children.values(): self.throttle.acquire() t = threading.Thread(target=self.run_and_release, args=(child.run, )) t.daemon = True @@ -494,29 +548,69 @@ class MultiSummarizer(object): def text_report(self): txt = '' - for cname, child in self.children.iteritems(): - if len(self.children) > 1: + d = self._descendants() + for child in d.values(): + if len(d) > 1: txt += '### Summary for {} ({})\n'.format( - cname, child.process['uuid']) + child.label, child.process['uuid']) txt += child.text_report() txt += '\n' return txt + def _descendants(self): + """Dict of self and all descendants. + + Nodes with nothing of their own to report (like + MultiSummarizers) are omitted. + """ + d = collections.OrderedDict() + for key, child in self.children.items(): + if isinstance(child, Summarizer): + d[key] = child + if isinstance(child, MultiSummarizer): + d.update(child._descendants()) + return d + def html_report(self): - return WEBCHART_CLASS(self.label, self.children.itervalues()).html() + return WEBCHART_CLASS(self.label, iter(self._descendants().values())).html() + + +class JobTreeSummarizer(MultiSummarizer): + """Summarizes a job and all children listed in its components field.""" + def __init__(self, job, label=None, **kwargs): + arv = arvados.api('v1', model=OrderedJsonModel()) + label = label or job.get('name', job['uuid']) + children = collections.OrderedDict() + children[job['uuid']] = JobSummarizer(job, label=label, **kwargs) + if job.get('components', None): + preloaded = {} + for j in arv.jobs().index( + limit=len(job['components']), + filters=[['uuid','in',list(job['components'].values())]]).execute()['items']: + preloaded[j['uuid']] = j + for cname in sorted(job['components'].keys()): + child_uuid = job['components'][cname] + j = (preloaded.get(child_uuid) or + arv.jobs().get(uuid=child_uuid).execute()) + children[child_uuid] = JobTreeSummarizer(job=j, label=cname, **kwargs) + + super(JobTreeSummarizer, self).__init__( + children=children, + label=label, + **kwargs) class PipelineSummarizer(MultiSummarizer): def __init__(self, instance, **kwargs): children = collections.OrderedDict() - for cname, component in instance['components'].iteritems(): + for cname, component in instance['components'].items(): if 'job' not in component: logger.warning( "%s: skipping component with no job assigned", cname) else: logger.info( "%s: job %s", cname, component['job']['uuid']) - summarizer = JobSummarizer(component['job'], **kwargs) + summarizer = JobTreeSummarizer(component['job'], label=cname, **kwargs) summarizer.label = '{} {}'.format( cname, component['job']['uuid']) children[cname] = summarizer @@ -527,7 +621,7 @@ class PipelineSummarizer(MultiSummarizer): class ContainerTreeSummarizer(MultiSummarizer): - def __init__(self, root, **kwargs): + def __init__(self, root, skip_child_jobs=False, **kwargs): arv = arvados.api('v1', model=OrderedJsonModel()) label = kwargs.pop('label', None) or root.get('name') or root['uuid'] @@ -538,26 +632,38 @@ class ContainerTreeSummarizer(MultiSummarizer): while len(todo) > 0: current = todo.popleft() label = current['name'] + sort_key = current['created_at'] if current['uuid'].find('-xvhdp-') > 0: current = arv.containers().get(uuid=current['container_uuid']).execute() - children[current['uuid']] = ContainerSummarizer( - current, label=label, **kwargs) + + summer = ContainerSummarizer(current, label=label, **kwargs) + summer.sort_key = sort_key + children[current['uuid']] = summer + page_filters = [] while True: - items = arv.container_requests().index( + child_crs = arv.container_requests().index( order=['uuid asc'], filters=page_filters+[ ['requesting_container_uuid', '=', current['uuid']]], - ).execute()['items'] - if not items: + ).execute() + if not child_crs['items']: + break + elif skip_child_jobs: + logger.warning('%s: omitting stats from %d child containers' + ' because --skip-child-jobs flag is on', + label, child_crs['items_available']) break - page_filters = [['uuid', '>', items[-1]['uuid']]] - for cr in items: + page_filters = [['uuid', '>', child_crs['items'][-1]['uuid']]] + for cr in child_crs['items']: if cr['container_uuid']: logger.debug('%s: container req %s', current['uuid'], cr['uuid']) - cr['name'] = label + ' / ' + (cr.get('name') or cr['uuid']) + cr['name'] = cr.get('name') or cr['uuid'] todo.append(cr) + sorted_children = collections.OrderedDict() + for uuid in sorted(list(children.keys()), key=lambda uuid: children[uuid].sort_key): + sorted_children[uuid] = children[uuid] super(ContainerTreeSummarizer, self).__init__( - children=children, + children=sorted_children, label=root['name'], **kwargs)