X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/dc140f2c277a7571669f5bdf492bac6448a0fba0..00a4d2f890cd363725a4d6697bbf92498c866420:/tools/crunchstat-summary/crunchstat_summary/summarizer.py diff --git a/tools/crunchstat-summary/crunchstat_summary/summarizer.py b/tools/crunchstat-summary/crunchstat_summary/summarizer.py index 2879a616ce..d91161c70c 100644 --- a/tools/crunchstat-summary/crunchstat_summary/summarizer.py +++ b/tools/crunchstat-summary/crunchstat_summary/summarizer.py @@ -70,12 +70,16 @@ class Summarizer(object): def run(self): logger.debug("%s: parsing logdata %s", self.label, self._logdata) - detected_crunch1 = False - for line in self._logdata: - if not detected_crunch1 and '-8i9sb-' in line: - detected_crunch1 = True + with self._logdata as logdata: + self._run(logdata) - if detected_crunch1: + def _run(self, logdata): + self.detected_crunch1 = False + for line in logdata: + if not self.detected_crunch1 and '-8i9sb-' in line: + self.detected_crunch1 = True + + if self.detected_crunch1: m = re.search(r'^\S+ \S+ \d+ (?P\d+) job_task (?P\S+)$', line) if m: seq = int(m.group('seq')) @@ -130,12 +134,12 @@ class Summarizer(object): continue elif m.group('category') in ('error', 'caught'): continue - elif m.group('category') in ['read', 'open', 'cgroup', 'CID']: + elif m.group('category') in ('read', 'open', 'cgroup', 'CID', 'Running'): # "stderr crunchstat: read /proc/1234/net/dev: ..." # (old logs are less careful with unprefixed error messages) continue - if detected_crunch1: + if self.detected_crunch1: task_id = self.seq_to_uuid[int(m.group('seq'))] else: task_id = 'container' @@ -178,8 +182,10 @@ class Summarizer(object): else: stats[stat] = int(val) except ValueError as e: - logger.warning('Error parsing {} stat: {!r}'.format( - stat, e)) + logger.warning( + 'Error parsing value %r (stat %r, category %r): %r', + val, stat, category, e) + logger.warning('%s', line) continue if 'user' in stats or 'sys' in stats: stats['user+sys'] = stats.get('user', 0) + stats.get('sys', 0) @@ -223,6 +229,8 @@ class Summarizer(object): def long_label(self): label = self.label + if hasattr(self, 'process') and self.process['uuid'] not in label: + label = '{} ({})'.format(label, self.process['uuid']) if self.finishtime: label += ' -- elapsed time ' s = (self.finishtime - self.starttime).total_seconds() @@ -307,19 +315,21 @@ class Summarizer(object): def _recommend_cpu(self): """Recommend asking for 4 cores if max CPU usage was 333%""" + constraint_key = self._map_runtime_constraint('vcpus') cpu_max_rate = self.stats_max['cpu']['user+sys__rate'] if cpu_max_rate == float('-Inf'): logger.warning('%s: no CPU usage data', self.label) return used_cores = max(1, int(math.ceil(cpu_max_rate))) - asked_cores = self.existing_constraints.get('min_cores_per_node') + asked_cores = self.existing_constraints.get(constraint_key) if asked_cores is None or used_cores < asked_cores: yield ( '#!! {} max CPU usage was {}% -- ' - 'try runtime_constraints "min_cores_per_node":{}' + 'try runtime_constraints "{}":{}' ).format( self.label, int(math.ceil(cpu_max_rate*100)), + constraint_key, int(used_cores)) def _recommend_ram(self): @@ -356,40 +366,44 @@ class Summarizer(object): the memory we want -- even if that happens to be 8192 MiB. """ + constraint_key = self._map_runtime_constraint('ram') used_bytes = self.stats_max['mem']['rss'] if used_bytes == float('-Inf'): logger.warning('%s: no memory usage data', self.label) return used_mib = math.ceil(float(used_bytes) / 1048576) - asked_mib = self.existing_constraints.get('min_ram_mb_per_node') + asked_mib = self.existing_constraints.get(constraint_key) nearlygibs = lambda mebibytes: mebibytes/AVAILABLE_RAM_RATIO/1024 if asked_mib is None or ( math.ceil(nearlygibs(used_mib)) < nearlygibs(asked_mib)): yield ( '#!! {} max RSS was {} MiB -- ' - 'try runtime_constraints "min_ram_mb_per_node":{}' + 'try runtime_constraints "{}":{}' ).format( self.label, int(used_mib), - int(math.ceil(nearlygibs(used_mib))*AVAILABLE_RAM_RATIO*1024)) + constraint_key, + int(math.ceil(nearlygibs(used_mib))*AVAILABLE_RAM_RATIO*1024*(2**20)/self._runtime_constraint_mem_unit())) def _recommend_keep_cache(self): """Recommend increasing keep cache if utilization < 80%""" + constraint_key = self._map_runtime_constraint('keep_cache_ram') if self.job_tot['net:keep0']['rx'] == 0: return utilization = (float(self.job_tot['blkio:0:0']['read']) / float(self.job_tot['net:keep0']['rx'])) - asked_mib = self.existing_constraints.get('keep_cache_mb_per_task', 256) + asked_mib = self.existing_constraints.get(constraint_key, 256) if utilization < 0.8: yield ( '#!! {} Keep cache utilization was {:.2f}% -- ' - 'try runtime_constraints "keep_cache_mb_per_task":{} (or more)' + 'try runtime_constraints "{}":{} (or more)' ).format( self.label, utilization * 100.0, - asked_mib*2) + constraint_key, + asked_mib*2*(2**20)/self._runtime_constraint_mem_unit()) def _format(self, val): @@ -401,6 +415,22 @@ class Summarizer(object): else: return '{}'.format(val) + def _runtime_constraint_mem_unit(self): + if hasattr(self, 'runtime_constraint_mem_unit'): + return self.runtime_constraint_mem_unit + elif self.detected_crunch1: + return JobSummarizer.runtime_constraint_mem_unit + else: + return ContainerSummarizer.runtime_constraint_mem_unit + + def _map_runtime_constraint(self, key): + if hasattr(self, 'map_runtime_constraint'): + return self.map_runtime_constraint[key] + elif self.detected_crunch1: + return JobSummarizer.map_runtime_constraint[key] + else: + return key + class CollectionSummarizer(Summarizer): def __init__(self, collection_id, **kwargs): @@ -409,11 +439,14 @@ class CollectionSummarizer(Summarizer): self.label = collection_id -def NewSummarizer(process, **kwargs): +def NewSummarizer(process_or_uuid, **kwargs): """Construct with the appropriate subclass for this uuid/object.""" - if not isinstance(process, dict): - uuid = process + if isinstance(process_or_uuid, dict): + process = process_or_uuid + uuid = process['uuid'] + else: + uuid = process_or_uuid process = None arv = arvados.api('v1', model=OrderedJsonModel()) @@ -428,7 +461,7 @@ def NewSummarizer(process, **kwargs): elif '-8i9sb-' in uuid: if process is None: process = arv.jobs().get(uuid=uuid).execute() - klass = JobSummarizer + klass = JobTreeSummarizer elif '-d1hrv-' in uuid: if process is None: process = arv.pipeline_instances().get(uuid=uuid).execute() @@ -462,11 +495,16 @@ class ProcessSummarizer(Summarizer): class JobSummarizer(ProcessSummarizer): - pass + runtime_constraint_mem_unit = 1048576 + map_runtime_constraint = { + 'keep_cache_ram': 'keep_cache_mb_per_task', + 'ram': 'min_ram_mb_per_node', + 'vcpus': 'min_cores_per_node', + } class ContainerSummarizer(ProcessSummarizer): - pass + runtime_constraint_mem_unit = 1 class MultiSummarizer(object): @@ -494,16 +532,56 @@ class MultiSummarizer(object): def text_report(self): txt = '' - for cname, child in self.children.iteritems(): - if len(self.children) > 1: + d = self._descendants() + for child in d.itervalues(): + if len(d) > 1: txt += '### Summary for {} ({})\n'.format( - cname, child.process['uuid']) + child.label, child.process['uuid']) txt += child.text_report() txt += '\n' return txt + def _descendants(self): + """Dict of self and all descendants. + + Nodes with nothing of their own to report (like + MultiSummarizers) are omitted. + """ + d = collections.OrderedDict() + for key, child in self.children.iteritems(): + if isinstance(child, Summarizer): + d[key] = child + if isinstance(child, MultiSummarizer): + d.update(child._descendants()) + return d + def html_report(self): - return WEBCHART_CLASS(self.label, self.children.itervalues()).html() + return WEBCHART_CLASS(self.label, self._descendants().itervalues()).html() + + +class JobTreeSummarizer(MultiSummarizer): + """Summarizes a job and all children listed in its components field.""" + def __init__(self, job, label=None, **kwargs): + arv = arvados.api('v1', model=OrderedJsonModel()) + label = label or job.get('name', job['uuid']) + children = collections.OrderedDict() + children[job['uuid']] = JobSummarizer(job, label=label, **kwargs) + if job.get('components', None): + preloaded = {} + for j in arv.jobs().index( + limit=len(job['components']), + filters=[['uuid','in',job['components'].values()]]).execute()['items']: + preloaded[j['uuid']] = j + for cname in sorted(job['components'].keys()): + child_uuid = job['components'][cname] + j = (preloaded.get(child_uuid) or + arv.jobs().get(uuid=child_uuid).execute()) + children[child_uuid] = JobTreeSummarizer(job=j, label=cname, **kwargs) + + super(JobTreeSummarizer, self).__init__( + children=children, + label=label, + **kwargs) class PipelineSummarizer(MultiSummarizer): @@ -516,7 +594,7 @@ class PipelineSummarizer(MultiSummarizer): else: logger.info( "%s: job %s", cname, component['job']['uuid']) - summarizer = JobSummarizer(component['job'], **kwargs) + summarizer = JobTreeSummarizer(component['job'], label=cname, **kwargs) summarizer.label = '{} {}'.format( cname, component['job']['uuid']) children[cname] = summarizer @@ -527,7 +605,7 @@ class PipelineSummarizer(MultiSummarizer): class ContainerTreeSummarizer(MultiSummarizer): - def __init__(self, root, **kwargs): + def __init__(self, root, skip_child_jobs=False, **kwargs): arv = arvados.api('v1', model=OrderedJsonModel()) label = kwargs.pop('label', None) or root.get('name') or root['uuid'] @@ -538,26 +616,38 @@ class ContainerTreeSummarizer(MultiSummarizer): while len(todo) > 0: current = todo.popleft() label = current['name'] + sort_key = current['created_at'] if current['uuid'].find('-xvhdp-') > 0: current = arv.containers().get(uuid=current['container_uuid']).execute() - children[current['uuid']] = ContainerSummarizer( - current, label=label, **kwargs) + + summer = ContainerSummarizer(current, label=label, **kwargs) + summer.sort_key = sort_key + children[current['uuid']] = summer + page_filters = [] while True: - items = arv.container_requests().index( + child_crs = arv.container_requests().index( order=['uuid asc'], filters=page_filters+[ ['requesting_container_uuid', '=', current['uuid']]], - ).execute()['items'] - if not items: + ).execute() + if not child_crs['items']: break - page_filters = [['uuid', '>', items[-1]['uuid']]] - for cr in items: + elif skip_child_jobs: + logger.warning('%s: omitting stats from %d child containers' + ' because --skip-child-jobs flag is on', + label, child_crs['items_available']) + break + page_filters = [['uuid', '>', child_crs['items'][-1]['uuid']]] + for cr in child_crs['items']: if cr['container_uuid']: logger.debug('%s: container req %s', current['uuid'], cr['uuid']) cr['name'] = cr.get('name') or cr['uuid'] todo.append(cr) + sorted_children = collections.OrderedDict() + for uuid in sorted(children.keys(), key=lambda uuid: children[uuid].sort_key): + sorted_children[uuid] = children[uuid] super(ContainerTreeSummarizer, self).__init__( - children=children, + children=sorted_children, label=root['name'], **kwargs)