import arvados
import collections
import functools
+import itertools
+import logging
+import math
import re
import sys
+logger = logging.getLogger(__name__)
+logger.addHandler(logging.NullHandler())
class Summarizer(object):
- def __init__(self, logdata):
+ existing_constraints = {}
+
+ def __init__(self, logdata, label='job'):
self._logdata = logdata
+ self.label = label
def run(self):
# stats_max: {category: {stat: val}}
this_interval_s = val
continue
elif not (this_interval_s > 0):
- print("BUG? interval stat given with duration {!r}".
- format(this_interval_s),
- file=sys.stderr)
+ logger.error(
+ "BUG? interval stat given with duration {!r}".
+ format(this_interval_s))
continue
else:
stat = stat + '__rate'
self.task_stats[task_id][category][stat] = val
if val > self.stats_max[category][stat]:
self.stats_max[category][stat] = val
-
- def report(self):
- return "\n".join(self._report_gen()) + "\n"
-
- def _report_gen(self):
- job_tot = collections.defaultdict(
+ self.job_tot = collections.defaultdict(
functools.partial(collections.defaultdict, int))
for task_id, task_stat in self.task_stats.iteritems():
for category, stat_last in task_stat.iteritems():
if stat in ['cpus', 'cache', 'swap', 'rss']:
# meaningless stats like 16 cpu cores x 5 tasks = 80
continue
- job_tot[category][stat] += val
+ self.job_tot[category][stat] += val
+
+ def report(self):
+ return "\n".join(itertools.chain(
+ self._report_gen(),
+ self._recommend_gen())) + "\n"
+
+ def _report_gen(self):
yield "\t".join(['category', 'metric', 'task_max', 'task_max_rate', 'job_total'])
for category, stat_max in sorted(self.stats_max.iteritems()):
for stat, val in sorted(stat_max.iteritems()):
continue
max_rate = self._format(stat_max.get(stat+'__rate', '-'))
val = self._format(val)
- tot = self._format(job_tot[category].get(stat, '-'))
+ tot = self._format(self.job_tot[category].get(stat, '-'))
yield "\t".join([category, stat, str(val), max_rate, tot])
for args in (
('Max CPU time spent by a single task: {}s',
self.stats_max['cpu']['user+sys__rate'],
lambda x: x * 100),
('Overall CPU usage: {}%',
- job_tot['cpu']['user+sys'] / job_tot['time']['elapsed'],
+ self.job_tot['cpu']['user+sys'] /
+ self.job_tot['time']['elapsed'],
lambda x: x * 100),
('Max memory used by a single task: {}GB',
self.stats_max['mem']['rss'],
val = transform(val)
yield "# "+format_string.format(self._format(val))
+ def _recommend_gen(self):
+ return itertools.chain(
+ self._recommend_cpu(),
+ self._recommend_ram())
+
+ def _recommend_cpu(self):
+ """Recommend asking for 4 cores if max CPU usage was 333%"""
+
+ cpu_max_rate = self.stats_max['cpu']['user+sys__rate']
+ if cpu_max_rate == float('-Inf'):
+ logger.warning('%s: no CPU usage data', self.label)
+ return
+ used_cores = int(math.ceil(cpu_max_rate))
+ asked_cores = self.existing_constraints.get('min_cores_per_node')
+ if asked_cores is None or used_cores < asked_cores:
+ yield (
+ '#!! {} max CPU usage was {}% -- '
+ 'try runtime_constraints "min_cores_per_node":{}'
+ ).format(
+ self.label,
+ int(math.ceil(cpu_max_rate*100)),
+ int(used_cores))
+
+ def _recommend_ram(self):
+ """Recommend asking for 2048 MiB RAM if max rss was 1248 MiB"""
+
+ used_ram = self.stats_max['mem']['rss']
+ if used_ram == float('-Inf'):
+ logger.warning('%s: no memory usage data', self.label)
+ return
+ used_ram = math.ceil(float(used_ram) / (1<<20))
+ asked_ram = self.existing_constraints.get('min_ram_mb_per_node')
+ if asked_ram is None or math.ceil(used_ram/(1<<10)) < asked_ram/(1<<10):
+ yield (
+ '#!! {} never used more than {} MiB RAM -- '
+ 'try runtime_constraints "min_ram_mb_per_node":{}'
+ ).format(
+ self.label,
+ int(used_ram),
+ int(math.ceil(used_ram/(1<<10))*(1<<10)))
+
def _format(self, val):
"""Return a string representation of a stat.
else:
return '{}'.format(val)
+
class CollectionSummarizer(Summarizer):
def __init__(self, collection_id):
collection = arvados.collection.CollectionReader(collection_id)
raise ValueError(
"collection {} has {} files; need exactly one".format(
collection_id, len(filenames)))
- super(CollectionSummarizer, self).__init__(collection.open(filenames[0]))
+ super(CollectionSummarizer, self).__init__(
+ collection.open(filenames[0]))
+
class JobSummarizer(CollectionSummarizer):
def __init__(self, job):
self.job = arv.jobs().get(uuid=job).execute()
else:
self.job = job
+ self.label = self.job['uuid']
+ self.existing_constraints = self.job.get('runtime_constraints', {})
if not self.job['log']:
raise ValueError(
"job {} has no log; live summary not implemented".format(
self.job['uuid']))
super(JobSummarizer, self).__init__(self.job['log'])
+
class PipelineSummarizer():
def __init__(self, pipeline_instance_uuid):
arv = arvados.api('v1')
self.summarizers = collections.OrderedDict()
for cname, component in instance['components'].iteritems():
if 'job' not in component:
- print("{}: skipping component with no job assigned".format(
- cname), file=sys.stderr)
+ logger.warning(
+ "%s: skipping component with no job assigned", cname)
elif component['job'].get('log') is None:
- print("{}: skipping component with no log available".format(
- cname), file=sys.stderr)
+ logger.warning(
+ "%s: skipping job %s with no log available",
+ cname, component['job'].get('uuid'))
else:
- print("{}: reading log from {}".format(
- cname, component['job']['log']), file=sys.stderr)
- summarizer = CollectionSummarizer(component['job']['log'])
- summarizer.job_uuid = component['job']['uuid']
+ logger.debug(
+ "%s: reading log from %s", cname, component['job']['log'])
+ summarizer = JobSummarizer(component['job'])
+ summarizer.label = cname
self.summarizers[cname] = summarizer
def run(self):
txt = ''
for cname, summarizer in self.summarizers.iteritems():
txt += '### Summary for {} ({})\n'.format(
- cname, summarizer.job_uuid)
+ cname, summarizer.job['uuid'])
txt += summarizer.report()
txt += '\n'
return txt
expect, got, fromfile=expectfile, tofile="(generated)")))
-class ExampleLogsTestCase(ReportDiff):
+class SummarizeFile(ReportDiff):
def test_example_files(self):
for fnm in glob.glob(os.path.join(TESTS_DIR, '*.txt.gz')):
logfile = os.path.join(TESTS_DIR, fnm)
self.diff_known_report(logfile, summarizer)
-class LookupJobUUID(ReportDiff):
- fake_uuid = 'zzzzz-8i9sb-jq0ekny1xou3zoh'
+class SummarizeJob(ReportDiff):
+ fake_job_uuid = 'zzzzz-8i9sb-jjjjjjjjjjjjjjj'
+ fake_log_id = 'fake-log-collection-id'
+ fake_job = {
+ 'uuid': fake_job_uuid,
+ 'log': fake_log_id,
+ }
+ logfile = os.path.join(TESTS_DIR, 'logfile_20151204190335.txt.gz')
@mock.patch('arvados.collection.CollectionReader')
@mock.patch('arvados.api')
- def test_job_uuid(self, mock_api, mock_cr):
- logfile = os.path.join(TESTS_DIR, 'logfile_20151204190335.txt.gz')
- mock_api().jobs().get().execute.return_value = {'log': 'fake-uuid'}
+ def test_job_report(self, mock_api, mock_cr):
+ mock_api().jobs().get().execute.return_value = self.fake_job
mock_cr().__iter__.return_value = ['fake-logfile.txt']
- mock_cr().open.return_value = gzip.open(logfile)
+ mock_cr().open.return_value = gzip.open(self.logfile)
args = crunchstat_summary.command.ArgumentParser().parse_args(
- ['--job', self.fake_uuid])
+ ['--job', self.fake_job_uuid])
summarizer = crunchstat_summary.command.Command(args).summarizer()
summarizer.run()
- self.diff_known_report(logfile, summarizer)
- mock_api().jobs().get.assert_called_with(uuid=self.fake_uuid)
+ self.diff_known_report(self.logfile, summarizer)
+ mock_api().jobs().get.assert_called_with(uuid=self.fake_job_uuid)
+ mock_cr.assert_called_with(self.fake_log_id)
mock_cr().open.assert_called_with('fake-logfile.txt')
'job': {
'uuid': 'zzzzz-8i9sb-000000000000000',
'log': 'fake-log-pdh-0',
+ 'runtime_constraints': {
+ 'min_ram_mb_per_node': 1024,
+ 'min_cores_per_node': 1,
+ },
},
}],
['bar', {
'job': {
'uuid': 'zzzzz-8i9sb-000000000000001',
'log': 'fake-log-pdh-1',
+ 'runtime_constraints': {
+ 'min_ram_mb_per_node': 1024,
+ 'min_cores_per_node': 1,
+ },
},
}],
['no-job-assigned', {}],
'job': {
'uuid': 'zzzzz-8i9sb-000000000000002',
'log': 'fake-log-pdh-2',
+ 'runtime_constraints': {
+ 'min_ram_mb_per_node': 1024,
+ 'min_cores_per_node': 1,
+ },
},
}]]),
}
summarizer = crunchstat_summary.command.Command(args).summarizer()
summarizer.run()
+ job_report = [
+ line for line in open(logfile+'.report').readlines()
+ if not line.startswith('#!! ')]
expect = (
['### Summary for foo (zzzzz-8i9sb-000000000000000)\n'] +
- open(logfile+'.report').readlines() + ['\n'] +
+ job_report + ['\n'] +
['### Summary for bar (zzzzz-8i9sb-000000000000001)\n'] +
- open(logfile+'.report').readlines() + ['\n'] +
+ job_report + ['\n'] +
['### Summary for baz (zzzzz-8i9sb-000000000000002)\n'] +
- open(logfile+'.report').readlines())
+ job_report)
self.diff_report(summarizer, expect)
mock_cr.assert_has_calls(
[