From 31242cbe64d5c58f54735a80b0db8820c4122bfb Mon Sep 17 00:00:00 2001 From: Tim Pierce Date: Wed, 7 Jan 2015 09:45:06 -0500 Subject: [PATCH] 4598: actually rename this time PEBCAK failure led to deleting the file without staging the new one. d'oh. --- services/api/script/crunch_failure_report.py | 219 +++++++++++++++++++ 1 file changed, 219 insertions(+) create mode 100755 services/api/script/crunch_failure_report.py diff --git a/services/api/script/crunch_failure_report.py b/services/api/script/crunch_failure_report.py new file mode 100755 index 0000000000..31ad0fefed --- /dev/null +++ b/services/api/script/crunch_failure_report.py @@ -0,0 +1,219 @@ +#! /usr/bin/env python + +import argparse +import datetime +import json +import re +import sys + +import arvados + +# Useful configuration variables: + +# Number of log lines to use as context in diagnosing failure. +LOG_CONTEXT_LINES = 10 + +# Regex that signifies a failed task. +FAILED_TASK_REGEX = re.compile(' \d+ failure (.*permanent)') + +# Regular expressions used to classify failure types. +JOB_FAILURE_TYPES = { + 'sys/docker': 'Cannot destroy container', + 'crunch/node': 'User not found on host', + 'slurm/comm': 'Communication connection failure' +} + +def parse_arguments(arguments): + arg_parser = argparse.ArgumentParser( + description='Produce a report of Crunch failures within a specified time range') + + arg_parser.add_argument( + '--start', + help='Start date and time') + arg_parser.add_argument( + '--end', + help='End date and time') + + args = arg_parser.parse_args(arguments) + + if args.start and not is_valid_timestamp(args.start): + raise ValueError(args.start) + if args.end and not is_valid_timestamp(args.end): + raise ValueError(args.end) + + return args + + +def api_timestamp(when=None): + """Returns a string representing the timestamp 'when' in a format + suitable for delivering to the API server. Defaults to the + current time. + """ + if when is None: + when = datetime.datetime.utcnow() + return when.strftime("%Y-%m-%dT%H:%M:%SZ") + + +def is_valid_timestamp(ts): + return re.match(r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z', ts) + + +def jobs_created_between_dates(api, start, end): + return arvados.util.list_all( + api.jobs().list, + filters=json.dumps([ ['created_at', '>=', start], + ['created_at', '<=', end] ])) + + +def job_logs(api, job): + # Returns the contents of the log for this job (as an array of lines). + if job['log']: + log_collection = arvados.CollectionReader(job['log'], api) + log_filename = "{}.log.txt".format(job['uuid']) + return log_collection.open(log_filename).readlines() + return [] + + +user_names = {} +def job_user_name(api, user_uuid): + def _lookup_user_name(api, user_uuid): + try: + return api.users().get(uuid=user_uuid).execute()['full_name'] + except arvados.errors.ApiError: + return user_uuid + + if user_uuid not in user_names: + user_names[user_uuid] = _lookup_user_name(api, user_uuid) + return user_names[user_uuid] + + +job_pipeline_names = {} +def job_pipeline_name(api, job_uuid): + def _lookup_pipeline_name(api, job_uuid): + try: + pipelines = api.pipeline_instances().list( + filters='[["components", "like", "%{}%"]]'.format(job_uuid)).execute() + pi = pipelines['items'][0] + if pi['name']: + return pi['name'] + else: + # Use the pipeline template name + pt = api.pipeline_templates().get(uuid=pi['pipeline_template_uuid']).execute() + return pt['name'] + except (TypeError, ValueError, IndexError): + return "" + + if job_uuid not in job_pipeline_names: + job_pipeline_names[job_uuid] = _lookup_pipeline_name(api, job_uuid) + return job_pipeline_names[job_uuid] + + +def is_failed_task(logline): + return FAILED_TASK_REGEX.search(logline) != None + + +def main(arguments=None, stdout=sys.stdout, stderr=sys.stderr): + args = parse_arguments(arguments) + + api = arvados.api('v1') + + now = datetime.datetime.utcnow() + start_time = args.start or api_timestamp(now - datetime.timedelta(days=1)) + end_time = args.end or api_timestamp(now) + + # Find all jobs created within the specified window, + # and their corresponding job logs. + jobs_created = jobs_created_between_dates(api, start_time, end_time) + jobs_by_state = {} + for job in jobs_created: + jobs_by_state.setdefault(job['state'], []) + jobs_by_state[job['state']].append(job) + + # Find failed jobs and record the job failure text. + + # failure_stats maps failure types (e.g. "sys/docker") to + # a set of job UUIDs that failed for that reason. + failure_stats = {} + for job in jobs_by_state['Failed']: + job_uuid = job['uuid'] + logs = job_logs(api, job) + # Find the first permanent task failure, and collect the + # preceding log lines. + failure_type = None + for i, lg in enumerate(logs): + if is_failed_task(lg): + # Get preceding log record to provide context. + log_start = i - LOG_CONTEXT_LINES if i >= LOG_CONTEXT_LINES else 0 + log_end = i + 1 + lastlogs = ''.join(logs[log_start:log_end]) + # try to identify the type of failure. + for key, rgx in JOB_FAILURE_TYPES.iteritems(): + if re.search(rgx, lastlogs): + failure_type = key + break + if failure_type is not None: + break + if failure_type is None: + failure_type = 'unknown' + failure_stats.setdefault(failure_type, set()) + failure_stats[failure_type].add(job_uuid) + + # Report percentages of successful, failed and unfinished jobs. + print "Start: {:20s}".format(start_time) + print "End: {:20s}".format(end_time) + print "" + + print "Overview" + print "" + + job_start_count = len(jobs_created) + print " {: <25s} {:4d}".format('Started', job_start_count) + for state in ['Complete', 'Failed', 'Queued', 'Cancelled', 'Running']: + if state in jobs_by_state: + job_count = len(jobs_by_state[state]) + job_percentage = job_count / float(job_start_count) + print " {: <25s} {:4d} ({: >4.0%})".format(state, + job_count, + job_percentage) + print "" + + # Report failure types. + failure_summary = "" + failure_detail = "" + + # Generate a mapping from failed job uuids to job records, to assist + # in generating detailed statistics for job failures. + jobs_failed_map = { job['uuid']: job for job in jobs_by_state.get('Failed', []) } + + # sort the failure stats in descending order by occurrence. + sorted_failures = sorted(failure_stats, + reverse=True, + key=lambda failure_type: len(failure_stats[failure_type])) + for failtype in sorted_failures: + job_uuids = failure_stats[failtype] + failstat = " {: <25s} {:4d} ({: >4.0%})\n".format( + failtype, + len(job_uuids), + len(job_uuids) / float(len(jobs_by_state['Failed']))) + failure_summary = failure_summary + failstat + failure_detail = failure_detail + failstat + for j in job_uuids: + job_info = jobs_failed_map[j] + job_owner = job_user_name(api, job_info['modified_by_user_uuid']) + job_name = job_pipeline_name(api, job_info['uuid']) + failure_detail = failure_detail + " {} {: <15.15s} {:29.29s}\n".format(j, job_owner, job_name) + failure_detail = failure_detail + "\n" + + print "Failures by class" + print "" + print failure_summary + + print "Failures by class (detail)" + print "" + print failure_detail + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) -- 2.39.5