#! /usr/bin/env python import argparse import datetime import json import re import sys import arvados # Useful configuration variables: # Number of log lines to use as context in diagnosing failure. LOG_CONTEXT_LINES = 10 # Regex that signifies a failed task. FAILED_TASK_REGEX = re.compile(' \d+ failure (.*permanent)') # Regex that signifies a successful job. SUCCESSFUL_JOB_REGEX = re.compile('finished') # List of regexes by which to classify failures. JOB_FAILURE_TYPES = [ 'User not found on host' ] def parse_arguments(arguments): arg_parser = argparse.ArgumentParser( description='Produce a report of Crunch failures within a specified time range') arg_parser.add_argument( '--start', help='Start date and time') arg_parser.add_argument( '--end', help='End date and time') arg_parser.add_argument( '--match', default='fail', help='Regular expression to match on Crunch error output lines.') args = arg_parser.parse_args(arguments) if args.start and not is_valid_timestamp(args.start): raise ValueError(args.start) if args.end and not is_valid_timestamp(args.end): raise ValueError(args.end) return args def api_timestamp(when=None): """Returns a string representing the timestamp 'when' in a format suitable for delivering to the API server. Defaults to the current time. """ if when is None: when = datetime.datetime.utcnow() return when.strftime("%Y-%m-%dT%H:%M:%SZ") def is_valid_timestamp(ts): return re.match(r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z', ts) def jobs_created_between_dates(api, start, end): return arvados.util.list_all( api.jobs().list, filters=json.dumps([ ['created_at', '>=', start], ['created_at', '<=', end] ])) def job_logs(api, job): # Returns the contents of the log for this job (as an array of lines). log_filename = "{}.log.txt".format(job['uuid']) log_collection = arvados.CollectionReader(job['log'], api) return log_collection.open(log_filename).readlines() def is_failed_task(logline): return FAILED_TASK_REGEX.search(logline) != None def is_successful_job(logline): return SUCCESSFUL_JOB_REGEX.search(logline) != None def log(s): print >>sys.stderr, "{}: {}".format(datetime.datetime.utcnow(), s) def main(arguments=None, stdout=sys.stdout, stderr=sys.stderr): args = parse_arguments(arguments) api = arvados.api('v1') now = datetime.datetime.utcnow() start_time = args.start or api_timestamp(now - datetime.timedelta(days=1)) end_time = args.end or api_timestamp(now) # Find all jobs created within the specified window, # and their corresponding job logs. jobs_created = jobs_created_between_dates(api, start_time, end_time) jobs_failed = [job for job in jobs_created if job['state'] == 'Failed'] jobs_successful = [job for job in jobs_created if job['state'] == 'Complete'] log("jobs created: {}".format(len(jobs_created))) # Find failed jobs and record the job failure text. jobs_failed_types = {} for job in jobs_failed: job_uuid = job['uuid'] log("fetching job {} log from keep".format(job_uuid)) logs = job_logs(api, job) # Find the first permanent task failure, and collect the # preceding log lines. for i, lg in enumerate(logs): if is_failed_task(lg): # Get preceding log record to provide context. log_start = i - LOG_CONTEXT_LINES if i >= LOG_CONTEXT_LINES else 0 log_end = i + 1 lastlogs = ''.join(logs[log_start:log_end]) # try to identify the type of failure. fail_reason = 'unknown' for rgx in JOB_FAILURE_TYPES: if re.search(rgx, lastlogs): fail_reason = rgx log("job {} failed for reason {}".format(job_uuid, fail_reason)) break jobs_failed_types.setdefault(fail_reason, set()) jobs_failed_types[fail_reason].add(job_uuid) break # If we got here, the job is recorded as "failed" but we # could not find the failure of any given task. jobs_failed_types.setdefault('unknown', set()) jobs_failed_types['unknown'].add(job_uuid) # Report percentages of successful, failed and unfinished jobs. print "Start: {:20s}".format(start_time) print "End: {:20s}".format(end_time) print "" print "Overview" print "" job_start_count = len(jobs_created) job_success_count = len(jobs_successful) job_fail_count = len(jobs_failed) job_unfinished_count = job_start_count - job_success_count - job_fail_count print " Started: {:4d}".format(job_start_count) print " Successful: {:4d} ({:3.0%})".format( job_success_count, job_success_count / float(job_start_count)) print " Failed: {:4d} ({:3.0%})".format( job_fail_count, job_fail_count / float(job_start_count)) print " In progress: {:4d} ({:3.0%})".format( job_unfinished_count, job_unfinished_count / float(job_start_count)) print "" # Report failure types. failure_summary = "" failure_detail = "" for failtype, job_uuids in jobs_failed_types.iteritems(): failstat = " {:s} {:4d} ({:3.0%})\n".format( failtype, len(job_uuids), len(job_uuids) / float(job_fail_count)) failure_summary = failure_summary + failstat failure_detail = failure_detail + failstat for j in job_uuids: failure_detail = failure_detail + " http://crvr.se/{}\n".format(j) failure_detail = failure_detail + "\n" print "Failures by class" print "" print failure_summary print "Failures by class (detail):" print "" print failure_detail return 0 if __name__ == "__main__": sys.exit(main())