services/api/script/crunch-failure-report.py

   1 #! /usr/bin/env python
   2
   3 import argparse
   4 import datetime
   5 import json
   6 import re
   7 import sys
   8
   9 import arvados
  10
  11 # Useful configuration variables:
  12
  13 # Number of log lines to use as context in diagnosing failure.
  14 LOG_CONTEXT_LINES = 10
  15
  16 # Regex that signifies a failed task.
  17 FAILED_TASK_REGEX = re.compile(' \d+ failure (.*permanent)')
  18
  19 # List of regexes by which to classify failures.
  20 JOB_FAILURE_TYPES = [ 'User not found on host' ]
  21
  22 def parse_arguments(arguments):
  23     arg_parser = argparse.ArgumentParser(
  24         description='Produce a report of Crunch failures within a specified time range')
  25
  26     arg_parser.add_argument(
  27         '--start',
  28         help='Start date and time')
  29     arg_parser.add_argument(
  30         '--end',
  31         help='End date and time')
  32
  33     args = arg_parser.parse_args(arguments)
  34
  35     if args.start and not is_valid_timestamp(args.start):
  36         raise ValueError(args.start)
  37     if args.end and not is_valid_timestamp(args.end):
  38         raise ValueError(args.end)
  39
  40     return args
  41
  42
  43 def api_timestamp(when=None):
  44     """Returns a string representing the timestamp 'when' in a format
  45     suitable for delivering to the API server.  Defaults to the
  46     current time.
  47     """
  48     if when is None:
  49         when = datetime.datetime.utcnow()
  50     return when.strftime("%Y-%m-%dT%H:%M:%SZ")
  51
  52
  53 def is_valid_timestamp(ts):
  54     return re.match(r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z', ts)
  55
  56 def jobs_created_between_dates(api, start, end):
  57     return arvados.util.list_all(
  58         api.jobs().list,
  59         filters=json.dumps([ ['created_at', '>=', start],
  60                              ['created_at', '<=', end] ]))
  61
  62
  63 def job_logs(api, job):
  64     # Returns the contents of the log for this job (as an array of lines).
  65     log_filename = "{}.log.txt".format(job['uuid'])
  66     log_collection = arvados.CollectionReader(job['log'], api)
  67     return log_collection.open(log_filename).readlines()
  68
  69
  70 def is_failed_task(logline):
  71     return FAILED_TASK_REGEX.search(logline) != None
  72
  73
  74 def main(arguments=None, stdout=sys.stdout, stderr=sys.stderr):
  75     args = parse_arguments(arguments)
  76
  77     api = arvados.api('v1')
  78
  79     now = datetime.datetime.utcnow()
  80     start_time = args.start or api_timestamp(now - datetime.timedelta(days=1))
  81     end_time = args.end or api_timestamp(now)
  82
  83     # Find all jobs created within the specified window,
  84     # and their corresponding job logs.
  85     jobs_created = jobs_created_between_dates(api, start_time, end_time)
  86     jobs_failed     = [job for job in jobs_created if job['state'] == 'Failed']
  87     jobs_successful = [job for job in jobs_created if job['state'] == 'Complete']
  88
  89     # Find failed jobs and record the job failure text.
  90     jobs_failed_types = {}
  91     for job in jobs_failed:
  92         job_uuid = job['uuid']
  93         logs = job_logs(api, job)
  94         # Find the first permanent task failure, and collect the
  95         # preceding log lines.
  96         for i, lg in enumerate(logs):
  97             if is_failed_task(lg):
  98                 # Get preceding log record to provide context.
  99                 log_start = i - LOG_CONTEXT_LINES if i >= LOG_CONTEXT_LINES else 0
 100                 log_end = i + 1
 101                 lastlogs = ''.join(logs[log_start:log_end])
 102                 # try to identify the type of failure.
 103                 fail_reason = 'unknown'
 104                 for rgx in JOB_FAILURE_TYPES:
 105                     if re.search(rgx, lastlogs):
 106                         fail_reason = rgx
 107                         break
 108                 jobs_failed_types.setdefault(fail_reason, set())
 109                 jobs_failed_types[fail_reason].add(job_uuid)
 110                 break
 111             # If we got here, the job is recorded as "failed" but we
 112             # could not find the failure of any specific task.
 113             jobs_failed_types.setdefault('unknown', set())
 114             jobs_failed_types['unknown'].add(job_uuid)
 115
 116     # Report percentages of successful, failed and unfinished jobs.
 117     print "Start: {:20s}".format(start_time)
 118     print "End:   {:20s}".format(end_time)
 119     print ""
 120
 121     print "Overview"
 122     print ""
 123
 124     job_start_count = len(jobs_created)
 125     job_success_count = len(jobs_successful)
 126     job_fail_count = len(jobs_failed)
 127     job_unfinished_count = job_start_count - job_success_count - job_fail_count
 128
 129     print "  Started:     {:4d}".format(job_start_count)
 130     print "  Successful:  {:4d} ({:3.0%})".format(
 131         job_success_count, job_success_count / float(job_start_count))
 132     print "  Failed:      {:4d} ({:3.0%})".format(
 133         job_fail_count, job_fail_count / float(job_start_count))
 134     print "  In progress: {:4d} ({:3.0%})".format(
 135         job_unfinished_count, job_unfinished_count / float(job_start_count))
 136     print ""
 137
 138     # Report failure types.
 139     failure_summary = ""
 140     failure_detail = ""
 141
 142     for failtype, job_uuids in jobs_failed_types.iteritems():
 143         failstat = "  {:s} {:4d} ({:3.0%})\n".format(
 144             failtype, len(job_uuids), len(job_uuids) / float(job_fail_count))
 145         failure_summary = failure_summary + failstat
 146         failure_detail = failure_detail + failstat
 147         for j in job_uuids:
 148             failure_detail = failure_detail + "    http://crvr.se/{}\n".format(j)
 149         failure_detail = failure_detail + "\n"
 150
 151     print "Failures by class"
 152     print ""
 153     print failure_summary
 154
 155     print "Failures by class (detail):"
 156     print ""
 157     print failure_detail
 158
 159     return 0
 160
 161
 162 if __name__ == "__main__":
 163     sys.exit(main())