4598: remove more dev/debugging features.
[arvados.git] / services / api / script / crunch-failure-report.py
1 #! /usr/bin/env python
2
3 import argparse
4 import datetime
5 import json
6 import re
7 import sys
8
9 import arvados
10
11 # Useful configuration variables:
12
13 # Number of log lines to use as context in diagnosing failure.
14 LOG_CONTEXT_LINES = 10
15
16 # Regex that signifies a failed task.
17 FAILED_TASK_REGEX = re.compile(' \d+ failure (.*permanent)')
18
19 # List of regexes by which to classify failures.
20 JOB_FAILURE_TYPES = [ 'User not found on host' ]
21
22 def parse_arguments(arguments):
23     arg_parser = argparse.ArgumentParser(
24         description='Produce a report of Crunch failures within a specified time range')
25
26     arg_parser.add_argument(
27         '--start',
28         help='Start date and time')
29     arg_parser.add_argument(
30         '--end',
31         help='End date and time')
32
33     args = arg_parser.parse_args(arguments)
34
35     if args.start and not is_valid_timestamp(args.start):
36         raise ValueError(args.start)
37     if args.end and not is_valid_timestamp(args.end):
38         raise ValueError(args.end)
39
40     return args
41
42
43 def api_timestamp(when=None):
44     """Returns a string representing the timestamp 'when' in a format
45     suitable for delivering to the API server.  Defaults to the
46     current time.
47     """
48     if when is None:
49         when = datetime.datetime.utcnow()
50     return when.strftime("%Y-%m-%dT%H:%M:%SZ")
51
52
53 def is_valid_timestamp(ts):
54     return re.match(r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z', ts)
55
56 def jobs_created_between_dates(api, start, end):
57     return arvados.util.list_all(
58         api.jobs().list,
59         filters=json.dumps([ ['created_at', '>=', start],
60                              ['created_at', '<=', end] ]))
61
62
63 def job_logs(api, job):
64     # Returns the contents of the log for this job (as an array of lines).
65     log_filename = "{}.log.txt".format(job['uuid'])
66     log_collection = arvados.CollectionReader(job['log'], api)
67     return log_collection.open(log_filename).readlines()
68
69
70 def is_failed_task(logline):
71     return FAILED_TASK_REGEX.search(logline) != None
72
73
74 def main(arguments=None, stdout=sys.stdout, stderr=sys.stderr):
75     args = parse_arguments(arguments)
76
77     api = arvados.api('v1')
78
79     now = datetime.datetime.utcnow()
80     start_time = args.start or api_timestamp(now - datetime.timedelta(days=1))
81     end_time = args.end or api_timestamp(now)
82
83     # Find all jobs created within the specified window,
84     # and their corresponding job logs.
85     jobs_created = jobs_created_between_dates(api, start_time, end_time)
86     jobs_failed     = [job for job in jobs_created if job['state'] == 'Failed']
87     jobs_successful = [job for job in jobs_created if job['state'] == 'Complete']
88
89     # Find failed jobs and record the job failure text.
90     jobs_failed_types = {}
91     for job in jobs_failed:
92         job_uuid = job['uuid']
93         logs = job_logs(api, job)
94         # Find the first permanent task failure, and collect the
95         # preceding log lines.
96         for i, lg in enumerate(logs):
97             if is_failed_task(lg):
98                 # Get preceding log record to provide context.
99                 log_start = i - LOG_CONTEXT_LINES if i >= LOG_CONTEXT_LINES else 0
100                 log_end = i + 1
101                 lastlogs = ''.join(logs[log_start:log_end])
102                 # try to identify the type of failure.
103                 fail_reason = 'unknown'
104                 for rgx in JOB_FAILURE_TYPES:
105                     if re.search(rgx, lastlogs):
106                         fail_reason = rgx
107                         break
108                 jobs_failed_types.setdefault(fail_reason, set())
109                 jobs_failed_types[fail_reason].add(job_uuid)
110                 break
111             # If we got here, the job is recorded as "failed" but we
112             # could not find the failure of any specific task.
113             jobs_failed_types.setdefault('unknown', set())
114             jobs_failed_types['unknown'].add(job_uuid)
115
116     # Report percentages of successful, failed and unfinished jobs.
117     print "Start: {:20s}".format(start_time)
118     print "End:   {:20s}".format(end_time)
119     print ""
120
121     print "Overview"
122     print ""
123
124     job_start_count = len(jobs_created)
125     job_success_count = len(jobs_successful)
126     job_fail_count = len(jobs_failed)
127     job_unfinished_count = job_start_count - job_success_count - job_fail_count
128
129     print "  Started:     {:4d}".format(job_start_count)
130     print "  Successful:  {:4d} ({:3.0%})".format(
131         job_success_count, job_success_count / float(job_start_count))
132     print "  Failed:      {:4d} ({:3.0%})".format(
133         job_fail_count, job_fail_count / float(job_start_count))
134     print "  In progress: {:4d} ({:3.0%})".format(
135         job_unfinished_count, job_unfinished_count / float(job_start_count))
136     print ""
137
138     # Report failure types.
139     failure_summary = ""
140     failure_detail = ""
141
142     for failtype, job_uuids in jobs_failed_types.iteritems():
143         failstat = "  {:s} {:4d} ({:3.0%})\n".format(
144             failtype, len(job_uuids), len(job_uuids) / float(job_fail_count))
145         failure_summary = failure_summary + failstat
146         failure_detail = failure_detail + failstat
147         for j in job_uuids:
148             failure_detail = failure_detail + "    http://crvr.se/{}\n".format(j)
149         failure_detail = failure_detail + "\n"
150
151     print "Failures by class"
152     print ""
153     print failure_summary
154
155     print "Failures by class (detail):"
156     print ""
157     print failure_detail
158
159     return 0
160
161
162 if __name__ == "__main__":
163     sys.exit(main())