4598: account for queued and cancelled jobs, fix sorting
[arvados.git] / services / api / script / crunch-failure-report.py
1 #! /usr/bin/env python
2
3 import argparse
4 import datetime
5 import json
6 import re
7 import sys
8
9 import arvados
10
11 # Useful configuration variables:
12
13 # Number of log lines to use as context in diagnosing failure.
14 LOG_CONTEXT_LINES = 10
15
16 # Regex that signifies a failed task.
17 FAILED_TASK_REGEX = re.compile(' \d+ failure (.*permanent)')
18
19 # Regular expressions used to classify failure types.
20 JOB_FAILURE_TYPES = {
21     'sys/docker': 'Cannot destroy container',
22     'crunch/node': 'User not found on host',
23     'slurm/comm':  'Communication connection failure'
24 }
25
26 def parse_arguments(arguments):
27     arg_parser = argparse.ArgumentParser(
28         description='Produce a report of Crunch failures within a specified time range')
29
30     arg_parser.add_argument(
31         '--start',
32         help='Start date and time')
33     arg_parser.add_argument(
34         '--end',
35         help='End date and time')
36
37     args = arg_parser.parse_args(arguments)
38
39     if args.start and not is_valid_timestamp(args.start):
40         raise ValueError(args.start)
41     if args.end and not is_valid_timestamp(args.end):
42         raise ValueError(args.end)
43
44     return args
45
46
47 def api_timestamp(when=None):
48     """Returns a string representing the timestamp 'when' in a format
49     suitable for delivering to the API server.  Defaults to the
50     current time.
51     """
52     if when is None:
53         when = datetime.datetime.utcnow()
54     return when.strftime("%Y-%m-%dT%H:%M:%SZ")
55
56
57 def is_valid_timestamp(ts):
58     return re.match(r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z', ts)
59
60
61 def jobs_created_between_dates(api, start, end):
62     return arvados.util.list_all(
63         api.jobs().list,
64         filters=json.dumps([ ['created_at', '>=', start],
65                              ['created_at', '<=', end] ]))
66
67
68 def job_logs(api, job):
69     # Returns the contents of the log for this job (as an array of lines).
70     if job['log']:
71         log_collection = arvados.CollectionReader(job['log'], api)
72         log_filename = "{}.log.txt".format(job['uuid'])
73         return log_collection.open(log_filename).readlines()
74     return []
75
76
77 user_names = {}
78 def job_user_name(api, user_uuid):
79     def _lookup_user_name(api, user_uuid):
80         try:
81             return api.users().get(uuid=user_uuid).execute()['full_name']
82         except arvados.errors.ApiError:
83             return user_uuid
84
85     if user_uuid not in user_names:
86         user_names[user_uuid] = _lookup_user_name(api, user_uuid)
87     return user_names[user_uuid]
88
89
90 job_pipeline_names = {}
91 def job_pipeline_name(api, job_uuid):
92     def _lookup_pipeline_name(api, job_uuid):
93         pipelines = api.pipeline_instances().list(
94             filters='[["components", "like", "%{}%"]]'.format(job_uuid)).execute()
95         if pipelines['items']:
96             pi = pipelines['items'][0]
97             if pi['name']:
98                 return pi['name']
99             else:
100                 # Use the pipeline template name
101                 pt = api.pipeline_templates().get(uuid=pi['pipeline_template_uuid']).execute()
102                 if pt:
103                     return pt['name']
104         return ""
105
106     if job_uuid not in job_pipeline_names:
107         job_pipeline_names[job_uuid] = _lookup_pipeline_name(api, job_uuid)
108     return job_pipeline_names[job_uuid]
109
110
111 def is_failed_task(logline):
112     return FAILED_TASK_REGEX.search(logline) != None
113
114
115 def main(arguments=None, stdout=sys.stdout, stderr=sys.stderr):
116     args = parse_arguments(arguments)
117
118     api = arvados.api('v1')
119
120     now = datetime.datetime.utcnow()
121     start_time = args.start or api_timestamp(now - datetime.timedelta(days=1))
122     end_time = args.end or api_timestamp(now)
123
124     # Find all jobs created within the specified window,
125     # and their corresponding job logs.
126     jobs_created = jobs_created_between_dates(api, start_time, end_time)
127     jobs_by_state = {}
128     for job in jobs_created:
129         jobs_by_state.setdefault(job['state'], [])
130         jobs_by_state[job['state']].append(job)
131
132     # Find failed jobs and record the job failure text.
133
134     # failure_stats maps failure types (e.g. "sys/docker") to
135     # a set of job UUIDs that failed for that reason.
136     failure_stats = {}
137     for job in jobs_by_state['Failed']:
138         job_uuid = job['uuid']
139         logs = job_logs(api, job)
140         # Find the first permanent task failure, and collect the
141         # preceding log lines.
142         failure_type = None
143         for i, lg in enumerate(logs):
144             if is_failed_task(lg):
145                 # Get preceding log record to provide context.
146                 log_start = i - LOG_CONTEXT_LINES if i >= LOG_CONTEXT_LINES else 0
147                 log_end = i + 1
148                 lastlogs = ''.join(logs[log_start:log_end])
149                 # try to identify the type of failure.
150                 for key, rgx in JOB_FAILURE_TYPES.iteritems():
151                     if re.search(rgx, lastlogs):
152                         failure_type = key
153                         break
154             if failure_type is not None:
155                 break
156         if failure_type is None:
157             failure_type = 'unknown'
158         failure_stats.setdefault(failure_type, set())
159         failure_stats[failure_type].add(job_uuid)
160
161     # Report percentages of successful, failed and unfinished jobs.
162     print "Start: {:20s}".format(start_time)
163     print "End:   {:20s}".format(end_time)
164     print ""
165
166     print "Overview"
167     print ""
168
169     job_start_count = len(jobs_created)
170     print "  {: <25s} {:4d}".format('Started', job_start_count)
171     for state in ['Complete', 'Failed', 'Queued', 'Cancelled', 'Running']:
172         if state in jobs_by_state:
173             job_count = len(jobs_by_state[state])
174             job_percentage = job_count / float(job_start_count)
175             print "  {: <25s} {:4d} ({: >4.0%})".format(state,
176                                                         job_count,
177                                                         job_percentage)
178     print ""
179
180     # Report failure types.
181     failure_summary = ""
182     failure_detail = ""
183
184     # Generate a mapping from failed job uuids to job records, to assist
185     # in generating detailed statistics for job failures.
186     jobs_failed_map = { job['uuid']: job for job in jobs_by_state.get('Failed', []) }
187
188     # sort the failure stats in descending order by occurrence.
189     sorted_failures = sorted(failure_stats,
190                              reverse=True,
191                              key=lambda failure_type: len(failure_stats[failure_type]))
192     for failtype in sorted_failures:
193         job_uuids = failure_stats[failtype]
194         failstat = "  {: <25s} {:4d} ({: >4.0%})\n".format(
195             failtype,
196             len(job_uuids),
197             len(job_uuids) / float(len(jobs_by_state['Failed'])))
198         failure_summary = failure_summary + failstat
199         failure_detail = failure_detail + failstat
200         for j in job_uuids:
201             job_info = jobs_failed_map[j]
202             job_owner = job_user_name(api, job_info['modified_by_user_uuid'])
203             job_name = job_pipeline_name(api, job_info['uuid'])
204             failure_detail = failure_detail + "    {}  {: <15.15s}  {:29.29s}\n".format(j, job_owner, job_name)
205         failure_detail = failure_detail + "\n"
206
207     print "Failures by class"
208     print ""
209     print failure_summary
210
211     print "Failures by class (detail)"
212     print ""
213     print failure_detail
214
215     return 0
216
217
218 if __name__ == "__main__":
219     sys.exit(main())