Merge branch '13593-async-perm-graph-update'
[arvados.git] / services / api / script / crunch_failure_report.py
1 #! /usr/bin/env python
2 # Copyright (C) The Arvados Authors. All rights reserved.
3 #
4 # SPDX-License-Identifier: AGPL-3.0
5
6 import argparse
7 import datetime
8 import json
9 import re
10 import sys
11
12 import arvados
13
14 # Useful configuration variables:
15
16 # Number of log lines to use as context in diagnosing failure.
17 LOG_CONTEXT_LINES = 10
18
19 # Regex that signifies a failed task.
20 FAILED_TASK_REGEX = re.compile(' \d+ failure (.*permanent)')
21
22 # Regular expressions used to classify failure types.
23 JOB_FAILURE_TYPES = {
24     'sys/docker': 'Cannot destroy container',
25     'crunch/node': 'User not found on host',
26     'slurm/comm':  'Communication connection failure'
27 }
28
29 def parse_arguments(arguments):
30     arg_parser = argparse.ArgumentParser(
31         description='Produce a report of Crunch failures within a specified time range')
32
33     arg_parser.add_argument(
34         '--start',
35         help='Start date and time')
36     arg_parser.add_argument(
37         '--end',
38         help='End date and time')
39
40     args = arg_parser.parse_args(arguments)
41
42     if args.start and not is_valid_timestamp(args.start):
43         raise ValueError(args.start)
44     if args.end and not is_valid_timestamp(args.end):
45         raise ValueError(args.end)
46
47     return args
48
49
50 def api_timestamp(when=None):
51     """Returns a string representing the timestamp 'when' in a format
52     suitable for delivering to the API server.  Defaults to the
53     current time.
54     """
55     if when is None:
56         when = datetime.datetime.utcnow()
57     return when.strftime("%Y-%m-%dT%H:%M:%SZ")
58
59
60 def is_valid_timestamp(ts):
61     return re.match(r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z', ts)
62
63
64 def jobs_created_between_dates(api, start, end):
65     return arvados.util.list_all(
66         api.jobs().list,
67         filters=json.dumps([ ['created_at', '>=', start],
68                              ['created_at', '<=', end] ]))
69
70
71 def job_logs(api, job):
72     # Returns the contents of the log for this job (as an array of lines).
73     if job['log']:
74         log_collection = arvados.CollectionReader(job['log'], api)
75         log_filename = "{}.log.txt".format(job['uuid'])
76         return log_collection.open(log_filename).readlines()
77     return []
78
79
80 user_names = {}
81 def job_user_name(api, user_uuid):
82     def _lookup_user_name(api, user_uuid):
83         try:
84             return api.users().get(uuid=user_uuid).execute()['full_name']
85         except arvados.errors.ApiError:
86             return user_uuid
87
88     if user_uuid not in user_names:
89         user_names[user_uuid] = _lookup_user_name(api, user_uuid)
90     return user_names[user_uuid]
91
92
93 job_pipeline_names = {}
94 def job_pipeline_name(api, job_uuid):
95     def _lookup_pipeline_name(api, job_uuid):
96         try:
97             pipelines = api.pipeline_instances().list(
98                 filters='[["components", "like", "%{}%"]]'.format(job_uuid)).execute()
99             pi = pipelines['items'][0]
100             if pi['name']:
101                 return pi['name']
102             else:
103                 # Use the pipeline template name
104                 pt = api.pipeline_templates().get(uuid=pi['pipeline_template_uuid']).execute()
105                 return pt['name']
106         except (TypeError, ValueError, IndexError):
107             return ""
108
109     if job_uuid not in job_pipeline_names:
110         job_pipeline_names[job_uuid] = _lookup_pipeline_name(api, job_uuid)
111     return job_pipeline_names[job_uuid]
112
113
114 def is_failed_task(logline):
115     return FAILED_TASK_REGEX.search(logline) != None
116
117
118 def main(arguments=None, stdout=sys.stdout, stderr=sys.stderr):
119     args = parse_arguments(arguments)
120
121     api = arvados.api('v1')
122
123     now = datetime.datetime.utcnow()
124     start_time = args.start or api_timestamp(now - datetime.timedelta(days=1))
125     end_time = args.end or api_timestamp(now)
126
127     # Find all jobs created within the specified window,
128     # and their corresponding job logs.
129     jobs_created = jobs_created_between_dates(api, start_time, end_time)
130     jobs_by_state = {}
131     for job in jobs_created:
132         jobs_by_state.setdefault(job['state'], [])
133         jobs_by_state[job['state']].append(job)
134
135     # Find failed jobs and record the job failure text.
136
137     # failure_stats maps failure types (e.g. "sys/docker") to
138     # a set of job UUIDs that failed for that reason.
139     failure_stats = {}
140     for job in jobs_by_state['Failed']:
141         job_uuid = job['uuid']
142         logs = job_logs(api, job)
143         # Find the first permanent task failure, and collect the
144         # preceding log lines.
145         failure_type = None
146         for i, lg in enumerate(logs):
147             if is_failed_task(lg):
148                 # Get preceding log record to provide context.
149                 log_start = i - LOG_CONTEXT_LINES if i >= LOG_CONTEXT_LINES else 0
150                 log_end = i + 1
151                 lastlogs = ''.join(logs[log_start:log_end])
152                 # try to identify the type of failure.
153                 for key, rgx in JOB_FAILURE_TYPES.iteritems():
154                     if re.search(rgx, lastlogs):
155                         failure_type = key
156                         break
157             if failure_type is not None:
158                 break
159         if failure_type is None:
160             failure_type = 'unknown'
161         failure_stats.setdefault(failure_type, set())
162         failure_stats[failure_type].add(job_uuid)
163
164     # Report percentages of successful, failed and unfinished jobs.
165     print "Start: {:20s}".format(start_time)
166     print "End:   {:20s}".format(end_time)
167     print ""
168
169     print "Overview"
170     print ""
171
172     job_start_count = len(jobs_created)
173     print "  {: <25s} {:4d}".format('Started', job_start_count)
174     for state in ['Complete', 'Failed', 'Queued', 'Cancelled', 'Running']:
175         if state in jobs_by_state:
176             job_count = len(jobs_by_state[state])
177             job_percentage = job_count / float(job_start_count)
178             print "  {: <25s} {:4d} ({: >4.0%})".format(state,
179                                                         job_count,
180                                                         job_percentage)
181     print ""
182
183     # Report failure types.
184     failure_summary = ""
185     failure_detail = ""
186
187     # Generate a mapping from failed job uuids to job records, to assist
188     # in generating detailed statistics for job failures.
189     jobs_failed_map = { job['uuid']: job for job in jobs_by_state.get('Failed', []) }
190
191     # sort the failure stats in descending order by occurrence.
192     sorted_failures = sorted(failure_stats,
193                              reverse=True,
194                              key=lambda failure_type: len(failure_stats[failure_type]))
195     for failtype in sorted_failures:
196         job_uuids = failure_stats[failtype]
197         failstat = "  {: <25s} {:4d} ({: >4.0%})\n".format(
198             failtype,
199             len(job_uuids),
200             len(job_uuids) / float(len(jobs_by_state['Failed'])))
201         failure_summary = failure_summary + failstat
202         failure_detail = failure_detail + failstat
203         for j in job_uuids:
204             job_info = jobs_failed_map[j]
205             job_owner = job_user_name(api, job_info['modified_by_user_uuid'])
206             job_name = job_pipeline_name(api, job_info['uuid'])
207             failure_detail = failure_detail + "    {}  {: <15.15s}  {:29.29s}\n".format(j, job_owner, job_name)
208         failure_detail = failure_detail + "\n"
209
210     print "Failures by class"
211     print ""
212     print failure_summary
213
214     print "Failures by class (detail)"
215     print ""
216     print failure_detail
217
218     return 0
219
220
221 if __name__ == "__main__":
222     sys.exit(main())