11 # Useful configuration variables:
13 # Number of log lines to use as context in diagnosing failure.
14 LOG_CONTEXT_LINES = 10
16 # Regex that signifies a failed task.
17 FAILED_TASK_REGEX = re.compile(' \d+ failure (.*permanent)')
19 # List of regexes by which to classify failures.
20 JOB_FAILURE_TYPES = [ 'User not found on host' ]
22 def parse_arguments(arguments):
23 arg_parser = argparse.ArgumentParser(
24 description='Produce a report of Crunch failures within a specified time range')
26 arg_parser.add_argument(
28 help='Start date and time')
29 arg_parser.add_argument(
31 help='End date and time')
33 args = arg_parser.parse_args(arguments)
35 if args.start and not is_valid_timestamp(args.start):
36 raise ValueError(args.start)
37 if args.end and not is_valid_timestamp(args.end):
38 raise ValueError(args.end)
43 def api_timestamp(when=None):
44 """Returns a string representing the timestamp 'when' in a format
45 suitable for delivering to the API server. Defaults to the
49 when = datetime.datetime.utcnow()
50 return when.strftime("%Y-%m-%dT%H:%M:%SZ")
53 def is_valid_timestamp(ts):
54 return re.match(r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z', ts)
56 def jobs_created_between_dates(api, start, end):
57 return arvados.util.list_all(
59 filters=json.dumps([ ['created_at', '>=', start],
60 ['created_at', '<=', end] ]))
63 def job_logs(api, job):
64 # Returns the contents of the log for this job (as an array of lines).
65 log_filename = "{}.log.txt".format(job['uuid'])
66 log_collection = arvados.CollectionReader(job['log'], api)
67 return log_collection.open(log_filename).readlines()
70 def is_failed_task(logline):
71 return FAILED_TASK_REGEX.search(logline) != None
74 def main(arguments=None, stdout=sys.stdout, stderr=sys.stderr):
75 args = parse_arguments(arguments)
77 api = arvados.api('v1')
79 now = datetime.datetime.utcnow()
80 start_time = args.start or api_timestamp(now - datetime.timedelta(days=1))
81 end_time = args.end or api_timestamp(now)
83 # Find all jobs created within the specified window,
84 # and their corresponding job logs.
85 jobs_created = jobs_created_between_dates(api, start_time, end_time)
86 jobs_failed = [job for job in jobs_created if job['state'] == 'Failed']
87 jobs_successful = [job for job in jobs_created if job['state'] == 'Complete']
89 # Find failed jobs and record the job failure text.
90 jobs_failed_types = {}
91 for job in jobs_failed:
92 job_uuid = job['uuid']
93 logs = job_logs(api, job)
94 # Find the first permanent task failure, and collect the
95 # preceding log lines.
96 for i, lg in enumerate(logs):
97 if is_failed_task(lg):
98 # Get preceding log record to provide context.
99 log_start = i - LOG_CONTEXT_LINES if i >= LOG_CONTEXT_LINES else 0
101 lastlogs = ''.join(logs[log_start:log_end])
102 # try to identify the type of failure.
103 fail_reason = 'unknown'
104 for rgx in JOB_FAILURE_TYPES:
105 if re.search(rgx, lastlogs):
108 jobs_failed_types.setdefault(fail_reason, set())
109 jobs_failed_types[fail_reason].add(job_uuid)
111 # If we got here, the job is recorded as "failed" but we
112 # could not find the failure of any specific task.
113 jobs_failed_types.setdefault('unknown', set())
114 jobs_failed_types['unknown'].add(job_uuid)
116 # Report percentages of successful, failed and unfinished jobs.
117 print "Start: {:20s}".format(start_time)
118 print "End: {:20s}".format(end_time)
124 job_start_count = len(jobs_created)
125 job_success_count = len(jobs_successful)
126 job_fail_count = len(jobs_failed)
127 job_unfinished_count = job_start_count - job_success_count - job_fail_count
129 print " Started: {:4d}".format(job_start_count)
130 print " Successful: {:4d} ({:3.0%})".format(
131 job_success_count, job_success_count / float(job_start_count))
132 print " Failed: {:4d} ({:3.0%})".format(
133 job_fail_count, job_fail_count / float(job_start_count))
134 print " In progress: {:4d} ({:3.0%})".format(
135 job_unfinished_count, job_unfinished_count / float(job_start_count))
138 # Report failure types.
142 for failtype, job_uuids in jobs_failed_types.iteritems():
143 failstat = " {:s} {:4d} ({:3.0%})\n".format(
144 failtype, len(job_uuids), len(job_uuids) / float(job_fail_count))
145 failure_summary = failure_summary + failstat
146 failure_detail = failure_detail + failstat
148 failure_detail = failure_detail + " http://crvr.se/{}\n".format(j)
149 failure_detail = failure_detail + "\n"
151 print "Failures by class"
153 print failure_summary
155 print "Failures by class (detail):"
162 if __name__ == "__main__":