4598: added failure types and short names
authorTim Pierce <twp@curoverse.com>
Tue, 30 Dec 2014 21:07:02 +0000 (16:07 -0500)
committerTim Pierce <twp@curoverse.com>
Tue, 30 Dec 2014 21:07:02 +0000 (16:07 -0500)
Added the sys/docker failure type. Failures now reported by short
failure name rather than by regex.

services/api/script/crunch-failure-report.py

index e57205383e13a718466da7978bd88fed79d5a7a7..63d729cfe51f7547192f33af4a9c6509abf85ade 100755 (executable)
@@ -16,8 +16,11 @@ LOG_CONTEXT_LINES = 10
 # Regex that signifies a failed task.
 FAILED_TASK_REGEX = re.compile(' \d+ failure (.*permanent)')
 
-# List of regexes by which to classify failures.
-JOB_FAILURE_TYPES = [ 'User not found on host' ]
+# Regular expressions used to classify failure types.
+JOB_FAILURE_TYPES = {
+    'sys/docker': 'Cannot destroy container',
+    'crunch/node': 'User not found on host'
+}
 
 def parse_arguments(arguments):
     arg_parser = argparse.ArgumentParser(
@@ -87,7 +90,10 @@ def main(arguments=None, stdout=sys.stdout, stderr=sys.stderr):
     jobs_successful = [job for job in jobs_created if job['state'] == 'Complete']
 
     # Find failed jobs and record the job failure text.
-    jobs_failed_types = {}
+
+    # failure_stats maps failure types (e.g. "sys/docker") to
+    # a set of job UUIDs that failed for that reason.
+    failure_stats = {}
     for job in jobs_failed:
         job_uuid = job['uuid']
         logs = job_logs(api, job)
@@ -100,18 +106,18 @@ def main(arguments=None, stdout=sys.stdout, stderr=sys.stderr):
                 log_end = i + 1
                 lastlogs = ''.join(logs[log_start:log_end])
                 # try to identify the type of failure.
-                fail_reason = 'unknown'
-                for rgx in JOB_FAILURE_TYPES:
+                failure_type = 'unknown'
+                for key, rgx in JOB_FAILURE_TYPES.iteritems():
                     if re.search(rgx, lastlogs):
-                        fail_reason = rgx
+                        failure_type = key
                         break
-                jobs_failed_types.setdefault(fail_reason, set())
-                jobs_failed_types[fail_reason].add(job_uuid)
+                failure_stats.setdefault(failure_type, set())
+                failure_stats[failure_type].add(job_uuid)
                 break
             # If we got here, the job is recorded as "failed" but we
             # could not find the failure of any specific task.
-            jobs_failed_types.setdefault('unknown', set())
-            jobs_failed_types['unknown'].add(job_uuid)
+            failure_stats.setdefault('unknown', set())
+            failure_stats['unknown'].add(job_uuid)
 
     # Report percentages of successful, failed and unfinished jobs.
     print "Start: {:20s}".format(start_time)
@@ -139,7 +145,7 @@ def main(arguments=None, stdout=sys.stdout, stderr=sys.stderr):
     failure_summary = ""
     failure_detail = ""
 
-    for failtype, job_uuids in jobs_failed_types.iteritems():
+    for failtype, job_uuids in failure_stats.iteritems():
         failstat = "  {:s} {:4d} ({:3.0%})\n".format(
             failtype, len(job_uuids), len(job_uuids) / float(job_fail_count))
         failure_summary = failure_summary + failstat