13804: Smarter shutdown behavior WIP
[arvados.git] / services / nodemanager / tests / integration_test.py
index 24ae701a656b452a841e1f99612c95165a6870c8..284b361cea231be765e0f8fa38c41565807d2a61 100755 (executable)
@@ -25,9 +25,13 @@ from functools import partial
 import arvados
 import StringIO
 
+formatter = logging.Formatter('%(asctime)s %(levelname)s: %(message)s')
+
+handler = logging.StreamHandler(sys.stderr)
+handler.setFormatter(formatter)
 logger = logging.getLogger("logger")
 logger.setLevel(logging.INFO)
-logger.addHandler(logging.StreamHandler(sys.stderr))
+logger.addHandler(handler)
 
 detail = logging.getLogger("detail")
 detail.setLevel(logging.INFO)
@@ -35,7 +39,9 @@ if os.environ.get("ANMTEST_LOGLEVEL"):
     detail_content = sys.stderr
 else:
     detail_content = StringIO.StringIO()
-detail.addHandler(logging.StreamHandler(detail_content))
+handler = logging.StreamHandler(detail_content)
+handler.setFormatter(formatter)
+detail.addHandler(handler)
 
 fake_slurm = None
 compute_nodes = None
@@ -52,14 +58,14 @@ def update_script(path, val):
 def set_squeue(g):
     global all_jobs
     update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n" +
-                  "\n".join("echo '1|100|100|%s|%s|(null)'" % (v, k) for k,v in all_jobs.items()))
+                  "\n".join("echo '1|100|100|%s|%s|(null)|1234567890'" % (v, k) for k,v in all_jobs.items()))
     return 0
 
 def set_queue_unsatisfiable(g):
     global all_jobs, unsatisfiable_job_scancelled
     # Simulate a job requesting a 99 core node.
     update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n" +
-                  "\n".join("echo '99|100|100|%s|%s|(null)'" % (v, k) for k,v in all_jobs.items()))
+                  "\n".join("echo '99|100|100|%s|%s|(null)|1234567890'" % (v, k) for k,v in all_jobs.items()))
     update_script(os.path.join(fake_slurm, "scancel"), "#!/bin/sh\n" +
                   "\ntouch %s" % unsatisfiable_job_scancelled)
     return 0
@@ -78,7 +84,7 @@ def job_cancelled(g):
             ['event_type', '=', 'stderr'],
         ]).execute()['items'][0]
     if not re.match(
-            r"Requirements for a single node exceed the available cloud node size",
+            r"Constraints cannot be satisfied",
             log_entry['properties']['text']):
         return 1
     return 0
@@ -88,7 +94,7 @@ def node_paired(g):
     compute_nodes[g.group(1)] = g.group(3)
 
     update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n" +
-                  "\n".join("echo '%s alloc'" % (v) for k,v in compute_nodes.items()))
+                  "\n".join("echo '%s|alloc|(null)'" % (v) for k,v in compute_nodes.items()))
 
     for k,v in all_jobs.items():
         if v == "ReqNodeNotAvail":
@@ -101,7 +107,7 @@ def node_paired(g):
 
 def remaining_jobs(g):
     update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n" +
-                  "\n".join("echo '%s alloc'" % (v) for k,v in compute_nodes.items()))
+                  "\n".join("echo '%s|alloc|(null)'" % (v) for k,v in compute_nodes.items()))
 
     for k,v in all_jobs.items():
         all_jobs[k] = "Running"
@@ -113,13 +119,16 @@ def remaining_jobs(g):
 
 def node_busy(g):
     update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n" +
-                  "\n".join("echo '%s idle'" % (v) for k,v in compute_nodes.items()))
+                  "\n".join("echo '%s|idle|(null)'" % (v) for k,v in compute_nodes.items()))
     return 0
 
 def node_shutdown(g):
     global compute_nodes
-    del compute_nodes[g.group(1)]
-    return 0
+    if g.group(1) in compute_nodes:
+        del compute_nodes[g.group(1)]
+        return 0
+    else:
+        return 1
 
 def jobs_req(g):
     global all_jobs
@@ -278,21 +287,28 @@ def main():
             # Provider
             "azure"),
         "test_single_node_azure": (
+            # Actions (pattern -> action)
             [
                 (r".*Daemon started", set_squeue),
                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
-            ], {
+            ],
+            # Checks (things that shouldn't happen)
+            {
                 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
                 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1),
                 r".*Setting node quota.*": fail,
             },
+            # Driver class
             "arvnodeman.test.fake_driver.FakeDriver",
+            # Jobs
             {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
+            # Provider
             "azure"),
         "test_multiple_nodes": (
+            # Actions (pattern -> action)
             [
                 (r".*Daemon started", set_squeue),
                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
@@ -305,46 +321,56 @@ def main():
                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
-            ], {
+            ],
+            # Checks (things that shouldn't happen)
+            {
                 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
                 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 4),
                 r".*Setting node quota.*": fail,
             },
+            # Driver class
             "arvnodeman.test.fake_driver.FakeDriver",
+            # Jobs
             {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
              "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
              "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
-             "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"
-         }, "azure"),
+             "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"},
+            # Provider
+            "azure"),
         "test_hit_quota": (
+            # Actions (pattern -> action)
             [
                 (r".*Daemon started", set_squeue),
                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", remaining_jobs),
-                (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown)
-            ], {
+            ],
+            # Checks (things that shouldn't happen)
+            {
                 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
                 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 2),
                 r".*Sending create_node request.*": partial(expect_count, 5)
             },
+            # Driver class
             "arvnodeman.test.fake_driver.QuotaDriver",
+            # Jobs
             {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
              "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
              "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
-             "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"
-         }, "azure"),
+             "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"},
+            # Provider
+            "azure"),
         "test_probe_quota": (
+            # Actions (pattern -> action)
             [
                 (r".*Daemon started", set_squeue),
                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", remaining_jobs),
-                (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
                 (r".*sending request", jobs_req),
@@ -358,18 +384,24 @@ def main():
                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
-            ], {
+            ],
+            # Checks (things that shouldn't happen)
+            {
                 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
                 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 6),
                 r".*Sending create_node request.*": partial(expect_count, 9)
             },
+            # Driver class
             "arvnodeman.test.fake_driver.QuotaDriver",
+            # Jobs
             {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
              "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
              "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
-             "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"
-         }, "azure"),
+             "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"},
+            # Provider
+            "azure"),
         "test_no_hang_failing_node_create": (
+            # Actions (pattern -> action)
             [
                 (r".*Daemon started", set_squeue),
                 (r".*Client error: nope", noop),
@@ -377,53 +409,74 @@ def main():
                 (r".*Client error: nope", noop),
                 (r".*Client error: nope", noop),
             ],
+            # Checks (things that shouldn't happen)
             {},
+            # Driver class
             "arvnodeman.test.fake_driver.FailingDriver",
+            # Jobs
             {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
              "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
              "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
-             "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"
-         }, "azure"),
+             "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"},
+            # Provider
+            "azure"),
         "test_retry_create": (
+            # Actions (pattern -> action)
             [
                 (r".*Daemon started", set_squeue),
                 (r".*Rate limit exceeded - scheduling retry in 12 seconds", noop),
                 (r".*Rate limit exceeded - scheduling retry in 2 seconds", noop),
                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", noop),
             ],
+            # Checks (things that shouldn't happen)
             {},
+            # Driver class
             "arvnodeman.test.fake_driver.RetryDriver",
-            {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail"
-         }, "azure"),
+            # Jobs
+            {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail"},
+            # Provider
+            "azure"),
         "test_single_node_aws": (
+            # Actions (pattern -> action)
             [
                 (r".*Daemon started", set_squeue),
                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
-            ], {
+            ],
+            # Checks (things that shouldn't happen)
+            {
                 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
                 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1),
                 r".*Setting node quota.*": fail,
             },
+            # Driver class
             "arvnodeman.test.fake_driver.FakeAwsDriver",
+            # Jobs
             {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
+            # Provider
             "ec2"),
         "test_single_node_gce": (
+            # Actions (pattern -> action)
             [
                 (r".*Daemon started", set_squeue),
                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
-            ], {
+            ],
+            # Checks (things that shouldn't happen)
+            {
                 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
                 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1),
                 r".*Setting node quota.*": fail,
             },
+            # Driver class
             "arvnodeman.test.fake_driver.FakeGceDriver",
+            # Jobs
             {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
+            # Provider
             "gce")
     }