12538: Refactor crunch-run shutdown
[arvados.git] / services / nodemanager / tests / integration_test.py
index f024b0cffecb967a5c863a685d92895eb36e43ee..d5b55540f8e66eea3749d1828d3203ae89521f92 100755 (executable)
@@ -1,4 +1,8 @@
 #!/usr/bin/env python
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: AGPL-3.0
+
 """Integration test framework for node manager.
 
 Runs full node manager with an API server (needs ARVADOS_API_HOST and
@@ -36,6 +40,7 @@ detail.addHandler(logging.StreamHandler(detail_content))
 fake_slurm = None
 compute_nodes = None
 all_jobs = None
+unsatisfiable_job_scancelled = None
 
 def update_script(path, val):
     with open(path+"_", "w") as f:
@@ -50,6 +55,33 @@ def set_squeue(g):
                   "\n".join("echo '1|100|100|%s|%s'" % (v, k) for k,v in all_jobs.items()))
     return 0
 
+def set_queue_unsatisfiable(g):
+    global all_jobs, unsatisfiable_job_scancelled
+    # Simulate a job requesting a 99 core node.
+    update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n" +
+                  "\n".join("echo '99|100|100|%s|%s'" % (v, k) for k,v in all_jobs.items()))
+    update_script(os.path.join(fake_slurm, "scancel"), "#!/bin/sh\n" +
+                  "\ntouch %s" % unsatisfiable_job_scancelled)
+    return 0
+
+def job_cancelled(g):
+    global unsatisfiable_job_scancelled
+    cancelled_job = g.group(1)
+    api = arvados.api('v1')
+    # Check that 'scancel' was called
+    if not os.path.isfile(unsatisfiable_job_scancelled):
+        return 1
+    # Check for the log entry
+    log_entry = api.logs().list(
+        filters=[
+            ['object_uuid', '=', cancelled_job],
+            ['event_type', '=', 'stderr'],
+        ]).execute()['items'][0]
+    if not re.match(
+            r"Requirements for a single node exceed the available cloud node size",
+            log_entry['properties']['text']):
+        return 1
+    return 0
 
 def node_paired(g):
     global compute_nodes
@@ -109,8 +141,11 @@ def expect_count(count, checks, pattern, g):
         checks[pattern] = partial(expect_count, count-1)
         return 0
 
-def run_test(name, actions, checks, driver_class, jobs):
+def run_test(name, actions, checks, driver_class, jobs, provider):
     code = 0
+    global unsatisfiable_job_scancelled
+    unsatisfiable_job_scancelled = os.path.join(tempfile.mkdtemp(),
+                                                "scancel_called")
 
     # Delete any stale node records
     api = arvados.api('v1')
@@ -137,7 +172,7 @@ def run_test(name, actions, checks, driver_class, jobs):
     update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n")
 
     # Write configuration file for test
-    with open("tests/fake.cfg.template") as f:
+    with open("tests/fake_%s.cfg.template" % provider) as f:
         open(os.path.join(fake_slurm, "id_rsa.pub"), "w").close()
         with open(os.path.join(fake_slurm, "fake.cfg"), "w") as cfg:
             cfg.write(f.read().format(host=os.environ["ARVADOS_API_HOST"],
@@ -155,7 +190,7 @@ def run_test(name, actions, checks, driver_class, jobs):
 
     # Test main loop:
     # - Read line
-    # - Apply negative checks (thinks that are not supposed to happen)
+    # - Apply negative checks (things that are not supposed to happen)
     # - Check timeout
     # - Check if the next action should trigger
     # - If all actions are exhausted, terminate with test success
@@ -209,11 +244,12 @@ def run_test(name, actions, checks, driver_class, jobs):
         code = 1
 
     shutil.rmtree(fake_slurm)
+    shutil.rmtree(os.path.dirname(unsatisfiable_job_scancelled))
 
     if code == 0:
         logger.info("%s passed", name)
     else:
-        if isinstance(detail_content, StringIO()):
+        if isinstance(detail_content, StringIO.StringIO):
             sys.stderr.write(detail_content.getvalue())
         logger.info("%s failed", name)
 
@@ -224,7 +260,24 @@ def main():
     # Test lifecycle.
 
     tests = {
-        "test_single_node": (
+        "test_unsatisfiable_jobs" : (
+            # Actions (pattern -> action)
+            [
+                (r".*Daemon started", set_queue_unsatisfiable),
+                (r".*Cancelled unsatisfiable job '(\S+)'", job_cancelled),
+            ],
+            # Checks (things that shouldn't happen)
+            {
+                r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": fail,
+                r".*Trying to cancel job '(\S+)'": fail,
+            },
+            # Driver class
+            "arvnodeman.test.fake_driver.FakeDriver",
+            # Jobs
+            {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
+            # Provider
+            "azure"),
+        "test_single_node_azure": (
             [
                 (r".*Daemon started", set_squeue),
                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
@@ -237,7 +290,8 @@ def main():
                 r".*Setting node quota.*": fail,
             },
             "arvnodeman.test.fake_driver.FakeDriver",
-            {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"}),
+            {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
+            "azure"),
         "test_multiple_nodes": (
             [
                 (r".*Daemon started", set_squeue),
@@ -261,11 +315,10 @@ def main():
              "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
              "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
              "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"
-         }),
+         }, "azure"),
         "test_hit_quota": (
             [
                 (r".*Daemon started", set_squeue),
-                (r".*setting node quota to 3", noop),
                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
@@ -283,11 +336,10 @@ def main():
              "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
              "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
              "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"
-         }),
+         }, "azure"),
         "test_probe_quota": (
             [
                 (r".*Daemon started", set_squeue),
-                (r".*setting node quota to 3", noop),
                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
@@ -316,7 +368,7 @@ def main():
              "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
              "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
              "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"
-         }),
+         }, "azure"),
         "test_no_hang_failing_node_create": (
             [
                 (r".*Daemon started", set_squeue),
@@ -331,17 +383,48 @@ def main():
              "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
              "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
              "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"
-         }),
+         }, "azure"),
         "test_retry_create": (
             [
                 (r".*Daemon started", set_squeue),
                 (r".*Rate limit exceeded - scheduling retry in 12 seconds", noop),
+                (r".*Rate limit exceeded - scheduling retry in 2 seconds", noop),
                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", noop),
             ],
             {},
             "arvnodeman.test.fake_driver.RetryDriver",
             {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail"
-         })
+         }, "azure"),
+        "test_single_node_aws": (
+            [
+                (r".*Daemon started", set_squeue),
+                (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
+                (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
+                (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
+                (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
+            ], {
+                r".*Suggesting shutdown because node state is \('down', .*\)": fail,
+                r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1),
+                r".*Setting node quota.*": fail,
+            },
+            "arvnodeman.test.fake_driver.FakeAwsDriver",
+            {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
+            "ec2"),
+        "test_single_node_gce": (
+            [
+                (r".*Daemon started", set_squeue),
+                (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
+                (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
+                (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
+                (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
+            ], {
+                r".*Suggesting shutdown because node state is \('down', .*\)": fail,
+                r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1),
+                r".*Setting node quota.*": fail,
+            },
+            "arvnodeman.test.fake_driver.FakeGceDriver",
+            {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
+            "gce")
     }
 
     code = 0