Merge branch 'master' into 13804-no-shutdown-wanted-nodes
[arvados.git] / services / nodemanager / tests / integration_test.py
index 7b8ba391c9ced07de6a39d7b7952695d619b126e..1ba2957ee5544c9346bbb00bc3f0e2ad9e51276a 100755 (executable)
@@ -12,7 +12,7 @@ events or behaviors for each test.
 
 """
 
-import subprocess
+import subprocess32 as subprocess
 import os
 import sys
 import re
@@ -21,6 +21,7 @@ import logging
 import stat
 import tempfile
 import shutil
+import errno
 from functools import partial
 import arvados
 import StringIO
@@ -58,14 +59,14 @@ def update_script(path, val):
 def set_squeue(g):
     global all_jobs
     update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n" +
-                  "\n".join("echo '1|100|100|%s|%s|(null)'" % (v, k) for k,v in all_jobs.items()))
+                  "\n".join("echo '1|100|100|%s|%s|(null)|1234567890'" % (v, k) for k,v in all_jobs.items()))
     return 0
 
 def set_queue_unsatisfiable(g):
     global all_jobs, unsatisfiable_job_scancelled
     # Simulate a job requesting a 99 core node.
     update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n" +
-                  "\n".join("echo '99|100|100|%s|%s|(null)'" % (v, k) for k,v in all_jobs.items()))
+                  "\n".join("echo '99|100|100|%s|%s|(null)|1234567890'" % (v, k) for k,v in all_jobs.items()))
     update_script(os.path.join(fake_slurm, "scancel"), "#!/bin/sh\n" +
                   "\ntouch %s" % unsatisfiable_job_scancelled)
     return 0
@@ -105,18 +106,6 @@ def node_paired(g):
 
     return 0
 
-def remaining_jobs(g):
-    update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n" +
-                  "\n".join("echo '%s|alloc|(null)'" % (v) for k,v in compute_nodes.items()))
-
-    for k,v in all_jobs.items():
-        all_jobs[k] = "Running"
-
-    set_squeue(g)
-
-    return 0
-
-
 def node_busy(g):
     update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n" +
                   "\n".join("echo '%s|idle|(null)'" % (v) for k,v in compute_nodes.items()))
@@ -124,8 +113,12 @@ def node_busy(g):
 
 def node_shutdown(g):
     global compute_nodes
-    del compute_nodes[g.group(1)]
-    return 0
+    if g.group(1) in compute_nodes:
+        del compute_nodes[g.group(1)]
+        return 0
+    else:
+        return 1
+
 
 def jobs_req(g):
     global all_jobs
@@ -186,8 +179,8 @@ def run_test(name, actions, checks, driver_class, jobs, provider):
                                       driver_class=driver_class,
                                       ssh_key=os.path.join(fake_slurm, "id_rsa.pub")))
 
-    # Tests must complete in less than 3 minutes.
-    timeout = time.time() + 180
+    # Tests must complete in less than 30 seconds.
+    timeout = time.time() + 30
     terminated = False
 
     # Now start node manager
@@ -215,7 +208,7 @@ def run_test(name, actions, checks, driver_class, jobs, provider):
                     if code != 0:
                         detail.error("Check failed")
                         if not terminated:
-                            p.terminate()
+                            p.kill()
                             terminated = True
 
             if terminated:
@@ -225,7 +218,7 @@ def run_test(name, actions, checks, driver_class, jobs, provider):
                 detail.error("Exceeded timeout with actions remaining: %s", actions)
                 code += 1
                 if not terminated:
-                    p.terminate()
+                    p.kill()
                     terminated = True
 
             k, v = actions[0]
@@ -236,11 +229,11 @@ def run_test(name, actions, checks, driver_class, jobs, provider):
                 code += v(g)
                 if code != 0:
                     detail.error("Action failed")
-                    p.terminate()
+                    p.kill()
                     terminated = True
 
             if not actions:
-                p.terminate()
+                p.kill()
                 terminated = True
     except KeyboardInterrupt:
         p.kill()
@@ -256,7 +249,18 @@ def run_test(name, actions, checks, driver_class, jobs, provider):
         logger.info("%s passed", name)
     else:
         if isinstance(detail_content, StringIO.StringIO):
-            sys.stderr.write(detail_content.getvalue())
+            detail_content.seek(0)
+            chunk = detail_content.read(4096)
+            while chunk:
+                try:
+                    sys.stderr.write(chunk)
+                    chunk = detail_content.read(4096)
+                except IOError as e:
+                    if e.errno == errno.EAGAIN:
+                        # try again (probably pipe buffer full)
+                        pass
+                    else:
+                        raise
         logger.info("%s failed", name)
 
     return code
@@ -284,21 +288,28 @@ def main():
             # Provider
             "azure"),
         "test_single_node_azure": (
+            # Actions (pattern -> action)
             [
                 (r".*Daemon started", set_squeue),
                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
-            ], {
+            ],
+            # Checks (things that shouldn't happen)
+            {
                 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
                 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1),
                 r".*Setting node quota.*": fail,
             },
+            # Driver class
             "arvnodeman.test.fake_driver.FakeDriver",
+            # Jobs
             {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
+            # Provider
             "azure"),
         "test_multiple_nodes": (
+            # Actions (pattern -> action)
             [
                 (r".*Daemon started", set_squeue),
                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
@@ -311,46 +322,54 @@ def main():
                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
-            ], {
-                r".*Suggesting shutdown because node state is \('down', .*\)": fail,
+            ],
+            # Checks (things that shouldn't happen)
+            {
                 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 4),
                 r".*Setting node quota.*": fail,
             },
+            # Driver class
             "arvnodeman.test.fake_driver.FakeDriver",
+            # Jobs
             {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
              "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
              "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
-             "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"
-         }, "azure"),
+             "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"},
+            # Provider
+            "azure"),
         "test_hit_quota": (
+            # Actions (pattern -> action)
             [
                 (r".*Daemon started", set_squeue),
                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
-                (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", remaining_jobs),
-                (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
+                (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown)
-            ], {
-                r".*Suggesting shutdown because node state is \('down', .*\)": fail,
+            ],
+            # Checks (things that shouldn't happen)
+            {
                 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 2),
                 r".*Sending create_node request.*": partial(expect_count, 5)
             },
+            # Driver class
             "arvnodeman.test.fake_driver.QuotaDriver",
+            # Jobs
             {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
              "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
              "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
-             "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"
-         }, "azure"),
+             "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"},
+            # Provider
+            "azure"),
         "test_probe_quota": (
+            # Actions (pattern -> action)
             [
                 (r".*Daemon started", set_squeue),
                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
-                (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", remaining_jobs),
-                (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
+                (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
                 (r".*sending request", jobs_req),
@@ -364,18 +383,23 @@ def main():
                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
-            ], {
-                r".*Suggesting shutdown because node state is \('down', .*\)": fail,
+            ],
+            # Checks (things that shouldn't happen)
+            {
                 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 6),
                 r".*Sending create_node request.*": partial(expect_count, 9)
             },
+            # Driver class
             "arvnodeman.test.fake_driver.QuotaDriver",
+            # Jobs
             {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
              "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
              "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
-             "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"
-         }, "azure"),
+             "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"},
+            # Provider
+            "azure"),
         "test_no_hang_failing_node_create": (
+            # Actions (pattern -> action)
             [
                 (r".*Daemon started", set_squeue),
                 (r".*Client error: nope", noop),
@@ -383,53 +407,72 @@ def main():
                 (r".*Client error: nope", noop),
                 (r".*Client error: nope", noop),
             ],
+            # Checks (things that shouldn't happen)
             {},
+            # Driver class
             "arvnodeman.test.fake_driver.FailingDriver",
+            # Jobs
             {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
              "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
              "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
-             "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"
-         }, "azure"),
+             "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"},
+            # Provider
+            "azure"),
         "test_retry_create": (
+            # Actions (pattern -> action)
             [
                 (r".*Daemon started", set_squeue),
-                (r".*Rate limit exceeded - scheduling retry in 12 seconds", noop),
                 (r".*Rate limit exceeded - scheduling retry in 2 seconds", noop),
+                (r".*Rate limit exceeded - scheduling retry in 1 seconds", noop),
                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", noop),
             ],
+            # Checks (things that shouldn't happen)
             {},
+            # Driver class
             "arvnodeman.test.fake_driver.RetryDriver",
-            {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail"
-         }, "azure"),
+            # Jobs
+            {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail"},
+            # Provider
+            "azure"),
         "test_single_node_aws": (
+            # Actions (pattern -> action)
             [
                 (r".*Daemon started", set_squeue),
                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
-            ], {
-                r".*Suggesting shutdown because node state is \('down', .*\)": fail,
+            ],
+            # Checks (things that shouldn't happen)
+            {
                 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1),
                 r".*Setting node quota.*": fail,
             },
+            # Driver class
             "arvnodeman.test.fake_driver.FakeAwsDriver",
+            # Jobs
             {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
+            # Provider
             "ec2"),
         "test_single_node_gce": (
+            # Actions (pattern -> action)
             [
                 (r".*Daemon started", set_squeue),
                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
-            ], {
-                r".*Suggesting shutdown because node state is \('down', .*\)": fail,
+            ],
+            # Checks (things that shouldn't happen)
+            {
                 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1),
                 r".*Setting node quota.*": fail,
             },
+            # Driver class
             "arvnodeman.test.fake_driver.FakeGceDriver",
+            # Jobs
             {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
+            # Provider
             "gce")
     }