13399: If slurm refuses to renice past 10K, stop trying.
[arvados.git] / services / nodemanager / tests / integration_test.py
index 4aa8c9e85d3e144290c178f9de198fc833b2cf1a..69a29019e78cb1ab7de2a8fdc41de85c8abc645a 100755 (executable)
@@ -12,7 +12,7 @@ events or behaviors for each test.
 
 """
 
-import subprocess
+import subprocess32 as subprocess
 import os
 import sys
 import re
@@ -106,18 +106,6 @@ def node_paired(g):
 
     return 0
 
-def remaining_jobs(g):
-    update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n" +
-                  "\n".join("echo '%s|alloc|(null)'" % (v) for k,v in compute_nodes.items()))
-
-    for k,v in all_jobs.items():
-        all_jobs[k] = "Running"
-
-    set_squeue(g)
-
-    return 0
-
-
 def node_busy(g):
     update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n" +
                   "\n".join("echo '%s|idle|(null)'" % (v) for k,v in compute_nodes.items()))
@@ -125,7 +113,8 @@ def node_busy(g):
 
 def node_shutdown(g):
     global compute_nodes
-    del compute_nodes[g.group(1)]
+    if g.group(1) in compute_nodes:
+        del compute_nodes[g.group(1)]
     return 0
 
 def jobs_req(g):
@@ -187,8 +176,8 @@ def run_test(name, actions, checks, driver_class, jobs, provider):
                                       driver_class=driver_class,
                                       ssh_key=os.path.join(fake_slurm, "id_rsa.pub")))
 
-    # Tests must complete in less than 3 minutes.
-    timeout = time.time() + 180
+    # Tests must complete in less than 30 seconds.
+    timeout = time.time() + 30
     terminated = False
 
     # Now start node manager
@@ -333,7 +322,6 @@ def main():
             ],
             # Checks (things that shouldn't happen)
             {
-                r".*Suggesting shutdown because node state is \('down', .*\)": fail,
                 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 4),
                 r".*Setting node quota.*": fail,
             },
@@ -353,13 +341,12 @@ def main():
                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
-                (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", remaining_jobs),
+                (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown)
             ],
             # Checks (things that shouldn't happen)
             {
-                r".*Suggesting shutdown because node state is \('down', .*\)": fail,
                 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 2),
                 r".*Sending create_node request.*": partial(expect_count, 5)
             },
@@ -379,7 +366,7 @@ def main():
                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
-                (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", remaining_jobs),
+                (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
                 (r".*sending request", jobs_req),
@@ -396,7 +383,6 @@ def main():
             ],
             # Checks (things that shouldn't happen)
             {
-                r".*Suggesting shutdown because node state is \('down', .*\)": fail,
                 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 6),
                 r".*Sending create_node request.*": partial(expect_count, 9)
             },
@@ -456,7 +442,6 @@ def main():
             ],
             # Checks (things that shouldn't happen)
             {
-                r".*Suggesting shutdown because node state is \('down', .*\)": fail,
                 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1),
                 r".*Setting node quota.*": fail,
             },
@@ -477,7 +462,6 @@ def main():
             ],
             # Checks (things that shouldn't happen)
             {
-                r".*Suggesting shutdown because node state is \('down', .*\)": fail,
                 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1),
                 r".*Setting node quota.*": fail,
             },