X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/88a29cd091468feb98e5cd541c560f4d35bca716..f98e0188777b3e2d229c968824b3e64307dae4e6:/services/nodemanager/tests/integration_test.py diff --git a/services/nodemanager/tests/integration_test.py b/services/nodemanager/tests/integration_test.py index 7b8ba391c9..508e626639 100755 --- a/services/nodemanager/tests/integration_test.py +++ b/services/nodemanager/tests/integration_test.py @@ -58,14 +58,14 @@ def update_script(path, val): def set_squeue(g): global all_jobs update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n" + - "\n".join("echo '1|100|100|%s|%s|(null)'" % (v, k) for k,v in all_jobs.items())) + "\n".join("echo '1|100|100|%s|%s|(null)|1234567890'" % (v, k) for k,v in all_jobs.items())) return 0 def set_queue_unsatisfiable(g): global all_jobs, unsatisfiable_job_scancelled # Simulate a job requesting a 99 core node. update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n" + - "\n".join("echo '99|100|100|%s|%s|(null)'" % (v, k) for k,v in all_jobs.items())) + "\n".join("echo '99|100|100|%s|%s|(null)|1234567890'" % (v, k) for k,v in all_jobs.items())) update_script(os.path.join(fake_slurm, "scancel"), "#!/bin/sh\n" + "\ntouch %s" % unsatisfiable_job_scancelled) return 0 @@ -284,21 +284,28 @@ def main(): # Provider "azure"), "test_single_node_azure": ( + # Actions (pattern -> action) [ (r".*Daemon started", set_squeue), (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired), (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy), (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop), (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown), - ], { + ], + # Checks (things that shouldn't happen) + { r".*Suggesting shutdown because node state is \('down', .*\)": fail, r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1), r".*Setting node quota.*": fail, }, + # Driver class "arvnodeman.test.fake_driver.FakeDriver", + # Jobs {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"}, + # Provider "azure"), "test_multiple_nodes": ( + # Actions (pattern -> action) [ (r".*Daemon started", set_squeue), (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired), @@ -311,46 +318,56 @@ def main(): (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown), (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown), (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown), - ], { + ], + # Checks (things that shouldn't happen) + { r".*Suggesting shutdown because node state is \('down', .*\)": fail, r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 4), r".*Setting node quota.*": fail, }, + # Driver class "arvnodeman.test.fake_driver.FakeDriver", + # Jobs {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail", "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail", "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail", - "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail" - }, "azure"), + "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"}, + # Provider + "azure"), "test_hit_quota": ( + # Actions (pattern -> action) [ (r".*Daemon started", set_squeue), (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired), (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired), (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy), (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", remaining_jobs), - (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy), (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown), (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown) - ], { + ], + # Checks (things that shouldn't happen) + { r".*Suggesting shutdown because node state is \('down', .*\)": fail, r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 2), r".*Sending create_node request.*": partial(expect_count, 5) }, + # Driver class "arvnodeman.test.fake_driver.QuotaDriver", + # Jobs {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail", "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail", "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail", - "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail" - }, "azure"), + "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"}, + # Provider + "azure"), "test_probe_quota": ( + # Actions (pattern -> action) [ (r".*Daemon started", set_squeue), (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired), (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired), (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy), (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", remaining_jobs), - (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy), (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown), (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown), (r".*sending request", jobs_req), @@ -364,18 +381,24 @@ def main(): (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown), (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown), (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown), - ], { + ], + # Checks (things that shouldn't happen) + { r".*Suggesting shutdown because node state is \('down', .*\)": fail, r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 6), r".*Sending create_node request.*": partial(expect_count, 9) }, + # Driver class "arvnodeman.test.fake_driver.QuotaDriver", + # Jobs {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail", "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail", "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail", - "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail" - }, "azure"), + "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"}, + # Provider + "azure"), "test_no_hang_failing_node_create": ( + # Actions (pattern -> action) [ (r".*Daemon started", set_squeue), (r".*Client error: nope", noop), @@ -383,53 +406,74 @@ def main(): (r".*Client error: nope", noop), (r".*Client error: nope", noop), ], + # Checks (things that shouldn't happen) {}, + # Driver class "arvnodeman.test.fake_driver.FailingDriver", + # Jobs {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail", "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail", "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail", - "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail" - }, "azure"), + "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"}, + # Provider + "azure"), "test_retry_create": ( + # Actions (pattern -> action) [ (r".*Daemon started", set_squeue), (r".*Rate limit exceeded - scheduling retry in 12 seconds", noop), (r".*Rate limit exceeded - scheduling retry in 2 seconds", noop), (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", noop), ], + # Checks (things that shouldn't happen) {}, + # Driver class "arvnodeman.test.fake_driver.RetryDriver", - {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail" - }, "azure"), + # Jobs + {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail"}, + # Provider + "azure"), "test_single_node_aws": ( + # Actions (pattern -> action) [ (r".*Daemon started", set_squeue), (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired), (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy), (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop), (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown), - ], { + ], + # Checks (things that shouldn't happen) + { r".*Suggesting shutdown because node state is \('down', .*\)": fail, r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1), r".*Setting node quota.*": fail, }, + # Driver class "arvnodeman.test.fake_driver.FakeAwsDriver", + # Jobs {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"}, + # Provider "ec2"), "test_single_node_gce": ( + # Actions (pattern -> action) [ (r".*Daemon started", set_squeue), (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired), (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy), (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop), (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown), - ], { + ], + # Checks (things that shouldn't happen) + { r".*Suggesting shutdown because node state is \('down', .*\)": fail, r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1), r".*Setting node quota.*": fail, }, + # Driver class "arvnodeman.test.fake_driver.FakeGceDriver", + # Jobs {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"}, + # Provider "gce") }