X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/590135864c4a977dddd3e42330d8b8608f916570..e2bf56f0a0fa1f6b4fb7b4efc4db5178b074b8ce:/services/nodemanager/tests/integration_test.py diff --git a/services/nodemanager/tests/integration_test.py b/services/nodemanager/tests/integration_test.py index d4b7e0214a..d5b55540f8 100755 --- a/services/nodemanager/tests/integration_test.py +++ b/services/nodemanager/tests/integration_test.py @@ -1,4 +1,8 @@ #!/usr/bin/env python +# Copyright (C) The Arvados Authors. All rights reserved. +# +# SPDX-License-Identifier: AGPL-3.0 + """Integration test framework for node manager. Runs full node manager with an API server (needs ARVADOS_API_HOST and @@ -19,19 +23,31 @@ import tempfile import shutil from functools import partial import arvados +import StringIO + +logger = logging.getLogger("logger") +logger.setLevel(logging.INFO) +logger.addHandler(logging.StreamHandler(sys.stderr)) -logging.basicConfig(level=logging.INFO) +detail = logging.getLogger("detail") +detail.setLevel(logging.INFO) +if os.environ.get("ANMTEST_LOGLEVEL"): + detail_content = sys.stderr +else: + detail_content = StringIO.StringIO() +detail.addHandler(logging.StreamHandler(detail_content)) fake_slurm = None compute_nodes = None all_jobs = None +unsatisfiable_job_scancelled = None def update_script(path, val): with open(path+"_", "w") as f: f.write(val) os.chmod(path+"_", stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR) os.rename(path+"_", path) - logging.info("Update script %s: %s", path, val) + detail.info("Update script %s: %s", path, val) def set_squeue(g): global all_jobs @@ -39,6 +55,33 @@ def set_squeue(g): "\n".join("echo '1|100|100|%s|%s'" % (v, k) for k,v in all_jobs.items())) return 0 +def set_queue_unsatisfiable(g): + global all_jobs, unsatisfiable_job_scancelled + # Simulate a job requesting a 99 core node. + update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n" + + "\n".join("echo '99|100|100|%s|%s'" % (v, k) for k,v in all_jobs.items())) + update_script(os.path.join(fake_slurm, "scancel"), "#!/bin/sh\n" + + "\ntouch %s" % unsatisfiable_job_scancelled) + return 0 + +def job_cancelled(g): + global unsatisfiable_job_scancelled + cancelled_job = g.group(1) + api = arvados.api('v1') + # Check that 'scancel' was called + if not os.path.isfile(unsatisfiable_job_scancelled): + return 1 + # Check for the log entry + log_entry = api.logs().list( + filters=[ + ['object_uuid', '=', cancelled_job], + ['event_type', '=', 'stderr'], + ]).execute()['items'][0] + if not re.match( + r"Requirements for a single node exceed the available cloud node size", + log_entry['properties']['text']): + return 1 + return 0 def node_paired(g): global compute_nodes @@ -98,19 +141,22 @@ def expect_count(count, checks, pattern, g): checks[pattern] = partial(expect_count, count-1) return 0 -def run_test(name, actions, checks, driver_class, jobs): +def run_test(name, actions, checks, driver_class, jobs, provider): code = 0 + global unsatisfiable_job_scancelled + unsatisfiable_job_scancelled = os.path.join(tempfile.mkdtemp(), + "scancel_called") # Delete any stale node records api = arvados.api('v1') for n in api.nodes().list().execute()['items']: api.nodes().delete(uuid=n["uuid"]).execute() - logging.info("Start %s", name) + logger.info("Start %s", name) global fake_slurm fake_slurm = tempfile.mkdtemp() - logging.info("fake_slurm is %s", fake_slurm) + detail.info("fake_slurm is %s", fake_slurm) global compute_nodes compute_nodes = {} @@ -126,9 +172,8 @@ def run_test(name, actions, checks, driver_class, jobs): update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n") # Write configuration file for test - with open("tests/fake.cfg.template") as f: - with open(os.path.join(fake_slurm, "id_rsa.pub"), "w") as ssh: - pass + with open("tests/fake_%s.cfg.template" % provider) as f: + open(os.path.join(fake_slurm, "id_rsa.pub"), "w").close() with open(os.path.join(fake_slurm, "fake.cfg"), "w") as cfg: cfg.write(f.read().format(host=os.environ["ARVADOS_API_HOST"], token=os.environ["ARVADOS_API_TOKEN"], @@ -145,7 +190,7 @@ def run_test(name, actions, checks, driver_class, jobs): # Test main loop: # - Read line - # - Apply negative checks (thinks that are not supposed to happen) + # - Apply negative checks (things that are not supposed to happen) # - Check timeout # - Check if the next action should trigger # - If all actions are exhausted, terminate with test success @@ -154,15 +199,15 @@ def run_test(name, actions, checks, driver_class, jobs): # naive line iteration over pipes gets buffered, which isn't what we want, # see https://bugs.python.org/issue3907 for line in iter(p.stderr.readline, ""): - sys.stdout.write(line) + detail_content.write(line) for k,v in checks.items(): g = re.match(k, line) if g: - logging.info("Matched check %s", k) + detail.info("Matched check %s", k) code += v(checks, k, g) if code != 0: - logging.error("Check failed") + detail.error("Check failed") if not terminated: p.terminate() terminated = True @@ -171,7 +216,7 @@ def run_test(name, actions, checks, driver_class, jobs): continue if time.time() > timeout: - logging.error("Exceeded timeout with actions remaining: %s", actions) + detail.error("Exceeded timeout with actions remaining: %s", actions) code += 1 if not terminated: p.terminate() @@ -180,11 +225,11 @@ def run_test(name, actions, checks, driver_class, jobs): k, v = actions[0] g = re.match(k, line) if g: - logging.info("Matched action %s", k) + detail.info("Matched action %s", k) actions.pop(0) code += v(g) if code != 0: - logging.error("Action failed") + detail.error("Action failed") p.terminate() terminated = True @@ -195,15 +240,18 @@ def run_test(name, actions, checks, driver_class, jobs): p.kill() if actions: - logging.error("Ended with remaining actions: %s", actions) + detail.error("Ended with remaining actions: %s", actions) code = 1 shutil.rmtree(fake_slurm) + shutil.rmtree(os.path.dirname(unsatisfiable_job_scancelled)) if code == 0: - logging.info("%s passed", name) + logger.info("%s passed", name) else: - logging.info("%s failed", name) + if isinstance(detail_content, StringIO.StringIO): + sys.stderr.write(detail_content.getvalue()) + logger.info("%s failed", name) return code @@ -212,7 +260,24 @@ def main(): # Test lifecycle. tests = { - "test_single_node": ( + "test_unsatisfiable_jobs" : ( + # Actions (pattern -> action) + [ + (r".*Daemon started", set_queue_unsatisfiable), + (r".*Cancelled unsatisfiable job '(\S+)'", job_cancelled), + ], + # Checks (things that shouldn't happen) + { + r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": fail, + r".*Trying to cancel job '(\S+)'": fail, + }, + # Driver class + "arvnodeman.test.fake_driver.FakeDriver", + # Jobs + {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"}, + # Provider + "azure"), + "test_single_node_azure": ( [ (r".*Daemon started", set_squeue), (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired), @@ -225,7 +290,8 @@ def main(): r".*Setting node quota.*": fail, }, "arvnodeman.test.fake_driver.FakeDriver", - {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"}), + {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"}, + "azure"), "test_multiple_nodes": ( [ (r".*Daemon started", set_squeue), @@ -249,11 +315,10 @@ def main(): "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail", "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail", "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail" - }), + }, "azure"), "test_hit_quota": ( [ (r".*Daemon started", set_squeue), - (r".*setting node quota to 3", noop), (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired), (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired), (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy), @@ -271,11 +336,10 @@ def main(): "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail", "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail", "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail" - }), + }, "azure"), "test_probe_quota": ( [ (r".*Daemon started", set_squeue), - (r".*setting node quota to 3", noop), (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired), (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired), (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy), @@ -304,8 +368,8 @@ def main(): "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail", "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail", "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail" - }), - "test5": ( + }, "azure"), + "test_no_hang_failing_node_create": ( [ (r".*Daemon started", set_squeue), (r".*Client error: nope", noop), @@ -319,7 +383,48 @@ def main(): "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail", "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail", "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail" - }) + }, "azure"), + "test_retry_create": ( + [ + (r".*Daemon started", set_squeue), + (r".*Rate limit exceeded - scheduling retry in 12 seconds", noop), + (r".*Rate limit exceeded - scheduling retry in 2 seconds", noop), + (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", noop), + ], + {}, + "arvnodeman.test.fake_driver.RetryDriver", + {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail" + }, "azure"), + "test_single_node_aws": ( + [ + (r".*Daemon started", set_squeue), + (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired), + (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy), + (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop), + (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown), + ], { + r".*Suggesting shutdown because node state is \('down', .*\)": fail, + r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1), + r".*Setting node quota.*": fail, + }, + "arvnodeman.test.fake_driver.FakeAwsDriver", + {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"}, + "ec2"), + "test_single_node_gce": ( + [ + (r".*Daemon started", set_squeue), + (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired), + (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy), + (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop), + (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown), + ], { + r".*Suggesting shutdown because node state is \('down', .*\)": fail, + r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1), + r".*Setting node quota.*": fail, + }, + "arvnodeman.test.fake_driver.FakeGceDriver", + {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"}, + "gce") } code = 0 @@ -330,9 +435,9 @@ def main(): code += run_test(t, *tests[t]) if code == 0: - logging.info("Tests passed") + logger.info("Tests passed") else: - logging.info("Tests failed") + logger.info("Tests failed") exit(code)