X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/09794d996eca79b85d3ac0c21a4a43c65a51d0d7..4d7567b7b3577b561dd064d397c10a9331c3ee16:/services/nodemanager/tests/integration_test.py diff --git a/services/nodemanager/tests/integration_test.py b/services/nodemanager/tests/integration_test.py index 90bf237645..1ba2957ee5 100755 --- a/services/nodemanager/tests/integration_test.py +++ b/services/nodemanager/tests/integration_test.py @@ -1,5 +1,18 @@ #!/usr/bin/env python -import subprocess +# Copyright (C) The Arvados Authors. All rights reserved. +# +# SPDX-License-Identifier: AGPL-3.0 + +"""Integration test framework for node manager. + +Runs full node manager with an API server (needs ARVADOS_API_HOST and +ARVADOS_API_TOKEN). Stubs out the cloud driver and slurm commands to mock +specific behaviors. Monitors the log output to verify an expected sequence of +events or behaviors for each test. + +""" + +import subprocess32 as subprocess import os import sys import re @@ -8,126 +21,474 @@ import logging import stat import tempfile import shutil +import errno +from functools import partial +import arvados +import StringIO + +formatter = logging.Formatter('%(asctime)s %(levelname)s: %(message)s') -logging.basicConfig(level=logging.INFO) +handler = logging.StreamHandler(sys.stderr) +handler.setFormatter(formatter) +logger = logging.getLogger("logger") +logger.setLevel(logging.INFO) +logger.addHandler(handler) + +detail = logging.getLogger("detail") +detail.setLevel(logging.INFO) +if os.environ.get("ANMTEST_LOGLEVEL"): + detail_content = sys.stderr +else: + detail_content = StringIO.StringIO() +handler = logging.StreamHandler(detail_content) +handler.setFormatter(formatter) +detail.addHandler(handler) fake_slurm = None compute_nodes = None +all_jobs = None +unsatisfiable_job_scancelled = None def update_script(path, val): with open(path+"_", "w") as f: f.write(val) os.chmod(path+"_", stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR) os.rename(path+"_", path) + detail.info("Update script %s: %s", path, val) - -def set_squeue(actions, checks, k, g): - update_script(os.path.join(fake_slurm, "squeue"), """#!/bin/sh -echo '1|100|100|ReqNodeNotAvail|34t0i-dz642-h42bg3hq4bdfpf9' -""") +def set_squeue(g): + global all_jobs + update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n" + + "\n".join("echo '1|100|100|%s|%s|(null)|1234567890'" % (v, k) for k,v in all_jobs.items())) return 0 -def set_sinfo_alloc(actions, checks, k, g): - update_script(os.path.join(fake_slurm, "sinfo"), """#!/bin/sh -echo '%s alloc' -""" % (g.group(3))) +def set_queue_unsatisfiable(g): + global all_jobs, unsatisfiable_job_scancelled + # Simulate a job requesting a 99 core node. + update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n" + + "\n".join("echo '99|100|100|%s|%s|(null)|1234567890'" % (v, k) for k,v in all_jobs.items())) + update_script(os.path.join(fake_slurm, "scancel"), "#!/bin/sh\n" + + "\ntouch %s" % unsatisfiable_job_scancelled) + return 0 - update_script(os.path.join(fake_slurm, "squeue"), """#!/bin/sh -echo '1|100|100|Running|34t0i-dz642-h42bg3hq4bdfpf9' -""") +def job_cancelled(g): + global unsatisfiable_job_scancelled + cancelled_job = g.group(1) + api = arvados.api('v1') + # Check that 'scancel' was called + if not os.path.isfile(unsatisfiable_job_scancelled): + return 1 + # Check for the log entry + log_entry = api.logs().list( + filters=[ + ['object_uuid', '=', cancelled_job], + ['event_type', '=', 'stderr'], + ]).execute()['items'][0] + if not re.match( + r"Constraints cannot be satisfied", + log_entry['properties']['text']): + return 1 + return 0 +def node_paired(g): global compute_nodes compute_nodes[g.group(1)] = g.group(3) + + update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n" + + "\n".join("echo '%s|alloc|(null)'" % (v) for k,v in compute_nodes.items())) + + for k,v in all_jobs.items(): + if v == "ReqNodeNotAvail": + all_jobs[k] = "Running" + break + + set_squeue(g) + return 0 -def set_sinfo_idle(actions, checks, k, g): - update_script(os.path.join(fake_slurm, "sinfo"), """#!/bin/sh -echo '%s idle' -""" % (compute_nodes[g.group(1)])) +def node_busy(g): + update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n" + + "\n".join("echo '%s|idle|(null)'" % (v) for k,v in compute_nodes.items())) return 0 -def noop(actions, checks, k, g): +def node_shutdown(g): + global compute_nodes + if g.group(1) in compute_nodes: + del compute_nodes[g.group(1)] + return 0 + else: + return 1 + + +def jobs_req(g): + global all_jobs + for k,v in all_jobs.items(): + all_jobs[k] = "ReqNodeNotAvail" + set_squeue(g) return 0 -def down_fail(actions, checks, k, g): +def noop(g): + return 0 + +def fail(checks, pattern, g): return 1 +def expect_count(count, checks, pattern, g): + if count == 0: + return 1 + else: + checks[pattern] = partial(expect_count, count-1) + return 0 -def run_test(actions, checks, driver_class): +def run_test(name, actions, checks, driver_class, jobs, provider): code = 0 + global unsatisfiable_job_scancelled + unsatisfiable_job_scancelled = os.path.join(tempfile.mkdtemp(), + "scancel_called") + + # Delete any stale node records + api = arvados.api('v1') + for n in api.nodes().list().execute()['items']: + api.nodes().delete(uuid=n["uuid"]).execute() + + logger.info("Start %s", name) global fake_slurm fake_slurm = tempfile.mkdtemp() - logging.info("fake_slurm is %s", fake_slurm) + detail.info("fake_slurm is %s", fake_slurm) global compute_nodes compute_nodes = {} + global all_jobs + all_jobs = jobs + env = os.environ.copy() env["PATH"] = fake_slurm + ":" + env["PATH"] + # Reset fake squeue/sinfo to empty update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n") update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n") - with open("tests/fake.cfg.template") as f: - with open(os.path.join(fake_slurm, "id_rsa.pub"), "w") as ssh: - pass + # Write configuration file for test + with open("tests/fake_%s.cfg.template" % provider) as f: + open(os.path.join(fake_slurm, "id_rsa.pub"), "w").close() with open(os.path.join(fake_slurm, "fake.cfg"), "w") as cfg: cfg.write(f.read().format(host=os.environ["ARVADOS_API_HOST"], token=os.environ["ARVADOS_API_TOKEN"], driver_class=driver_class, ssh_key=os.path.join(fake_slurm, "id_rsa.pub"))) - timeout = time.time() + 300 + # Tests must complete in less than 30 seconds. + timeout = time.time() + 30 + terminated = False + # Now start node manager p = subprocess.Popen(["bin/arvados-node-manager", "--foreground", "--config", os.path.join(fake_slurm, "fake.cfg")], - bufsize=1, stderr=subprocess.PIPE, env=env) - for line in p.stderr: - sys.stdout.write(line) + bufsize=0, stderr=subprocess.PIPE, env=env) - if time.time() > timeout: - logging.error("Exceeded timeout") - code = 1 - p.terminate() + # Test main loop: + # - Read line + # - Apply negative checks (things that are not supposed to happen) + # - Check timeout + # - Check if the next action should trigger + # - If all actions are exhausted, terminate with test success + # - If it hits timeout with actions remaining, terminate with test failed + try: + # naive line iteration over pipes gets buffered, which isn't what we want, + # see https://bugs.python.org/issue3907 + for line in iter(p.stderr.readline, ""): + detail_content.write(line) - for k,v in actions.items(): - g = re.match(k, line) - if g: - logging.info("Triggered action %s", k) - del actions[k] - code = v(actions, checks, k, g) - if code != 0: - logging.error("Action failed") - p.terminate() + for k,v in checks.items(): + g = re.match(k, line) + if g: + detail.info("Matched check %s", k) + code += v(checks, k, g) + if code != 0: + detail.error("Check failed") + if not terminated: + p.kill() + terminated = True + + if terminated: + continue - for k,v in checks.items(): + if time.time() > timeout: + detail.error("Exceeded timeout with actions remaining: %s", actions) + code += 1 + if not terminated: + p.kill() + terminated = True + + k, v = actions[0] g = re.match(k, line) if g: - logging.info("Triggered check %s", k) - code = v(actions, checks, k, g) + detail.info("Matched action %s", k) + actions.pop(0) + code += v(g) if code != 0: - logging.error("Check failed") - p.terminate() + detail.error("Action failed") + p.kill() + terminated = True + + if not actions: + p.kill() + terminated = True + except KeyboardInterrupt: + p.kill() - if not actions: - p.terminate() + if actions: + detail.error("Ended with remaining actions: %s", actions) + code = 1 - #shutil.rmtree(fake_slurm) + shutil.rmtree(fake_slurm) + shutil.rmtree(os.path.dirname(unsatisfiable_job_scancelled)) + + if code == 0: + logger.info("%s passed", name) + else: + if isinstance(detail_content, StringIO.StringIO): + detail_content.seek(0) + chunk = detail_content.read(4096) + while chunk: + try: + sys.stderr.write(chunk) + chunk = detail_content.read(4096) + except IOError as e: + if e.errno == errno.EAGAIN: + # try again (probably pipe buffer full) + pass + else: + raise + logger.info("%s failed", name) return code def main(): - code = run_test({ - r".*Daemon started": set_squeue, - r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": set_sinfo_alloc, - r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)": set_sinfo_idle, - r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)": noop, - r".*Shutdown success": noop, - }, { - r".*Suggesting shutdown because node state is \('down', .*\)": down_fail - }, - "arvnodeman.test.fake_driver.FakeDriver") + # Test lifecycle. + + tests = { + "test_unsatisfiable_jobs" : ( + # Actions (pattern -> action) + [ + (r".*Daemon started", set_queue_unsatisfiable), + (r".*Cancelled unsatisfiable job '(\S+)'", job_cancelled), + ], + # Checks (things that shouldn't happen) + { + r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": fail, + r".*Trying to cancel job '(\S+)'": fail, + }, + # Driver class + "arvnodeman.test.fake_driver.FakeDriver", + # Jobs + {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"}, + # Provider + "azure"), + "test_single_node_azure": ( + # Actions (pattern -> action) + [ + (r".*Daemon started", set_squeue), + (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired), + (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy), + (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop), + (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown), + ], + # Checks (things that shouldn't happen) + { + r".*Suggesting shutdown because node state is \('down', .*\)": fail, + r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1), + r".*Setting node quota.*": fail, + }, + # Driver class + "arvnodeman.test.fake_driver.FakeDriver", + # Jobs + {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"}, + # Provider + "azure"), + "test_multiple_nodes": ( + # Actions (pattern -> action) + [ + (r".*Daemon started", set_squeue), + (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired), + (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired), + (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired), + (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired), + (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy), + (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop), + (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown), + (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown), + (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown), + (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown), + ], + # Checks (things that shouldn't happen) + { + r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 4), + r".*Setting node quota.*": fail, + }, + # Driver class + "arvnodeman.test.fake_driver.FakeDriver", + # Jobs + {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail", + "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail", + "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail", + "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"}, + # Provider + "azure"), + "test_hit_quota": ( + # Actions (pattern -> action) + [ + (r".*Daemon started", set_squeue), + (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired), + (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired), + (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy), + (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop), + (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown), + (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown) + ], + # Checks (things that shouldn't happen) + { + r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 2), + r".*Sending create_node request.*": partial(expect_count, 5) + }, + # Driver class + "arvnodeman.test.fake_driver.QuotaDriver", + # Jobs + {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail", + "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail", + "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail", + "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"}, + # Provider + "azure"), + "test_probe_quota": ( + # Actions (pattern -> action) + [ + (r".*Daemon started", set_squeue), + (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired), + (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired), + (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy), + (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop), + (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown), + (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown), + (r".*sending request", jobs_req), + (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired), + (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired), + (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired), + (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired), + (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy), + (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop), + (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown), + (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown), + (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown), + (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown), + ], + # Checks (things that shouldn't happen) + { + r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 6), + r".*Sending create_node request.*": partial(expect_count, 9) + }, + # Driver class + "arvnodeman.test.fake_driver.QuotaDriver", + # Jobs + {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail", + "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail", + "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail", + "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"}, + # Provider + "azure"), + "test_no_hang_failing_node_create": ( + # Actions (pattern -> action) + [ + (r".*Daemon started", set_squeue), + (r".*Client error: nope", noop), + (r".*Client error: nope", noop), + (r".*Client error: nope", noop), + (r".*Client error: nope", noop), + ], + # Checks (things that shouldn't happen) + {}, + # Driver class + "arvnodeman.test.fake_driver.FailingDriver", + # Jobs + {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail", + "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail", + "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail", + "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"}, + # Provider + "azure"), + "test_retry_create": ( + # Actions (pattern -> action) + [ + (r".*Daemon started", set_squeue), + (r".*Rate limit exceeded - scheduling retry in 2 seconds", noop), + (r".*Rate limit exceeded - scheduling retry in 1 seconds", noop), + (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", noop), + ], + # Checks (things that shouldn't happen) + {}, + # Driver class + "arvnodeman.test.fake_driver.RetryDriver", + # Jobs + {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail"}, + # Provider + "azure"), + "test_single_node_aws": ( + # Actions (pattern -> action) + [ + (r".*Daemon started", set_squeue), + (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired), + (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy), + (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop), + (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown), + ], + # Checks (things that shouldn't happen) + { + r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1), + r".*Setting node quota.*": fail, + }, + # Driver class + "arvnodeman.test.fake_driver.FakeAwsDriver", + # Jobs + {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"}, + # Provider + "ec2"), + "test_single_node_gce": ( + # Actions (pattern -> action) + [ + (r".*Daemon started", set_squeue), + (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired), + (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy), + (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop), + (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown), + ], + # Checks (things that shouldn't happen) + { + r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1), + r".*Setting node quota.*": fail, + }, + # Driver class + "arvnodeman.test.fake_driver.FakeGceDriver", + # Jobs + {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"}, + # Provider + "gce") + } + + code = 0 + if len(sys.argv) > 1: + code = run_test(sys.argv[1], *tests[sys.argv[1]]) + else: + for t in sorted(tests.keys()): + code += run_test(t, *tests[t]) + + if code == 0: + logger.info("Tests passed") + else: + logger.info("Tests failed") + exit(code) -main() +if __name__ == '__main__': + main()