X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/bb91f86ea02e3535e3953ee0916dd6877bf585f7..080c940d7a8134a6e277a53b7e45eb27e2b2c87f:/services/nodemanager/tests/integration_test.py diff --git a/services/nodemanager/tests/integration_test.py b/services/nodemanager/tests/integration_test.py index bdd3ffdcb7..1ba2957ee5 100755 --- a/services/nodemanager/tests/integration_test.py +++ b/services/nodemanager/tests/integration_test.py @@ -12,7 +12,7 @@ events or behaviors for each test. """ -import subprocess +import subprocess32 as subprocess import os import sys import re @@ -21,13 +21,18 @@ import logging import stat import tempfile import shutil +import errno from functools import partial import arvados import StringIO +formatter = logging.Formatter('%(asctime)s %(levelname)s: %(message)s') + +handler = logging.StreamHandler(sys.stderr) +handler.setFormatter(formatter) logger = logging.getLogger("logger") logger.setLevel(logging.INFO) -logger.addHandler(logging.StreamHandler(sys.stderr)) +logger.addHandler(handler) detail = logging.getLogger("detail") detail.setLevel(logging.INFO) @@ -35,7 +40,9 @@ if os.environ.get("ANMTEST_LOGLEVEL"): detail_content = sys.stderr else: detail_content = StringIO.StringIO() -detail.addHandler(logging.StreamHandler(detail_content)) +handler = logging.StreamHandler(detail_content) +handler.setFormatter(formatter) +detail.addHandler(handler) fake_slurm = None compute_nodes = None @@ -52,14 +59,14 @@ def update_script(path, val): def set_squeue(g): global all_jobs update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n" + - "\n".join("echo '1|100|100|%s|%s'" % (v, k) for k,v in all_jobs.items())) + "\n".join("echo '1|100|100|%s|%s|(null)|1234567890'" % (v, k) for k,v in all_jobs.items())) return 0 def set_queue_unsatisfiable(g): global all_jobs, unsatisfiable_job_scancelled # Simulate a job requesting a 99 core node. update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n" + - "\n".join("echo '99|100|100|%s|%s'" % (v, k) for k,v in all_jobs.items())) + "\n".join("echo '99|100|100|%s|%s|(null)|1234567890'" % (v, k) for k,v in all_jobs.items())) update_script(os.path.join(fake_slurm, "scancel"), "#!/bin/sh\n" + "\ntouch %s" % unsatisfiable_job_scancelled) return 0 @@ -78,7 +85,7 @@ def job_cancelled(g): ['event_type', '=', 'stderr'], ]).execute()['items'][0] if not re.match( - r"Requirements for a single node exceed the available cloud node size", + r"Constraints cannot be satisfied", log_entry['properties']['text']): return 1 return 0 @@ -88,7 +95,7 @@ def node_paired(g): compute_nodes[g.group(1)] = g.group(3) update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n" + - "\n".join("echo '%s alloc'" % (v) for k,v in compute_nodes.items())) + "\n".join("echo '%s|alloc|(null)'" % (v) for k,v in compute_nodes.items())) for k,v in all_jobs.items(): if v == "ReqNodeNotAvail": @@ -99,27 +106,19 @@ def node_paired(g): return 0 -def remaining_jobs(g): - update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n" + - "\n".join("echo '%s alloc'" % (v) for k,v in compute_nodes.items())) - - for k,v in all_jobs.items(): - all_jobs[k] = "Running" - - set_squeue(g) - - return 0 - - def node_busy(g): update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n" + - "\n".join("echo '%s idle'" % (v) for k,v in compute_nodes.items())) + "\n".join("echo '%s|idle|(null)'" % (v) for k,v in compute_nodes.items())) return 0 def node_shutdown(g): global compute_nodes - del compute_nodes[g.group(1)] - return 0 + if g.group(1) in compute_nodes: + del compute_nodes[g.group(1)] + return 0 + else: + return 1 + def jobs_req(g): global all_jobs @@ -180,8 +179,8 @@ def run_test(name, actions, checks, driver_class, jobs, provider): driver_class=driver_class, ssh_key=os.path.join(fake_slurm, "id_rsa.pub"))) - # Tests must complete in less than 3 minutes. - timeout = time.time() + 180 + # Tests must complete in less than 30 seconds. + timeout = time.time() + 30 terminated = False # Now start node manager @@ -209,7 +208,7 @@ def run_test(name, actions, checks, driver_class, jobs, provider): if code != 0: detail.error("Check failed") if not terminated: - p.terminate() + p.kill() terminated = True if terminated: @@ -219,7 +218,7 @@ def run_test(name, actions, checks, driver_class, jobs, provider): detail.error("Exceeded timeout with actions remaining: %s", actions) code += 1 if not terminated: - p.terminate() + p.kill() terminated = True k, v = actions[0] @@ -230,11 +229,11 @@ def run_test(name, actions, checks, driver_class, jobs, provider): code += v(g) if code != 0: detail.error("Action failed") - p.terminate() + p.kill() terminated = True if not actions: - p.terminate() + p.kill() terminated = True except KeyboardInterrupt: p.kill() @@ -250,7 +249,18 @@ def run_test(name, actions, checks, driver_class, jobs, provider): logger.info("%s passed", name) else: if isinstance(detail_content, StringIO.StringIO): - sys.stderr.write(detail_content.getvalue()) + detail_content.seek(0) + chunk = detail_content.read(4096) + while chunk: + try: + sys.stderr.write(chunk) + chunk = detail_content.read(4096) + except IOError as e: + if e.errno == errno.EAGAIN: + # try again (probably pipe buffer full) + pass + else: + raise logger.info("%s failed", name) return code @@ -278,21 +288,28 @@ def main(): # Provider "azure"), "test_single_node_azure": ( + # Actions (pattern -> action) [ (r".*Daemon started", set_squeue), (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired), (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy), (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop), (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown), - ], { + ], + # Checks (things that shouldn't happen) + { r".*Suggesting shutdown because node state is \('down', .*\)": fail, r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1), r".*Setting node quota.*": fail, }, + # Driver class "arvnodeman.test.fake_driver.FakeDriver", + # Jobs {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"}, + # Provider "azure"), "test_multiple_nodes": ( + # Actions (pattern -> action) [ (r".*Daemon started", set_squeue), (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired), @@ -305,46 +322,54 @@ def main(): (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown), (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown), (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown), - ], { - r".*Suggesting shutdown because node state is \('down', .*\)": fail, + ], + # Checks (things that shouldn't happen) + { r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 4), r".*Setting node quota.*": fail, }, + # Driver class "arvnodeman.test.fake_driver.FakeDriver", + # Jobs {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail", "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail", "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail", - "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail" - }, "azure"), + "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"}, + # Provider + "azure"), "test_hit_quota": ( + # Actions (pattern -> action) [ (r".*Daemon started", set_squeue), (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired), (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired), (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy), - (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", remaining_jobs), - (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy), + (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop), (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown), (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown) - ], { - r".*Suggesting shutdown because node state is \('down', .*\)": fail, + ], + # Checks (things that shouldn't happen) + { r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 2), r".*Sending create_node request.*": partial(expect_count, 5) }, + # Driver class "arvnodeman.test.fake_driver.QuotaDriver", + # Jobs {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail", "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail", "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail", - "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail" - }, "azure"), + "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"}, + # Provider + "azure"), "test_probe_quota": ( + # Actions (pattern -> action) [ (r".*Daemon started", set_squeue), (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired), (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired), (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy), - (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", remaining_jobs), - (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy), + (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop), (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown), (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown), (r".*sending request", jobs_req), @@ -358,18 +383,23 @@ def main(): (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown), (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown), (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown), - ], { - r".*Suggesting shutdown because node state is \('down', .*\)": fail, + ], + # Checks (things that shouldn't happen) + { r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 6), r".*Sending create_node request.*": partial(expect_count, 9) }, + # Driver class "arvnodeman.test.fake_driver.QuotaDriver", + # Jobs {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail", "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail", "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail", - "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail" - }, "azure"), + "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"}, + # Provider + "azure"), "test_no_hang_failing_node_create": ( + # Actions (pattern -> action) [ (r".*Daemon started", set_squeue), (r".*Client error: nope", noop), @@ -377,52 +407,72 @@ def main(): (r".*Client error: nope", noop), (r".*Client error: nope", noop), ], + # Checks (things that shouldn't happen) {}, + # Driver class "arvnodeman.test.fake_driver.FailingDriver", + # Jobs {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail", "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail", "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail", - "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail" - }, "azure"), + "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"}, + # Provider + "azure"), "test_retry_create": ( + # Actions (pattern -> action) [ (r".*Daemon started", set_squeue), - (r".*Rate limit exceeded - scheduling retry in 12 seconds", noop), + (r".*Rate limit exceeded - scheduling retry in 2 seconds", noop), + (r".*Rate limit exceeded - scheduling retry in 1 seconds", noop), (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", noop), ], + # Checks (things that shouldn't happen) {}, + # Driver class "arvnodeman.test.fake_driver.RetryDriver", - {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail" - }, "azure"), + # Jobs + {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail"}, + # Provider + "azure"), "test_single_node_aws": ( + # Actions (pattern -> action) [ (r".*Daemon started", set_squeue), (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired), (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy), (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop), (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown), - ], { - r".*Suggesting shutdown because node state is \('down', .*\)": fail, + ], + # Checks (things that shouldn't happen) + { r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1), r".*Setting node quota.*": fail, }, + # Driver class "arvnodeman.test.fake_driver.FakeAwsDriver", + # Jobs {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"}, + # Provider "ec2"), "test_single_node_gce": ( + # Actions (pattern -> action) [ (r".*Daemon started", set_squeue), (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired), (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy), (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop), (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown), - ], { - r".*Suggesting shutdown because node state is \('down', .*\)": fail, + ], + # Checks (things that shouldn't happen) + { r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1), r".*Setting node quota.*": fail, }, + # Driver class "arvnodeman.test.fake_driver.FakeGceDriver", + # Jobs {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"}, + # Provider "gce") }