16580: remove arvados-node-manager
[arvados.git] / services / nodemanager / tests / integration_test.py
diff --git a/services/nodemanager/tests/integration_test.py b/services/nodemanager/tests/integration_test.py
deleted file mode 100755 (executable)
index 1ba2957..0000000
+++ /dev/null
@@ -1,494 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: AGPL-3.0
-
-"""Integration test framework for node manager.
-
-Runs full node manager with an API server (needs ARVADOS_API_HOST and
-ARVADOS_API_TOKEN).  Stubs out the cloud driver and slurm commands to mock
-specific behaviors.  Monitors the log output to verify an expected sequence of
-events or behaviors for each test.
-
-"""
-
-import subprocess32 as subprocess
-import os
-import sys
-import re
-import time
-import logging
-import stat
-import tempfile
-import shutil
-import errno
-from functools import partial
-import arvados
-import StringIO
-
-formatter = logging.Formatter('%(asctime)s %(levelname)s: %(message)s')
-
-handler = logging.StreamHandler(sys.stderr)
-handler.setFormatter(formatter)
-logger = logging.getLogger("logger")
-logger.setLevel(logging.INFO)
-logger.addHandler(handler)
-
-detail = logging.getLogger("detail")
-detail.setLevel(logging.INFO)
-if os.environ.get("ANMTEST_LOGLEVEL"):
-    detail_content = sys.stderr
-else:
-    detail_content = StringIO.StringIO()
-handler = logging.StreamHandler(detail_content)
-handler.setFormatter(formatter)
-detail.addHandler(handler)
-
-fake_slurm = None
-compute_nodes = None
-all_jobs = None
-unsatisfiable_job_scancelled = None
-
-def update_script(path, val):
-    with open(path+"_", "w") as f:
-        f.write(val)
-    os.chmod(path+"_", stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR)
-    os.rename(path+"_", path)
-    detail.info("Update script %s: %s", path, val)
-
-def set_squeue(g):
-    global all_jobs
-    update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n" +
-                  "\n".join("echo '1|100|100|%s|%s|(null)|1234567890'" % (v, k) for k,v in all_jobs.items()))
-    return 0
-
-def set_queue_unsatisfiable(g):
-    global all_jobs, unsatisfiable_job_scancelled
-    # Simulate a job requesting a 99 core node.
-    update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n" +
-                  "\n".join("echo '99|100|100|%s|%s|(null)|1234567890'" % (v, k) for k,v in all_jobs.items()))
-    update_script(os.path.join(fake_slurm, "scancel"), "#!/bin/sh\n" +
-                  "\ntouch %s" % unsatisfiable_job_scancelled)
-    return 0
-
-def job_cancelled(g):
-    global unsatisfiable_job_scancelled
-    cancelled_job = g.group(1)
-    api = arvados.api('v1')
-    # Check that 'scancel' was called
-    if not os.path.isfile(unsatisfiable_job_scancelled):
-        return 1
-    # Check for the log entry
-    log_entry = api.logs().list(
-        filters=[
-            ['object_uuid', '=', cancelled_job],
-            ['event_type', '=', 'stderr'],
-        ]).execute()['items'][0]
-    if not re.match(
-            r"Constraints cannot be satisfied",
-            log_entry['properties']['text']):
-        return 1
-    return 0
-
-def node_paired(g):
-    global compute_nodes
-    compute_nodes[g.group(1)] = g.group(3)
-
-    update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n" +
-                  "\n".join("echo '%s|alloc|(null)'" % (v) for k,v in compute_nodes.items()))
-
-    for k,v in all_jobs.items():
-        if v == "ReqNodeNotAvail":
-            all_jobs[k] = "Running"
-            break
-
-    set_squeue(g)
-
-    return 0
-
-def node_busy(g):
-    update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n" +
-                  "\n".join("echo '%s|idle|(null)'" % (v) for k,v in compute_nodes.items()))
-    return 0
-
-def node_shutdown(g):
-    global compute_nodes
-    if g.group(1) in compute_nodes:
-        del compute_nodes[g.group(1)]
-        return 0
-    else:
-        return 1
-
-
-def jobs_req(g):
-    global all_jobs
-    for k,v in all_jobs.items():
-        all_jobs[k] = "ReqNodeNotAvail"
-    set_squeue(g)
-    return 0
-
-def noop(g):
-    return 0
-
-def fail(checks, pattern, g):
-    return 1
-
-def expect_count(count, checks, pattern, g):
-    if count == 0:
-        return 1
-    else:
-        checks[pattern] = partial(expect_count, count-1)
-        return 0
-
-def run_test(name, actions, checks, driver_class, jobs, provider):
-    code = 0
-    global unsatisfiable_job_scancelled
-    unsatisfiable_job_scancelled = os.path.join(tempfile.mkdtemp(),
-                                                "scancel_called")
-
-    # Delete any stale node records
-    api = arvados.api('v1')
-    for n in api.nodes().list().execute()['items']:
-        api.nodes().delete(uuid=n["uuid"]).execute()
-
-    logger.info("Start %s", name)
-
-    global fake_slurm
-    fake_slurm = tempfile.mkdtemp()
-    detail.info("fake_slurm is %s", fake_slurm)
-
-    global compute_nodes
-    compute_nodes = {}
-
-    global all_jobs
-    all_jobs = jobs
-
-    env = os.environ.copy()
-    env["PATH"] = fake_slurm + ":" + env["PATH"]
-
-    # Reset fake squeue/sinfo to empty
-    update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n")
-    update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n")
-
-    # Write configuration file for test
-    with open("tests/fake_%s.cfg.template" % provider) as f:
-        open(os.path.join(fake_slurm, "id_rsa.pub"), "w").close()
-        with open(os.path.join(fake_slurm, "fake.cfg"), "w") as cfg:
-            cfg.write(f.read().format(host=os.environ["ARVADOS_API_HOST"],
-                                      token=os.environ["ARVADOS_API_TOKEN"],
-                                      driver_class=driver_class,
-                                      ssh_key=os.path.join(fake_slurm, "id_rsa.pub")))
-
-    # Tests must complete in less than 30 seconds.
-    timeout = time.time() + 30
-    terminated = False
-
-    # Now start node manager
-    p = subprocess.Popen(["bin/arvados-node-manager", "--foreground", "--config", os.path.join(fake_slurm, "fake.cfg")],
-                         bufsize=0, stderr=subprocess.PIPE, env=env)
-
-    # Test main loop:
-    # - Read line
-    # - Apply negative checks (things that are not supposed to happen)
-    # - Check timeout
-    # - Check if the next action should trigger
-    # - If all actions are exhausted, terminate with test success
-    # - If it hits timeout with actions remaining, terminate with test failed
-    try:
-        # naive line iteration over pipes gets buffered, which isn't what we want,
-        # see https://bugs.python.org/issue3907
-        for line in iter(p.stderr.readline, ""):
-            detail_content.write(line)
-
-            for k,v in checks.items():
-                g = re.match(k, line)
-                if g:
-                    detail.info("Matched check %s", k)
-                    code += v(checks, k, g)
-                    if code != 0:
-                        detail.error("Check failed")
-                        if not terminated:
-                            p.kill()
-                            terminated = True
-
-            if terminated:
-                continue
-
-            if time.time() > timeout:
-                detail.error("Exceeded timeout with actions remaining: %s", actions)
-                code += 1
-                if not terminated:
-                    p.kill()
-                    terminated = True
-
-            k, v = actions[0]
-            g = re.match(k, line)
-            if g:
-                detail.info("Matched action %s", k)
-                actions.pop(0)
-                code += v(g)
-                if code != 0:
-                    detail.error("Action failed")
-                    p.kill()
-                    terminated = True
-
-            if not actions:
-                p.kill()
-                terminated = True
-    except KeyboardInterrupt:
-        p.kill()
-
-    if actions:
-        detail.error("Ended with remaining actions: %s", actions)
-        code = 1
-
-    shutil.rmtree(fake_slurm)
-    shutil.rmtree(os.path.dirname(unsatisfiable_job_scancelled))
-
-    if code == 0:
-        logger.info("%s passed", name)
-    else:
-        if isinstance(detail_content, StringIO.StringIO):
-            detail_content.seek(0)
-            chunk = detail_content.read(4096)
-            while chunk:
-                try:
-                    sys.stderr.write(chunk)
-                    chunk = detail_content.read(4096)
-                except IOError as e:
-                    if e.errno == errno.EAGAIN:
-                        # try again (probably pipe buffer full)
-                        pass
-                    else:
-                        raise
-        logger.info("%s failed", name)
-
-    return code
-
-
-def main():
-    # Test lifecycle.
-
-    tests = {
-        "test_unsatisfiable_jobs" : (
-            # Actions (pattern -> action)
-            [
-                (r".*Daemon started", set_queue_unsatisfiable),
-                (r".*Cancelled unsatisfiable job '(\S+)'", job_cancelled),
-            ],
-            # Checks (things that shouldn't happen)
-            {
-                r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": fail,
-                r".*Trying to cancel job '(\S+)'": fail,
-            },
-            # Driver class
-            "arvnodeman.test.fake_driver.FakeDriver",
-            # Jobs
-            {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
-            # Provider
-            "azure"),
-        "test_single_node_azure": (
-            # Actions (pattern -> action)
-            [
-                (r".*Daemon started", set_squeue),
-                (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
-                (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
-                (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
-                (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
-            ],
-            # Checks (things that shouldn't happen)
-            {
-                r".*Suggesting shutdown because node state is \('down', .*\)": fail,
-                r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1),
-                r".*Setting node quota.*": fail,
-            },
-            # Driver class
-            "arvnodeman.test.fake_driver.FakeDriver",
-            # Jobs
-            {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
-            # Provider
-            "azure"),
-        "test_multiple_nodes": (
-            # Actions (pattern -> action)
-            [
-                (r".*Daemon started", set_squeue),
-                (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
-                (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
-                (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
-                (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
-                (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
-                (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
-                (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
-                (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
-                (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
-                (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
-            ],
-            # Checks (things that shouldn't happen)
-            {
-                r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 4),
-                r".*Setting node quota.*": fail,
-            },
-            # Driver class
-            "arvnodeman.test.fake_driver.FakeDriver",
-            # Jobs
-            {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
-             "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
-             "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
-             "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"},
-            # Provider
-            "azure"),
-        "test_hit_quota": (
-            # Actions (pattern -> action)
-            [
-                (r".*Daemon started", set_squeue),
-                (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
-                (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
-                (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
-                (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
-                (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
-                (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown)
-            ],
-            # Checks (things that shouldn't happen)
-            {
-                r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 2),
-                r".*Sending create_node request.*": partial(expect_count, 5)
-            },
-            # Driver class
-            "arvnodeman.test.fake_driver.QuotaDriver",
-            # Jobs
-            {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
-             "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
-             "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
-             "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"},
-            # Provider
-            "azure"),
-        "test_probe_quota": (
-            # Actions (pattern -> action)
-            [
-                (r".*Daemon started", set_squeue),
-                (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
-                (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
-                (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
-                (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
-                (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
-                (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
-                (r".*sending request", jobs_req),
-                (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
-                (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
-                (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
-                (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
-                (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
-                (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
-                (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
-                (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
-                (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
-                (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
-            ],
-            # Checks (things that shouldn't happen)
-            {
-                r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 6),
-                r".*Sending create_node request.*": partial(expect_count, 9)
-            },
-            # Driver class
-            "arvnodeman.test.fake_driver.QuotaDriver",
-            # Jobs
-            {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
-             "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
-             "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
-             "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"},
-            # Provider
-            "azure"),
-        "test_no_hang_failing_node_create": (
-            # Actions (pattern -> action)
-            [
-                (r".*Daemon started", set_squeue),
-                (r".*Client error: nope", noop),
-                (r".*Client error: nope", noop),
-                (r".*Client error: nope", noop),
-                (r".*Client error: nope", noop),
-            ],
-            # Checks (things that shouldn't happen)
-            {},
-            # Driver class
-            "arvnodeman.test.fake_driver.FailingDriver",
-            # Jobs
-            {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
-             "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
-             "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
-             "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"},
-            # Provider
-            "azure"),
-        "test_retry_create": (
-            # Actions (pattern -> action)
-            [
-                (r".*Daemon started", set_squeue),
-                (r".*Rate limit exceeded - scheduling retry in 2 seconds", noop),
-                (r".*Rate limit exceeded - scheduling retry in 1 seconds", noop),
-                (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", noop),
-            ],
-            # Checks (things that shouldn't happen)
-            {},
-            # Driver class
-            "arvnodeman.test.fake_driver.RetryDriver",
-            # Jobs
-            {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail"},
-            # Provider
-            "azure"),
-        "test_single_node_aws": (
-            # Actions (pattern -> action)
-            [
-                (r".*Daemon started", set_squeue),
-                (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
-                (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
-                (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
-                (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
-            ],
-            # Checks (things that shouldn't happen)
-            {
-                r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1),
-                r".*Setting node quota.*": fail,
-            },
-            # Driver class
-            "arvnodeman.test.fake_driver.FakeAwsDriver",
-            # Jobs
-            {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
-            # Provider
-            "ec2"),
-        "test_single_node_gce": (
-            # Actions (pattern -> action)
-            [
-                (r".*Daemon started", set_squeue),
-                (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
-                (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
-                (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
-                (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
-            ],
-            # Checks (things that shouldn't happen)
-            {
-                r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1),
-                r".*Setting node quota.*": fail,
-            },
-            # Driver class
-            "arvnodeman.test.fake_driver.FakeGceDriver",
-            # Jobs
-            {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
-            # Provider
-            "gce")
-    }
-
-    code = 0
-    if len(sys.argv) > 1:
-        code = run_test(sys.argv[1], *tests[sys.argv[1]])
-    else:
-        for t in sorted(tests.keys()):
-            code += run_test(t, *tests[t])
-
-    if code == 0:
-        logger.info("Tests passed")
-    else:
-        logger.info("Tests failed")
-
-    exit(code)
-
-if __name__ == '__main__':
-    main()