From 09794d996eca79b85d3ac0c21a4a43c65a51d0d7 Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Fri, 2 Jun 2017 11:58:55 -0400 Subject: [PATCH] 10312: Integration test framework for node manager, runs full node manager with fake cloud driver and monitors logging output. Arvados-DCO-1.1-Signed-off-by: Peter Amstutz --- services/nodemanager/arvnodeman/daemon.py | 4 +- services/nodemanager/arvnodeman/launcher.py | 6 +- .../nodemanager/arvnodeman/test/__init__.py | 1 + .../arvnodeman/{ => test}/fake_driver.py | 7 +- services/nodemanager/fake_slurm/sinfo | 2 - services/nodemanager/fake_slurm/squeue | 2 - .../{fake.azure.cfg => fake.cfg.template} | 18 +-- .../nodemanager/tests/integration_test.py | 133 ++++++++++++++++++ 8 files changed, 153 insertions(+), 20 deletions(-) create mode 100644 services/nodemanager/arvnodeman/test/__init__.py rename services/nodemanager/arvnodeman/{ => test}/fake_driver.py (91%) delete mode 100755 services/nodemanager/fake_slurm/sinfo delete mode 100755 services/nodemanager/fake_slurm/squeue rename services/nodemanager/tests/{fake.azure.cfg => fake.cfg.template} (95%) create mode 100755 services/nodemanager/tests/integration_test.py diff --git a/services/nodemanager/arvnodeman/daemon.py b/services/nodemanager/arvnodeman/daemon.py index 8f9207e3ba..029d8180e3 100644 --- a/services/nodemanager/arvnodeman/daemon.py +++ b/services/nodemanager/arvnodeman/daemon.py @@ -146,8 +146,8 @@ class NodeManagerDaemonActor(actor_class): self.last_polls[poll_key] = time.time() def _pair_nodes(self, node_record, arvados_node): - self._logger.info("Cloud node %s is now paired with Arvados node %s", - node_record.cloud_node.name, arvados_node['uuid']) + self._logger.info("Cloud node %s is now paired with Arvados node %s with hostname %s", + node_record.cloud_node.name, arvados_node['uuid'], arvados_node['hostname']) self._arvados_nodes_actor.subscribe_to( arvados_node['uuid'], node_record.actor.update_arvados_node) node_record.arvados_node = arvados_node diff --git a/services/nodemanager/arvnodeman/launcher.py b/services/nodemanager/arvnodeman/launcher.py index 11d38ecb76..cb80bbf293 100644 --- a/services/nodemanager/arvnodeman/launcher.py +++ b/services/nodemanager/arvnodeman/launcher.py @@ -22,6 +22,7 @@ from .timedcallback import TimedCallBackActor from ._version import __version__ node_daemon = None +watchdog = None def abort(msg, code=1): print("arvados-node-manager: " + msg) @@ -97,6 +98,7 @@ def shutdown_signal(signal_code, frame): pykka.ActorRegistry.stop_all() sys.exit(-signal_code) elif current_count == 0: + watchdog.stop() node_daemon.shutdown() elif current_count == 1: pykka.ActorRegistry.stop_all() @@ -104,7 +106,7 @@ def shutdown_signal(signal_code, frame): sys.exit(-signal_code) def main(args=None): - global node_daemon + global node_daemon, watchdog args = parse_cli(args) config = load_config(args.config) @@ -138,7 +140,7 @@ def main(args=None): node_setup, node_shutdown, node_monitor, max_total_price=config.getfloat('Daemon', 'max_total_price')).tell_proxy() - WatchdogActor.start(config.getint('Daemon', 'watchdog'), + watchdog = WatchdogActor.start(config.getint('Daemon', 'watchdog'), cloud_node_poller.actor_ref, arvados_node_poller.actor_ref, job_queue_poller.actor_ref, diff --git a/services/nodemanager/arvnodeman/test/__init__.py b/services/nodemanager/arvnodeman/test/__init__.py new file mode 100644 index 0000000000..8b13789179 --- /dev/null +++ b/services/nodemanager/arvnodeman/test/__init__.py @@ -0,0 +1 @@ + diff --git a/services/nodemanager/arvnodeman/fake_driver.py b/services/nodemanager/arvnodeman/test/fake_driver.py similarity index 91% rename from services/nodemanager/arvnodeman/fake_driver.py rename to services/nodemanager/arvnodeman/test/fake_driver.py index 89a3dbb6b7..be0789e84f 100644 --- a/services/nodemanager/arvnodeman/fake_driver.py +++ b/services/nodemanager/arvnodeman/test/fake_driver.py @@ -27,18 +27,19 @@ class FakeDriver(NodeDriver): ex_user_name=None, ex_tags=None, ex_network=None): + global all_nodes all_nodes.append(Node(name, name, NodeState.RUNNING, [], [], self, size=size, extra={"tags": ex_tags})) ping_url = re.search(r"echo '(.*)' > /var/tmp/arv-node-data/arv-ping-url", ex_customdata).groups(1)[0] + "&instance_id=" + name - print(ping_url) ctx = ssl.SSLContext(ssl.PROTOCOL_SSLv23) ctx.verify_mode = ssl.CERT_NONE f = urllib.urlopen(ping_url, "", context=ctx) - print(f.read()) f.close() return all_nodes[-1] def destroy_node(self, cloud_node): - return None + global all_nodes + all_nodes = [n for n in all_nodes if n.id != cloud_node.id] + return True def get_image(self, img): pass diff --git a/services/nodemanager/fake_slurm/sinfo b/services/nodemanager/fake_slurm/sinfo deleted file mode 100755 index e57d0d3a70..0000000000 --- a/services/nodemanager/fake_slurm/sinfo +++ /dev/null @@ -1,2 +0,0 @@ -#!/bin/sh -echo \ No newline at end of file diff --git a/services/nodemanager/fake_slurm/squeue b/services/nodemanager/fake_slurm/squeue deleted file mode 100755 index dd114a00b9..0000000000 --- a/services/nodemanager/fake_slurm/squeue +++ /dev/null @@ -1,2 +0,0 @@ -#!/bin/sh -echo '1|100|100|ReqNodeNotAvail|34t0i-dz642-h42bg3hq4bdfpf9' diff --git a/services/nodemanager/tests/fake.azure.cfg b/services/nodemanager/tests/fake.cfg.template similarity index 95% rename from services/nodemanager/tests/fake.azure.cfg rename to services/nodemanager/tests/fake.cfg.template index 7f7629fe4d..631745a433 100644 --- a/services/nodemanager/tests/fake.azure.cfg +++ b/services/nodemanager/tests/fake.cfg.template @@ -34,7 +34,7 @@ max_nodes = 8 max_total_price = 0 # Poll Azure nodes and Arvados for new information every N seconds. -poll_time = 15 +poll_time = 5 # Polls have exponential backoff when services fail to respond. # This is the longest time to wait between polls. @@ -49,7 +49,7 @@ poll_stale_after = 600 # node before this long, assume that there was a cloud bootstrap failure and # shut it down. Note that normal shutdown windows apply (see the Cloud # section), so this should be shorter than the first shutdown window value. -boot_fail_after = 1800 +boot_fail_after = 20 # "Node stale time" affects two related behaviors. # 1. If a compute node has been running for at least this long, but it @@ -90,8 +90,8 @@ pykka = WARNING apiclient = WARNING [Arvados] -host = 192.168.5.2:8000 -token = 2tnmn9ou33o3vk3bynzyzrc7aedhijo7ufa11j9kyv7509cygx +host = {host} +token = {token} timeout = 15 # Accept an untrusted SSL certificate from the API server? @@ -99,7 +99,7 @@ insecure = yes [Cloud] provider = azure -driver_class = arvnodeman.fake_driver.FakeDriver +driver_class = {driver_class} # Shutdown windows define periods of time when a node may and may not be shut # down. These are windows in full minutes, separated by commas. Counting from @@ -110,7 +110,7 @@ driver_class = arvnodeman.fake_driver.FakeDriver # Azure bills by the minute, so it makes sense to agressively shut down idle # nodes. Specify at least two windows. You can add as many as you need beyond # that. -shutdown_windows = 5, 999999 +shutdown_windows = 1, 999999 [Cloud Credentials] # Use "azure account list" with the azure CLI to get these values. @@ -123,7 +123,7 @@ subscription_id = 00000000-0000-0000-0000-000000000000 # azure config mode arm # azure ad app create --name "" --home-page "" --identifier-uris "" --password # azure ad sp create "" -# azure role assignment create --objectId "" -o Owner -c /subscriptions/{subscriptionId}/ +# azure role assignment create --objectId "" -o Owner -c /subscriptions// # # Use for "key" and the for "secret" # @@ -142,7 +142,7 @@ ex_resource_group = ArvadosResourceGroup image = Canonical:UbuntuServer:14.04.3-LTS:14.04.201508050 # Path to a local ssh key file that will be used to provision new nodes. -ssh_key = /home/peter/.ssh/id_rsa.pub +ssh_key = {ssh_key} # The account name for the admin user that will be provisioned on new nodes. ex_user_name = arvadosuser @@ -161,7 +161,7 @@ tag_arvados-class = dynamic-compute tag_cluster = zyxwv # the API server to ping -ping_host = 192.168.5.2:8000 +ping_host = {host} # You can define any number of Size sections to list Azure sizes you're willing # to use. The Node Manager should boot the cheapest size(s) that can run jobs diff --git a/services/nodemanager/tests/integration_test.py b/services/nodemanager/tests/integration_test.py new file mode 100755 index 0000000000..90bf237645 --- /dev/null +++ b/services/nodemanager/tests/integration_test.py @@ -0,0 +1,133 @@ +#!/usr/bin/env python +import subprocess +import os +import sys +import re +import time +import logging +import stat +import tempfile +import shutil + +logging.basicConfig(level=logging.INFO) + +fake_slurm = None +compute_nodes = None + +def update_script(path, val): + with open(path+"_", "w") as f: + f.write(val) + os.chmod(path+"_", stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR) + os.rename(path+"_", path) + + +def set_squeue(actions, checks, k, g): + update_script(os.path.join(fake_slurm, "squeue"), """#!/bin/sh +echo '1|100|100|ReqNodeNotAvail|34t0i-dz642-h42bg3hq4bdfpf9' +""") + return 0 + +def set_sinfo_alloc(actions, checks, k, g): + update_script(os.path.join(fake_slurm, "sinfo"), """#!/bin/sh +echo '%s alloc' +""" % (g.group(3))) + + update_script(os.path.join(fake_slurm, "squeue"), """#!/bin/sh +echo '1|100|100|Running|34t0i-dz642-h42bg3hq4bdfpf9' +""") + + global compute_nodes + compute_nodes[g.group(1)] = g.group(3) + return 0 + +def set_sinfo_idle(actions, checks, k, g): + update_script(os.path.join(fake_slurm, "sinfo"), """#!/bin/sh +echo '%s idle' +""" % (compute_nodes[g.group(1)])) + return 0 + +def noop(actions, checks, k, g): + return 0 + +def down_fail(actions, checks, k, g): + return 1 + + +def run_test(actions, checks, driver_class): + code = 0 + + global fake_slurm + fake_slurm = tempfile.mkdtemp() + logging.info("fake_slurm is %s", fake_slurm) + + global compute_nodes + compute_nodes = {} + + env = os.environ.copy() + env["PATH"] = fake_slurm + ":" + env["PATH"] + + update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n") + update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n") + + with open("tests/fake.cfg.template") as f: + with open(os.path.join(fake_slurm, "id_rsa.pub"), "w") as ssh: + pass + with open(os.path.join(fake_slurm, "fake.cfg"), "w") as cfg: + cfg.write(f.read().format(host=os.environ["ARVADOS_API_HOST"], + token=os.environ["ARVADOS_API_TOKEN"], + driver_class=driver_class, + ssh_key=os.path.join(fake_slurm, "id_rsa.pub"))) + + timeout = time.time() + 300 + + p = subprocess.Popen(["bin/arvados-node-manager", "--foreground", "--config", os.path.join(fake_slurm, "fake.cfg")], + bufsize=1, stderr=subprocess.PIPE, env=env) + for line in p.stderr: + sys.stdout.write(line) + + if time.time() > timeout: + logging.error("Exceeded timeout") + code = 1 + p.terminate() + + for k,v in actions.items(): + g = re.match(k, line) + if g: + logging.info("Triggered action %s", k) + del actions[k] + code = v(actions, checks, k, g) + if code != 0: + logging.error("Action failed") + p.terminate() + + for k,v in checks.items(): + g = re.match(k, line) + if g: + logging.info("Triggered check %s", k) + code = v(actions, checks, k, g) + if code != 0: + logging.error("Check failed") + p.terminate() + + if not actions: + p.terminate() + + #shutil.rmtree(fake_slurm) + + return code + + +def main(): + code = run_test({ + r".*Daemon started": set_squeue, + r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": set_sinfo_alloc, + r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)": set_sinfo_idle, + r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)": noop, + r".*Shutdown success": noop, + }, { + r".*Suggesting shutdown because node state is \('down', .*\)": down_fail + }, + "arvnodeman.test.fake_driver.FakeDriver") + exit(code) + +main() -- 2.30.2