#!/usr/bin/env python
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: AGPL-3.0
+
"""Integration test framework for node manager.
Runs full node manager with an API server (needs ARVADOS_API_HOST and
fake_slurm = None
compute_nodes = None
all_jobs = None
+unsatisfiable_job_scancelled = None
def update_script(path, val):
with open(path+"_", "w") as f:
"\n".join("echo '1|100|100|%s|%s'" % (v, k) for k,v in all_jobs.items()))
return 0
+def set_queue_unsatisfiable(g):
+ global all_jobs, unsatisfiable_job_scancelled
+ # Simulate a job requesting a 99 core node.
+ update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n" +
+ "\n".join("echo '99|100|100|%s|%s'" % (v, k) for k,v in all_jobs.items()))
+ update_script(os.path.join(fake_slurm, "scancel"), "#!/bin/sh\n" +
+ "\ntouch %s" % unsatisfiable_job_scancelled)
+ return 0
+
+def job_cancelled(g):
+ global unsatisfiable_job_scancelled
+ cancelled_job = g.group(1)
+ api = arvados.api('v1')
+ # Check that 'scancel' was called
+ if not os.path.isfile(unsatisfiable_job_scancelled):
+ return 1
+ # Check for the log entry
+ log_entry = api.logs().list(
+ filters=[
+ ['object_uuid', '=', cancelled_job],
+ ['event_type', '=', 'stderr'],
+ ]).execute()['items'][0]
+ if not re.match(
+ r"Requirements for a single node exceed the available cloud node size",
+ log_entry['properties']['text']):
+ return 1
+ return 0
def node_paired(g):
global compute_nodes
checks[pattern] = partial(expect_count, count-1)
return 0
-def run_test(name, actions, checks, driver_class, jobs):
+def run_test(name, actions, checks, driver_class, jobs, provider):
code = 0
+ global unsatisfiable_job_scancelled
+ unsatisfiable_job_scancelled = os.path.join(tempfile.mkdtemp(),
+ "scancel_called")
# Delete any stale node records
api = arvados.api('v1')
update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n")
# Write configuration file for test
- with open("tests/fake.cfg.template") as f:
+ with open("tests/fake_%s.cfg.template" % provider) as f:
open(os.path.join(fake_slurm, "id_rsa.pub"), "w").close()
with open(os.path.join(fake_slurm, "fake.cfg"), "w") as cfg:
cfg.write(f.read().format(host=os.environ["ARVADOS_API_HOST"],
# Test main loop:
# - Read line
- # - Apply negative checks (thinks that are not supposed to happen)
+ # - Apply negative checks (things that are not supposed to happen)
# - Check timeout
# - Check if the next action should trigger
# - If all actions are exhausted, terminate with test success
code = 1
shutil.rmtree(fake_slurm)
+ shutil.rmtree(os.path.dirname(unsatisfiable_job_scancelled))
if code == 0:
logger.info("%s passed", name)
# Test lifecycle.
tests = {
- "test_single_node": (
+ "test_unsatisfiable_jobs" : (
+ # Actions (pattern -> action)
+ [
+ (r".*Daemon started", set_queue_unsatisfiable),
+ (r".*Cancelled unsatisfiable job '(\S+)'", job_cancelled),
+ ],
+ # Checks (things that shouldn't happen)
+ {
+ r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": fail,
+ r".*Trying to cancel job '(\S+)'": fail,
+ },
+ # Driver class
+ "arvnodeman.test.fake_driver.FakeDriver",
+ # Jobs
+ {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
+ # Provider
+ "azure"),
+ "test_single_node_azure": (
[
(r".*Daemon started", set_squeue),
(r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
r".*Setting node quota.*": fail,
},
"arvnodeman.test.fake_driver.FakeDriver",
- {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"}),
+ {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
+ "azure"),
"test_multiple_nodes": (
[
(r".*Daemon started", set_squeue),
"34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
"34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
"34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"
- }),
+ }, "azure"),
"test_hit_quota": (
[
(r".*Daemon started", set_squeue),
- (r".*setting node quota to 3", noop),
(r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
(r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
(r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
"34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
"34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
"34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"
- }),
+ }, "azure"),
"test_probe_quota": (
[
(r".*Daemon started", set_squeue),
- (r".*setting node quota to 3", noop),
(r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
(r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
(r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
"34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
"34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
"34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"
- }),
+ }, "azure"),
"test_no_hang_failing_node_create": (
[
(r".*Daemon started", set_squeue),
"34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
"34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
"34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"
- }),
+ }, "azure"),
"test_retry_create": (
[
(r".*Daemon started", set_squeue),
(r".*Rate limit exceeded - scheduling retry in 12 seconds", noop),
+ (r".*Rate limit exceeded - scheduling retry in 2 seconds", noop),
(r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", noop),
],
{},
"arvnodeman.test.fake_driver.RetryDriver",
{"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail"
- })
+ }, "azure"),
+ "test_single_node_aws": (
+ [
+ (r".*Daemon started", set_squeue),
+ (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
+ (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
+ (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
+ (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
+ ], {
+ r".*Suggesting shutdown because node state is \('down', .*\)": fail,
+ r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1),
+ r".*Setting node quota.*": fail,
+ },
+ "arvnodeman.test.fake_driver.FakeAwsDriver",
+ {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
+ "ec2"),
+ "test_single_node_gce": (
+ [
+ (r".*Daemon started", set_squeue),
+ (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
+ (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
+ (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
+ (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
+ ], {
+ r".*Suggesting shutdown because node state is \('down', .*\)": fail,
+ r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1),
+ r".*Setting node quota.*": fail,
+ },
+ "arvnodeman.test.fake_driver.FakeGceDriver",
+ {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
+ "gce")
}
code = 0