2 # Copyright (C) The Arvados Authors. All rights reserved.
4 # SPDX-License-Identifier: AGPL-3.0
6 """Integration test framework for node manager.
8 Runs full node manager with an API server (needs ARVADOS_API_HOST and
9 ARVADOS_API_TOKEN). Stubs out the cloud driver and slurm commands to mock
10 specific behaviors. Monitors the log output to verify an expected sequence of
11 events or behaviors for each test.
24 from functools import partial
28 logger = logging.getLogger("logger")
29 logger.setLevel(logging.INFO)
30 logger.addHandler(logging.StreamHandler(sys.stderr))
32 detail = logging.getLogger("detail")
33 detail.setLevel(logging.INFO)
34 if os.environ.get("ANMTEST_LOGLEVEL"):
35 detail_content = sys.stderr
37 detail_content = StringIO.StringIO()
38 detail.addHandler(logging.StreamHandler(detail_content))
44 def update_script(path, val):
45 with open(path+"_", "w") as f:
47 os.chmod(path+"_", stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR)
48 os.rename(path+"_", path)
49 detail.info("Update script %s: %s", path, val)
53 update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n" +
54 "\n".join("echo '1|100|100|%s|%s'" % (v, k) for k,v in all_jobs.items()))
60 compute_nodes[g.group(1)] = g.group(3)
62 update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n" +
63 "\n".join("echo '%s alloc'" % (v) for k,v in compute_nodes.items()))
65 for k,v in all_jobs.items():
66 if v == "ReqNodeNotAvail":
67 all_jobs[k] = "Running"
74 def remaining_jobs(g):
75 update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n" +
76 "\n".join("echo '%s alloc'" % (v) for k,v in compute_nodes.items()))
78 for k,v in all_jobs.items():
79 all_jobs[k] = "Running"
87 update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n" +
88 "\n".join("echo '%s idle'" % (v) for k,v in compute_nodes.items()))
93 del compute_nodes[g.group(1)]
98 for k,v in all_jobs.items():
99 all_jobs[k] = "ReqNodeNotAvail"
106 def fail(checks, pattern, g):
109 def expect_count(count, checks, pattern, g):
113 checks[pattern] = partial(expect_count, count-1)
116 def run_test(name, actions, checks, driver_class, jobs, provider):
119 # Delete any stale node records
120 api = arvados.api('v1')
121 for n in api.nodes().list().execute()['items']:
122 api.nodes().delete(uuid=n["uuid"]).execute()
124 logger.info("Start %s", name)
127 fake_slurm = tempfile.mkdtemp()
128 detail.info("fake_slurm is %s", fake_slurm)
136 env = os.environ.copy()
137 env["PATH"] = fake_slurm + ":" + env["PATH"]
139 # Reset fake squeue/sinfo to empty
140 update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n")
141 update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n")
143 # Write configuration file for test
144 with open("tests/fake_%s.cfg.template" % provider) as f:
145 open(os.path.join(fake_slurm, "id_rsa.pub"), "w").close()
146 with open(os.path.join(fake_slurm, "fake.cfg"), "w") as cfg:
147 cfg.write(f.read().format(host=os.environ["ARVADOS_API_HOST"],
148 token=os.environ["ARVADOS_API_TOKEN"],
149 driver_class=driver_class,
150 ssh_key=os.path.join(fake_slurm, "id_rsa.pub")))
152 # Tests must complete in less than 3 minutes.
153 timeout = time.time() + 180
156 # Now start node manager
157 p = subprocess.Popen(["bin/arvados-node-manager", "--foreground", "--config", os.path.join(fake_slurm, "fake.cfg")],
158 bufsize=0, stderr=subprocess.PIPE, env=env)
162 # - Apply negative checks (thinks that are not supposed to happen)
164 # - Check if the next action should trigger
165 # - If all actions are exhausted, terminate with test success
166 # - If it hits timeout with actions remaining, terminate with test failed
168 # naive line iteration over pipes gets buffered, which isn't what we want,
169 # see https://bugs.python.org/issue3907
170 for line in iter(p.stderr.readline, ""):
171 detail_content.write(line)
173 for k,v in checks.items():
174 g = re.match(k, line)
176 detail.info("Matched check %s", k)
177 code += v(checks, k, g)
179 detail.error("Check failed")
187 if time.time() > timeout:
188 detail.error("Exceeded timeout with actions remaining: %s", actions)
195 g = re.match(k, line)
197 detail.info("Matched action %s", k)
201 detail.error("Action failed")
208 except KeyboardInterrupt:
212 detail.error("Ended with remaining actions: %s", actions)
215 shutil.rmtree(fake_slurm)
218 logger.info("%s passed", name)
220 if isinstance(detail_content, StringIO.StringIO):
221 sys.stderr.write(detail_content.getvalue())
222 logger.info("%s failed", name)
231 "test_single_node_azure": (
233 (r".*Daemon started", set_squeue),
234 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
235 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
236 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
237 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
239 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
240 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1),
241 r".*Setting node quota.*": fail,
243 "arvnodeman.test.fake_driver.FakeDriver",
244 {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
246 "test_multiple_nodes": (
248 (r".*Daemon started", set_squeue),
249 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
250 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
251 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
252 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
253 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
254 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
255 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
256 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
257 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
258 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
260 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
261 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 4),
262 r".*Setting node quota.*": fail,
264 "arvnodeman.test.fake_driver.FakeDriver",
265 {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
266 "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
267 "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
268 "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"
272 (r".*Daemon started", set_squeue),
273 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
274 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
275 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
276 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", remaining_jobs),
277 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
278 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
279 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown)
281 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
282 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 2),
283 r".*Sending create_node request.*": partial(expect_count, 5)
285 "arvnodeman.test.fake_driver.QuotaDriver",
286 {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
287 "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
288 "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
289 "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"
291 "test_probe_quota": (
293 (r".*Daemon started", set_squeue),
294 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
295 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
296 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
297 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", remaining_jobs),
298 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
299 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
300 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
301 (r".*sending request", jobs_req),
302 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
303 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
304 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
305 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
306 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
307 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
308 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
309 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
310 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
311 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
313 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
314 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 6),
315 r".*Sending create_node request.*": partial(expect_count, 9)
317 "arvnodeman.test.fake_driver.QuotaDriver",
318 {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
319 "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
320 "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
321 "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"
323 "test_no_hang_failing_node_create": (
325 (r".*Daemon started", set_squeue),
326 (r".*Client error: nope", noop),
327 (r".*Client error: nope", noop),
328 (r".*Client error: nope", noop),
329 (r".*Client error: nope", noop),
332 "arvnodeman.test.fake_driver.FailingDriver",
333 {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
334 "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
335 "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
336 "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"
338 "test_retry_create": (
340 (r".*Daemon started", set_squeue),
341 (r".*Rate limit exceeded - scheduling retry in 12 seconds", noop),
342 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", noop),
345 "arvnodeman.test.fake_driver.RetryDriver",
346 {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail"
348 "test_single_node_aws": (
350 (r".*Daemon started", set_squeue),
351 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
352 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
353 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
354 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
356 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
357 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1),
358 r".*Setting node quota.*": fail,
360 "arvnodeman.test.fake_driver.FakeAwsDriver",
361 {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
363 "test_single_node_gce": (
365 (r".*Daemon started", set_squeue),
366 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
367 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
368 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
369 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
371 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
372 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1),
373 r".*Setting node quota.*": fail,
375 "arvnodeman.test.fake_driver.FakeGceDriver",
376 {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
381 if len(sys.argv) > 1:
382 code = run_test(sys.argv[1], *tests[sys.argv[1]])
384 for t in sorted(tests.keys()):
385 code += run_test(t, *tests[t])
388 logger.info("Tests passed")
390 logger.info("Tests failed")
394 if __name__ == '__main__':