2 # Copyright (C) The Arvados Authors. All rights reserved.
4 # SPDX-License-Identifier: AGPL-3.0
6 """Integration test framework for node manager.
8 Runs full node manager with an API server (needs ARVADOS_API_HOST and
9 ARVADOS_API_TOKEN). Stubs out the cloud driver and slurm commands to mock
10 specific behaviors. Monitors the log output to verify an expected sequence of
11 events or behaviors for each test.
15 import subprocess32 as subprocess
25 from functools import partial
29 formatter = logging.Formatter('%(asctime)s %(levelname)s: %(message)s')
31 handler = logging.StreamHandler(sys.stderr)
32 handler.setFormatter(formatter)
33 logger = logging.getLogger("logger")
34 logger.setLevel(logging.INFO)
35 logger.addHandler(handler)
37 detail = logging.getLogger("detail")
38 detail.setLevel(logging.INFO)
39 if os.environ.get("ANMTEST_LOGLEVEL"):
40 detail_content = sys.stderr
42 detail_content = StringIO.StringIO()
43 handler = logging.StreamHandler(detail_content)
44 handler.setFormatter(formatter)
45 detail.addHandler(handler)
50 unsatisfiable_job_scancelled = None
52 def update_script(path, val):
53 with open(path+"_", "w") as f:
55 os.chmod(path+"_", stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR)
56 os.rename(path+"_", path)
57 detail.info("Update script %s: %s", path, val)
61 update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n" +
62 "\n".join("echo '1|100|100|%s|%s|(null)|1234567890'" % (v, k) for k,v in all_jobs.items()))
65 def set_queue_unsatisfiable(g):
66 global all_jobs, unsatisfiable_job_scancelled
67 # Simulate a job requesting a 99 core node.
68 update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n" +
69 "\n".join("echo '99|100|100|%s|%s|(null)|1234567890'" % (v, k) for k,v in all_jobs.items()))
70 update_script(os.path.join(fake_slurm, "scancel"), "#!/bin/sh\n" +
71 "\ntouch %s" % unsatisfiable_job_scancelled)
75 global unsatisfiable_job_scancelled
76 cancelled_job = g.group(1)
77 api = arvados.api('v1')
78 # Check that 'scancel' was called
79 if not os.path.isfile(unsatisfiable_job_scancelled):
81 # Check for the log entry
82 log_entry = api.logs().list(
84 ['object_uuid', '=', cancelled_job],
85 ['event_type', '=', 'stderr'],
86 ]).execute()['items'][0]
88 r"Constraints cannot be satisfied",
89 log_entry['properties']['text']):
95 compute_nodes[g.group(1)] = g.group(3)
97 update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n" +
98 "\n".join("echo '%s|alloc|(null)'" % (v) for k,v in compute_nodes.items()))
100 for k,v in all_jobs.items():
101 if v == "ReqNodeNotAvail":
102 all_jobs[k] = "Running"
110 update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n" +
111 "\n".join("echo '%s|idle|(null)'" % (v) for k,v in compute_nodes.items()))
114 def node_shutdown(g):
116 if g.group(1) in compute_nodes:
117 del compute_nodes[g.group(1)]
125 for k,v in all_jobs.items():
126 all_jobs[k] = "ReqNodeNotAvail"
133 def fail(checks, pattern, g):
136 def expect_count(count, checks, pattern, g):
140 checks[pattern] = partial(expect_count, count-1)
143 def run_test(name, actions, checks, driver_class, jobs, provider):
145 global unsatisfiable_job_scancelled
146 unsatisfiable_job_scancelled = os.path.join(tempfile.mkdtemp(),
149 # Delete any stale node records
150 api = arvados.api('v1')
151 for n in api.nodes().list().execute()['items']:
152 api.nodes().delete(uuid=n["uuid"]).execute()
154 logger.info("Start %s", name)
157 fake_slurm = tempfile.mkdtemp()
158 detail.info("fake_slurm is %s", fake_slurm)
166 env = os.environ.copy()
167 env["PATH"] = fake_slurm + ":" + env["PATH"]
169 # Reset fake squeue/sinfo to empty
170 update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n")
171 update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n")
173 # Write configuration file for test
174 with open("tests/fake_%s.cfg.template" % provider) as f:
175 open(os.path.join(fake_slurm, "id_rsa.pub"), "w").close()
176 with open(os.path.join(fake_slurm, "fake.cfg"), "w") as cfg:
177 cfg.write(f.read().format(host=os.environ["ARVADOS_API_HOST"],
178 token=os.environ["ARVADOS_API_TOKEN"],
179 driver_class=driver_class,
180 ssh_key=os.path.join(fake_slurm, "id_rsa.pub")))
182 # Tests must complete in less than 30 seconds.
183 timeout = time.time() + 30
186 # Now start node manager
187 p = subprocess.Popen(["bin/arvados-node-manager", "--foreground", "--config", os.path.join(fake_slurm, "fake.cfg")],
188 bufsize=0, stderr=subprocess.PIPE, env=env)
192 # - Apply negative checks (things that are not supposed to happen)
194 # - Check if the next action should trigger
195 # - If all actions are exhausted, terminate with test success
196 # - If it hits timeout with actions remaining, terminate with test failed
198 # naive line iteration over pipes gets buffered, which isn't what we want,
199 # see https://bugs.python.org/issue3907
200 for line in iter(p.stderr.readline, ""):
201 detail_content.write(line)
203 for k,v in checks.items():
204 g = re.match(k, line)
206 detail.info("Matched check %s", k)
207 code += v(checks, k, g)
209 detail.error("Check failed")
217 if time.time() > timeout:
218 detail.error("Exceeded timeout with actions remaining: %s", actions)
225 g = re.match(k, line)
227 detail.info("Matched action %s", k)
231 detail.error("Action failed")
238 except KeyboardInterrupt:
242 detail.error("Ended with remaining actions: %s", actions)
245 shutil.rmtree(fake_slurm)
246 shutil.rmtree(os.path.dirname(unsatisfiable_job_scancelled))
249 logger.info("%s passed", name)
251 if isinstance(detail_content, StringIO.StringIO):
252 detail_content.seek(0)
253 chunk = detail_content.read(4096)
256 sys.stderr.write(chunk)
257 chunk = detail_content.read(4096)
259 if e.errno == errno.EAGAIN:
260 # try again (probably pipe buffer full)
264 logger.info("%s failed", name)
273 "test_unsatisfiable_jobs" : (
274 # Actions (pattern -> action)
276 (r".*Daemon started", set_queue_unsatisfiable),
277 (r".*Cancelled unsatisfiable job '(\S+)'", job_cancelled),
279 # Checks (things that shouldn't happen)
281 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": fail,
282 r".*Trying to cancel job '(\S+)'": fail,
285 "arvnodeman.test.fake_driver.FakeDriver",
287 {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
290 "test_single_node_azure": (
291 # Actions (pattern -> action)
293 (r".*Daemon started", set_squeue),
294 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
295 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
296 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
297 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
299 # Checks (things that shouldn't happen)
301 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
302 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1),
303 r".*Setting node quota.*": fail,
306 "arvnodeman.test.fake_driver.FakeDriver",
308 {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
311 "test_multiple_nodes": (
312 # Actions (pattern -> action)
314 (r".*Daemon started", set_squeue),
315 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
316 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
317 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
318 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
319 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
320 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
321 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
322 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
323 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
324 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
326 # Checks (things that shouldn't happen)
328 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 4),
329 r".*Setting node quota.*": fail,
332 "arvnodeman.test.fake_driver.FakeDriver",
334 {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
335 "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
336 "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
337 "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"},
341 # Actions (pattern -> action)
343 (r".*Daemon started", set_squeue),
344 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
345 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
346 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
347 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
348 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
349 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown)
351 # Checks (things that shouldn't happen)
353 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 2),
354 r".*Sending create_node request.*": partial(expect_count, 5)
357 "arvnodeman.test.fake_driver.QuotaDriver",
359 {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
360 "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
361 "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
362 "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"},
365 "test_probe_quota": (
366 # Actions (pattern -> action)
368 (r".*Daemon started", set_squeue),
369 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
370 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
371 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
372 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
373 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
374 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
375 (r".*sending request", jobs_req),
376 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
377 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
378 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
379 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
380 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
381 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
382 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
383 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
384 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
385 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
387 # Checks (things that shouldn't happen)
389 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 6),
390 r".*Sending create_node request.*": partial(expect_count, 9)
393 "arvnodeman.test.fake_driver.QuotaDriver",
395 {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
396 "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
397 "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
398 "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"},
401 "test_no_hang_failing_node_create": (
402 # Actions (pattern -> action)
404 (r".*Daemon started", set_squeue),
405 (r".*Client error: nope", noop),
406 (r".*Client error: nope", noop),
407 (r".*Client error: nope", noop),
408 (r".*Client error: nope", noop),
410 # Checks (things that shouldn't happen)
413 "arvnodeman.test.fake_driver.FailingDriver",
415 {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
416 "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
417 "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
418 "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"},
421 "test_retry_create": (
422 # Actions (pattern -> action)
424 (r".*Daemon started", set_squeue),
425 (r".*Rate limit exceeded - scheduling retry in 2 seconds", noop),
426 (r".*Rate limit exceeded - scheduling retry in 1 seconds", noop),
427 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", noop),
429 # Checks (things that shouldn't happen)
432 "arvnodeman.test.fake_driver.RetryDriver",
434 {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail"},
437 "test_single_node_aws": (
438 # Actions (pattern -> action)
440 (r".*Daemon started", set_squeue),
441 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
442 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
443 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
444 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
446 # Checks (things that shouldn't happen)
448 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1),
449 r".*Setting node quota.*": fail,
452 "arvnodeman.test.fake_driver.FakeAwsDriver",
454 {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
457 "test_single_node_gce": (
458 # Actions (pattern -> action)
460 (r".*Daemon started", set_squeue),
461 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
462 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
463 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
464 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
466 # Checks (things that shouldn't happen)
468 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1),
469 r".*Setting node quota.*": fail,
472 "arvnodeman.test.fake_driver.FakeGceDriver",
474 {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
480 if len(sys.argv) > 1:
481 code = run_test(sys.argv[1], *tests[sys.argv[1]])
483 for t in sorted(tests.keys()):
484 code += run_test(t, *tests[t])
487 logger.info("Tests passed")
489 logger.info("Tests failed")
493 if __name__ == '__main__':