2 # Copyright (C) The Arvados Authors. All rights reserved.
4 # SPDX-License-Identifier: AGPL-3.0
6 """Integration test framework for node manager.
8 Runs full node manager with an API server (needs ARVADOS_API_HOST and
9 ARVADOS_API_TOKEN). Stubs out the cloud driver and slurm commands to mock
10 specific behaviors. Monitors the log output to verify an expected sequence of
11 events or behaviors for each test.
25 from functools import partial
29 formatter = logging.Formatter('%(asctime)s %(levelname)s: %(message)s')
31 handler = logging.StreamHandler(sys.stderr)
32 handler.setFormatter(formatter)
33 logger = logging.getLogger("logger")
34 logger.setLevel(logging.INFO)
35 logger.addHandler(handler)
37 detail = logging.getLogger("detail")
38 detail.setLevel(logging.INFO)
39 if os.environ.get("ANMTEST_LOGLEVEL"):
40 detail_content = sys.stderr
42 detail_content = StringIO.StringIO()
43 handler = logging.StreamHandler(detail_content)
44 handler.setFormatter(formatter)
45 detail.addHandler(handler)
50 unsatisfiable_job_scancelled = None
52 def update_script(path, val):
53 with open(path+"_", "w") as f:
55 os.chmod(path+"_", stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR)
56 os.rename(path+"_", path)
57 detail.info("Update script %s: %s", path, val)
61 update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n" +
62 "\n".join("echo '1|100|100|%s|%s|(null)|1234567890'" % (v, k) for k,v in all_jobs.items()))
65 def set_queue_unsatisfiable(g):
66 global all_jobs, unsatisfiable_job_scancelled
67 # Simulate a job requesting a 99 core node.
68 update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n" +
69 "\n".join("echo '99|100|100|%s|%s|(null)|1234567890'" % (v, k) for k,v in all_jobs.items()))
70 update_script(os.path.join(fake_slurm, "scancel"), "#!/bin/sh\n" +
71 "\ntouch %s" % unsatisfiable_job_scancelled)
75 global unsatisfiable_job_scancelled
76 cancelled_job = g.group(1)
77 api = arvados.api('v1')
78 # Check that 'scancel' was called
79 if not os.path.isfile(unsatisfiable_job_scancelled):
81 # Check for the log entry
82 log_entry = api.logs().list(
84 ['object_uuid', '=', cancelled_job],
85 ['event_type', '=', 'stderr'],
86 ]).execute()['items'][0]
88 r"Constraints cannot be satisfied",
89 log_entry['properties']['text']):
95 compute_nodes[g.group(1)] = g.group(3)
97 update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n" +
98 "\n".join("echo '%s|alloc|(null)'" % (v) for k,v in compute_nodes.items()))
100 for k,v in all_jobs.items():
101 if v == "ReqNodeNotAvail":
102 all_jobs[k] = "Running"
110 update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n" +
111 "\n".join("echo '%s|idle|(null)'" % (v) for k,v in compute_nodes.items()))
114 def node_shutdown(g):
116 if g.group(1) in compute_nodes:
117 del compute_nodes[g.group(1)]
122 for k,v in all_jobs.items():
123 all_jobs[k] = "ReqNodeNotAvail"
130 def fail(checks, pattern, g):
133 def expect_count(count, checks, pattern, g):
137 checks[pattern] = partial(expect_count, count-1)
140 def run_test(name, actions, checks, driver_class, jobs, provider):
142 global unsatisfiable_job_scancelled
143 unsatisfiable_job_scancelled = os.path.join(tempfile.mkdtemp(),
146 # Delete any stale node records
147 api = arvados.api('v1')
148 for n in api.nodes().list().execute()['items']:
149 api.nodes().delete(uuid=n["uuid"]).execute()
151 logger.info("Start %s", name)
154 fake_slurm = tempfile.mkdtemp()
155 detail.info("fake_slurm is %s", fake_slurm)
163 env = os.environ.copy()
164 env["PATH"] = fake_slurm + ":" + env["PATH"]
166 # Reset fake squeue/sinfo to empty
167 update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n")
168 update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n")
170 # Write configuration file for test
171 with open("tests/fake_%s.cfg.template" % provider) as f:
172 open(os.path.join(fake_slurm, "id_rsa.pub"), "w").close()
173 with open(os.path.join(fake_slurm, "fake.cfg"), "w") as cfg:
174 cfg.write(f.read().format(host=os.environ["ARVADOS_API_HOST"],
175 token=os.environ["ARVADOS_API_TOKEN"],
176 driver_class=driver_class,
177 ssh_key=os.path.join(fake_slurm, "id_rsa.pub")))
179 # Tests must complete in less than 30 seconds.
180 timeout = time.time() + 30
183 # Now start node manager
184 p = subprocess.Popen(["bin/arvados-node-manager", "--foreground", "--config", os.path.join(fake_slurm, "fake.cfg")],
185 bufsize=0, stderr=subprocess.PIPE, env=env)
189 # - Apply negative checks (things that are not supposed to happen)
191 # - Check if the next action should trigger
192 # - If all actions are exhausted, terminate with test success
193 # - If it hits timeout with actions remaining, terminate with test failed
195 # naive line iteration over pipes gets buffered, which isn't what we want,
196 # see https://bugs.python.org/issue3907
197 for line in iter(p.stderr.readline, ""):
198 detail_content.write(line)
200 for k,v in checks.items():
201 g = re.match(k, line)
203 detail.info("Matched check %s", k)
204 code += v(checks, k, g)
206 detail.error("Check failed")
214 if time.time() > timeout:
215 detail.error("Exceeded timeout with actions remaining: %s", actions)
222 g = re.match(k, line)
224 detail.info("Matched action %s", k)
228 detail.error("Action failed")
235 except KeyboardInterrupt:
239 detail.error("Ended with remaining actions: %s", actions)
242 shutil.rmtree(fake_slurm)
243 shutil.rmtree(os.path.dirname(unsatisfiable_job_scancelled))
246 logger.info("%s passed", name)
248 if isinstance(detail_content, StringIO.StringIO):
249 detail_content.seek(0)
250 chunk = detail_content.read(4096)
253 sys.stderr.write(chunk)
254 chunk = detail_content.read(4096)
256 if e.errno == errno.EAGAIN:
257 # try again (probably pipe buffer full)
261 logger.info("%s failed", name)
270 "test_unsatisfiable_jobs" : (
271 # Actions (pattern -> action)
273 (r".*Daemon started", set_queue_unsatisfiable),
274 (r".*Cancelled unsatisfiable job '(\S+)'", job_cancelled),
276 # Checks (things that shouldn't happen)
278 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": fail,
279 r".*Trying to cancel job '(\S+)'": fail,
282 "arvnodeman.test.fake_driver.FakeDriver",
284 {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
287 "test_single_node_azure": (
288 # Actions (pattern -> action)
290 (r".*Daemon started", set_squeue),
291 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
292 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
293 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
294 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
296 # Checks (things that shouldn't happen)
298 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
299 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1),
300 r".*Setting node quota.*": fail,
303 "arvnodeman.test.fake_driver.FakeDriver",
305 {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
308 "test_multiple_nodes": (
309 # Actions (pattern -> action)
311 (r".*Daemon started", set_squeue),
312 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
313 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
314 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
315 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
316 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
317 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
318 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
319 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
320 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
321 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
323 # Checks (things that shouldn't happen)
325 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 4),
326 r".*Setting node quota.*": fail,
329 "arvnodeman.test.fake_driver.FakeDriver",
331 {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
332 "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
333 "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
334 "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"},
338 # Actions (pattern -> action)
340 (r".*Daemon started", set_squeue),
341 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
342 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
343 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
344 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
345 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
346 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown)
348 # Checks (things that shouldn't happen)
350 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 2),
351 r".*Sending create_node request.*": partial(expect_count, 5)
354 "arvnodeman.test.fake_driver.QuotaDriver",
356 {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
357 "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
358 "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
359 "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"},
362 "test_probe_quota": (
363 # Actions (pattern -> action)
365 (r".*Daemon started", set_squeue),
366 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
367 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
368 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
369 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
370 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
371 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
372 (r".*sending request", jobs_req),
373 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
374 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
375 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
376 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
377 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
378 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
379 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
380 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
381 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
382 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
384 # Checks (things that shouldn't happen)
386 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 6),
387 r".*Sending create_node request.*": partial(expect_count, 9)
390 "arvnodeman.test.fake_driver.QuotaDriver",
392 {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
393 "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
394 "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
395 "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"},
398 "test_no_hang_failing_node_create": (
399 # Actions (pattern -> action)
401 (r".*Daemon started", set_squeue),
402 (r".*Client error: nope", noop),
403 (r".*Client error: nope", noop),
404 (r".*Client error: nope", noop),
405 (r".*Client error: nope", noop),
407 # Checks (things that shouldn't happen)
410 "arvnodeman.test.fake_driver.FailingDriver",
412 {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
413 "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
414 "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
415 "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"},
418 "test_retry_create": (
419 # Actions (pattern -> action)
421 (r".*Daemon started", set_squeue),
422 (r".*Rate limit exceeded - scheduling retry in 2 seconds", noop),
423 (r".*Rate limit exceeded - scheduling retry in 1 seconds", noop),
424 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", noop),
426 # Checks (things that shouldn't happen)
429 "arvnodeman.test.fake_driver.RetryDriver",
431 {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail"},
434 "test_single_node_aws": (
435 # Actions (pattern -> action)
437 (r".*Daemon started", set_squeue),
438 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
439 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
440 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
441 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
443 # Checks (things that shouldn't happen)
445 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1),
446 r".*Setting node quota.*": fail,
449 "arvnodeman.test.fake_driver.FakeAwsDriver",
451 {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
454 "test_single_node_gce": (
455 # Actions (pattern -> action)
457 (r".*Daemon started", set_squeue),
458 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
459 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
460 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
461 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
463 # Checks (things that shouldn't happen)
465 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1),
466 r".*Setting node quota.*": fail,
469 "arvnodeman.test.fake_driver.FakeGceDriver",
471 {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
477 if len(sys.argv) > 1:
478 code = run_test(sys.argv[1], *tests[sys.argv[1]])
480 for t in sorted(tests.keys()):
481 code += run_test(t, *tests[t])
484 logger.info("Tests passed")
486 logger.info("Tests failed")
490 if __name__ == '__main__':