2 # Copyright (C) The Arvados Authors. All rights reserved.
4 # SPDX-License-Identifier: AGPL-3.0
6 """Integration test framework for node manager.
8 Runs full node manager with an API server (needs ARVADOS_API_HOST and
9 ARVADOS_API_TOKEN). Stubs out the cloud driver and slurm commands to mock
10 specific behaviors. Monitors the log output to verify an expected sequence of
11 events or behaviors for each test.
25 from functools import partial
29 formatter = logging.Formatter('%(asctime)s %(levelname)s: %(message)s')
31 handler = logging.StreamHandler(sys.stderr)
32 handler.setFormatter(formatter)
33 logger = logging.getLogger("logger")
34 logger.setLevel(logging.INFO)
35 logger.addHandler(handler)
37 detail = logging.getLogger("detail")
38 detail.setLevel(logging.INFO)
39 if os.environ.get("ANMTEST_LOGLEVEL"):
40 detail_content = sys.stderr
42 detail_content = StringIO.StringIO()
43 handler = logging.StreamHandler(detail_content)
44 handler.setFormatter(formatter)
45 detail.addHandler(handler)
50 unsatisfiable_job_scancelled = None
52 def update_script(path, val):
53 with open(path+"_", "w") as f:
55 os.chmod(path+"_", stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR)
56 os.rename(path+"_", path)
57 detail.info("Update script %s: %s", path, val)
61 update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n" +
62 "\n".join("echo '1|100|100|%s|%s|(null)|1234567890'" % (v, k) for k,v in all_jobs.items()))
65 def set_queue_unsatisfiable(g):
66 global all_jobs, unsatisfiable_job_scancelled
67 # Simulate a job requesting a 99 core node.
68 update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n" +
69 "\n".join("echo '99|100|100|%s|%s|(null)|1234567890'" % (v, k) for k,v in all_jobs.items()))
70 update_script(os.path.join(fake_slurm, "scancel"), "#!/bin/sh\n" +
71 "\ntouch %s" % unsatisfiable_job_scancelled)
75 global unsatisfiable_job_scancelled
76 cancelled_job = g.group(1)
77 api = arvados.api('v1')
78 # Check that 'scancel' was called
79 if not os.path.isfile(unsatisfiable_job_scancelled):
81 # Check for the log entry
82 log_entry = api.logs().list(
84 ['object_uuid', '=', cancelled_job],
85 ['event_type', '=', 'stderr'],
86 ]).execute()['items'][0]
88 r"Constraints cannot be satisfied",
89 log_entry['properties']['text']):
95 compute_nodes[g.group(1)] = g.group(3)
97 update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n" +
98 "\n".join("echo '%s|alloc|(null)'" % (v) for k,v in compute_nodes.items()))
100 for k,v in all_jobs.items():
101 if v == "ReqNodeNotAvail":
102 all_jobs[k] = "Running"
109 def remaining_jobs(g):
110 update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n" +
111 "\n".join("echo '%s|alloc|(null)'" % (v) for k,v in compute_nodes.items()))
113 for k,v in all_jobs.items():
114 all_jobs[k] = "Running"
122 update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n" +
123 "\n".join("echo '%s|idle|(null)'" % (v) for k,v in compute_nodes.items()))
126 def node_shutdown(g):
128 del compute_nodes[g.group(1)]
133 for k,v in all_jobs.items():
134 all_jobs[k] = "ReqNodeNotAvail"
141 def fail(checks, pattern, g):
144 def expect_count(count, checks, pattern, g):
148 checks[pattern] = partial(expect_count, count-1)
151 def run_test(name, actions, checks, driver_class, jobs, provider):
153 global unsatisfiable_job_scancelled
154 unsatisfiable_job_scancelled = os.path.join(tempfile.mkdtemp(),
157 # Delete any stale node records
158 api = arvados.api('v1')
159 for n in api.nodes().list().execute()['items']:
160 api.nodes().delete(uuid=n["uuid"]).execute()
162 logger.info("Start %s", name)
165 fake_slurm = tempfile.mkdtemp()
166 detail.info("fake_slurm is %s", fake_slurm)
174 env = os.environ.copy()
175 env["PATH"] = fake_slurm + ":" + env["PATH"]
177 # Reset fake squeue/sinfo to empty
178 update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n")
179 update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n")
181 # Write configuration file for test
182 with open("tests/fake_%s.cfg.template" % provider) as f:
183 open(os.path.join(fake_slurm, "id_rsa.pub"), "w").close()
184 with open(os.path.join(fake_slurm, "fake.cfg"), "w") as cfg:
185 cfg.write(f.read().format(host=os.environ["ARVADOS_API_HOST"],
186 token=os.environ["ARVADOS_API_TOKEN"],
187 driver_class=driver_class,
188 ssh_key=os.path.join(fake_slurm, "id_rsa.pub")))
190 # Tests must complete in less than 3 minutes.
191 timeout = time.time() + 180
194 # Now start node manager
195 p = subprocess.Popen(["bin/arvados-node-manager", "--foreground", "--config", os.path.join(fake_slurm, "fake.cfg")],
196 bufsize=0, stderr=subprocess.PIPE, env=env)
200 # - Apply negative checks (things that are not supposed to happen)
202 # - Check if the next action should trigger
203 # - If all actions are exhausted, terminate with test success
204 # - If it hits timeout with actions remaining, terminate with test failed
206 # naive line iteration over pipes gets buffered, which isn't what we want,
207 # see https://bugs.python.org/issue3907
208 for line in iter(p.stderr.readline, ""):
209 detail_content.write(line)
211 for k,v in checks.items():
212 g = re.match(k, line)
214 detail.info("Matched check %s", k)
215 code += v(checks, k, g)
217 detail.error("Check failed")
225 if time.time() > timeout:
226 detail.error("Exceeded timeout with actions remaining: %s", actions)
233 g = re.match(k, line)
235 detail.info("Matched action %s", k)
239 detail.error("Action failed")
246 except KeyboardInterrupt:
250 detail.error("Ended with remaining actions: %s", actions)
253 shutil.rmtree(fake_slurm)
254 shutil.rmtree(os.path.dirname(unsatisfiable_job_scancelled))
257 logger.info("%s passed", name)
259 if isinstance(detail_content, StringIO.StringIO):
260 detail_content.seek(0)
261 chunk = detail_content.read(4096)
264 sys.stderr.write(chunk)
265 chunk = detail_content.read(4096)
267 if e.errno == errno.EAGAIN:
268 # try again (probably pipe buffer full)
272 logger.info("%s failed", name)
281 "test_unsatisfiable_jobs" : (
282 # Actions (pattern -> action)
284 (r".*Daemon started", set_queue_unsatisfiable),
285 (r".*Cancelled unsatisfiable job '(\S+)'", job_cancelled),
287 # Checks (things that shouldn't happen)
289 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": fail,
290 r".*Trying to cancel job '(\S+)'": fail,
293 "arvnodeman.test.fake_driver.FakeDriver",
295 {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
298 "test_single_node_azure": (
299 # Actions (pattern -> action)
301 (r".*Daemon started", set_squeue),
302 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
303 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
304 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
305 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
307 # Checks (things that shouldn't happen)
309 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
310 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1),
311 r".*Setting node quota.*": fail,
314 "arvnodeman.test.fake_driver.FakeDriver",
316 {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
319 "test_multiple_nodes": (
320 # Actions (pattern -> action)
322 (r".*Daemon started", set_squeue),
323 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
324 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
325 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
326 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
327 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
328 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
329 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
330 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
331 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
332 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
334 # Checks (things that shouldn't happen)
336 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
337 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 4),
338 r".*Setting node quota.*": fail,
341 "arvnodeman.test.fake_driver.FakeDriver",
343 {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
344 "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
345 "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
346 "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"},
350 # Actions (pattern -> action)
352 (r".*Daemon started", set_squeue),
353 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
354 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
355 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
356 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", remaining_jobs),
357 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
358 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown)
360 # Checks (things that shouldn't happen)
362 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
363 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 2),
364 r".*Sending create_node request.*": partial(expect_count, 5)
367 "arvnodeman.test.fake_driver.QuotaDriver",
369 {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
370 "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
371 "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
372 "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"},
375 "test_probe_quota": (
376 # Actions (pattern -> action)
378 (r".*Daemon started", set_squeue),
379 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
380 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
381 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
382 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", remaining_jobs),
383 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
384 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
385 (r".*sending request", jobs_req),
386 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
387 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
388 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
389 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
390 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
391 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
392 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
393 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
394 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
395 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
397 # Checks (things that shouldn't happen)
399 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
400 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 6),
401 r".*Sending create_node request.*": partial(expect_count, 9)
404 "arvnodeman.test.fake_driver.QuotaDriver",
406 {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
407 "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
408 "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
409 "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"},
412 "test_no_hang_failing_node_create": (
413 # Actions (pattern -> action)
415 (r".*Daemon started", set_squeue),
416 (r".*Client error: nope", noop),
417 (r".*Client error: nope", noop),
418 (r".*Client error: nope", noop),
419 (r".*Client error: nope", noop),
421 # Checks (things that shouldn't happen)
424 "arvnodeman.test.fake_driver.FailingDriver",
426 {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
427 "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
428 "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
429 "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"},
432 "test_retry_create": (
433 # Actions (pattern -> action)
435 (r".*Daemon started", set_squeue),
436 (r".*Rate limit exceeded - scheduling retry in 12 seconds", noop),
437 (r".*Rate limit exceeded - scheduling retry in 2 seconds", noop),
438 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", noop),
440 # Checks (things that shouldn't happen)
443 "arvnodeman.test.fake_driver.RetryDriver",
445 {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail"},
448 "test_single_node_aws": (
449 # Actions (pattern -> action)
451 (r".*Daemon started", set_squeue),
452 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
453 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
454 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
455 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
457 # Checks (things that shouldn't happen)
459 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
460 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1),
461 r".*Setting node quota.*": fail,
464 "arvnodeman.test.fake_driver.FakeAwsDriver",
466 {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
469 "test_single_node_gce": (
470 # Actions (pattern -> action)
472 (r".*Daemon started", set_squeue),
473 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
474 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
475 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
476 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
478 # Checks (things that shouldn't happen)
480 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
481 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1),
482 r".*Setting node quota.*": fail,
485 "arvnodeman.test.fake_driver.FakeGceDriver",
487 {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
493 if len(sys.argv) > 1:
494 code = run_test(sys.argv[1], *tests[sys.argv[1]])
496 for t in sorted(tests.keys()):
497 code += run_test(t, *tests[t])
500 logger.info("Tests passed")
502 logger.info("Tests failed")
506 if __name__ == '__main__':