2 # Copyright (C) The Arvados Authors. All rights reserved.
4 # SPDX-License-Identifier: AGPL-3.0
6 """Integration test framework for node manager.
8 Runs full node manager with an API server (needs ARVADOS_API_HOST and
9 ARVADOS_API_TOKEN). Stubs out the cloud driver and slurm commands to mock
10 specific behaviors. Monitors the log output to verify an expected sequence of
11 events or behaviors for each test.
24 from functools import partial
28 formatter = logging.Formatter('%(asctime)s %(levelname)s: %(message)s')
30 handler = logging.StreamHandler(sys.stderr)
31 handler.setFormatter(formatter)
32 logger = logging.getLogger("logger")
33 logger.setLevel(logging.INFO)
34 logger.addHandler(handler)
36 detail = logging.getLogger("detail")
37 detail.setLevel(logging.INFO)
38 if os.environ.get("ANMTEST_LOGLEVEL"):
39 detail_content = sys.stderr
41 detail_content = StringIO.StringIO()
42 handler = logging.StreamHandler(detail_content)
43 handler.setFormatter(formatter)
44 detail.addHandler(handler)
49 unsatisfiable_job_scancelled = None
51 def update_script(path, val):
52 with open(path+"_", "w") as f:
54 os.chmod(path+"_", stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR)
55 os.rename(path+"_", path)
56 detail.info("Update script %s: %s", path, val)
60 update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n" +
61 "\n".join("echo '1|100|100|%s|%s|(null)|1234567890'" % (v, k) for k,v in all_jobs.items()))
64 def set_queue_unsatisfiable(g):
65 global all_jobs, unsatisfiable_job_scancelled
66 # Simulate a job requesting a 99 core node.
67 update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n" +
68 "\n".join("echo '99|100|100|%s|%s|(null)|1234567890'" % (v, k) for k,v in all_jobs.items()))
69 update_script(os.path.join(fake_slurm, "scancel"), "#!/bin/sh\n" +
70 "\ntouch %s" % unsatisfiable_job_scancelled)
74 global unsatisfiable_job_scancelled
75 cancelled_job = g.group(1)
76 api = arvados.api('v1')
77 # Check that 'scancel' was called
78 if not os.path.isfile(unsatisfiable_job_scancelled):
80 # Check for the log entry
81 log_entry = api.logs().list(
83 ['object_uuid', '=', cancelled_job],
84 ['event_type', '=', 'stderr'],
85 ]).execute()['items'][0]
87 r"Constraints cannot be satisfied",
88 log_entry['properties']['text']):
94 compute_nodes[g.group(1)] = g.group(3)
96 update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n" +
97 "\n".join("echo '%s|alloc|(null)'" % (v) for k,v in compute_nodes.items()))
99 for k,v in all_jobs.items():
100 if v == "ReqNodeNotAvail":
101 all_jobs[k] = "Running"
108 def remaining_jobs(g):
109 update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n" +
110 "\n".join("echo '%s|alloc|(null)'" % (v) for k,v in compute_nodes.items()))
112 for k,v in all_jobs.items():
113 all_jobs[k] = "Running"
121 update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n" +
122 "\n".join("echo '%s|idle|(null)'" % (v) for k,v in compute_nodes.items()))
125 def node_shutdown(g):
127 del compute_nodes[g.group(1)]
132 for k,v in all_jobs.items():
133 all_jobs[k] = "ReqNodeNotAvail"
140 def fail(checks, pattern, g):
143 def expect_count(count, checks, pattern, g):
147 checks[pattern] = partial(expect_count, count-1)
150 def run_test(name, actions, checks, driver_class, jobs, provider):
152 global unsatisfiable_job_scancelled
153 unsatisfiable_job_scancelled = os.path.join(tempfile.mkdtemp(),
156 # Delete any stale node records
157 api = arvados.api('v1')
158 for n in api.nodes().list().execute()['items']:
159 api.nodes().delete(uuid=n["uuid"]).execute()
161 logger.info("Start %s", name)
164 fake_slurm = tempfile.mkdtemp()
165 detail.info("fake_slurm is %s", fake_slurm)
173 env = os.environ.copy()
174 env["PATH"] = fake_slurm + ":" + env["PATH"]
176 # Reset fake squeue/sinfo to empty
177 update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n")
178 update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n")
180 # Write configuration file for test
181 with open("tests/fake_%s.cfg.template" % provider) as f:
182 open(os.path.join(fake_slurm, "id_rsa.pub"), "w").close()
183 with open(os.path.join(fake_slurm, "fake.cfg"), "w") as cfg:
184 cfg.write(f.read().format(host=os.environ["ARVADOS_API_HOST"],
185 token=os.environ["ARVADOS_API_TOKEN"],
186 driver_class=driver_class,
187 ssh_key=os.path.join(fake_slurm, "id_rsa.pub")))
189 # Tests must complete in less than 3 minutes.
190 timeout = time.time() + 180
193 # Now start node manager
194 p = subprocess.Popen(["bin/arvados-node-manager", "--foreground", "--config", os.path.join(fake_slurm, "fake.cfg")],
195 bufsize=0, stderr=subprocess.PIPE, env=env)
199 # - Apply negative checks (things that are not supposed to happen)
201 # - Check if the next action should trigger
202 # - If all actions are exhausted, terminate with test success
203 # - If it hits timeout with actions remaining, terminate with test failed
205 # naive line iteration over pipes gets buffered, which isn't what we want,
206 # see https://bugs.python.org/issue3907
207 for line in iter(p.stderr.readline, ""):
208 detail_content.write(line)
210 for k,v in checks.items():
211 g = re.match(k, line)
213 detail.info("Matched check %s", k)
214 code += v(checks, k, g)
216 detail.error("Check failed")
224 if time.time() > timeout:
225 detail.error("Exceeded timeout with actions remaining: %s", actions)
232 g = re.match(k, line)
234 detail.info("Matched action %s", k)
238 detail.error("Action failed")
245 except KeyboardInterrupt:
249 detail.error("Ended with remaining actions: %s", actions)
252 shutil.rmtree(fake_slurm)
253 shutil.rmtree(os.path.dirname(unsatisfiable_job_scancelled))
256 logger.info("%s passed", name)
258 if isinstance(detail_content, StringIO.StringIO):
259 sys.stderr.write(detail_content.getvalue())
260 logger.info("%s failed", name)
269 "test_unsatisfiable_jobs" : (
270 # Actions (pattern -> action)
272 (r".*Daemon started", set_queue_unsatisfiable),
273 (r".*Cancelled unsatisfiable job '(\S+)'", job_cancelled),
275 # Checks (things that shouldn't happen)
277 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": fail,
278 r".*Trying to cancel job '(\S+)'": fail,
281 "arvnodeman.test.fake_driver.FakeDriver",
283 {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
286 "test_single_node_azure": (
287 # Actions (pattern -> action)
289 (r".*Daemon started", set_squeue),
290 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
291 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
292 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
293 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
295 # Checks (things that shouldn't happen)
297 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
298 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1),
299 r".*Setting node quota.*": fail,
302 "arvnodeman.test.fake_driver.FakeDriver",
304 {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
307 "test_multiple_nodes": (
308 # Actions (pattern -> action)
310 (r".*Daemon started", set_squeue),
311 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
312 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
313 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
314 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
315 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
316 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
317 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
318 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
319 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
320 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
322 # Checks (things that shouldn't happen)
324 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
325 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 4),
326 r".*Setting node quota.*": fail,
329 "arvnodeman.test.fake_driver.FakeDriver",
331 {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
332 "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
333 "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
334 "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"},
338 # Actions (pattern -> action)
340 (r".*Daemon started", set_squeue),
341 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
342 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
343 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
344 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", remaining_jobs),
345 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
346 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown)
348 # Checks (things that shouldn't happen)
350 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
351 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 2),
352 r".*Sending create_node request.*": partial(expect_count, 5)
355 "arvnodeman.test.fake_driver.QuotaDriver",
357 {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
358 "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
359 "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
360 "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"},
363 "test_probe_quota": (
364 # Actions (pattern -> action)
366 (r".*Daemon started", set_squeue),
367 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
368 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
369 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
370 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", remaining_jobs),
371 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
372 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
373 (r".*sending request", jobs_req),
374 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
375 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
376 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
377 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
378 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
379 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
380 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
381 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
382 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
383 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
385 # Checks (things that shouldn't happen)
387 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
388 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 6),
389 r".*Sending create_node request.*": partial(expect_count, 9)
392 "arvnodeman.test.fake_driver.QuotaDriver",
394 {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
395 "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
396 "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
397 "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"},
400 "test_no_hang_failing_node_create": (
401 # Actions (pattern -> action)
403 (r".*Daemon started", set_squeue),
404 (r".*Client error: nope", noop),
405 (r".*Client error: nope", noop),
406 (r".*Client error: nope", noop),
407 (r".*Client error: nope", noop),
409 # Checks (things that shouldn't happen)
412 "arvnodeman.test.fake_driver.FailingDriver",
414 {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
415 "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
416 "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
417 "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"},
420 "test_retry_create": (
421 # Actions (pattern -> action)
423 (r".*Daemon started", set_squeue),
424 (r".*Rate limit exceeded - scheduling retry in 12 seconds", noop),
425 (r".*Rate limit exceeded - scheduling retry in 2 seconds", noop),
426 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", noop),
428 # Checks (things that shouldn't happen)
431 "arvnodeman.test.fake_driver.RetryDriver",
433 {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail"},
436 "test_single_node_aws": (
437 # Actions (pattern -> action)
439 (r".*Daemon started", set_squeue),
440 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
441 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
442 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
443 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
445 # Checks (things that shouldn't happen)
447 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
448 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1),
449 r".*Setting node quota.*": fail,
452 "arvnodeman.test.fake_driver.FakeAwsDriver",
454 {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
457 "test_single_node_gce": (
458 # Actions (pattern -> action)
460 (r".*Daemon started", set_squeue),
461 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
462 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
463 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
464 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
466 # Checks (things that shouldn't happen)
468 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
469 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1),
470 r".*Setting node quota.*": fail,
473 "arvnodeman.test.fake_driver.FakeGceDriver",
475 {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
481 if len(sys.argv) > 1:
482 code = run_test(sys.argv[1], *tests[sys.argv[1]])
484 for t in sorted(tests.keys()):
485 code += run_test(t, *tests[t])
488 logger.info("Tests passed")
490 logger.info("Tests failed")
494 if __name__ == '__main__':