2 # Copyright (C) The Arvados Authors. All rights reserved.
4 # SPDX-License-Identifier: AGPL-3.0
6 """Integration test framework for node manager.
8 Runs full node manager with an API server (needs ARVADOS_API_HOST and
9 ARVADOS_API_TOKEN). Stubs out the cloud driver and slurm commands to mock
10 specific behaviors. Monitors the log output to verify an expected sequence of
11 events or behaviors for each test.
24 from functools import partial
28 formatter = logging.Formatter('%(asctime)s %(levelname)s: %(message)s')
30 handler = logging.StreamHandler(sys.stderr)
31 handler.setFormatter(formatter)
32 logger = logging.getLogger("logger")
33 logger.setLevel(logging.INFO)
34 logger.addHandler(handler)
36 detail = logging.getLogger("detail")
37 detail.setLevel(logging.INFO)
38 if os.environ.get("ANMTEST_LOGLEVEL"):
39 detail_content = sys.stderr
41 detail_content = StringIO.StringIO()
42 handler = logging.StreamHandler(detail_content)
43 handler.setFormatter(formatter)
44 detail.addHandler(handler)
49 unsatisfiable_job_scancelled = None
51 def update_script(path, val):
52 with open(path+"_", "w") as f:
54 os.chmod(path+"_", stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR)
55 os.rename(path+"_", path)
56 detail.info("Update script %s: %s", path, val)
60 update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n" +
61 "\n".join("echo '1|100|100|%s|%s|(null)|1234567890'" % (v, k) for k,v in all_jobs.items()))
64 def set_queue_unsatisfiable(g):
65 global all_jobs, unsatisfiable_job_scancelled
66 # Simulate a job requesting a 99 core node.
67 update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n" +
68 "\n".join("echo '99|100|100|%s|%s|(null)|1234567890'" % (v, k) for k,v in all_jobs.items()))
69 update_script(os.path.join(fake_slurm, "scancel"), "#!/bin/sh\n" +
70 "\ntouch %s" % unsatisfiable_job_scancelled)
74 global unsatisfiable_job_scancelled
75 cancelled_job = g.group(1)
76 api = arvados.api('v1')
77 # Check that 'scancel' was called
78 if not os.path.isfile(unsatisfiable_job_scancelled):
80 # Check for the log entry
81 log_entry = api.logs().list(
83 ['object_uuid', '=', cancelled_job],
84 ['event_type', '=', 'stderr'],
85 ]).execute()['items'][0]
87 r"Constraints cannot be satisfied",
88 log_entry['properties']['text']):
94 compute_nodes[g.group(1)] = g.group(3)
96 update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n" +
97 "\n".join("echo '%s|alloc|(null)'" % (v) for k,v in compute_nodes.items()))
99 for k,v in all_jobs.items():
100 if v == "ReqNodeNotAvail":
101 all_jobs[k] = "Running"
108 def remaining_jobs(g):
109 update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n" +
110 "\n".join("echo '%s|alloc|(null)'" % (v) for k,v in compute_nodes.items()))
112 for k,v in all_jobs.items():
113 all_jobs[k] = "Running"
121 update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n" +
122 "\n".join("echo '%s|idle|(null)'" % (v) for k,v in compute_nodes.items()))
125 def node_shutdown(g):
127 if g.group(1) in compute_nodes:
128 del compute_nodes[g.group(1)]
135 for k,v in all_jobs.items():
136 all_jobs[k] = "ReqNodeNotAvail"
143 def fail(checks, pattern, g):
146 def expect_count(count, checks, pattern, g):
150 checks[pattern] = partial(expect_count, count-1)
153 def run_test(name, actions, checks, driver_class, jobs, provider):
155 global unsatisfiable_job_scancelled
156 unsatisfiable_job_scancelled = os.path.join(tempfile.mkdtemp(),
159 # Delete any stale node records
160 api = arvados.api('v1')
161 for n in api.nodes().list().execute()['items']:
162 api.nodes().delete(uuid=n["uuid"]).execute()
164 logger.info("Start %s", name)
167 fake_slurm = tempfile.mkdtemp()
168 detail.info("fake_slurm is %s", fake_slurm)
176 env = os.environ.copy()
177 env["PATH"] = fake_slurm + ":" + env["PATH"]
179 # Reset fake squeue/sinfo to empty
180 update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n")
181 update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n")
183 # Write configuration file for test
184 with open("tests/fake_%s.cfg.template" % provider) as f:
185 open(os.path.join(fake_slurm, "id_rsa.pub"), "w").close()
186 with open(os.path.join(fake_slurm, "fake.cfg"), "w") as cfg:
187 cfg.write(f.read().format(host=os.environ["ARVADOS_API_HOST"],
188 token=os.environ["ARVADOS_API_TOKEN"],
189 driver_class=driver_class,
190 ssh_key=os.path.join(fake_slurm, "id_rsa.pub")))
192 # Tests must complete in less than 3 minutes.
193 timeout = time.time() + 180
196 # Now start node manager
197 p = subprocess.Popen(["bin/arvados-node-manager", "--foreground", "--config", os.path.join(fake_slurm, "fake.cfg")],
198 bufsize=0, stderr=subprocess.PIPE, env=env)
202 # - Apply negative checks (things that are not supposed to happen)
204 # - Check if the next action should trigger
205 # - If all actions are exhausted, terminate with test success
206 # - If it hits timeout with actions remaining, terminate with test failed
208 # naive line iteration over pipes gets buffered, which isn't what we want,
209 # see https://bugs.python.org/issue3907
210 for line in iter(p.stderr.readline, ""):
211 detail_content.write(line)
213 for k,v in checks.items():
214 g = re.match(k, line)
216 detail.info("Matched check %s", k)
217 code += v(checks, k, g)
219 detail.error("Check failed")
227 if time.time() > timeout:
228 detail.error("Exceeded timeout with actions remaining: %s", actions)
235 g = re.match(k, line)
237 detail.info("Matched action %s", k)
241 detail.error("Action failed")
248 except KeyboardInterrupt:
252 detail.error("Ended with remaining actions: %s", actions)
255 shutil.rmtree(fake_slurm)
256 shutil.rmtree(os.path.dirname(unsatisfiable_job_scancelled))
259 logger.info("%s passed", name)
261 if isinstance(detail_content, StringIO.StringIO):
262 sys.stderr.write(detail_content.getvalue())
263 logger.info("%s failed", name)
272 "test_unsatisfiable_jobs" : (
273 # Actions (pattern -> action)
275 (r".*Daemon started", set_queue_unsatisfiable),
276 (r".*Cancelled unsatisfiable job '(\S+)'", job_cancelled),
278 # Checks (things that shouldn't happen)
280 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": fail,
281 r".*Trying to cancel job '(\S+)'": fail,
284 "arvnodeman.test.fake_driver.FakeDriver",
286 {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
289 "test_single_node_azure": (
290 # Actions (pattern -> action)
292 (r".*Daemon started", set_squeue),
293 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
294 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
295 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
296 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
298 # Checks (things that shouldn't happen)
300 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
301 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1),
302 r".*Setting node quota.*": fail,
305 "arvnodeman.test.fake_driver.FakeDriver",
307 {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
310 "test_multiple_nodes": (
311 # Actions (pattern -> action)
313 (r".*Daemon started", set_squeue),
314 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
315 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
316 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
317 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
318 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
319 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
320 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
321 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
322 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
323 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
325 # Checks (things that shouldn't happen)
327 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
328 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 4),
329 r".*Setting node quota.*": fail,
332 "arvnodeman.test.fake_driver.FakeDriver",
334 {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
335 "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
336 "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
337 "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"},
341 # Actions (pattern -> action)
343 (r".*Daemon started", set_squeue),
344 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
345 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
346 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
347 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", remaining_jobs),
348 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
349 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown)
351 # Checks (things that shouldn't happen)
353 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
354 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 2),
355 r".*Sending create_node request.*": partial(expect_count, 5)
358 "arvnodeman.test.fake_driver.QuotaDriver",
360 {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
361 "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
362 "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
363 "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"},
366 "test_probe_quota": (
367 # Actions (pattern -> action)
369 (r".*Daemon started", set_squeue),
370 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
371 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
372 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
373 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", remaining_jobs),
374 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
375 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
376 (r".*sending request", jobs_req),
377 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
378 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
379 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
380 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
381 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
382 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
383 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
384 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
385 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
386 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
388 # Checks (things that shouldn't happen)
390 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
391 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 6),
392 r".*Sending create_node request.*": partial(expect_count, 9)
395 "arvnodeman.test.fake_driver.QuotaDriver",
397 {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
398 "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
399 "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
400 "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"},
403 "test_no_hang_failing_node_create": (
404 # Actions (pattern -> action)
406 (r".*Daemon started", set_squeue),
407 (r".*Client error: nope", noop),
408 (r".*Client error: nope", noop),
409 (r".*Client error: nope", noop),
410 (r".*Client error: nope", noop),
412 # Checks (things that shouldn't happen)
415 "arvnodeman.test.fake_driver.FailingDriver",
417 {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
418 "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
419 "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
420 "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"},
423 "test_retry_create": (
424 # Actions (pattern -> action)
426 (r".*Daemon started", set_squeue),
427 (r".*Rate limit exceeded - scheduling retry in 12 seconds", noop),
428 (r".*Rate limit exceeded - scheduling retry in 2 seconds", noop),
429 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", noop),
431 # Checks (things that shouldn't happen)
434 "arvnodeman.test.fake_driver.RetryDriver",
436 {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail"},
439 "test_single_node_aws": (
440 # Actions (pattern -> action)
442 (r".*Daemon started", set_squeue),
443 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
444 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
445 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
446 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
448 # Checks (things that shouldn't happen)
450 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
451 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1),
452 r".*Setting node quota.*": fail,
455 "arvnodeman.test.fake_driver.FakeAwsDriver",
457 {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
460 "test_single_node_gce": (
461 # Actions (pattern -> action)
463 (r".*Daemon started", set_squeue),
464 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
465 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
466 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
467 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
469 # Checks (things that shouldn't happen)
471 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
472 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1),
473 r".*Setting node quota.*": fail,
476 "arvnodeman.test.fake_driver.FakeGceDriver",
478 {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
484 if len(sys.argv) > 1:
485 code = run_test(sys.argv[1], *tests[sys.argv[1]])
487 for t in sorted(tests.keys()):
488 code += run_test(t, *tests[t])
491 logger.info("Tests passed")
493 logger.info("Tests failed")
497 if __name__ == '__main__':