2 # Copyright (C) The Arvados Authors. All rights reserved.
4 # SPDX-License-Identifier: AGPL-3.0
6 """Integration test framework for node manager.
8 Runs full node manager with an API server (needs ARVADOS_API_HOST and
9 ARVADOS_API_TOKEN). Stubs out the cloud driver and slurm commands to mock
10 specific behaviors. Monitors the log output to verify an expected sequence of
11 events or behaviors for each test.
24 from functools import partial
28 logger = logging.getLogger("logger")
29 logger.setLevel(logging.INFO)
30 logger.addHandler(logging.StreamHandler(sys.stderr))
32 detail = logging.getLogger("detail")
33 detail.setLevel(logging.INFO)
34 if os.environ.get("ANMTEST_LOGLEVEL"):
35 detail_content = sys.stderr
37 detail_content = StringIO.StringIO()
38 detail.addHandler(logging.StreamHandler(detail_content))
43 unsatisfiable_job_scancelled = os.path.join(tempfile.mkdtemp(), "scancel_called")
45 def update_script(path, val):
46 with open(path+"_", "w") as f:
48 os.chmod(path+"_", stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR)
49 os.rename(path+"_", path)
50 detail.info("Update script %s: %s", path, val)
54 update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n" +
55 "\n".join("echo '1|100|100|%s|%s'" % (v, k) for k,v in all_jobs.items()))
58 def set_queue_unsatisfiable(g):
59 global all_jobs, unsatisfiable_job_scancelled
60 # Simulate a job requesting a 99 core node.
61 update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n" +
62 "\n".join("echo '99|100|100|%s|%s'" % (v, k) for k,v in all_jobs.items()))
63 update_script(os.path.join(fake_slurm, "scancel"), "#!/bin/sh\n" +
64 "\ntouch %s" % unsatisfiable_job_scancelled)
68 global unsatisfiable_job_scancelled
69 cancelled_job = g.group(1)
70 api = arvados.api('v1')
71 # Check that 'scancel' was called
72 if not os.path.isfile(unsatisfiable_job_scancelled):
74 # Check for the log entry
75 log_entry = api.logs().list(
77 ['object_uuid', '=', cancelled_job],
78 ['event_type', '=', 'stderr'],
79 ]).execute()['items'][0]
81 r"Requirements for a single node exceed the available cloud node size",
82 log_entry['properties']['text']):
88 compute_nodes[g.group(1)] = g.group(3)
90 update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n" +
91 "\n".join("echo '%s alloc'" % (v) for k,v in compute_nodes.items()))
93 for k,v in all_jobs.items():
94 if v == "ReqNodeNotAvail":
95 all_jobs[k] = "Running"
102 def remaining_jobs(g):
103 update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n" +
104 "\n".join("echo '%s alloc'" % (v) for k,v in compute_nodes.items()))
106 for k,v in all_jobs.items():
107 all_jobs[k] = "Running"
115 update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n" +
116 "\n".join("echo '%s idle'" % (v) for k,v in compute_nodes.items()))
119 def node_shutdown(g):
121 del compute_nodes[g.group(1)]
126 for k,v in all_jobs.items():
127 all_jobs[k] = "ReqNodeNotAvail"
134 def fail(checks, pattern, g):
137 def expect_count(count, checks, pattern, g):
141 checks[pattern] = partial(expect_count, count-1)
144 def run_test(name, actions, checks, driver_class, jobs, provider):
147 # Delete any stale node records
148 api = arvados.api('v1')
149 for n in api.nodes().list().execute()['items']:
150 api.nodes().delete(uuid=n["uuid"]).execute()
152 logger.info("Start %s", name)
155 fake_slurm = tempfile.mkdtemp()
156 detail.info("fake_slurm is %s", fake_slurm)
164 env = os.environ.copy()
165 env["PATH"] = fake_slurm + ":" + env["PATH"]
167 # Reset fake squeue/sinfo to empty
168 update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n")
169 update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n")
171 # Write configuration file for test
172 with open("tests/fake_%s.cfg.template" % provider) as f:
173 open(os.path.join(fake_slurm, "id_rsa.pub"), "w").close()
174 with open(os.path.join(fake_slurm, "fake.cfg"), "w") as cfg:
175 cfg.write(f.read().format(host=os.environ["ARVADOS_API_HOST"],
176 token=os.environ["ARVADOS_API_TOKEN"],
177 driver_class=driver_class,
178 ssh_key=os.path.join(fake_slurm, "id_rsa.pub")))
180 # Tests must complete in less than 3 minutes.
181 timeout = time.time() + 180
184 # Now start node manager
185 p = subprocess.Popen(["bin/arvados-node-manager", "--foreground", "--config", os.path.join(fake_slurm, "fake.cfg")],
186 bufsize=0, stderr=subprocess.PIPE, env=env)
190 # - Apply negative checks (things that are not supposed to happen)
192 # - Check if the next action should trigger
193 # - If all actions are exhausted, terminate with test success
194 # - If it hits timeout with actions remaining, terminate with test failed
196 # naive line iteration over pipes gets buffered, which isn't what we want,
197 # see https://bugs.python.org/issue3907
198 for line in iter(p.stderr.readline, ""):
199 detail_content.write(line)
201 for k,v in checks.items():
202 g = re.match(k, line)
204 detail.info("Matched check %s", k)
205 code += v(checks, k, g)
207 detail.error("Check failed")
215 if time.time() > timeout:
216 detail.error("Exceeded timeout with actions remaining: %s", actions)
223 g = re.match(k, line)
225 detail.info("Matched action %s", k)
229 detail.error("Action failed")
236 except KeyboardInterrupt:
240 detail.error("Ended with remaining actions: %s", actions)
243 shutil.rmtree(fake_slurm)
244 shutil.rmtree(os.path.dirname(unsatisfiable_job_scancelled))
247 logger.info("%s passed", name)
249 if isinstance(detail_content, StringIO.StringIO):
250 sys.stderr.write(detail_content.getvalue())
251 logger.info("%s failed", name)
260 "test_unsatisfiable_jobs" : (
261 # Actions (pattern -> action)
263 (r".*Daemon started", set_queue_unsatisfiable),
264 (r".*Cancelled unsatisfiable job '(\S+)'", job_cancelled),
266 # Checks (things that shouldn't happen)
268 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": fail,
269 r".*Trying to cancel job '(\S+)'": fail,
272 "arvnodeman.test.fake_driver.FakeDriver",
274 {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
277 "test_single_node_azure": (
279 (r".*Daemon started", set_squeue),
280 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
281 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
282 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
283 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
285 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
286 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1),
287 r".*Setting node quota.*": fail,
289 "arvnodeman.test.fake_driver.FakeDriver",
290 {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
292 "test_multiple_nodes": (
294 (r".*Daemon started", set_squeue),
295 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
296 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
297 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
298 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
299 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
300 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
301 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
302 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
303 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
304 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
306 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
307 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 4),
308 r".*Setting node quota.*": fail,
310 "arvnodeman.test.fake_driver.FakeDriver",
311 {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
312 "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
313 "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
314 "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"
318 (r".*Daemon started", set_squeue),
319 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
320 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
321 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
322 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", remaining_jobs),
323 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
324 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
325 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown)
327 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
328 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 2),
329 r".*Sending create_node request.*": partial(expect_count, 5)
331 "arvnodeman.test.fake_driver.QuotaDriver",
332 {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
333 "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
334 "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
335 "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"
337 "test_probe_quota": (
339 (r".*Daemon started", set_squeue),
340 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
341 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
342 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
343 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", remaining_jobs),
344 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
345 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
346 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
347 (r".*sending request", jobs_req),
348 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
349 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
350 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
351 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
352 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
353 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
354 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
355 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
356 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
357 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
359 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
360 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 6),
361 r".*Sending create_node request.*": partial(expect_count, 9)
363 "arvnodeman.test.fake_driver.QuotaDriver",
364 {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
365 "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
366 "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
367 "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"
369 "test_no_hang_failing_node_create": (
371 (r".*Daemon started", set_squeue),
372 (r".*Client error: nope", noop),
373 (r".*Client error: nope", noop),
374 (r".*Client error: nope", noop),
375 (r".*Client error: nope", noop),
378 "arvnodeman.test.fake_driver.FailingDriver",
379 {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
380 "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
381 "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
382 "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"
384 "test_retry_create": (
386 (r".*Daemon started", set_squeue),
387 (r".*Rate limit exceeded - scheduling retry in 12 seconds", noop),
388 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", noop),
391 "arvnodeman.test.fake_driver.RetryDriver",
392 {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail"
394 "test_single_node_aws": (
396 (r".*Daemon started", set_squeue),
397 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
398 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
399 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
400 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
402 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
403 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1),
404 r".*Setting node quota.*": fail,
406 "arvnodeman.test.fake_driver.FakeAwsDriver",
407 {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
409 "test_single_node_gce": (
411 (r".*Daemon started", set_squeue),
412 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
413 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
414 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
415 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
417 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
418 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1),
419 r".*Setting node quota.*": fail,
421 "arvnodeman.test.fake_driver.FakeGceDriver",
422 {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
427 if len(sys.argv) > 1:
428 code = run_test(sys.argv[1], *tests[sys.argv[1]])
430 for t in sorted(tests.keys()):
431 code += run_test(t, *tests[t])
434 logger.info("Tests passed")
436 logger.info("Tests failed")
440 if __name__ == '__main__':