2 # Copyright (C) The Arvados Authors. All rights reserved.
4 # SPDX-License-Identifier: AGPL-3.0
6 """Integration test framework for node manager.
8 Runs full node manager with an API server (needs ARVADOS_API_HOST and
9 ARVADOS_API_TOKEN). Stubs out the cloud driver and slurm commands to mock
10 specific behaviors. Monitors the log output to verify an expected sequence of
11 events or behaviors for each test.
24 from functools import partial
28 logger = logging.getLogger("logger")
29 logger.setLevel(logging.INFO)
30 logger.addHandler(logging.StreamHandler(sys.stderr))
32 detail = logging.getLogger("detail")
33 detail.setLevel(logging.INFO)
34 if os.environ.get("ANMTEST_LOGLEVEL"):
35 detail_content = sys.stderr
37 detail_content = StringIO.StringIO()
38 detail.addHandler(logging.StreamHandler(detail_content))
43 unsatisfiable_job_scancelled = None
45 def update_script(path, val):
46 with open(path+"_", "w") as f:
48 os.chmod(path+"_", stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR)
49 os.rename(path+"_", path)
50 detail.info("Update script %s: %s", path, val)
54 update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n" +
55 "\n".join("echo '1|100|100|%s|%s'" % (v, k) for k,v in all_jobs.items()))
58 def set_queue_unsatisfiable(g):
59 global all_jobs, unsatisfiable_job_scancelled
60 # Simulate a job requesting a 99 core node.
61 update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n" +
62 "\n".join("echo '99|100|100|%s|%s'" % (v, k) for k,v in all_jobs.items()))
63 update_script(os.path.join(fake_slurm, "scancel"), "#!/bin/sh\n" +
64 "\ntouch %s" % unsatisfiable_job_scancelled)
68 global unsatisfiable_job_scancelled
69 cancelled_job = g.group(1)
70 api = arvados.api('v1')
71 # Check that 'scancel' was called
72 if not os.path.isfile(unsatisfiable_job_scancelled):
74 # Check for the log entry
75 log_entry = api.logs().list(
77 ['object_uuid', '=', cancelled_job],
78 ['event_type', '=', 'stderr'],
79 ]).execute()['items'][0]
81 r"Requirements for a single node exceed the available cloud node size",
82 log_entry['properties']['text']):
88 compute_nodes[g.group(1)] = g.group(3)
90 update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n" +
91 "\n".join("echo '%s alloc'" % (v) for k,v in compute_nodes.items()))
93 for k,v in all_jobs.items():
94 if v == "ReqNodeNotAvail":
95 all_jobs[k] = "Running"
102 def remaining_jobs(g):
103 update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n" +
104 "\n".join("echo '%s alloc'" % (v) for k,v in compute_nodes.items()))
106 for k,v in all_jobs.items():
107 all_jobs[k] = "Running"
115 update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n" +
116 "\n".join("echo '%s idle'" % (v) for k,v in compute_nodes.items()))
119 def node_shutdown(g):
121 del compute_nodes[g.group(1)]
126 for k,v in all_jobs.items():
127 all_jobs[k] = "ReqNodeNotAvail"
134 def fail(checks, pattern, g):
137 def expect_count(count, checks, pattern, g):
141 checks[pattern] = partial(expect_count, count-1)
144 def run_test(name, actions, checks, driver_class, jobs, provider):
146 global unsatisfiable_job_scancelled
147 unsatisfiable_job_scancelled = os.path.join(tempfile.mkdtemp(),
150 # Delete any stale node records
151 api = arvados.api('v1')
152 for n in api.nodes().list().execute()['items']:
153 api.nodes().delete(uuid=n["uuid"]).execute()
155 logger.info("Start %s", name)
158 fake_slurm = tempfile.mkdtemp()
159 detail.info("fake_slurm is %s", fake_slurm)
167 env = os.environ.copy()
168 env["PATH"] = fake_slurm + ":" + env["PATH"]
170 # Reset fake squeue/sinfo to empty
171 update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n")
172 update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n")
174 # Write configuration file for test
175 with open("tests/fake_%s.cfg.template" % provider) as f:
176 open(os.path.join(fake_slurm, "id_rsa.pub"), "w").close()
177 with open(os.path.join(fake_slurm, "fake.cfg"), "w") as cfg:
178 cfg.write(f.read().format(host=os.environ["ARVADOS_API_HOST"],
179 token=os.environ["ARVADOS_API_TOKEN"],
180 driver_class=driver_class,
181 ssh_key=os.path.join(fake_slurm, "id_rsa.pub")))
183 # Tests must complete in less than 3 minutes.
184 timeout = time.time() + 180
187 # Now start node manager
188 p = subprocess.Popen(["bin/arvados-node-manager", "--foreground", "--config", os.path.join(fake_slurm, "fake.cfg")],
189 bufsize=0, stderr=subprocess.PIPE, env=env)
193 # - Apply negative checks (things that are not supposed to happen)
195 # - Check if the next action should trigger
196 # - If all actions are exhausted, terminate with test success
197 # - If it hits timeout with actions remaining, terminate with test failed
199 # naive line iteration over pipes gets buffered, which isn't what we want,
200 # see https://bugs.python.org/issue3907
201 for line in iter(p.stderr.readline, ""):
202 detail_content.write(line)
204 for k,v in checks.items():
205 g = re.match(k, line)
207 detail.info("Matched check %s", k)
208 code += v(checks, k, g)
210 detail.error("Check failed")
218 if time.time() > timeout:
219 detail.error("Exceeded timeout with actions remaining: %s", actions)
226 g = re.match(k, line)
228 detail.info("Matched action %s", k)
232 detail.error("Action failed")
239 except KeyboardInterrupt:
243 detail.error("Ended with remaining actions: %s", actions)
246 shutil.rmtree(fake_slurm)
247 shutil.rmtree(os.path.dirname(unsatisfiable_job_scancelled))
250 logger.info("%s passed", name)
252 if isinstance(detail_content, StringIO.StringIO):
253 sys.stderr.write(detail_content.getvalue())
254 logger.info("%s failed", name)
263 "test_unsatisfiable_jobs" : (
264 # Actions (pattern -> action)
266 (r".*Daemon started", set_queue_unsatisfiable),
267 (r".*Cancelled unsatisfiable job '(\S+)'", job_cancelled),
269 # Checks (things that shouldn't happen)
271 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": fail,
272 r".*Trying to cancel job '(\S+)'": fail,
275 "arvnodeman.test.fake_driver.FakeDriver",
277 {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
280 "test_single_node_azure": (
282 (r".*Daemon started", set_squeue),
283 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
284 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
285 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
286 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
288 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
289 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1),
290 r".*Setting node quota.*": fail,
292 "arvnodeman.test.fake_driver.FakeDriver",
293 {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
295 "test_multiple_nodes": (
297 (r".*Daemon started", set_squeue),
298 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
299 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
300 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
301 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
302 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
303 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
304 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
305 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
306 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
307 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
309 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
310 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 4),
311 r".*Setting node quota.*": fail,
313 "arvnodeman.test.fake_driver.FakeDriver",
314 {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
315 "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
316 "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
317 "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"
321 (r".*Daemon started", set_squeue),
322 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
323 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
324 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
325 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", remaining_jobs),
326 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
327 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
328 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown)
330 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
331 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 2),
332 r".*Sending create_node request.*": partial(expect_count, 5)
334 "arvnodeman.test.fake_driver.QuotaDriver",
335 {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
336 "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
337 "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
338 "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"
340 "test_probe_quota": (
342 (r".*Daemon started", set_squeue),
343 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
344 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
345 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
346 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", remaining_jobs),
347 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
348 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
349 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
350 (r".*sending request", jobs_req),
351 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
352 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
353 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
354 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
355 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
356 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
357 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
358 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
359 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
360 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
362 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
363 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 6),
364 r".*Sending create_node request.*": partial(expect_count, 9)
366 "arvnodeman.test.fake_driver.QuotaDriver",
367 {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
368 "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
369 "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
370 "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"
372 "test_no_hang_failing_node_create": (
374 (r".*Daemon started", set_squeue),
375 (r".*Client error: nope", noop),
376 (r".*Client error: nope", noop),
377 (r".*Client error: nope", noop),
378 (r".*Client error: nope", noop),
381 "arvnodeman.test.fake_driver.FailingDriver",
382 {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
383 "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
384 "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
385 "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"
387 "test_retry_create": (
389 (r".*Daemon started", set_squeue),
390 (r".*Rate limit exceeded - scheduling retry in 12 seconds", noop),
391 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", noop),
394 "arvnodeman.test.fake_driver.RetryDriver",
395 {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail"
397 "test_single_node_aws": (
399 (r".*Daemon started", set_squeue),
400 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
401 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
402 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
403 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
405 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
406 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1),
407 r".*Setting node quota.*": fail,
409 "arvnodeman.test.fake_driver.FakeAwsDriver",
410 {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
412 "test_single_node_gce": (
414 (r".*Daemon started", set_squeue),
415 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
416 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
417 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
418 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
420 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
421 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1),
422 r".*Setting node quota.*": fail,
424 "arvnodeman.test.fake_driver.FakeGceDriver",
425 {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
430 if len(sys.argv) > 1:
431 code = run_test(sys.argv[1], *tests[sys.argv[1]])
433 for t in sorted(tests.keys()):
434 code += run_test(t, *tests[t])
437 logger.info("Tests passed")
439 logger.info("Tests failed")
443 if __name__ == '__main__':