2 """Integration test framework for node manager.
4 Runs full node manager with an API server (needs ARVADOS_API_HOST and
5 ARVADOS_API_TOKEN). Stubs out the cloud driver and slurm commands to mock
6 specific behaviors. Monitors the log output to verify an expected sequence of
7 events or behaviors for each test.
20 from functools import partial
24 logger = logging.getLogger("logger")
25 logger.setLevel(logging.INFO)
26 logger.addHandler(logging.StreamHandler(sys.stderr))
28 detail = logging.getLogger("detail")
29 detail.setLevel(logging.INFO)
30 if os.environ.get("ANMTEST_LOGLEVEL"):
31 detail_content = sys.stderr
33 detail_content = StringIO.StringIO()
34 detail.addHandler(logging.StreamHandler(detail_content))
40 def update_script(path, val):
41 with open(path+"_", "w") as f:
43 os.chmod(path+"_", stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR)
44 os.rename(path+"_", path)
45 detail.info("Update script %s: %s", path, val)
49 update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n" +
50 "\n".join("echo '1|100|100|%s|%s'" % (v, k) for k,v in all_jobs.items()))
56 compute_nodes[g.group(1)] = g.group(3)
58 update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n" +
59 "\n".join("echo '%s alloc'" % (v) for k,v in compute_nodes.items()))
61 for k,v in all_jobs.items():
62 if v == "ReqNodeNotAvail":
63 all_jobs[k] = "Running"
70 def remaining_jobs(g):
71 update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n" +
72 "\n".join("echo '%s alloc'" % (v) for k,v in compute_nodes.items()))
74 for k,v in all_jobs.items():
75 all_jobs[k] = "Running"
83 update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n" +
84 "\n".join("echo '%s idle'" % (v) for k,v in compute_nodes.items()))
89 del compute_nodes[g.group(1)]
94 for k,v in all_jobs.items():
95 all_jobs[k] = "ReqNodeNotAvail"
102 def fail(checks, pattern, g):
105 def expect_count(count, checks, pattern, g):
109 checks[pattern] = partial(expect_count, count-1)
112 def run_test(name, actions, checks, driver_class, jobs, provider):
115 # Delete any stale node records
116 api = arvados.api('v1')
117 for n in api.nodes().list().execute()['items']:
118 api.nodes().delete(uuid=n["uuid"]).execute()
120 logger.info("Start %s", name)
123 fake_slurm = tempfile.mkdtemp()
124 detail.info("fake_slurm is %s", fake_slurm)
132 env = os.environ.copy()
133 env["PATH"] = fake_slurm + ":" + env["PATH"]
135 # Reset fake squeue/sinfo to empty
136 update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n")
137 update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n")
139 # Write configuration file for test
140 with open("tests/fake_%s.cfg.template" % provider) as f:
141 open(os.path.join(fake_slurm, "id_rsa.pub"), "w").close()
142 with open(os.path.join(fake_slurm, "fake.cfg"), "w") as cfg:
143 cfg.write(f.read().format(host=os.environ["ARVADOS_API_HOST"],
144 token=os.environ["ARVADOS_API_TOKEN"],
145 driver_class=driver_class,
146 ssh_key=os.path.join(fake_slurm, "id_rsa.pub")))
148 # Tests must complete in less than 3 minutes.
149 timeout = time.time() + 180
152 # Now start node manager
153 p = subprocess.Popen(["bin/arvados-node-manager", "--foreground", "--config", os.path.join(fake_slurm, "fake.cfg")],
154 bufsize=0, stderr=subprocess.PIPE, env=env)
158 # - Apply negative checks (thinks that are not supposed to happen)
160 # - Check if the next action should trigger
161 # - If all actions are exhausted, terminate with test success
162 # - If it hits timeout with actions remaining, terminate with test failed
164 # naive line iteration over pipes gets buffered, which isn't what we want,
165 # see https://bugs.python.org/issue3907
166 for line in iter(p.stderr.readline, ""):
167 detail_content.write(line)
169 for k,v in checks.items():
170 g = re.match(k, line)
172 detail.info("Matched check %s", k)
173 code += v(checks, k, g)
175 detail.error("Check failed")
183 if time.time() > timeout:
184 detail.error("Exceeded timeout with actions remaining: %s", actions)
191 g = re.match(k, line)
193 detail.info("Matched action %s", k)
197 detail.error("Action failed")
204 except KeyboardInterrupt:
208 detail.error("Ended with remaining actions: %s", actions)
211 shutil.rmtree(fake_slurm)
214 logger.info("%s passed", name)
216 if isinstance(detail_content, StringIO.StringIO):
217 sys.stderr.write(detail_content.getvalue())
218 logger.info("%s failed", name)
227 "test_single_node_azure": (
229 (r".*Daemon started", set_squeue),
230 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
231 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
232 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
233 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
235 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
236 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1),
237 r".*Setting node quota.*": fail,
239 "arvnodeman.test.fake_driver.FakeDriver",
240 {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
242 "test_multiple_nodes": (
244 (r".*Daemon started", set_squeue),
245 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
246 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
247 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
248 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
249 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
250 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
251 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
252 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
253 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
254 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
256 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
257 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 4),
258 r".*Setting node quota.*": fail,
260 "arvnodeman.test.fake_driver.FakeDriver",
261 {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
262 "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
263 "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
264 "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"
268 (r".*Daemon started", set_squeue),
269 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
270 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
271 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
272 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", remaining_jobs),
273 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
274 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
275 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown)
277 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
278 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 2),
279 r".*Sending create_node request.*": partial(expect_count, 5)
281 "arvnodeman.test.fake_driver.QuotaDriver",
282 {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
283 "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
284 "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
285 "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"
287 "test_probe_quota": (
289 (r".*Daemon started", set_squeue),
290 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
291 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
292 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
293 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", remaining_jobs),
294 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
295 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
296 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
297 (r".*sending request", jobs_req),
298 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
299 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
300 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
301 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
302 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
303 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
304 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
305 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
306 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
307 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
309 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
310 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 6),
311 r".*Sending create_node request.*": partial(expect_count, 9)
313 "arvnodeman.test.fake_driver.QuotaDriver",
314 {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
315 "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
316 "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
317 "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"
319 "test_no_hang_failing_node_create": (
321 (r".*Daemon started", set_squeue),
322 (r".*Client error: nope", noop),
323 (r".*Client error: nope", noop),
324 (r".*Client error: nope", noop),
325 (r".*Client error: nope", noop),
328 "arvnodeman.test.fake_driver.FailingDriver",
329 {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
330 "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
331 "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
332 "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"
334 "test_retry_create": (
336 (r".*Daemon started", set_squeue),
337 (r".*Rate limit exceeded - scheduling retry in 12 seconds", noop),
338 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", noop),
341 "arvnodeman.test.fake_driver.RetryDriver",
342 {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail"
344 "test_single_node_aws": (
346 (r".*Daemon started", set_squeue),
347 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
348 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
349 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
350 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
352 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
353 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1),
354 r".*Setting node quota.*": fail,
356 "arvnodeman.test.fake_driver.FakeAwsDriver",
357 {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
359 "test_single_node_gce": (
361 (r".*Daemon started", set_squeue),
362 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
363 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
364 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
365 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
367 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
368 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1),
369 r".*Setting node quota.*": fail,
371 "arvnodeman.test.fake_driver.FakeGceDriver",
372 {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
377 if len(sys.argv) > 1:
378 code = run_test(sys.argv[1], *tests[sys.argv[1]])
380 for t in sorted(tests.keys()):
381 code += run_test(t, *tests[t])
384 logger.info("Tests passed")
386 logger.info("Tests failed")
390 if __name__ == '__main__':