2 """Integration test framework for node manager.
4 Runs full node manager with an API server (needs ARVADOS_API_HOST and
5 ARVADOS_API_TOKEN). Stubs out the cloud driver and slurm commands to mock
6 specific behaviors. Monitors the log output to verify an expected sequence of
7 events or behaviors for each test.
20 from functools import partial
24 logger = logging.getLogger("logger")
25 logger.setLevel(logging.INFO)
26 logger.addHandler(logging.StreamHandler(sys.stderr))
28 detail = logging.getLogger("detail")
29 detail.setLevel(logging.INFO)
30 if os.environ.get("ANMTEST_LOGLEVEL"):
31 detail_content = sys.stderr
33 detail_content = StringIO.StringIO()
34 detail.addHandler(logging.StreamHandler(detail_content))
40 def update_script(path, val):
41 with open(path+"_", "w") as f:
43 os.chmod(path+"_", stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR)
44 os.rename(path+"_", path)
45 detail.info("Update script %s: %s", path, val)
49 update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n" +
50 "\n".join("echo '1|100|100|%s|%s'" % (v, k) for k,v in all_jobs.items()))
56 compute_nodes[g.group(1)] = g.group(3)
58 update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n" +
59 "\n".join("echo '%s alloc'" % (v) for k,v in compute_nodes.items()))
61 for k,v in all_jobs.items():
62 if v == "ReqNodeNotAvail":
63 all_jobs[k] = "Running"
70 def remaining_jobs(g):
71 update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n" +
72 "\n".join("echo '%s alloc'" % (v) for k,v in compute_nodes.items()))
74 for k,v in all_jobs.items():
75 all_jobs[k] = "Running"
83 update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n" +
84 "\n".join("echo '%s idle'" % (v) for k,v in compute_nodes.items()))
89 del compute_nodes[g.group(1)]
94 for k,v in all_jobs.items():
95 all_jobs[k] = "ReqNodeNotAvail"
102 def fail(checks, pattern, g):
105 def expect_count(count, checks, pattern, g):
109 checks[pattern] = partial(expect_count, count-1)
112 def run_test(name, actions, checks, driver_class, jobs):
115 # Delete any stale node records
116 api = arvados.api('v1')
117 for n in api.nodes().list().execute()['items']:
118 api.nodes().delete(uuid=n["uuid"]).execute()
120 logger.info("Start %s", name)
123 fake_slurm = tempfile.mkdtemp()
124 detail.info("fake_slurm is %s", fake_slurm)
132 env = os.environ.copy()
133 env["PATH"] = fake_slurm + ":" + env["PATH"]
135 # Reset fake squeue/sinfo to empty
136 update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n")
137 update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n")
139 # Write configuration file for test
140 with open("tests/fake.cfg.template") as f:
141 open(os.path.join(fake_slurm, "id_rsa.pub"), "w").close()
142 with open(os.path.join(fake_slurm, "fake.cfg"), "w") as cfg:
143 cfg.write(f.read().format(host=os.environ["ARVADOS_API_HOST"],
144 token=os.environ["ARVADOS_API_TOKEN"],
145 driver_class=driver_class,
146 ssh_key=os.path.join(fake_slurm, "id_rsa.pub")))
148 # Tests must complete in less than 3 minutes.
149 timeout = time.time() + 180
152 # Now start node manager
153 p = subprocess.Popen(["bin/arvados-node-manager", "--foreground", "--config", os.path.join(fake_slurm, "fake.cfg")],
154 bufsize=0, stderr=subprocess.PIPE, env=env)
158 # - Apply negative checks (thinks that are not supposed to happen)
160 # - Check if the next action should trigger
161 # - If all actions are exhausted, terminate with test success
162 # - If it hits timeout with actions remaining, terminate with test failed
164 # naive line iteration over pipes gets buffered, which isn't what we want,
165 # see https://bugs.python.org/issue3907
166 for line in iter(p.stderr.readline, ""):
167 detail_content.write(line)
169 for k,v in checks.items():
170 g = re.match(k, line)
172 detail.info("Matched check %s", k)
173 code += v(checks, k, g)
175 detail.error("Check failed")
183 if time.time() > timeout:
184 detail.error("Exceeded timeout with actions remaining: %s", actions)
191 g = re.match(k, line)
193 detail.info("Matched action %s", k)
197 detail.error("Action failed")
204 except KeyboardInterrupt:
208 detail.error("Ended with remaining actions: %s", actions)
211 shutil.rmtree(fake_slurm)
214 logger.info("%s passed", name)
216 if isinstance(detail_content, StringIO.StringIO):
217 sys.stderr.write(detail_content.getvalue())
218 logger.info("%s failed", name)
227 "test_single_node": (
229 (r".*Daemon started", set_squeue),
230 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
231 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
232 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
233 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
235 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
236 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1),
237 r".*Setting node quota.*": fail,
239 "arvnodeman.test.fake_driver.FakeDriver",
240 {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"}),
241 "test_multiple_nodes": (
243 (r".*Daemon started", set_squeue),
244 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
245 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
246 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
247 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
248 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
249 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
250 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
251 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
252 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
253 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
255 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
256 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 4),
257 r".*Setting node quota.*": fail,
259 "arvnodeman.test.fake_driver.FakeDriver",
260 {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
261 "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
262 "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
263 "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"
267 (r".*Daemon started", set_squeue),
268 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
269 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
270 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
271 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", remaining_jobs),
272 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
273 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
274 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown)
276 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
277 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 2),
278 r".*Sending create_node request.*": partial(expect_count, 5)
280 "arvnodeman.test.fake_driver.QuotaDriver",
281 {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
282 "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
283 "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
284 "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"
286 "test_probe_quota": (
288 (r".*Daemon started", set_squeue),
289 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
290 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
291 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
292 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", remaining_jobs),
293 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
294 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
295 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
296 (r".*sending request", jobs_req),
297 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
298 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
299 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
300 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
301 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
302 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
303 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
304 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
305 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
306 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
308 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
309 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 6),
310 r".*Sending create_node request.*": partial(expect_count, 9)
312 "arvnodeman.test.fake_driver.QuotaDriver",
313 {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
314 "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
315 "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
316 "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"
318 "test_no_hang_failing_node_create": (
320 (r".*Daemon started", set_squeue),
321 (r".*Client error: nope", noop),
322 (r".*Client error: nope", noop),
323 (r".*Client error: nope", noop),
324 (r".*Client error: nope", noop),
327 "arvnodeman.test.fake_driver.FailingDriver",
328 {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
329 "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
330 "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
331 "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"
333 "test_retry_create": (
335 (r".*Daemon started", set_squeue),
336 (r".*Rate limit exceeded - scheduling retry in 12 seconds", noop),
337 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", noop),
340 "arvnodeman.test.fake_driver.RetryDriver",
341 {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail"
346 if len(sys.argv) > 1:
347 code = run_test(sys.argv[1], *tests[sys.argv[1]])
349 for t in sorted(tests.keys()):
350 code += run_test(t, *tests[t])
353 logger.info("Tests passed")
355 logger.info("Tests failed")
359 if __name__ == '__main__':