Merge branch '8784-dir-listings'
[arvados.git] / services / nodemanager / tests / integration_test.py
1 #!/usr/bin/env python
2 # Copyright (C) The Arvados Authors. All rights reserved.
3 #
4 # SPDX-License-Identifier: AGPL-3.0
5
6 """Integration test framework for node manager.
7
8 Runs full node manager with an API server (needs ARVADOS_API_HOST and
9 ARVADOS_API_TOKEN).  Stubs out the cloud driver and slurm commands to mock
10 specific behaviors.  Monitors the log output to verify an expected sequence of
11 events or behaviors for each test.
12
13 """
14
15 import subprocess
16 import os
17 import sys
18 import re
19 import time
20 import logging
21 import stat
22 import tempfile
23 import shutil
24 from functools import partial
25 import arvados
26 import StringIO
27
28 logger = logging.getLogger("logger")
29 logger.setLevel(logging.INFO)
30 logger.addHandler(logging.StreamHandler(sys.stderr))
31
32 detail = logging.getLogger("detail")
33 detail.setLevel(logging.INFO)
34 if os.environ.get("ANMTEST_LOGLEVEL"):
35     detail_content = sys.stderr
36 else:
37     detail_content = StringIO.StringIO()
38 detail.addHandler(logging.StreamHandler(detail_content))
39
40 fake_slurm = None
41 compute_nodes = None
42 all_jobs = None
43
44 def update_script(path, val):
45     with open(path+"_", "w") as f:
46         f.write(val)
47     os.chmod(path+"_", stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR)
48     os.rename(path+"_", path)
49     detail.info("Update script %s: %s", path, val)
50
51 def set_squeue(g):
52     global all_jobs
53     update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n" +
54                   "\n".join("echo '1|100|100|%s|%s'" % (v, k) for k,v in all_jobs.items()))
55     return 0
56
57
58 def node_paired(g):
59     global compute_nodes
60     compute_nodes[g.group(1)] = g.group(3)
61
62     update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n" +
63                   "\n".join("echo '%s alloc'" % (v) for k,v in compute_nodes.items()))
64
65     for k,v in all_jobs.items():
66         if v == "ReqNodeNotAvail":
67             all_jobs[k] = "Running"
68             break
69
70     set_squeue(g)
71
72     return 0
73
74 def remaining_jobs(g):
75     update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n" +
76                   "\n".join("echo '%s alloc'" % (v) for k,v in compute_nodes.items()))
77
78     for k,v in all_jobs.items():
79         all_jobs[k] = "Running"
80
81     set_squeue(g)
82
83     return 0
84
85
86 def node_busy(g):
87     update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n" +
88                   "\n".join("echo '%s idle'" % (v) for k,v in compute_nodes.items()))
89     return 0
90
91 def node_shutdown(g):
92     global compute_nodes
93     del compute_nodes[g.group(1)]
94     return 0
95
96 def jobs_req(g):
97     global all_jobs
98     for k,v in all_jobs.items():
99         all_jobs[k] = "ReqNodeNotAvail"
100     set_squeue(g)
101     return 0
102
103 def noop(g):
104     return 0
105
106 def fail(checks, pattern, g):
107     return 1
108
109 def expect_count(count, checks, pattern, g):
110     if count == 0:
111         return 1
112     else:
113         checks[pattern] = partial(expect_count, count-1)
114         return 0
115
116 def run_test(name, actions, checks, driver_class, jobs, provider):
117     code = 0
118
119     # Delete any stale node records
120     api = arvados.api('v1')
121     for n in api.nodes().list().execute()['items']:
122         api.nodes().delete(uuid=n["uuid"]).execute()
123
124     logger.info("Start %s", name)
125
126     global fake_slurm
127     fake_slurm = tempfile.mkdtemp()
128     detail.info("fake_slurm is %s", fake_slurm)
129
130     global compute_nodes
131     compute_nodes = {}
132
133     global all_jobs
134     all_jobs = jobs
135
136     env = os.environ.copy()
137     env["PATH"] = fake_slurm + ":" + env["PATH"]
138
139     # Reset fake squeue/sinfo to empty
140     update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n")
141     update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n")
142
143     # Write configuration file for test
144     with open("tests/fake_%s.cfg.template" % provider) as f:
145         open(os.path.join(fake_slurm, "id_rsa.pub"), "w").close()
146         with open(os.path.join(fake_slurm, "fake.cfg"), "w") as cfg:
147             cfg.write(f.read().format(host=os.environ["ARVADOS_API_HOST"],
148                                       token=os.environ["ARVADOS_API_TOKEN"],
149                                       driver_class=driver_class,
150                                       ssh_key=os.path.join(fake_slurm, "id_rsa.pub")))
151
152     # Tests must complete in less than 3 minutes.
153     timeout = time.time() + 180
154     terminated = False
155
156     # Now start node manager
157     p = subprocess.Popen(["bin/arvados-node-manager", "--foreground", "--config", os.path.join(fake_slurm, "fake.cfg")],
158                          bufsize=0, stderr=subprocess.PIPE, env=env)
159
160     # Test main loop:
161     # - Read line
162     # - Apply negative checks (thinks that are not supposed to happen)
163     # - Check timeout
164     # - Check if the next action should trigger
165     # - If all actions are exhausted, terminate with test success
166     # - If it hits timeout with actions remaining, terminate with test failed
167     try:
168         # naive line iteration over pipes gets buffered, which isn't what we want,
169         # see https://bugs.python.org/issue3907
170         for line in iter(p.stderr.readline, ""):
171             detail_content.write(line)
172
173             for k,v in checks.items():
174                 g = re.match(k, line)
175                 if g:
176                     detail.info("Matched check %s", k)
177                     code += v(checks, k, g)
178                     if code != 0:
179                         detail.error("Check failed")
180                         if not terminated:
181                             p.terminate()
182                             terminated = True
183
184             if terminated:
185                 continue
186
187             if time.time() > timeout:
188                 detail.error("Exceeded timeout with actions remaining: %s", actions)
189                 code += 1
190                 if not terminated:
191                     p.terminate()
192                     terminated = True
193
194             k, v = actions[0]
195             g = re.match(k, line)
196             if g:
197                 detail.info("Matched action %s", k)
198                 actions.pop(0)
199                 code += v(g)
200                 if code != 0:
201                     detail.error("Action failed")
202                     p.terminate()
203                     terminated = True
204
205             if not actions:
206                 p.terminate()
207                 terminated = True
208     except KeyboardInterrupt:
209         p.kill()
210
211     if actions:
212         detail.error("Ended with remaining actions: %s", actions)
213         code = 1
214
215     shutil.rmtree(fake_slurm)
216
217     if code == 0:
218         logger.info("%s passed", name)
219     else:
220         if isinstance(detail_content, StringIO.StringIO):
221             sys.stderr.write(detail_content.getvalue())
222         logger.info("%s failed", name)
223
224     return code
225
226
227 def main():
228     # Test lifecycle.
229
230     tests = {
231         "test_single_node_azure": (
232             [
233                 (r".*Daemon started", set_squeue),
234                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
235                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
236                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
237                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
238             ], {
239                 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
240                 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1),
241                 r".*Setting node quota.*": fail,
242             },
243             "arvnodeman.test.fake_driver.FakeDriver",
244             {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
245             "azure"),
246         "test_multiple_nodes": (
247             [
248                 (r".*Daemon started", set_squeue),
249                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
250                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
251                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
252                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
253                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
254                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
255                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
256                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
257                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
258                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
259             ], {
260                 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
261                 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 4),
262                 r".*Setting node quota.*": fail,
263             },
264             "arvnodeman.test.fake_driver.FakeDriver",
265             {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
266              "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
267              "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
268              "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"
269          }, "azure"),
270         "test_hit_quota": (
271             [
272                 (r".*Daemon started", set_squeue),
273                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
274                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
275                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
276                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", remaining_jobs),
277                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
278                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
279                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown)
280             ], {
281                 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
282                 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 2),
283                 r".*Sending create_node request.*": partial(expect_count, 5)
284             },
285             "arvnodeman.test.fake_driver.QuotaDriver",
286             {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
287              "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
288              "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
289              "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"
290          }, "azure"),
291         "test_probe_quota": (
292             [
293                 (r".*Daemon started", set_squeue),
294                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
295                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
296                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
297                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", remaining_jobs),
298                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
299                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
300                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
301                 (r".*sending request", jobs_req),
302                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
303                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
304                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
305                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
306                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
307                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
308                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
309                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
310                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
311                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
312             ], {
313                 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
314                 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 6),
315                 r".*Sending create_node request.*": partial(expect_count, 9)
316             },
317             "arvnodeman.test.fake_driver.QuotaDriver",
318             {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
319              "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
320              "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
321              "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"
322          }, "azure"),
323         "test_no_hang_failing_node_create": (
324             [
325                 (r".*Daemon started", set_squeue),
326                 (r".*Client error: nope", noop),
327                 (r".*Client error: nope", noop),
328                 (r".*Client error: nope", noop),
329                 (r".*Client error: nope", noop),
330             ],
331             {},
332             "arvnodeman.test.fake_driver.FailingDriver",
333             {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
334              "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
335              "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
336              "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"
337          }, "azure"),
338         "test_retry_create": (
339             [
340                 (r".*Daemon started", set_squeue),
341                 (r".*Rate limit exceeded - scheduling retry in 12 seconds", noop),
342                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", noop),
343             ],
344             {},
345             "arvnodeman.test.fake_driver.RetryDriver",
346             {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail"
347          }, "azure"),
348         "test_single_node_aws": (
349             [
350                 (r".*Daemon started", set_squeue),
351                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
352                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
353                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
354                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
355             ], {
356                 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
357                 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1),
358                 r".*Setting node quota.*": fail,
359             },
360             "arvnodeman.test.fake_driver.FakeAwsDriver",
361             {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
362             "ec2"),
363         "test_single_node_gce": (
364             [
365                 (r".*Daemon started", set_squeue),
366                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
367                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
368                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
369                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
370             ], {
371                 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
372                 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1),
373                 r".*Setting node quota.*": fail,
374             },
375             "arvnodeman.test.fake_driver.FakeGceDriver",
376             {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
377             "gce")
378     }
379
380     code = 0
381     if len(sys.argv) > 1:
382         code = run_test(sys.argv[1], *tests[sys.argv[1]])
383     else:
384         for t in sorted(tests.keys()):
385             code += run_test(t, *tests[t])
386
387     if code == 0:
388         logger.info("Tests passed")
389     else:
390         logger.info("Tests failed")
391
392     exit(code)
393
394 if __name__ == '__main__':
395     main()