12055: Merge branch 'master' into 12055-nodemanager-ec2-tags
[arvados.git] / services / nodemanager / tests / integration_test.py
1 #!/usr/bin/env python
2 # Copyright (C) The Arvados Authors. All rights reserved.
3 #
4 # SPDX-License-Identifier: AGPL-3.0
5
6 """Integration test framework for node manager.
7
8 Runs full node manager with an API server (needs ARVADOS_API_HOST and
9 ARVADOS_API_TOKEN).  Stubs out the cloud driver and slurm commands to mock
10 specific behaviors.  Monitors the log output to verify an expected sequence of
11 events or behaviors for each test.
12
13 """
14
15 import subprocess
16 import os
17 import sys
18 import re
19 import time
20 import logging
21 import stat
22 import tempfile
23 import shutil
24 from functools import partial
25 import arvados
26 import StringIO
27
28 logger = logging.getLogger("logger")
29 logger.setLevel(logging.INFO)
30 logger.addHandler(logging.StreamHandler(sys.stderr))
31
32 detail = logging.getLogger("detail")
33 detail.setLevel(logging.INFO)
34 if os.environ.get("ANMTEST_LOGLEVEL"):
35     detail_content = sys.stderr
36 else:
37     detail_content = StringIO.StringIO()
38 detail.addHandler(logging.StreamHandler(detail_content))
39
40 fake_slurm = None
41 compute_nodes = None
42 all_jobs = None
43 unsatisfiable_job_scancelled = None
44
45 def update_script(path, val):
46     with open(path+"_", "w") as f:
47         f.write(val)
48     os.chmod(path+"_", stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR)
49     os.rename(path+"_", path)
50     detail.info("Update script %s: %s", path, val)
51
52 def set_squeue(g):
53     global all_jobs
54     update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n" +
55                   "\n".join("echo '1|100|100|%s|%s'" % (v, k) for k,v in all_jobs.items()))
56     return 0
57
58 def set_queue_unsatisfiable(g):
59     global all_jobs, unsatisfiable_job_scancelled
60     # Simulate a job requesting a 99 core node.
61     update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n" +
62                   "\n".join("echo '99|100|100|%s|%s'" % (v, k) for k,v in all_jobs.items()))
63     update_script(os.path.join(fake_slurm, "scancel"), "#!/bin/sh\n" +
64                   "\ntouch %s" % unsatisfiable_job_scancelled)
65     return 0
66
67 def job_cancelled(g):
68     global unsatisfiable_job_scancelled
69     cancelled_job = g.group(1)
70     api = arvados.api('v1')
71     # Check that 'scancel' was called
72     if not os.path.isfile(unsatisfiable_job_scancelled):
73         return 1
74     # Check for the log entry
75     log_entry = api.logs().list(
76         filters=[
77             ['object_uuid', '=', cancelled_job],
78             ['event_type', '=', 'stderr'],
79         ]).execute()['items'][0]
80     if not re.match(
81             r"Requirements for a single node exceed the available cloud node size",
82             log_entry['properties']['text']):
83         return 1
84     return 0
85
86 def node_paired(g):
87     global compute_nodes
88     compute_nodes[g.group(1)] = g.group(3)
89
90     update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n" +
91                   "\n".join("echo '%s alloc'" % (v) for k,v in compute_nodes.items()))
92
93     for k,v in all_jobs.items():
94         if v == "ReqNodeNotAvail":
95             all_jobs[k] = "Running"
96             break
97
98     set_squeue(g)
99
100     return 0
101
102 def remaining_jobs(g):
103     update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n" +
104                   "\n".join("echo '%s alloc'" % (v) for k,v in compute_nodes.items()))
105
106     for k,v in all_jobs.items():
107         all_jobs[k] = "Running"
108
109     set_squeue(g)
110
111     return 0
112
113
114 def node_busy(g):
115     update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n" +
116                   "\n".join("echo '%s idle'" % (v) for k,v in compute_nodes.items()))
117     return 0
118
119 def node_shutdown(g):
120     global compute_nodes
121     del compute_nodes[g.group(1)]
122     return 0
123
124 def jobs_req(g):
125     global all_jobs
126     for k,v in all_jobs.items():
127         all_jobs[k] = "ReqNodeNotAvail"
128     set_squeue(g)
129     return 0
130
131 def noop(g):
132     return 0
133
134 def fail(checks, pattern, g):
135     return 1
136
137 def expect_count(count, checks, pattern, g):
138     if count == 0:
139         return 1
140     else:
141         checks[pattern] = partial(expect_count, count-1)
142         return 0
143
144 def run_test(name, actions, checks, driver_class, jobs, provider):
145     code = 0
146     global unsatisfiable_job_scancelled
147     unsatisfiable_job_scancelled = os.path.join(tempfile.mkdtemp(),
148                                                 "scancel_called")
149
150     # Delete any stale node records
151     api = arvados.api('v1')
152     for n in api.nodes().list().execute()['items']:
153         api.nodes().delete(uuid=n["uuid"]).execute()
154
155     logger.info("Start %s", name)
156
157     global fake_slurm
158     fake_slurm = tempfile.mkdtemp()
159     detail.info("fake_slurm is %s", fake_slurm)
160
161     global compute_nodes
162     compute_nodes = {}
163
164     global all_jobs
165     all_jobs = jobs
166
167     env = os.environ.copy()
168     env["PATH"] = fake_slurm + ":" + env["PATH"]
169
170     # Reset fake squeue/sinfo to empty
171     update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n")
172     update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n")
173
174     # Write configuration file for test
175     with open("tests/fake_%s.cfg.template" % provider) as f:
176         open(os.path.join(fake_slurm, "id_rsa.pub"), "w").close()
177         with open(os.path.join(fake_slurm, "fake.cfg"), "w") as cfg:
178             cfg.write(f.read().format(host=os.environ["ARVADOS_API_HOST"],
179                                       token=os.environ["ARVADOS_API_TOKEN"],
180                                       driver_class=driver_class,
181                                       ssh_key=os.path.join(fake_slurm, "id_rsa.pub")))
182
183     # Tests must complete in less than 3 minutes.
184     timeout = time.time() + 180
185     terminated = False
186
187     # Now start node manager
188     p = subprocess.Popen(["bin/arvados-node-manager", "--foreground", "--config", os.path.join(fake_slurm, "fake.cfg")],
189                          bufsize=0, stderr=subprocess.PIPE, env=env)
190
191     # Test main loop:
192     # - Read line
193     # - Apply negative checks (things that are not supposed to happen)
194     # - Check timeout
195     # - Check if the next action should trigger
196     # - If all actions are exhausted, terminate with test success
197     # - If it hits timeout with actions remaining, terminate with test failed
198     try:
199         # naive line iteration over pipes gets buffered, which isn't what we want,
200         # see https://bugs.python.org/issue3907
201         for line in iter(p.stderr.readline, ""):
202             detail_content.write(line)
203
204             for k,v in checks.items():
205                 g = re.match(k, line)
206                 if g:
207                     detail.info("Matched check %s", k)
208                     code += v(checks, k, g)
209                     if code != 0:
210                         detail.error("Check failed")
211                         if not terminated:
212                             p.terminate()
213                             terminated = True
214
215             if terminated:
216                 continue
217
218             if time.time() > timeout:
219                 detail.error("Exceeded timeout with actions remaining: %s", actions)
220                 code += 1
221                 if not terminated:
222                     p.terminate()
223                     terminated = True
224
225             k, v = actions[0]
226             g = re.match(k, line)
227             if g:
228                 detail.info("Matched action %s", k)
229                 actions.pop(0)
230                 code += v(g)
231                 if code != 0:
232                     detail.error("Action failed")
233                     p.terminate()
234                     terminated = True
235
236             if not actions:
237                 p.terminate()
238                 terminated = True
239     except KeyboardInterrupt:
240         p.kill()
241
242     if actions:
243         detail.error("Ended with remaining actions: %s", actions)
244         code = 1
245
246     shutil.rmtree(fake_slurm)
247     shutil.rmtree(os.path.dirname(unsatisfiable_job_scancelled))
248
249     if code == 0:
250         logger.info("%s passed", name)
251     else:
252         if isinstance(detail_content, StringIO.StringIO):
253             sys.stderr.write(detail_content.getvalue())
254         logger.info("%s failed", name)
255
256     return code
257
258
259 def main():
260     # Test lifecycle.
261
262     tests = {
263         "test_unsatisfiable_jobs" : (
264             # Actions (pattern -> action)
265             [
266                 (r".*Daemon started", set_queue_unsatisfiable),
267                 (r".*Cancelled unsatisfiable job '(\S+)'", job_cancelled),
268             ],
269             # Checks (things that shouldn't happen)
270             {
271                 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": fail,
272                 r".*Trying to cancel job '(\S+)'": fail,
273             },
274             # Driver class
275             "arvnodeman.test.fake_driver.FakeDriver",
276             # Jobs
277             {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
278             # Provider
279             "azure"),
280         "test_single_node_azure": (
281             [
282                 (r".*Daemon started", set_squeue),
283                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
284                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
285                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
286                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
287             ], {
288                 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
289                 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1),
290                 r".*Setting node quota.*": fail,
291             },
292             "arvnodeman.test.fake_driver.FakeDriver",
293             {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
294             "azure"),
295         "test_multiple_nodes": (
296             [
297                 (r".*Daemon started", set_squeue),
298                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
299                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
300                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
301                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
302                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
303                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
304                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
305                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
306                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
307                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
308             ], {
309                 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
310                 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 4),
311                 r".*Setting node quota.*": fail,
312             },
313             "arvnodeman.test.fake_driver.FakeDriver",
314             {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
315              "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
316              "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
317              "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"
318          }, "azure"),
319         "test_hit_quota": (
320             [
321                 (r".*Daemon started", set_squeue),
322                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
323                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
324                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
325                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", remaining_jobs),
326                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
327                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
328                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown)
329             ], {
330                 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
331                 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 2),
332                 r".*Sending create_node request.*": partial(expect_count, 5)
333             },
334             "arvnodeman.test.fake_driver.QuotaDriver",
335             {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
336              "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
337              "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
338              "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"
339          }, "azure"),
340         "test_probe_quota": (
341             [
342                 (r".*Daemon started", set_squeue),
343                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
344                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
345                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
346                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", remaining_jobs),
347                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
348                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
349                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
350                 (r".*sending request", jobs_req),
351                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
352                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
353                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
354                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
355                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
356                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
357                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
358                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
359                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
360                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
361             ], {
362                 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
363                 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 6),
364                 r".*Sending create_node request.*": partial(expect_count, 9)
365             },
366             "arvnodeman.test.fake_driver.QuotaDriver",
367             {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
368              "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
369              "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
370              "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"
371          }, "azure"),
372         "test_no_hang_failing_node_create": (
373             [
374                 (r".*Daemon started", set_squeue),
375                 (r".*Client error: nope", noop),
376                 (r".*Client error: nope", noop),
377                 (r".*Client error: nope", noop),
378                 (r".*Client error: nope", noop),
379             ],
380             {},
381             "arvnodeman.test.fake_driver.FailingDriver",
382             {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
383              "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
384              "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
385              "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"
386          }, "azure"),
387         "test_retry_create": (
388             [
389                 (r".*Daemon started", set_squeue),
390                 (r".*Rate limit exceeded - scheduling retry in 12 seconds", noop),
391                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", noop),
392             ],
393             {},
394             "arvnodeman.test.fake_driver.RetryDriver",
395             {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail"
396          }, "azure"),
397         "test_single_node_aws": (
398             [
399                 (r".*Daemon started", set_squeue),
400                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
401                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
402                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
403                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
404             ], {
405                 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
406                 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1),
407                 r".*Setting node quota.*": fail,
408             },
409             "arvnodeman.test.fake_driver.FakeAwsDriver",
410             {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
411             "ec2"),
412         "test_single_node_gce": (
413             [
414                 (r".*Daemon started", set_squeue),
415                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
416                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
417                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
418                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
419             ], {
420                 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
421                 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1),
422                 r".*Setting node quota.*": fail,
423             },
424             "arvnodeman.test.fake_driver.FakeGceDriver",
425             {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
426             "gce")
427     }
428
429     code = 0
430     if len(sys.argv) > 1:
431         code = run_test(sys.argv[1], *tests[sys.argv[1]])
432     else:
433         for t in sorted(tests.keys()):
434             code += run_test(t, *tests[t])
435
436     if code == 0:
437         logger.info("Tests passed")
438     else:
439         logger.info("Tests failed")
440
441     exit(code)
442
443 if __name__ == '__main__':
444     main()