12199: Update integration tests.
[arvados.git] / services / nodemanager / tests / integration_test.py
1 #!/usr/bin/env python
2 # Copyright (C) The Arvados Authors. All rights reserved.
3 #
4 # SPDX-License-Identifier: AGPL-3.0
5
6 """Integration test framework for node manager.
7
8 Runs full node manager with an API server (needs ARVADOS_API_HOST and
9 ARVADOS_API_TOKEN).  Stubs out the cloud driver and slurm commands to mock
10 specific behaviors.  Monitors the log output to verify an expected sequence of
11 events or behaviors for each test.
12
13 """
14
15 import subprocess
16 import os
17 import sys
18 import re
19 import time
20 import logging
21 import stat
22 import tempfile
23 import shutil
24 from functools import partial
25 import arvados
26 import StringIO
27
28 formatter = logging.Formatter('%(asctime)s %(levelname)s: %(message)s')
29
30 handler = logging.StreamHandler(sys.stderr)
31 handler.setFormatter(formatter)
32 logger = logging.getLogger("logger")
33 logger.setLevel(logging.INFO)
34 logger.addHandler(handler)
35
36 detail = logging.getLogger("detail")
37 detail.setLevel(logging.INFO)
38 if os.environ.get("ANMTEST_LOGLEVEL"):
39     detail_content = sys.stderr
40 else:
41     detail_content = StringIO.StringIO()
42 handler = logging.StreamHandler(detail_content)
43 handler.setFormatter(formatter)
44 detail.addHandler(handler)
45
46 fake_slurm = None
47 compute_nodes = None
48 all_jobs = None
49 unsatisfiable_job_scancelled = None
50
51 def update_script(path, val):
52     with open(path+"_", "w") as f:
53         f.write(val)
54     os.chmod(path+"_", stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR)
55     os.rename(path+"_", path)
56     detail.info("Update script %s: %s", path, val)
57
58 def set_squeue(g):
59     global all_jobs
60     update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n" +
61                   "\n".join("echo '1|100|100|%s|%s|(null)'" % (v, k) for k,v in all_jobs.items()))
62     return 0
63
64 def set_queue_unsatisfiable(g):
65     global all_jobs, unsatisfiable_job_scancelled
66     # Simulate a job requesting a 99 core node.
67     update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n" +
68                   "\n".join("echo '99|100|100|%s|%s|(null)'" % (v, k) for k,v in all_jobs.items()))
69     update_script(os.path.join(fake_slurm, "scancel"), "#!/bin/sh\n" +
70                   "\ntouch %s" % unsatisfiable_job_scancelled)
71     return 0
72
73 def job_cancelled(g):
74     global unsatisfiable_job_scancelled
75     cancelled_job = g.group(1)
76     api = arvados.api('v1')
77     # Check that 'scancel' was called
78     if not os.path.isfile(unsatisfiable_job_scancelled):
79         return 1
80     # Check for the log entry
81     log_entry = api.logs().list(
82         filters=[
83             ['object_uuid', '=', cancelled_job],
84             ['event_type', '=', 'stderr'],
85         ]).execute()['items'][0]
86     if not re.match(
87             r"Constraints cannot be satisfied",
88             log_entry['properties']['text']):
89         return 1
90     return 0
91
92 def node_paired(g):
93     global compute_nodes
94     compute_nodes[g.group(1)] = g.group(3)
95
96     update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n" +
97                   "\n".join("echo '%s|alloc|(null)'" % (v) for k,v in compute_nodes.items()))
98
99     for k,v in all_jobs.items():
100         if v == "ReqNodeNotAvail":
101             all_jobs[k] = "Running"
102             break
103
104     set_squeue(g)
105
106     return 0
107
108 def remaining_jobs(g):
109     update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n" +
110                   "\n".join("echo '%s|alloc|(null)'" % (v) for k,v in compute_nodes.items()))
111
112     for k,v in all_jobs.items():
113         all_jobs[k] = "Running"
114
115     set_squeue(g)
116
117     return 0
118
119
120 def node_busy(g):
121     update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n" +
122                   "\n".join("echo '%s|idle|(null)'" % (v) for k,v in compute_nodes.items()))
123     return 0
124
125 def node_shutdown(g):
126     global compute_nodes
127     del compute_nodes[g.group(1)]
128     return 0
129
130 def jobs_req(g):
131     global all_jobs
132     for k,v in all_jobs.items():
133         all_jobs[k] = "ReqNodeNotAvail"
134     set_squeue(g)
135     return 0
136
137 def noop(g):
138     return 0
139
140 def fail(checks, pattern, g):
141     return 1
142
143 def expect_count(count, checks, pattern, g):
144     if count == 0:
145         return 1
146     else:
147         checks[pattern] = partial(expect_count, count-1)
148         return 0
149
150 def run_test(name, actions, checks, driver_class, jobs, provider):
151     code = 0
152     global unsatisfiable_job_scancelled
153     unsatisfiable_job_scancelled = os.path.join(tempfile.mkdtemp(),
154                                                 "scancel_called")
155
156     # Delete any stale node records
157     api = arvados.api('v1')
158     for n in api.nodes().list().execute()['items']:
159         api.nodes().delete(uuid=n["uuid"]).execute()
160
161     logger.info("Start %s", name)
162
163     global fake_slurm
164     fake_slurm = tempfile.mkdtemp()
165     detail.info("fake_slurm is %s", fake_slurm)
166
167     global compute_nodes
168     compute_nodes = {}
169
170     global all_jobs
171     all_jobs = jobs
172
173     env = os.environ.copy()
174     env["PATH"] = fake_slurm + ":" + env["PATH"]
175
176     # Reset fake squeue/sinfo to empty
177     update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n")
178     update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n")
179
180     # Write configuration file for test
181     with open("tests/fake_%s.cfg.template" % provider) as f:
182         open(os.path.join(fake_slurm, "id_rsa.pub"), "w").close()
183         with open(os.path.join(fake_slurm, "fake.cfg"), "w") as cfg:
184             cfg.write(f.read().format(host=os.environ["ARVADOS_API_HOST"],
185                                       token=os.environ["ARVADOS_API_TOKEN"],
186                                       driver_class=driver_class,
187                                       ssh_key=os.path.join(fake_slurm, "id_rsa.pub")))
188
189     # Tests must complete in less than 3 minutes.
190     timeout = time.time() + 180
191     terminated = False
192
193     # Now start node manager
194     p = subprocess.Popen(["bin/arvados-node-manager", "--foreground", "--config", os.path.join(fake_slurm, "fake.cfg")],
195                          bufsize=0, stderr=subprocess.PIPE, env=env)
196
197     # Test main loop:
198     # - Read line
199     # - Apply negative checks (things that are not supposed to happen)
200     # - Check timeout
201     # - Check if the next action should trigger
202     # - If all actions are exhausted, terminate with test success
203     # - If it hits timeout with actions remaining, terminate with test failed
204     try:
205         # naive line iteration over pipes gets buffered, which isn't what we want,
206         # see https://bugs.python.org/issue3907
207         for line in iter(p.stderr.readline, ""):
208             detail_content.write(line)
209
210             for k,v in checks.items():
211                 g = re.match(k, line)
212                 if g:
213                     detail.info("Matched check %s", k)
214                     code += v(checks, k, g)
215                     if code != 0:
216                         detail.error("Check failed")
217                         if not terminated:
218                             p.terminate()
219                             terminated = True
220
221             if terminated:
222                 continue
223
224             if time.time() > timeout:
225                 detail.error("Exceeded timeout with actions remaining: %s", actions)
226                 code += 1
227                 if not terminated:
228                     p.terminate()
229                     terminated = True
230
231             k, v = actions[0]
232             g = re.match(k, line)
233             if g:
234                 detail.info("Matched action %s", k)
235                 actions.pop(0)
236                 code += v(g)
237                 if code != 0:
238                     detail.error("Action failed")
239                     p.terminate()
240                     terminated = True
241
242             if not actions:
243                 p.terminate()
244                 terminated = True
245     except KeyboardInterrupt:
246         p.kill()
247
248     if actions:
249         detail.error("Ended with remaining actions: %s", actions)
250         code = 1
251
252     shutil.rmtree(fake_slurm)
253     shutil.rmtree(os.path.dirname(unsatisfiable_job_scancelled))
254
255     if code == 0:
256         logger.info("%s passed", name)
257     else:
258         if isinstance(detail_content, StringIO.StringIO):
259             sys.stderr.write(detail_content.getvalue())
260         logger.info("%s failed", name)
261
262     return code
263
264
265 def main():
266     # Test lifecycle.
267
268     tests = {
269         "test_unsatisfiable_jobs" : (
270             # Actions (pattern -> action)
271             [
272                 (r".*Daemon started", set_queue_unsatisfiable),
273                 (r".*Cancelled unsatisfiable job '(\S+)'", job_cancelled),
274             ],
275             # Checks (things that shouldn't happen)
276             {
277                 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": fail,
278                 r".*Trying to cancel job '(\S+)'": fail,
279             },
280             # Driver class
281             "arvnodeman.test.fake_driver.FakeDriver",
282             # Jobs
283             {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
284             # Provider
285             "azure"),
286         "test_single_node_azure": (
287             [
288                 (r".*Daemon started", set_squeue),
289                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
290                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
291                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
292                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
293             ], {
294                 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
295                 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1),
296                 r".*Setting node quota.*": fail,
297             },
298             "arvnodeman.test.fake_driver.FakeDriver",
299             {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
300             "azure"),
301         "test_multiple_nodes": (
302             [
303                 (r".*Daemon started", set_squeue),
304                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
305                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
306                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
307                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
308                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
309                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
310                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
311                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
312                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
313                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
314             ], {
315                 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
316                 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 4),
317                 r".*Setting node quota.*": fail,
318             },
319             "arvnodeman.test.fake_driver.FakeDriver",
320             {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
321              "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
322              "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
323              "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"
324          }, "azure"),
325         "test_hit_quota": (
326             [
327                 (r".*Daemon started", set_squeue),
328                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
329                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
330                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
331                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", remaining_jobs),
332                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
333                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
334                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown)
335             ], {
336                 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
337                 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 2),
338                 r".*Sending create_node request.*": partial(expect_count, 5)
339             },
340             "arvnodeman.test.fake_driver.QuotaDriver",
341             {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
342              "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
343              "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
344              "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"
345          }, "azure"),
346         "test_probe_quota": (
347             [
348                 (r".*Daemon started", set_squeue),
349                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
350                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
351                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
352                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", remaining_jobs),
353                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
354                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
355                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
356                 (r".*sending request", jobs_req),
357                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
358                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
359                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
360                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
361                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
362                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
363                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
364                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
365                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
366                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
367             ], {
368                 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
369                 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 6),
370                 r".*Sending create_node request.*": partial(expect_count, 9)
371             },
372             "arvnodeman.test.fake_driver.QuotaDriver",
373             {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
374              "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
375              "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
376              "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"
377          }, "azure"),
378         "test_no_hang_failing_node_create": (
379             [
380                 (r".*Daemon started", set_squeue),
381                 (r".*Client error: nope", noop),
382                 (r".*Client error: nope", noop),
383                 (r".*Client error: nope", noop),
384                 (r".*Client error: nope", noop),
385             ],
386             {},
387             "arvnodeman.test.fake_driver.FailingDriver",
388             {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
389              "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
390              "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
391              "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"
392          }, "azure"),
393         "test_retry_create": (
394             [
395                 (r".*Daemon started", set_squeue),
396                 (r".*Rate limit exceeded - scheduling retry in 12 seconds", noop),
397                 (r".*Rate limit exceeded - scheduling retry in 2 seconds", noop),
398                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", noop),
399             ],
400             {},
401             "arvnodeman.test.fake_driver.RetryDriver",
402             {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail"
403          }, "azure"),
404         "test_single_node_aws": (
405             [
406                 (r".*Daemon started", set_squeue),
407                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
408                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
409                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
410                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
411             ], {
412                 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
413                 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1),
414                 r".*Setting node quota.*": fail,
415             },
416             "arvnodeman.test.fake_driver.FakeAwsDriver",
417             {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
418             "ec2"),
419         "test_single_node_gce": (
420             [
421                 (r".*Daemon started", set_squeue),
422                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
423                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
424                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
425                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
426             ], {
427                 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
428                 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1),
429                 r".*Setting node quota.*": fail,
430             },
431             "arvnodeman.test.fake_driver.FakeGceDriver",
432             {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
433             "gce")
434     }
435
436     code = 0
437     if len(sys.argv) > 1:
438         code = run_test(sys.argv[1], *tests[sys.argv[1]])
439     else:
440         for t in sorted(tests.keys()):
441             code += run_test(t, *tests[t])
442
443     if code == 0:
444         logger.info("Tests passed")
445     else:
446         logger.info("Tests failed")
447
448     exit(code)
449
450 if __name__ == '__main__':
451     main()