7475: Added integration test that checks for scancel to be called and a log entry
[arvados.git] / services / nodemanager / tests / integration_test.py
1 #!/usr/bin/env python
2 # Copyright (C) The Arvados Authors. All rights reserved.
3 #
4 # SPDX-License-Identifier: AGPL-3.0
5
6 """Integration test framework for node manager.
7
8 Runs full node manager with an API server (needs ARVADOS_API_HOST and
9 ARVADOS_API_TOKEN).  Stubs out the cloud driver and slurm commands to mock
10 specific behaviors.  Monitors the log output to verify an expected sequence of
11 events or behaviors for each test.
12
13 """
14
15 import subprocess
16 import os
17 import sys
18 import re
19 import time
20 import logging
21 import stat
22 import tempfile
23 import shutil
24 from functools import partial
25 import arvados
26 import StringIO
27
28 logger = logging.getLogger("logger")
29 logger.setLevel(logging.INFO)
30 logger.addHandler(logging.StreamHandler(sys.stderr))
31
32 detail = logging.getLogger("detail")
33 detail.setLevel(logging.INFO)
34 if os.environ.get("ANMTEST_LOGLEVEL"):
35     detail_content = sys.stderr
36 else:
37     detail_content = StringIO.StringIO()
38 detail.addHandler(logging.StreamHandler(detail_content))
39
40 fake_slurm = None
41 compute_nodes = None
42 all_jobs = None
43 unsatisfiable_job_scancelled = os.path.join(tempfile.mkdtemp(), "scancel_called")
44
45 def update_script(path, val):
46     with open(path+"_", "w") as f:
47         f.write(val)
48     os.chmod(path+"_", stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR)
49     os.rename(path+"_", path)
50     detail.info("Update script %s: %s", path, val)
51
52 def set_squeue(g):
53     global all_jobs
54     update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n" +
55                   "\n".join("echo '1|100|100|%s|%s'" % (v, k) for k,v in all_jobs.items()))
56     return 0
57
58 def set_queue_unsatisfiable(g):
59     global all_jobs, unsatisfiable_job_scancelled
60     # Simulate a job requesting a 99 core node.
61     update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n" +
62                   "\n".join("echo '99|100|100|%s|%s'" % (v, k) for k,v in all_jobs.items()))
63     update_script(os.path.join(fake_slurm, "scancel"), "#!/bin/sh\n" +
64                   "\ntouch %s" % unsatisfiable_job_scancelled)
65     return 0
66
67 def job_cancelled(g):
68     global unsatisfiable_job_scancelled
69     cancelled_job = g.group(1)
70     api = arvados.api('v1')
71     # Check that 'scancel' was called
72     if not os.path.isfile(unsatisfiable_job_scancelled):
73         return 1
74     # Check for the log entry
75     log_entry = api.logs().list(
76         filters=[
77             ['object_uuid', '=', cancelled_job],
78             ['event_type', '=', 'stderr'],
79         ]).execute()['items'][0]
80     if not re.match(
81             r"Requirements for a single node exceed the available cloud node size",
82             log_entry['properties']['text']):
83         return 1
84     return 0
85
86 def node_paired(g):
87     global compute_nodes
88     compute_nodes[g.group(1)] = g.group(3)
89
90     update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n" +
91                   "\n".join("echo '%s alloc'" % (v) for k,v in compute_nodes.items()))
92
93     for k,v in all_jobs.items():
94         if v == "ReqNodeNotAvail":
95             all_jobs[k] = "Running"
96             break
97
98     set_squeue(g)
99
100     return 0
101
102 def remaining_jobs(g):
103     update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n" +
104                   "\n".join("echo '%s alloc'" % (v) for k,v in compute_nodes.items()))
105
106     for k,v in all_jobs.items():
107         all_jobs[k] = "Running"
108
109     set_squeue(g)
110
111     return 0
112
113
114 def node_busy(g):
115     update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n" +
116                   "\n".join("echo '%s idle'" % (v) for k,v in compute_nodes.items()))
117     return 0
118
119 def node_shutdown(g):
120     global compute_nodes
121     del compute_nodes[g.group(1)]
122     return 0
123
124 def jobs_req(g):
125     global all_jobs
126     for k,v in all_jobs.items():
127         all_jobs[k] = "ReqNodeNotAvail"
128     set_squeue(g)
129     return 0
130
131 def noop(g):
132     return 0
133
134 def fail(checks, pattern, g):
135     return 1
136
137 def expect_count(count, checks, pattern, g):
138     if count == 0:
139         return 1
140     else:
141         checks[pattern] = partial(expect_count, count-1)
142         return 0
143
144 def run_test(name, actions, checks, driver_class, jobs, provider):
145     code = 0
146
147     # Delete any stale node records
148     api = arvados.api('v1')
149     for n in api.nodes().list().execute()['items']:
150         api.nodes().delete(uuid=n["uuid"]).execute()
151
152     logger.info("Start %s", name)
153
154     global fake_slurm
155     fake_slurm = tempfile.mkdtemp()
156     detail.info("fake_slurm is %s", fake_slurm)
157
158     global compute_nodes
159     compute_nodes = {}
160
161     global all_jobs
162     all_jobs = jobs
163
164     env = os.environ.copy()
165     env["PATH"] = fake_slurm + ":" + env["PATH"]
166
167     # Reset fake squeue/sinfo to empty
168     update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n")
169     update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n")
170
171     # Write configuration file for test
172     with open("tests/fake_%s.cfg.template" % provider) as f:
173         open(os.path.join(fake_slurm, "id_rsa.pub"), "w").close()
174         with open(os.path.join(fake_slurm, "fake.cfg"), "w") as cfg:
175             cfg.write(f.read().format(host=os.environ["ARVADOS_API_HOST"],
176                                       token=os.environ["ARVADOS_API_TOKEN"],
177                                       driver_class=driver_class,
178                                       ssh_key=os.path.join(fake_slurm, "id_rsa.pub")))
179
180     # Tests must complete in less than 3 minutes.
181     timeout = time.time() + 180
182     terminated = False
183
184     # Now start node manager
185     p = subprocess.Popen(["bin/arvados-node-manager", "--foreground", "--config", os.path.join(fake_slurm, "fake.cfg")],
186                          bufsize=0, stderr=subprocess.PIPE, env=env)
187
188     # Test main loop:
189     # - Read line
190     # - Apply negative checks (things that are not supposed to happen)
191     # - Check timeout
192     # - Check if the next action should trigger
193     # - If all actions are exhausted, terminate with test success
194     # - If it hits timeout with actions remaining, terminate with test failed
195     try:
196         # naive line iteration over pipes gets buffered, which isn't what we want,
197         # see https://bugs.python.org/issue3907
198         for line in iter(p.stderr.readline, ""):
199             detail_content.write(line)
200
201             for k,v in checks.items():
202                 g = re.match(k, line)
203                 if g:
204                     detail.info("Matched check %s", k)
205                     code += v(checks, k, g)
206                     if code != 0:
207                         detail.error("Check failed")
208                         if not terminated:
209                             p.terminate()
210                             terminated = True
211
212             if terminated:
213                 continue
214
215             if time.time() > timeout:
216                 detail.error("Exceeded timeout with actions remaining: %s", actions)
217                 code += 1
218                 if not terminated:
219                     p.terminate()
220                     terminated = True
221
222             k, v = actions[0]
223             g = re.match(k, line)
224             if g:
225                 detail.info("Matched action %s", k)
226                 actions.pop(0)
227                 code += v(g)
228                 if code != 0:
229                     detail.error("Action failed")
230                     p.terminate()
231                     terminated = True
232
233             if not actions:
234                 p.terminate()
235                 terminated = True
236     except KeyboardInterrupt:
237         p.kill()
238
239     if actions:
240         detail.error("Ended with remaining actions: %s", actions)
241         code = 1
242
243     shutil.rmtree(fake_slurm)
244     shutil.rmtree(os.path.dirname(unsatisfiable_job_scancelled))
245
246     if code == 0:
247         logger.info("%s passed", name)
248     else:
249         if isinstance(detail_content, StringIO.StringIO):
250             sys.stderr.write(detail_content.getvalue())
251         logger.info("%s failed", name)
252
253     return code
254
255
256 def main():
257     # Test lifecycle.
258
259     tests = {
260         "test_unsatisfiable_jobs" : (
261             # Actions (pattern -> action)
262             [
263                 (r".*Daemon started", set_queue_unsatisfiable),
264                 (r".*Cancelled unsatisfiable job '(\S+)'", job_cancelled),
265             ],
266             # Checks (things that shouldn't happen)
267             {
268                 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": fail,
269                 r".*Trying to cancel job '(\S+)'": fail,
270             },
271             # Driver class
272             "arvnodeman.test.fake_driver.FakeDriver",
273             # Jobs
274             {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
275             # Provider
276             "azure"),
277         "test_single_node_azure": (
278             [
279                 (r".*Daemon started", set_squeue),
280                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
281                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
282                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
283                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
284             ], {
285                 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
286                 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1),
287                 r".*Setting node quota.*": fail,
288             },
289             "arvnodeman.test.fake_driver.FakeDriver",
290             {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
291             "azure"),
292         "test_multiple_nodes": (
293             [
294                 (r".*Daemon started", set_squeue),
295                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
296                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
297                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
298                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
299                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
300                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
301                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
302                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
303                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
304                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
305             ], {
306                 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
307                 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 4),
308                 r".*Setting node quota.*": fail,
309             },
310             "arvnodeman.test.fake_driver.FakeDriver",
311             {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
312              "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
313              "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
314              "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"
315          }, "azure"),
316         "test_hit_quota": (
317             [
318                 (r".*Daemon started", set_squeue),
319                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
320                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
321                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
322                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", remaining_jobs),
323                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
324                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
325                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown)
326             ], {
327                 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
328                 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 2),
329                 r".*Sending create_node request.*": partial(expect_count, 5)
330             },
331             "arvnodeman.test.fake_driver.QuotaDriver",
332             {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
333              "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
334              "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
335              "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"
336          }, "azure"),
337         "test_probe_quota": (
338             [
339                 (r".*Daemon started", set_squeue),
340                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
341                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
342                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
343                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", remaining_jobs),
344                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
345                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
346                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
347                 (r".*sending request", jobs_req),
348                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
349                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
350                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
351                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
352                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
353                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
354                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
355                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
356                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
357                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
358             ], {
359                 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
360                 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 6),
361                 r".*Sending create_node request.*": partial(expect_count, 9)
362             },
363             "arvnodeman.test.fake_driver.QuotaDriver",
364             {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
365              "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
366              "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
367              "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"
368          }, "azure"),
369         "test_no_hang_failing_node_create": (
370             [
371                 (r".*Daemon started", set_squeue),
372                 (r".*Client error: nope", noop),
373                 (r".*Client error: nope", noop),
374                 (r".*Client error: nope", noop),
375                 (r".*Client error: nope", noop),
376             ],
377             {},
378             "arvnodeman.test.fake_driver.FailingDriver",
379             {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
380              "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
381              "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
382              "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"
383          }, "azure"),
384         "test_retry_create": (
385             [
386                 (r".*Daemon started", set_squeue),
387                 (r".*Rate limit exceeded - scheduling retry in 12 seconds", noop),
388                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", noop),
389             ],
390             {},
391             "arvnodeman.test.fake_driver.RetryDriver",
392             {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail"
393          }, "azure"),
394         "test_single_node_aws": (
395             [
396                 (r".*Daemon started", set_squeue),
397                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
398                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
399                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
400                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
401             ], {
402                 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
403                 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1),
404                 r".*Setting node quota.*": fail,
405             },
406             "arvnodeman.test.fake_driver.FakeAwsDriver",
407             {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
408             "ec2"),
409         "test_single_node_gce": (
410             [
411                 (r".*Daemon started", set_squeue),
412                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
413                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
414                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
415                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
416             ], {
417                 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
418                 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1),
419                 r".*Setting node quota.*": fail,
420             },
421             "arvnodeman.test.fake_driver.FakeGceDriver",
422             {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
423             "gce")
424     }
425
426     code = 0
427     if len(sys.argv) > 1:
428         code = run_test(sys.argv[1], *tests[sys.argv[1]])
429     else:
430         for t in sorted(tests.keys()):
431             code += run_test(t, *tests[t])
432
433     if code == 0:
434         logger.info("Tests passed")
435     else:
436         logger.info("Tests failed")
437
438     exit(code)
439
440 if __name__ == '__main__':
441     main()