12061: Don't wait around for test daemon to shutdown gracefully.
[arvados.git] / services / nodemanager / tests / integration_test.py
1 #!/usr/bin/env python
2 # Copyright (C) The Arvados Authors. All rights reserved.
3 #
4 # SPDX-License-Identifier: AGPL-3.0
5
6 """Integration test framework for node manager.
7
8 Runs full node manager with an API server (needs ARVADOS_API_HOST and
9 ARVADOS_API_TOKEN).  Stubs out the cloud driver and slurm commands to mock
10 specific behaviors.  Monitors the log output to verify an expected sequence of
11 events or behaviors for each test.
12
13 """
14
15 import subprocess
16 import os
17 import sys
18 import re
19 import time
20 import logging
21 import stat
22 import tempfile
23 import shutil
24 import errno
25 from functools import partial
26 import arvados
27 import StringIO
28
29 formatter = logging.Formatter('%(asctime)s %(levelname)s: %(message)s')
30
31 handler = logging.StreamHandler(sys.stderr)
32 handler.setFormatter(formatter)
33 logger = logging.getLogger("logger")
34 logger.setLevel(logging.INFO)
35 logger.addHandler(handler)
36
37 detail = logging.getLogger("detail")
38 detail.setLevel(logging.INFO)
39 if os.environ.get("ANMTEST_LOGLEVEL"):
40     detail_content = sys.stderr
41 else:
42     detail_content = StringIO.StringIO()
43 handler = logging.StreamHandler(detail_content)
44 handler.setFormatter(formatter)
45 detail.addHandler(handler)
46
47 fake_slurm = None
48 compute_nodes = None
49 all_jobs = None
50 unsatisfiable_job_scancelled = None
51
52 def update_script(path, val):
53     with open(path+"_", "w") as f:
54         f.write(val)
55     os.chmod(path+"_", stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR)
56     os.rename(path+"_", path)
57     detail.info("Update script %s: %s", path, val)
58
59 def set_squeue(g):
60     global all_jobs
61     update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n" +
62                   "\n".join("echo '1|100|100|%s|%s|(null)|1234567890'" % (v, k) for k,v in all_jobs.items()))
63     return 0
64
65 def set_queue_unsatisfiable(g):
66     global all_jobs, unsatisfiable_job_scancelled
67     # Simulate a job requesting a 99 core node.
68     update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n" +
69                   "\n".join("echo '99|100|100|%s|%s|(null)|1234567890'" % (v, k) for k,v in all_jobs.items()))
70     update_script(os.path.join(fake_slurm, "scancel"), "#!/bin/sh\n" +
71                   "\ntouch %s" % unsatisfiable_job_scancelled)
72     return 0
73
74 def job_cancelled(g):
75     global unsatisfiable_job_scancelled
76     cancelled_job = g.group(1)
77     api = arvados.api('v1')
78     # Check that 'scancel' was called
79     if not os.path.isfile(unsatisfiable_job_scancelled):
80         return 1
81     # Check for the log entry
82     log_entry = api.logs().list(
83         filters=[
84             ['object_uuid', '=', cancelled_job],
85             ['event_type', '=', 'stderr'],
86         ]).execute()['items'][0]
87     if not re.match(
88             r"Constraints cannot be satisfied",
89             log_entry['properties']['text']):
90         return 1
91     return 0
92
93 def node_paired(g):
94     global compute_nodes
95     compute_nodes[g.group(1)] = g.group(3)
96
97     update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n" +
98                   "\n".join("echo '%s|alloc|(null)'" % (v) for k,v in compute_nodes.items()))
99
100     for k,v in all_jobs.items():
101         if v == "ReqNodeNotAvail":
102             all_jobs[k] = "Running"
103             break
104
105     set_squeue(g)
106
107     return 0
108
109 def remaining_jobs(g):
110     update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n" +
111                   "\n".join("echo '%s|alloc|(null)'" % (v) for k,v in compute_nodes.items()))
112
113     for k,v in all_jobs.items():
114         all_jobs[k] = "Running"
115
116     set_squeue(g)
117
118     return 0
119
120
121 def node_busy(g):
122     update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n" +
123                   "\n".join("echo '%s|idle|(null)'" % (v) for k,v in compute_nodes.items()))
124     return 0
125
126 def node_shutdown(g):
127     global compute_nodes
128     del compute_nodes[g.group(1)]
129     return 0
130
131 def jobs_req(g):
132     global all_jobs
133     for k,v in all_jobs.items():
134         all_jobs[k] = "ReqNodeNotAvail"
135     set_squeue(g)
136     return 0
137
138 def noop(g):
139     return 0
140
141 def fail(checks, pattern, g):
142     return 1
143
144 def expect_count(count, checks, pattern, g):
145     if count == 0:
146         return 1
147     else:
148         checks[pattern] = partial(expect_count, count-1)
149         return 0
150
151 def run_test(name, actions, checks, driver_class, jobs, provider):
152     code = 0
153     global unsatisfiable_job_scancelled
154     unsatisfiable_job_scancelled = os.path.join(tempfile.mkdtemp(),
155                                                 "scancel_called")
156
157     # Delete any stale node records
158     api = arvados.api('v1')
159     for n in api.nodes().list().execute()['items']:
160         api.nodes().delete(uuid=n["uuid"]).execute()
161
162     logger.info("Start %s", name)
163
164     global fake_slurm
165     fake_slurm = tempfile.mkdtemp()
166     detail.info("fake_slurm is %s", fake_slurm)
167
168     global compute_nodes
169     compute_nodes = {}
170
171     global all_jobs
172     all_jobs = jobs
173
174     env = os.environ.copy()
175     env["PATH"] = fake_slurm + ":" + env["PATH"]
176
177     # Reset fake squeue/sinfo to empty
178     update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n")
179     update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n")
180
181     # Write configuration file for test
182     with open("tests/fake_%s.cfg.template" % provider) as f:
183         open(os.path.join(fake_slurm, "id_rsa.pub"), "w").close()
184         with open(os.path.join(fake_slurm, "fake.cfg"), "w") as cfg:
185             cfg.write(f.read().format(host=os.environ["ARVADOS_API_HOST"],
186                                       token=os.environ["ARVADOS_API_TOKEN"],
187                                       driver_class=driver_class,
188                                       ssh_key=os.path.join(fake_slurm, "id_rsa.pub")))
189
190     # Tests must complete in less than 3 minutes.
191     timeout = time.time() + 180
192     terminated = False
193
194     # Now start node manager
195     p = subprocess.Popen(["bin/arvados-node-manager", "--foreground", "--config", os.path.join(fake_slurm, "fake.cfg")],
196                          bufsize=0, stderr=subprocess.PIPE, env=env)
197
198     # Test main loop:
199     # - Read line
200     # - Apply negative checks (things that are not supposed to happen)
201     # - Check timeout
202     # - Check if the next action should trigger
203     # - If all actions are exhausted, terminate with test success
204     # - If it hits timeout with actions remaining, terminate with test failed
205     try:
206         # naive line iteration over pipes gets buffered, which isn't what we want,
207         # see https://bugs.python.org/issue3907
208         for line in iter(p.stderr.readline, ""):
209             detail_content.write(line)
210
211             for k,v in checks.items():
212                 g = re.match(k, line)
213                 if g:
214                     detail.info("Matched check %s", k)
215                     code += v(checks, k, g)
216                     if code != 0:
217                         detail.error("Check failed")
218                         if not terminated:
219                             p.kill()
220                             terminated = True
221
222             if terminated:
223                 continue
224
225             if time.time() > timeout:
226                 detail.error("Exceeded timeout with actions remaining: %s", actions)
227                 code += 1
228                 if not terminated:
229                     p.kill()
230                     terminated = True
231
232             k, v = actions[0]
233             g = re.match(k, line)
234             if g:
235                 detail.info("Matched action %s", k)
236                 actions.pop(0)
237                 code += v(g)
238                 if code != 0:
239                     detail.error("Action failed")
240                     p.kill()
241                     terminated = True
242
243             if not actions:
244                 p.kill()
245                 terminated = True
246     except KeyboardInterrupt:
247         p.kill()
248
249     if actions:
250         detail.error("Ended with remaining actions: %s", actions)
251         code = 1
252
253     shutil.rmtree(fake_slurm)
254     shutil.rmtree(os.path.dirname(unsatisfiable_job_scancelled))
255
256     if code == 0:
257         logger.info("%s passed", name)
258     else:
259         if isinstance(detail_content, StringIO.StringIO):
260             detail_content.seek(0)
261             chunk = detail_content.read(4096)
262             while chunk:
263                 try:
264                     sys.stderr.write(chunk)
265                     chunk = detail_content.read(4096)
266                 except IOError as e:
267                     if e.errno == errno.EAGAIN:
268                         # try again (probably pipe buffer full)
269                         pass
270                     else:
271                         raise
272         logger.info("%s failed", name)
273
274     return code
275
276
277 def main():
278     # Test lifecycle.
279
280     tests = {
281         "test_unsatisfiable_jobs" : (
282             # Actions (pattern -> action)
283             [
284                 (r".*Daemon started", set_queue_unsatisfiable),
285                 (r".*Cancelled unsatisfiable job '(\S+)'", job_cancelled),
286             ],
287             # Checks (things that shouldn't happen)
288             {
289                 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": fail,
290                 r".*Trying to cancel job '(\S+)'": fail,
291             },
292             # Driver class
293             "arvnodeman.test.fake_driver.FakeDriver",
294             # Jobs
295             {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
296             # Provider
297             "azure"),
298         "test_single_node_azure": (
299             # Actions (pattern -> action)
300             [
301                 (r".*Daemon started", set_squeue),
302                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
303                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
304                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
305                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
306             ],
307             # Checks (things that shouldn't happen)
308             {
309                 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
310                 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1),
311                 r".*Setting node quota.*": fail,
312             },
313             # Driver class
314             "arvnodeman.test.fake_driver.FakeDriver",
315             # Jobs
316             {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
317             # Provider
318             "azure"),
319         "test_multiple_nodes": (
320             # Actions (pattern -> action)
321             [
322                 (r".*Daemon started", set_squeue),
323                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
324                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
325                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
326                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
327                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
328                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
329                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
330                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
331                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
332                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
333             ],
334             # Checks (things that shouldn't happen)
335             {
336                 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
337                 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 4),
338                 r".*Setting node quota.*": fail,
339             },
340             # Driver class
341             "arvnodeman.test.fake_driver.FakeDriver",
342             # Jobs
343             {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
344              "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
345              "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
346              "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"},
347             # Provider
348             "azure"),
349         "test_hit_quota": (
350             # Actions (pattern -> action)
351             [
352                 (r".*Daemon started", set_squeue),
353                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
354                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
355                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
356                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", remaining_jobs),
357                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
358                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown)
359             ],
360             # Checks (things that shouldn't happen)
361             {
362                 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
363                 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 2),
364                 r".*Sending create_node request.*": partial(expect_count, 5)
365             },
366             # Driver class
367             "arvnodeman.test.fake_driver.QuotaDriver",
368             # Jobs
369             {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
370              "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
371              "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
372              "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"},
373             # Provider
374             "azure"),
375         "test_probe_quota": (
376             # Actions (pattern -> action)
377             [
378                 (r".*Daemon started", set_squeue),
379                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
380                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
381                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
382                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", remaining_jobs),
383                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
384                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
385                 (r".*sending request", jobs_req),
386                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
387                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
388                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
389                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
390                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
391                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
392                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
393                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
394                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
395                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
396             ],
397             # Checks (things that shouldn't happen)
398             {
399                 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
400                 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 6),
401                 r".*Sending create_node request.*": partial(expect_count, 9)
402             },
403             # Driver class
404             "arvnodeman.test.fake_driver.QuotaDriver",
405             # Jobs
406             {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
407              "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
408              "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
409              "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"},
410             # Provider
411             "azure"),
412         "test_no_hang_failing_node_create": (
413             # Actions (pattern -> action)
414             [
415                 (r".*Daemon started", set_squeue),
416                 (r".*Client error: nope", noop),
417                 (r".*Client error: nope", noop),
418                 (r".*Client error: nope", noop),
419                 (r".*Client error: nope", noop),
420             ],
421             # Checks (things that shouldn't happen)
422             {},
423             # Driver class
424             "arvnodeman.test.fake_driver.FailingDriver",
425             # Jobs
426             {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
427              "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
428              "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
429              "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"},
430             # Provider
431             "azure"),
432         "test_retry_create": (
433             # Actions (pattern -> action)
434             [
435                 (r".*Daemon started", set_squeue),
436                 (r".*Rate limit exceeded - scheduling retry in 12 seconds", noop),
437                 (r".*Rate limit exceeded - scheduling retry in 2 seconds", noop),
438                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", noop),
439             ],
440             # Checks (things that shouldn't happen)
441             {},
442             # Driver class
443             "arvnodeman.test.fake_driver.RetryDriver",
444             # Jobs
445             {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail"},
446             # Provider
447             "azure"),
448         "test_single_node_aws": (
449             # Actions (pattern -> action)
450             [
451                 (r".*Daemon started", set_squeue),
452                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
453                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
454                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
455                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
456             ],
457             # Checks (things that shouldn't happen)
458             {
459                 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
460                 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1),
461                 r".*Setting node quota.*": fail,
462             },
463             # Driver class
464             "arvnodeman.test.fake_driver.FakeAwsDriver",
465             # Jobs
466             {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
467             # Provider
468             "ec2"),
469         "test_single_node_gce": (
470             # Actions (pattern -> action)
471             [
472                 (r".*Daemon started", set_squeue),
473                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
474                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
475                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
476                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
477             ],
478             # Checks (things that shouldn't happen)
479             {
480                 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
481                 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1),
482                 r".*Setting node quota.*": fail,
483             },
484             # Driver class
485             "arvnodeman.test.fake_driver.FakeGceDriver",
486             # Jobs
487             {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
488             # Provider
489             "gce")
490     }
491
492     code = 0
493     if len(sys.argv) > 1:
494         code = run_test(sys.argv[1], *tests[sys.argv[1]])
495     else:
496         for t in sorted(tests.keys()):
497             code += run_test(t, *tests[t])
498
499     if code == 0:
500         logger.info("Tests passed")
501     else:
502         logger.info("Tests failed")
503
504     exit(code)
505
506 if __name__ == '__main__':
507     main()