Merge branch '13822-nm-delayed-daemon' refs #13822
[arvados.git] / services / nodemanager / tests / integration_test.py
1 #!/usr/bin/env python
2 # Copyright (C) The Arvados Authors. All rights reserved.
3 #
4 # SPDX-License-Identifier: AGPL-3.0
5
6 """Integration test framework for node manager.
7
8 Runs full node manager with an API server (needs ARVADOS_API_HOST and
9 ARVADOS_API_TOKEN).  Stubs out the cloud driver and slurm commands to mock
10 specific behaviors.  Monitors the log output to verify an expected sequence of
11 events or behaviors for each test.
12
13 """
14
15 import subprocess32 as subprocess
16 import os
17 import sys
18 import re
19 import time
20 import logging
21 import stat
22 import tempfile
23 import shutil
24 import errno
25 from functools import partial
26 import arvados
27 import StringIO
28
29 formatter = logging.Formatter('%(asctime)s %(levelname)s: %(message)s')
30
31 handler = logging.StreamHandler(sys.stderr)
32 handler.setFormatter(formatter)
33 logger = logging.getLogger("logger")
34 logger.setLevel(logging.INFO)
35 logger.addHandler(handler)
36
37 detail = logging.getLogger("detail")
38 detail.setLevel(logging.INFO)
39 if os.environ.get("ANMTEST_LOGLEVEL"):
40     detail_content = sys.stderr
41 else:
42     detail_content = StringIO.StringIO()
43 handler = logging.StreamHandler(detail_content)
44 handler.setFormatter(formatter)
45 detail.addHandler(handler)
46
47 fake_slurm = None
48 compute_nodes = None
49 all_jobs = None
50 unsatisfiable_job_scancelled = None
51
52 def update_script(path, val):
53     with open(path+"_", "w") as f:
54         f.write(val)
55     os.chmod(path+"_", stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR)
56     os.rename(path+"_", path)
57     detail.info("Update script %s: %s", path, val)
58
59 def set_squeue(g):
60     global all_jobs
61     update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n" +
62                   "\n".join("echo '1|100|100|%s|%s|(null)|1234567890'" % (v, k) for k,v in all_jobs.items()))
63     return 0
64
65 def set_queue_unsatisfiable(g):
66     global all_jobs, unsatisfiable_job_scancelled
67     # Simulate a job requesting a 99 core node.
68     update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n" +
69                   "\n".join("echo '99|100|100|%s|%s|(null)|1234567890'" % (v, k) for k,v in all_jobs.items()))
70     update_script(os.path.join(fake_slurm, "scancel"), "#!/bin/sh\n" +
71                   "\ntouch %s" % unsatisfiable_job_scancelled)
72     return 0
73
74 def job_cancelled(g):
75     global unsatisfiable_job_scancelled
76     cancelled_job = g.group(1)
77     api = arvados.api('v1')
78     # Check that 'scancel' was called
79     if not os.path.isfile(unsatisfiable_job_scancelled):
80         return 1
81     # Check for the log entry
82     log_entry = api.logs().list(
83         filters=[
84             ['object_uuid', '=', cancelled_job],
85             ['event_type', '=', 'stderr'],
86         ]).execute()['items'][0]
87     if not re.match(
88             r"Constraints cannot be satisfied",
89             log_entry['properties']['text']):
90         return 1
91     return 0
92
93 def node_paired(g):
94     global compute_nodes
95     compute_nodes[g.group(1)] = g.group(3)
96
97     update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n" +
98                   "\n".join("echo '%s|alloc|(null)'" % (v) for k,v in compute_nodes.items()))
99
100     for k,v in all_jobs.items():
101         if v == "ReqNodeNotAvail":
102             all_jobs[k] = "Running"
103             break
104
105     set_squeue(g)
106
107     return 0
108
109 def node_busy(g):
110     update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n" +
111                   "\n".join("echo '%s|idle|(null)'" % (v) for k,v in compute_nodes.items()))
112     return 0
113
114 def node_shutdown(g):
115     global compute_nodes
116     if g.group(1) in compute_nodes:
117         del compute_nodes[g.group(1)]
118     return 0
119
120 def jobs_req(g):
121     global all_jobs
122     for k,v in all_jobs.items():
123         all_jobs[k] = "ReqNodeNotAvail"
124     set_squeue(g)
125     return 0
126
127 def noop(g):
128     return 0
129
130 def fail(checks, pattern, g):
131     return 1
132
133 def expect_count(count, checks, pattern, g):
134     if count == 0:
135         return 1
136     else:
137         checks[pattern] = partial(expect_count, count-1)
138         return 0
139
140 def run_test(name, actions, checks, driver_class, jobs, provider):
141     code = 0
142     global unsatisfiable_job_scancelled
143     unsatisfiable_job_scancelled = os.path.join(tempfile.mkdtemp(),
144                                                 "scancel_called")
145
146     # Delete any stale node records
147     api = arvados.api('v1')
148     for n in api.nodes().list().execute()['items']:
149         api.nodes().delete(uuid=n["uuid"]).execute()
150
151     logger.info("Start %s", name)
152
153     global fake_slurm
154     fake_slurm = tempfile.mkdtemp()
155     detail.info("fake_slurm is %s", fake_slurm)
156
157     global compute_nodes
158     compute_nodes = {}
159
160     global all_jobs
161     all_jobs = jobs
162
163     env = os.environ.copy()
164     env["PATH"] = fake_slurm + ":" + env["PATH"]
165
166     # Reset fake squeue/sinfo to empty
167     update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n")
168     update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n")
169
170     # Write configuration file for test
171     with open("tests/fake_%s.cfg.template" % provider) as f:
172         open(os.path.join(fake_slurm, "id_rsa.pub"), "w").close()
173         with open(os.path.join(fake_slurm, "fake.cfg"), "w") as cfg:
174             cfg.write(f.read().format(host=os.environ["ARVADOS_API_HOST"],
175                                       token=os.environ["ARVADOS_API_TOKEN"],
176                                       driver_class=driver_class,
177                                       ssh_key=os.path.join(fake_slurm, "id_rsa.pub")))
178
179     # Tests must complete in less than 30 seconds.
180     timeout = time.time() + 30
181     terminated = False
182
183     # Now start node manager
184     p = subprocess.Popen(["bin/arvados-node-manager", "--foreground", "--config", os.path.join(fake_slurm, "fake.cfg")],
185                          bufsize=0, stderr=subprocess.PIPE, env=env)
186
187     # Test main loop:
188     # - Read line
189     # - Apply negative checks (things that are not supposed to happen)
190     # - Check timeout
191     # - Check if the next action should trigger
192     # - If all actions are exhausted, terminate with test success
193     # - If it hits timeout with actions remaining, terminate with test failed
194     try:
195         # naive line iteration over pipes gets buffered, which isn't what we want,
196         # see https://bugs.python.org/issue3907
197         for line in iter(p.stderr.readline, ""):
198             detail_content.write(line)
199
200             for k,v in checks.items():
201                 g = re.match(k, line)
202                 if g:
203                     detail.info("Matched check %s", k)
204                     code += v(checks, k, g)
205                     if code != 0:
206                         detail.error("Check failed")
207                         if not terminated:
208                             p.kill()
209                             terminated = True
210
211             if terminated:
212                 continue
213
214             if time.time() > timeout:
215                 detail.error("Exceeded timeout with actions remaining: %s", actions)
216                 code += 1
217                 if not terminated:
218                     p.kill()
219                     terminated = True
220
221             k, v = actions[0]
222             g = re.match(k, line)
223             if g:
224                 detail.info("Matched action %s", k)
225                 actions.pop(0)
226                 code += v(g)
227                 if code != 0:
228                     detail.error("Action failed")
229                     p.kill()
230                     terminated = True
231
232             if not actions:
233                 p.kill()
234                 terminated = True
235     except KeyboardInterrupt:
236         p.kill()
237
238     if actions:
239         detail.error("Ended with remaining actions: %s", actions)
240         code = 1
241
242     shutil.rmtree(fake_slurm)
243     shutil.rmtree(os.path.dirname(unsatisfiable_job_scancelled))
244
245     if code == 0:
246         logger.info("%s passed", name)
247     else:
248         if isinstance(detail_content, StringIO.StringIO):
249             detail_content.seek(0)
250             chunk = detail_content.read(4096)
251             while chunk:
252                 try:
253                     sys.stderr.write(chunk)
254                     chunk = detail_content.read(4096)
255                 except IOError as e:
256                     if e.errno == errno.EAGAIN:
257                         # try again (probably pipe buffer full)
258                         pass
259                     else:
260                         raise
261         logger.info("%s failed", name)
262
263     return code
264
265
266 def main():
267     # Test lifecycle.
268
269     tests = {
270         "test_unsatisfiable_jobs" : (
271             # Actions (pattern -> action)
272             [
273                 (r".*Daemon started", set_queue_unsatisfiable),
274                 (r".*Cancelled unsatisfiable job '(\S+)'", job_cancelled),
275             ],
276             # Checks (things that shouldn't happen)
277             {
278                 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": fail,
279                 r".*Trying to cancel job '(\S+)'": fail,
280             },
281             # Driver class
282             "arvnodeman.test.fake_driver.FakeDriver",
283             # Jobs
284             {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
285             # Provider
286             "azure"),
287         "test_single_node_azure": (
288             # Actions (pattern -> action)
289             [
290                 (r".*Daemon started", set_squeue),
291                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
292                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
293                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
294                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
295             ],
296             # Checks (things that shouldn't happen)
297             {
298                 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
299                 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1),
300                 r".*Setting node quota.*": fail,
301             },
302             # Driver class
303             "arvnodeman.test.fake_driver.FakeDriver",
304             # Jobs
305             {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
306             # Provider
307             "azure"),
308         "test_multiple_nodes": (
309             # Actions (pattern -> action)
310             [
311                 (r".*Daemon started", set_squeue),
312                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
313                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
314                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
315                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
316                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
317                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
318                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
319                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
320                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
321                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
322             ],
323             # Checks (things that shouldn't happen)
324             {
325                 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 4),
326                 r".*Setting node quota.*": fail,
327             },
328             # Driver class
329             "arvnodeman.test.fake_driver.FakeDriver",
330             # Jobs
331             {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
332              "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
333              "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
334              "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"},
335             # Provider
336             "azure"),
337         "test_hit_quota": (
338             # Actions (pattern -> action)
339             [
340                 (r".*Daemon started", set_squeue),
341                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
342                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
343                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
344                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
345                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
346                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown)
347             ],
348             # Checks (things that shouldn't happen)
349             {
350                 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 2),
351                 r".*Sending create_node request.*": partial(expect_count, 5)
352             },
353             # Driver class
354             "arvnodeman.test.fake_driver.QuotaDriver",
355             # Jobs
356             {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
357              "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
358              "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
359              "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"},
360             # Provider
361             "azure"),
362         "test_probe_quota": (
363             # Actions (pattern -> action)
364             [
365                 (r".*Daemon started", set_squeue),
366                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
367                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
368                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
369                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
370                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
371                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
372                 (r".*sending request", jobs_req),
373                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
374                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
375                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
376                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
377                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
378                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
379                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
380                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
381                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
382                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
383             ],
384             # Checks (things that shouldn't happen)
385             {
386                 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 6),
387                 r".*Sending create_node request.*": partial(expect_count, 9)
388             },
389             # Driver class
390             "arvnodeman.test.fake_driver.QuotaDriver",
391             # Jobs
392             {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
393              "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
394              "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
395              "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"},
396             # Provider
397             "azure"),
398         "test_no_hang_failing_node_create": (
399             # Actions (pattern -> action)
400             [
401                 (r".*Daemon started", set_squeue),
402                 (r".*Client error: nope", noop),
403                 (r".*Client error: nope", noop),
404                 (r".*Client error: nope", noop),
405                 (r".*Client error: nope", noop),
406             ],
407             # Checks (things that shouldn't happen)
408             {},
409             # Driver class
410             "arvnodeman.test.fake_driver.FailingDriver",
411             # Jobs
412             {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
413              "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
414              "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
415              "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"},
416             # Provider
417             "azure"),
418         "test_retry_create": (
419             # Actions (pattern -> action)
420             [
421                 (r".*Daemon started", set_squeue),
422                 (r".*Rate limit exceeded - scheduling retry in 2 seconds", noop),
423                 (r".*Rate limit exceeded - scheduling retry in 1 seconds", noop),
424                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", noop),
425             ],
426             # Checks (things that shouldn't happen)
427             {},
428             # Driver class
429             "arvnodeman.test.fake_driver.RetryDriver",
430             # Jobs
431             {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail"},
432             # Provider
433             "azure"),
434         "test_single_node_aws": (
435             # Actions (pattern -> action)
436             [
437                 (r".*Daemon started", set_squeue),
438                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
439                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
440                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
441                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
442             ],
443             # Checks (things that shouldn't happen)
444             {
445                 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1),
446                 r".*Setting node quota.*": fail,
447             },
448             # Driver class
449             "arvnodeman.test.fake_driver.FakeAwsDriver",
450             # Jobs
451             {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
452             # Provider
453             "ec2"),
454         "test_single_node_gce": (
455             # Actions (pattern -> action)
456             [
457                 (r".*Daemon started", set_squeue),
458                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
459                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
460                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
461                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
462             ],
463             # Checks (things that shouldn't happen)
464             {
465                 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1),
466                 r".*Setting node quota.*": fail,
467             },
468             # Driver class
469             "arvnodeman.test.fake_driver.FakeGceDriver",
470             # Jobs
471             {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
472             # Provider
473             "gce")
474     }
475
476     code = 0
477     if len(sys.argv) > 1:
478         code = run_test(sys.argv[1], *tests[sys.argv[1]])
479     else:
480         for t in sorted(tests.keys()):
481             code += run_test(t, *tests[t])
482
483     if code == 0:
484         logger.info("Tests passed")
485     else:
486         logger.info("Tests failed")
487
488     exit(code)
489
490 if __name__ == '__main__':
491     main()