13804: Update comments for comments for "consecutive_idle_count"
[arvados.git] / services / nodemanager / tests / integration_test.py
1 #!/usr/bin/env python
2 # Copyright (C) The Arvados Authors. All rights reserved.
3 #
4 # SPDX-License-Identifier: AGPL-3.0
5
6 """Integration test framework for node manager.
7
8 Runs full node manager with an API server (needs ARVADOS_API_HOST and
9 ARVADOS_API_TOKEN).  Stubs out the cloud driver and slurm commands to mock
10 specific behaviors.  Monitors the log output to verify an expected sequence of
11 events or behaviors for each test.
12
13 """
14
15 import subprocess
16 import os
17 import sys
18 import re
19 import time
20 import logging
21 import stat
22 import tempfile
23 import shutil
24 from functools import partial
25 import arvados
26 import StringIO
27
28 formatter = logging.Formatter('%(asctime)s %(levelname)s: %(message)s')
29
30 handler = logging.StreamHandler(sys.stderr)
31 handler.setFormatter(formatter)
32 logger = logging.getLogger("logger")
33 logger.setLevel(logging.INFO)
34 logger.addHandler(handler)
35
36 detail = logging.getLogger("detail")
37 detail.setLevel(logging.INFO)
38 if os.environ.get("ANMTEST_LOGLEVEL"):
39     detail_content = sys.stderr
40 else:
41     detail_content = StringIO.StringIO()
42 handler = logging.StreamHandler(detail_content)
43 handler.setFormatter(formatter)
44 detail.addHandler(handler)
45
46 fake_slurm = None
47 compute_nodes = None
48 all_jobs = None
49 unsatisfiable_job_scancelled = None
50
51 def update_script(path, val):
52     with open(path+"_", "w") as f:
53         f.write(val)
54     os.chmod(path+"_", stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR)
55     os.rename(path+"_", path)
56     detail.info("Update script %s: %s", path, val)
57
58 def set_squeue(g):
59     global all_jobs
60     update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n" +
61                   "\n".join("echo '1|100|100|%s|%s|(null)|1234567890'" % (v, k) for k,v in all_jobs.items()))
62     return 0
63
64 def set_queue_unsatisfiable(g):
65     global all_jobs, unsatisfiable_job_scancelled
66     # Simulate a job requesting a 99 core node.
67     update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n" +
68                   "\n".join("echo '99|100|100|%s|%s|(null)|1234567890'" % (v, k) for k,v in all_jobs.items()))
69     update_script(os.path.join(fake_slurm, "scancel"), "#!/bin/sh\n" +
70                   "\ntouch %s" % unsatisfiable_job_scancelled)
71     return 0
72
73 def job_cancelled(g):
74     global unsatisfiable_job_scancelled
75     cancelled_job = g.group(1)
76     api = arvados.api('v1')
77     # Check that 'scancel' was called
78     if not os.path.isfile(unsatisfiable_job_scancelled):
79         return 1
80     # Check for the log entry
81     log_entry = api.logs().list(
82         filters=[
83             ['object_uuid', '=', cancelled_job],
84             ['event_type', '=', 'stderr'],
85         ]).execute()['items'][0]
86     if not re.match(
87             r"Constraints cannot be satisfied",
88             log_entry['properties']['text']):
89         return 1
90     return 0
91
92 def node_paired(g):
93     global compute_nodes
94     compute_nodes[g.group(1)] = g.group(3)
95
96     update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n" +
97                   "\n".join("echo '%s|alloc|(null)'" % (v) for k,v in compute_nodes.items()))
98
99     for k,v in all_jobs.items():
100         if v == "ReqNodeNotAvail":
101             all_jobs[k] = "Running"
102             break
103
104     set_squeue(g)
105
106     return 0
107
108 def remaining_jobs(g):
109     update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n" +
110                   "\n".join("echo '%s|alloc|(null)'" % (v) for k,v in compute_nodes.items()))
111
112     for k,v in all_jobs.items():
113         all_jobs[k] = "Running"
114
115     set_squeue(g)
116
117     return 0
118
119
120 def node_busy(g):
121     update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n" +
122                   "\n".join("echo '%s|idle|(null)'" % (v) for k,v in compute_nodes.items()))
123     return 0
124
125 def node_shutdown(g):
126     global compute_nodes
127     if g.group(1) in compute_nodes:
128         del compute_nodes[g.group(1)]
129         return 0
130     else:
131         return 1
132
133 def jobs_req(g):
134     global all_jobs
135     for k,v in all_jobs.items():
136         all_jobs[k] = "ReqNodeNotAvail"
137     set_squeue(g)
138     return 0
139
140 def noop(g):
141     return 0
142
143 def fail(checks, pattern, g):
144     return 1
145
146 def expect_count(count, checks, pattern, g):
147     if count == 0:
148         return 1
149     else:
150         checks[pattern] = partial(expect_count, count-1)
151         return 0
152
153 def run_test(name, actions, checks, driver_class, jobs, provider):
154     code = 0
155     global unsatisfiable_job_scancelled
156     unsatisfiable_job_scancelled = os.path.join(tempfile.mkdtemp(),
157                                                 "scancel_called")
158
159     # Delete any stale node records
160     api = arvados.api('v1')
161     for n in api.nodes().list().execute()['items']:
162         api.nodes().delete(uuid=n["uuid"]).execute()
163
164     logger.info("Start %s", name)
165
166     global fake_slurm
167     fake_slurm = tempfile.mkdtemp()
168     detail.info("fake_slurm is %s", fake_slurm)
169
170     global compute_nodes
171     compute_nodes = {}
172
173     global all_jobs
174     all_jobs = jobs
175
176     env = os.environ.copy()
177     env["PATH"] = fake_slurm + ":" + env["PATH"]
178
179     # Reset fake squeue/sinfo to empty
180     update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n")
181     update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n")
182
183     # Write configuration file for test
184     with open("tests/fake_%s.cfg.template" % provider) as f:
185         open(os.path.join(fake_slurm, "id_rsa.pub"), "w").close()
186         with open(os.path.join(fake_slurm, "fake.cfg"), "w") as cfg:
187             cfg.write(f.read().format(host=os.environ["ARVADOS_API_HOST"],
188                                       token=os.environ["ARVADOS_API_TOKEN"],
189                                       driver_class=driver_class,
190                                       ssh_key=os.path.join(fake_slurm, "id_rsa.pub")))
191
192     # Tests must complete in less than 3 minutes.
193     timeout = time.time() + 180
194     terminated = False
195
196     # Now start node manager
197     p = subprocess.Popen(["bin/arvados-node-manager", "--foreground", "--config", os.path.join(fake_slurm, "fake.cfg")],
198                          bufsize=0, stderr=subprocess.PIPE, env=env)
199
200     # Test main loop:
201     # - Read line
202     # - Apply negative checks (things that are not supposed to happen)
203     # - Check timeout
204     # - Check if the next action should trigger
205     # - If all actions are exhausted, terminate with test success
206     # - If it hits timeout with actions remaining, terminate with test failed
207     try:
208         # naive line iteration over pipes gets buffered, which isn't what we want,
209         # see https://bugs.python.org/issue3907
210         for line in iter(p.stderr.readline, ""):
211             detail_content.write(line)
212
213             for k,v in checks.items():
214                 g = re.match(k, line)
215                 if g:
216                     detail.info("Matched check %s", k)
217                     code += v(checks, k, g)
218                     if code != 0:
219                         detail.error("Check failed")
220                         if not terminated:
221                             p.terminate()
222                             terminated = True
223
224             if terminated:
225                 continue
226
227             if time.time() > timeout:
228                 detail.error("Exceeded timeout with actions remaining: %s", actions)
229                 code += 1
230                 if not terminated:
231                     p.terminate()
232                     terminated = True
233
234             k, v = actions[0]
235             g = re.match(k, line)
236             if g:
237                 detail.info("Matched action %s", k)
238                 actions.pop(0)
239                 code += v(g)
240                 if code != 0:
241                     detail.error("Action failed")
242                     p.terminate()
243                     terminated = True
244
245             if not actions:
246                 p.terminate()
247                 terminated = True
248     except KeyboardInterrupt:
249         p.kill()
250
251     if actions:
252         detail.error("Ended with remaining actions: %s", actions)
253         code = 1
254
255     shutil.rmtree(fake_slurm)
256     shutil.rmtree(os.path.dirname(unsatisfiable_job_scancelled))
257
258     if code == 0:
259         logger.info("%s passed", name)
260     else:
261         if isinstance(detail_content, StringIO.StringIO):
262             sys.stderr.write(detail_content.getvalue())
263         logger.info("%s failed", name)
264
265     return code
266
267
268 def main():
269     # Test lifecycle.
270
271     tests = {
272         "test_unsatisfiable_jobs" : (
273             # Actions (pattern -> action)
274             [
275                 (r".*Daemon started", set_queue_unsatisfiable),
276                 (r".*Cancelled unsatisfiable job '(\S+)'", job_cancelled),
277             ],
278             # Checks (things that shouldn't happen)
279             {
280                 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": fail,
281                 r".*Trying to cancel job '(\S+)'": fail,
282             },
283             # Driver class
284             "arvnodeman.test.fake_driver.FakeDriver",
285             # Jobs
286             {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
287             # Provider
288             "azure"),
289         "test_single_node_azure": (
290             # Actions (pattern -> action)
291             [
292                 (r".*Daemon started", set_squeue),
293                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
294                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
295                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
296                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
297             ],
298             # Checks (things that shouldn't happen)
299             {
300                 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
301                 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1),
302                 r".*Setting node quota.*": fail,
303             },
304             # Driver class
305             "arvnodeman.test.fake_driver.FakeDriver",
306             # Jobs
307             {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
308             # Provider
309             "azure"),
310         "test_multiple_nodes": (
311             # Actions (pattern -> action)
312             [
313                 (r".*Daemon started", set_squeue),
314                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
315                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
316                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
317                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
318                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
319                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
320                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
321                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
322                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
323                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
324             ],
325             # Checks (things that shouldn't happen)
326             {
327                 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
328                 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 4),
329                 r".*Setting node quota.*": fail,
330             },
331             # Driver class
332             "arvnodeman.test.fake_driver.FakeDriver",
333             # Jobs
334             {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
335              "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
336              "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
337              "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"},
338             # Provider
339             "azure"),
340         "test_hit_quota": (
341             # Actions (pattern -> action)
342             [
343                 (r".*Daemon started", set_squeue),
344                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
345                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
346                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
347                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", remaining_jobs),
348                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
349                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown)
350             ],
351             # Checks (things that shouldn't happen)
352             {
353                 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
354                 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 2),
355                 r".*Sending create_node request.*": partial(expect_count, 5)
356             },
357             # Driver class
358             "arvnodeman.test.fake_driver.QuotaDriver",
359             # Jobs
360             {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
361              "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
362              "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
363              "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"},
364             # Provider
365             "azure"),
366         "test_probe_quota": (
367             # Actions (pattern -> action)
368             [
369                 (r".*Daemon started", set_squeue),
370                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
371                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
372                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
373                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", remaining_jobs),
374                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
375                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
376                 (r".*sending request", jobs_req),
377                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
378                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
379                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
380                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
381                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
382                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
383                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
384                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
385                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
386                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
387             ],
388             # Checks (things that shouldn't happen)
389             {
390                 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
391                 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 6),
392                 r".*Sending create_node request.*": partial(expect_count, 9)
393             },
394             # Driver class
395             "arvnodeman.test.fake_driver.QuotaDriver",
396             # Jobs
397             {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
398              "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
399              "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
400              "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"},
401             # Provider
402             "azure"),
403         "test_no_hang_failing_node_create": (
404             # Actions (pattern -> action)
405             [
406                 (r".*Daemon started", set_squeue),
407                 (r".*Client error: nope", noop),
408                 (r".*Client error: nope", noop),
409                 (r".*Client error: nope", noop),
410                 (r".*Client error: nope", noop),
411             ],
412             # Checks (things that shouldn't happen)
413             {},
414             # Driver class
415             "arvnodeman.test.fake_driver.FailingDriver",
416             # Jobs
417             {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
418              "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
419              "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
420              "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"},
421             # Provider
422             "azure"),
423         "test_retry_create": (
424             # Actions (pattern -> action)
425             [
426                 (r".*Daemon started", set_squeue),
427                 (r".*Rate limit exceeded - scheduling retry in 12 seconds", noop),
428                 (r".*Rate limit exceeded - scheduling retry in 2 seconds", noop),
429                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", noop),
430             ],
431             # Checks (things that shouldn't happen)
432             {},
433             # Driver class
434             "arvnodeman.test.fake_driver.RetryDriver",
435             # Jobs
436             {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail"},
437             # Provider
438             "azure"),
439         "test_single_node_aws": (
440             # Actions (pattern -> action)
441             [
442                 (r".*Daemon started", set_squeue),
443                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
444                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
445                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
446                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
447             ],
448             # Checks (things that shouldn't happen)
449             {
450                 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
451                 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1),
452                 r".*Setting node quota.*": fail,
453             },
454             # Driver class
455             "arvnodeman.test.fake_driver.FakeAwsDriver",
456             # Jobs
457             {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
458             # Provider
459             "ec2"),
460         "test_single_node_gce": (
461             # Actions (pattern -> action)
462             [
463                 (r".*Daemon started", set_squeue),
464                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
465                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
466                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
467                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
468             ],
469             # Checks (things that shouldn't happen)
470             {
471                 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
472                 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1),
473                 r".*Setting node quota.*": fail,
474             },
475             # Driver class
476             "arvnodeman.test.fake_driver.FakeGceDriver",
477             # Jobs
478             {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
479             # Provider
480             "gce")
481     }
482
483     code = 0
484     if len(sys.argv) > 1:
485         code = run_test(sys.argv[1], *tests[sys.argv[1]])
486     else:
487         for t in sorted(tests.keys()):
488             code += run_test(t, *tests[t])
489
490     if code == 0:
491         logger.info("Tests passed")
492     else:
493         logger.info("Tests failed")
494
495     exit(code)
496
497 if __name__ == '__main__':
498     main()