Fix a typo in Crunch Dispatch documentation. Arvados-DCO-1.1-Signed-off-by: Pavel...
[arvados.git] / services / nodemanager / tests / integration_test.py
1 #!/usr/bin/env python
2 # Copyright (C) The Arvados Authors. All rights reserved.
3 #
4 # SPDX-License-Identifier: AGPL-3.0
5
6 """Integration test framework for node manager.
7
8 Runs full node manager with an API server (needs ARVADOS_API_HOST and
9 ARVADOS_API_TOKEN).  Stubs out the cloud driver and slurm commands to mock
10 specific behaviors.  Monitors the log output to verify an expected sequence of
11 events or behaviors for each test.
12
13 """
14
15 import subprocess
16 import os
17 import sys
18 import re
19 import time
20 import logging
21 import stat
22 import tempfile
23 import shutil
24 from functools import partial
25 import arvados
26 import StringIO
27
28 formatter = logging.Formatter('%(asctime)s %(levelname)s: %(message)s')
29
30 handler = logging.StreamHandler(sys.stderr)
31 handler.setFormatter(formatter)
32 logger = logging.getLogger("logger")
33 logger.setLevel(logging.INFO)
34 logger.addHandler(handler)
35
36 detail = logging.getLogger("detail")
37 detail.setLevel(logging.INFO)
38 if os.environ.get("ANMTEST_LOGLEVEL"):
39     detail_content = sys.stderr
40 else:
41     detail_content = StringIO.StringIO()
42 handler = logging.StreamHandler(detail_content)
43 handler.setFormatter(formatter)
44 detail.addHandler(handler)
45
46 fake_slurm = None
47 compute_nodes = None
48 all_jobs = None
49 unsatisfiable_job_scancelled = None
50
51 def update_script(path, val):
52     with open(path+"_", "w") as f:
53         f.write(val)
54     os.chmod(path+"_", stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR)
55     os.rename(path+"_", path)
56     detail.info("Update script %s: %s", path, val)
57
58 def set_squeue(g):
59     global all_jobs
60     update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n" +
61                   "\n".join("echo '1|100|100|%s|%s|(null)|1234567890'" % (v, k) for k,v in all_jobs.items()))
62     return 0
63
64 def set_queue_unsatisfiable(g):
65     global all_jobs, unsatisfiable_job_scancelled
66     # Simulate a job requesting a 99 core node.
67     update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n" +
68                   "\n".join("echo '99|100|100|%s|%s|(null)|1234567890'" % (v, k) for k,v in all_jobs.items()))
69     update_script(os.path.join(fake_slurm, "scancel"), "#!/bin/sh\n" +
70                   "\ntouch %s" % unsatisfiable_job_scancelled)
71     return 0
72
73 def job_cancelled(g):
74     global unsatisfiable_job_scancelled
75     cancelled_job = g.group(1)
76     api = arvados.api('v1')
77     # Check that 'scancel' was called
78     if not os.path.isfile(unsatisfiable_job_scancelled):
79         return 1
80     # Check for the log entry
81     log_entry = api.logs().list(
82         filters=[
83             ['object_uuid', '=', cancelled_job],
84             ['event_type', '=', 'stderr'],
85         ]).execute()['items'][0]
86     if not re.match(
87             r"Constraints cannot be satisfied",
88             log_entry['properties']['text']):
89         return 1
90     return 0
91
92 def node_paired(g):
93     global compute_nodes
94     compute_nodes[g.group(1)] = g.group(3)
95
96     update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n" +
97                   "\n".join("echo '%s|alloc|(null)'" % (v) for k,v in compute_nodes.items()))
98
99     for k,v in all_jobs.items():
100         if v == "ReqNodeNotAvail":
101             all_jobs[k] = "Running"
102             break
103
104     set_squeue(g)
105
106     return 0
107
108 def remaining_jobs(g):
109     update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n" +
110                   "\n".join("echo '%s|alloc|(null)'" % (v) for k,v in compute_nodes.items()))
111
112     for k,v in all_jobs.items():
113         all_jobs[k] = "Running"
114
115     set_squeue(g)
116
117     return 0
118
119
120 def node_busy(g):
121     update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n" +
122                   "\n".join("echo '%s|idle|(null)'" % (v) for k,v in compute_nodes.items()))
123     return 0
124
125 def node_shutdown(g):
126     global compute_nodes
127     del compute_nodes[g.group(1)]
128     return 0
129
130 def jobs_req(g):
131     global all_jobs
132     for k,v in all_jobs.items():
133         all_jobs[k] = "ReqNodeNotAvail"
134     set_squeue(g)
135     return 0
136
137 def noop(g):
138     return 0
139
140 def fail(checks, pattern, g):
141     return 1
142
143 def expect_count(count, checks, pattern, g):
144     if count == 0:
145         return 1
146     else:
147         checks[pattern] = partial(expect_count, count-1)
148         return 0
149
150 def run_test(name, actions, checks, driver_class, jobs, provider):
151     code = 0
152     global unsatisfiable_job_scancelled
153     unsatisfiable_job_scancelled = os.path.join(tempfile.mkdtemp(),
154                                                 "scancel_called")
155
156     # Delete any stale node records
157     api = arvados.api('v1')
158     for n in api.nodes().list().execute()['items']:
159         api.nodes().delete(uuid=n["uuid"]).execute()
160
161     logger.info("Start %s", name)
162
163     global fake_slurm
164     fake_slurm = tempfile.mkdtemp()
165     detail.info("fake_slurm is %s", fake_slurm)
166
167     global compute_nodes
168     compute_nodes = {}
169
170     global all_jobs
171     all_jobs = jobs
172
173     env = os.environ.copy()
174     env["PATH"] = fake_slurm + ":" + env["PATH"]
175
176     # Reset fake squeue/sinfo to empty
177     update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n")
178     update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n")
179
180     # Write configuration file for test
181     with open("tests/fake_%s.cfg.template" % provider) as f:
182         open(os.path.join(fake_slurm, "id_rsa.pub"), "w").close()
183         with open(os.path.join(fake_slurm, "fake.cfg"), "w") as cfg:
184             cfg.write(f.read().format(host=os.environ["ARVADOS_API_HOST"],
185                                       token=os.environ["ARVADOS_API_TOKEN"],
186                                       driver_class=driver_class,
187                                       ssh_key=os.path.join(fake_slurm, "id_rsa.pub")))
188
189     # Tests must complete in less than 3 minutes.
190     timeout = time.time() + 180
191     terminated = False
192
193     # Now start node manager
194     p = subprocess.Popen(["bin/arvados-node-manager", "--foreground", "--config", os.path.join(fake_slurm, "fake.cfg")],
195                          bufsize=0, stderr=subprocess.PIPE, env=env)
196
197     # Test main loop:
198     # - Read line
199     # - Apply negative checks (things that are not supposed to happen)
200     # - Check timeout
201     # - Check if the next action should trigger
202     # - If all actions are exhausted, terminate with test success
203     # - If it hits timeout with actions remaining, terminate with test failed
204     try:
205         # naive line iteration over pipes gets buffered, which isn't what we want,
206         # see https://bugs.python.org/issue3907
207         for line in iter(p.stderr.readline, ""):
208             detail_content.write(line)
209
210             for k,v in checks.items():
211                 g = re.match(k, line)
212                 if g:
213                     detail.info("Matched check %s", k)
214                     code += v(checks, k, g)
215                     if code != 0:
216                         detail.error("Check failed")
217                         if not terminated:
218                             p.terminate()
219                             terminated = True
220
221             if terminated:
222                 continue
223
224             if time.time() > timeout:
225                 detail.error("Exceeded timeout with actions remaining: %s", actions)
226                 code += 1
227                 if not terminated:
228                     p.terminate()
229                     terminated = True
230
231             k, v = actions[0]
232             g = re.match(k, line)
233             if g:
234                 detail.info("Matched action %s", k)
235                 actions.pop(0)
236                 code += v(g)
237                 if code != 0:
238                     detail.error("Action failed")
239                     p.terminate()
240                     terminated = True
241
242             if not actions:
243                 p.terminate()
244                 terminated = True
245     except KeyboardInterrupt:
246         p.kill()
247
248     if actions:
249         detail.error("Ended with remaining actions: %s", actions)
250         code = 1
251
252     shutil.rmtree(fake_slurm)
253     shutil.rmtree(os.path.dirname(unsatisfiable_job_scancelled))
254
255     if code == 0:
256         logger.info("%s passed", name)
257     else:
258         if isinstance(detail_content, StringIO.StringIO):
259             sys.stderr.write(detail_content.getvalue())
260         logger.info("%s failed", name)
261
262     return code
263
264
265 def main():
266     # Test lifecycle.
267
268     tests = {
269         "test_unsatisfiable_jobs" : (
270             # Actions (pattern -> action)
271             [
272                 (r".*Daemon started", set_queue_unsatisfiable),
273                 (r".*Cancelled unsatisfiable job '(\S+)'", job_cancelled),
274             ],
275             # Checks (things that shouldn't happen)
276             {
277                 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": fail,
278                 r".*Trying to cancel job '(\S+)'": fail,
279             },
280             # Driver class
281             "arvnodeman.test.fake_driver.FakeDriver",
282             # Jobs
283             {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
284             # Provider
285             "azure"),
286         "test_single_node_azure": (
287             # Actions (pattern -> action)
288             [
289                 (r".*Daemon started", set_squeue),
290                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
291                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
292                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
293                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
294             ],
295             # Checks (things that shouldn't happen)
296             {
297                 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
298                 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1),
299                 r".*Setting node quota.*": fail,
300             },
301             # Driver class
302             "arvnodeman.test.fake_driver.FakeDriver",
303             # Jobs
304             {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
305             # Provider
306             "azure"),
307         "test_multiple_nodes": (
308             # Actions (pattern -> action)
309             [
310                 (r".*Daemon started", set_squeue),
311                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
312                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
313                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
314                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
315                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
316                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
317                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
318                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
319                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
320                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
321             ],
322             # Checks (things that shouldn't happen)
323             {
324                 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
325                 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 4),
326                 r".*Setting node quota.*": fail,
327             },
328             # Driver class
329             "arvnodeman.test.fake_driver.FakeDriver",
330             # Jobs
331             {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
332              "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
333              "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
334              "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"},
335             # Provider
336             "azure"),
337         "test_hit_quota": (
338             # Actions (pattern -> action)
339             [
340                 (r".*Daemon started", set_squeue),
341                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
342                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
343                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
344                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", remaining_jobs),
345                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
346                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown)
347             ],
348             # Checks (things that shouldn't happen)
349             {
350                 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
351                 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 2),
352                 r".*Sending create_node request.*": partial(expect_count, 5)
353             },
354             # Driver class
355             "arvnodeman.test.fake_driver.QuotaDriver",
356             # Jobs
357             {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
358              "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
359              "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
360              "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"},
361             # Provider
362             "azure"),
363         "test_probe_quota": (
364             # Actions (pattern -> action)
365             [
366                 (r".*Daemon started", set_squeue),
367                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
368                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
369                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
370                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", remaining_jobs),
371                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
372                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
373                 (r".*sending request", jobs_req),
374                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
375                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
376                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
377                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
378                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
379                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
380                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
381                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
382                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
383                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
384             ],
385             # Checks (things that shouldn't happen)
386             {
387                 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
388                 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 6),
389                 r".*Sending create_node request.*": partial(expect_count, 9)
390             },
391             # Driver class
392             "arvnodeman.test.fake_driver.QuotaDriver",
393             # Jobs
394             {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
395              "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
396              "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
397              "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"},
398             # Provider
399             "azure"),
400         "test_no_hang_failing_node_create": (
401             # Actions (pattern -> action)
402             [
403                 (r".*Daemon started", set_squeue),
404                 (r".*Client error: nope", noop),
405                 (r".*Client error: nope", noop),
406                 (r".*Client error: nope", noop),
407                 (r".*Client error: nope", noop),
408             ],
409             # Checks (things that shouldn't happen)
410             {},
411             # Driver class
412             "arvnodeman.test.fake_driver.FailingDriver",
413             # Jobs
414             {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
415              "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
416              "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
417              "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"},
418             # Provider
419             "azure"),
420         "test_retry_create": (
421             # Actions (pattern -> action)
422             [
423                 (r".*Daemon started", set_squeue),
424                 (r".*Rate limit exceeded - scheduling retry in 12 seconds", noop),
425                 (r".*Rate limit exceeded - scheduling retry in 2 seconds", noop),
426                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", noop),
427             ],
428             # Checks (things that shouldn't happen)
429             {},
430             # Driver class
431             "arvnodeman.test.fake_driver.RetryDriver",
432             # Jobs
433             {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail"},
434             # Provider
435             "azure"),
436         "test_single_node_aws": (
437             # Actions (pattern -> action)
438             [
439                 (r".*Daemon started", set_squeue),
440                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
441                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
442                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
443                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
444             ],
445             # Checks (things that shouldn't happen)
446             {
447                 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
448                 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1),
449                 r".*Setting node quota.*": fail,
450             },
451             # Driver class
452             "arvnodeman.test.fake_driver.FakeAwsDriver",
453             # Jobs
454             {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
455             # Provider
456             "ec2"),
457         "test_single_node_gce": (
458             # Actions (pattern -> action)
459             [
460                 (r".*Daemon started", set_squeue),
461                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
462                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
463                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
464                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
465             ],
466             # Checks (things that shouldn't happen)
467             {
468                 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
469                 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1),
470                 r".*Setting node quota.*": fail,
471             },
472             # Driver class
473             "arvnodeman.test.fake_driver.FakeGceDriver",
474             # Jobs
475             {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
476             # Provider
477             "gce")
478     }
479
480     code = 0
481     if len(sys.argv) > 1:
482         code = run_test(sys.argv[1], *tests[sys.argv[1]])
483     else:
484         for t in sorted(tests.keys()):
485             code += run_test(t, *tests[t])
486
487     if code == 0:
488         logger.info("Tests passed")
489     else:
490         logger.info("Tests failed")
491
492     exit(code)
493
494 if __name__ == '__main__':
495     main()