Merge branch 'master' into 13804-no-shutdown-wanted-nodes
[arvados.git] / services / nodemanager / tests / integration_test.py
1 #!/usr/bin/env python
2 # Copyright (C) The Arvados Authors. All rights reserved.
3 #
4 # SPDX-License-Identifier: AGPL-3.0
5
6 """Integration test framework for node manager.
7
8 Runs full node manager with an API server (needs ARVADOS_API_HOST and
9 ARVADOS_API_TOKEN).  Stubs out the cloud driver and slurm commands to mock
10 specific behaviors.  Monitors the log output to verify an expected sequence of
11 events or behaviors for each test.
12
13 """
14
15 import subprocess32 as subprocess
16 import os
17 import sys
18 import re
19 import time
20 import logging
21 import stat
22 import tempfile
23 import shutil
24 import errno
25 from functools import partial
26 import arvados
27 import StringIO
28
29 formatter = logging.Formatter('%(asctime)s %(levelname)s: %(message)s')
30
31 handler = logging.StreamHandler(sys.stderr)
32 handler.setFormatter(formatter)
33 logger = logging.getLogger("logger")
34 logger.setLevel(logging.INFO)
35 logger.addHandler(handler)
36
37 detail = logging.getLogger("detail")
38 detail.setLevel(logging.INFO)
39 if os.environ.get("ANMTEST_LOGLEVEL"):
40     detail_content = sys.stderr
41 else:
42     detail_content = StringIO.StringIO()
43 handler = logging.StreamHandler(detail_content)
44 handler.setFormatter(formatter)
45 detail.addHandler(handler)
46
47 fake_slurm = None
48 compute_nodes = None
49 all_jobs = None
50 unsatisfiable_job_scancelled = None
51
52 def update_script(path, val):
53     with open(path+"_", "w") as f:
54         f.write(val)
55     os.chmod(path+"_", stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR)
56     os.rename(path+"_", path)
57     detail.info("Update script %s: %s", path, val)
58
59 def set_squeue(g):
60     global all_jobs
61     update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n" +
62                   "\n".join("echo '1|100|100|%s|%s|(null)|1234567890'" % (v, k) for k,v in all_jobs.items()))
63     return 0
64
65 def set_queue_unsatisfiable(g):
66     global all_jobs, unsatisfiable_job_scancelled
67     # Simulate a job requesting a 99 core node.
68     update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n" +
69                   "\n".join("echo '99|100|100|%s|%s|(null)|1234567890'" % (v, k) for k,v in all_jobs.items()))
70     update_script(os.path.join(fake_slurm, "scancel"), "#!/bin/sh\n" +
71                   "\ntouch %s" % unsatisfiable_job_scancelled)
72     return 0
73
74 def job_cancelled(g):
75     global unsatisfiable_job_scancelled
76     cancelled_job = g.group(1)
77     api = arvados.api('v1')
78     # Check that 'scancel' was called
79     if not os.path.isfile(unsatisfiable_job_scancelled):
80         return 1
81     # Check for the log entry
82     log_entry = api.logs().list(
83         filters=[
84             ['object_uuid', '=', cancelled_job],
85             ['event_type', '=', 'stderr'],
86         ]).execute()['items'][0]
87     if not re.match(
88             r"Constraints cannot be satisfied",
89             log_entry['properties']['text']):
90         return 1
91     return 0
92
93 def node_paired(g):
94     global compute_nodes
95     compute_nodes[g.group(1)] = g.group(3)
96
97     update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n" +
98                   "\n".join("echo '%s|alloc|(null)'" % (v) for k,v in compute_nodes.items()))
99
100     for k,v in all_jobs.items():
101         if v == "ReqNodeNotAvail":
102             all_jobs[k] = "Running"
103             break
104
105     set_squeue(g)
106
107     return 0
108
109 def node_busy(g):
110     update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n" +
111                   "\n".join("echo '%s|idle|(null)'" % (v) for k,v in compute_nodes.items()))
112     return 0
113
114 def node_shutdown(g):
115     global compute_nodes
116     if g.group(1) in compute_nodes:
117         del compute_nodes[g.group(1)]
118         return 0
119     else:
120         return 1
121
122
123 def jobs_req(g):
124     global all_jobs
125     for k,v in all_jobs.items():
126         all_jobs[k] = "ReqNodeNotAvail"
127     set_squeue(g)
128     return 0
129
130 def noop(g):
131     return 0
132
133 def fail(checks, pattern, g):
134     return 1
135
136 def expect_count(count, checks, pattern, g):
137     if count == 0:
138         return 1
139     else:
140         checks[pattern] = partial(expect_count, count-1)
141         return 0
142
143 def run_test(name, actions, checks, driver_class, jobs, provider):
144     code = 0
145     global unsatisfiable_job_scancelled
146     unsatisfiable_job_scancelled = os.path.join(tempfile.mkdtemp(),
147                                                 "scancel_called")
148
149     # Delete any stale node records
150     api = arvados.api('v1')
151     for n in api.nodes().list().execute()['items']:
152         api.nodes().delete(uuid=n["uuid"]).execute()
153
154     logger.info("Start %s", name)
155
156     global fake_slurm
157     fake_slurm = tempfile.mkdtemp()
158     detail.info("fake_slurm is %s", fake_slurm)
159
160     global compute_nodes
161     compute_nodes = {}
162
163     global all_jobs
164     all_jobs = jobs
165
166     env = os.environ.copy()
167     env["PATH"] = fake_slurm + ":" + env["PATH"]
168
169     # Reset fake squeue/sinfo to empty
170     update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n")
171     update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n")
172
173     # Write configuration file for test
174     with open("tests/fake_%s.cfg.template" % provider) as f:
175         open(os.path.join(fake_slurm, "id_rsa.pub"), "w").close()
176         with open(os.path.join(fake_slurm, "fake.cfg"), "w") as cfg:
177             cfg.write(f.read().format(host=os.environ["ARVADOS_API_HOST"],
178                                       token=os.environ["ARVADOS_API_TOKEN"],
179                                       driver_class=driver_class,
180                                       ssh_key=os.path.join(fake_slurm, "id_rsa.pub")))
181
182     # Tests must complete in less than 30 seconds.
183     timeout = time.time() + 30
184     terminated = False
185
186     # Now start node manager
187     p = subprocess.Popen(["bin/arvados-node-manager", "--foreground", "--config", os.path.join(fake_slurm, "fake.cfg")],
188                          bufsize=0, stderr=subprocess.PIPE, env=env)
189
190     # Test main loop:
191     # - Read line
192     # - Apply negative checks (things that are not supposed to happen)
193     # - Check timeout
194     # - Check if the next action should trigger
195     # - If all actions are exhausted, terminate with test success
196     # - If it hits timeout with actions remaining, terminate with test failed
197     try:
198         # naive line iteration over pipes gets buffered, which isn't what we want,
199         # see https://bugs.python.org/issue3907
200         for line in iter(p.stderr.readline, ""):
201             detail_content.write(line)
202
203             for k,v in checks.items():
204                 g = re.match(k, line)
205                 if g:
206                     detail.info("Matched check %s", k)
207                     code += v(checks, k, g)
208                     if code != 0:
209                         detail.error("Check failed")
210                         if not terminated:
211                             p.kill()
212                             terminated = True
213
214             if terminated:
215                 continue
216
217             if time.time() > timeout:
218                 detail.error("Exceeded timeout with actions remaining: %s", actions)
219                 code += 1
220                 if not terminated:
221                     p.kill()
222                     terminated = True
223
224             k, v = actions[0]
225             g = re.match(k, line)
226             if g:
227                 detail.info("Matched action %s", k)
228                 actions.pop(0)
229                 code += v(g)
230                 if code != 0:
231                     detail.error("Action failed")
232                     p.kill()
233                     terminated = True
234
235             if not actions:
236                 p.kill()
237                 terminated = True
238     except KeyboardInterrupt:
239         p.kill()
240
241     if actions:
242         detail.error("Ended with remaining actions: %s", actions)
243         code = 1
244
245     shutil.rmtree(fake_slurm)
246     shutil.rmtree(os.path.dirname(unsatisfiable_job_scancelled))
247
248     if code == 0:
249         logger.info("%s passed", name)
250     else:
251         if isinstance(detail_content, StringIO.StringIO):
252             detail_content.seek(0)
253             chunk = detail_content.read(4096)
254             while chunk:
255                 try:
256                     sys.stderr.write(chunk)
257                     chunk = detail_content.read(4096)
258                 except IOError as e:
259                     if e.errno == errno.EAGAIN:
260                         # try again (probably pipe buffer full)
261                         pass
262                     else:
263                         raise
264         logger.info("%s failed", name)
265
266     return code
267
268
269 def main():
270     # Test lifecycle.
271
272     tests = {
273         "test_unsatisfiable_jobs" : (
274             # Actions (pattern -> action)
275             [
276                 (r".*Daemon started", set_queue_unsatisfiable),
277                 (r".*Cancelled unsatisfiable job '(\S+)'", job_cancelled),
278             ],
279             # Checks (things that shouldn't happen)
280             {
281                 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": fail,
282                 r".*Trying to cancel job '(\S+)'": fail,
283             },
284             # Driver class
285             "arvnodeman.test.fake_driver.FakeDriver",
286             # Jobs
287             {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
288             # Provider
289             "azure"),
290         "test_single_node_azure": (
291             # Actions (pattern -> action)
292             [
293                 (r".*Daemon started", set_squeue),
294                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
295                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
296                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
297                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
298             ],
299             # Checks (things that shouldn't happen)
300             {
301                 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
302                 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1),
303                 r".*Setting node quota.*": fail,
304             },
305             # Driver class
306             "arvnodeman.test.fake_driver.FakeDriver",
307             # Jobs
308             {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
309             # Provider
310             "azure"),
311         "test_multiple_nodes": (
312             # Actions (pattern -> action)
313             [
314                 (r".*Daemon started", set_squeue),
315                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
316                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
317                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
318                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
319                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
320                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
321                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
322                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
323                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
324                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
325             ],
326             # Checks (things that shouldn't happen)
327             {
328                 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 4),
329                 r".*Setting node quota.*": fail,
330             },
331             # Driver class
332             "arvnodeman.test.fake_driver.FakeDriver",
333             # Jobs
334             {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
335              "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
336              "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
337              "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"},
338             # Provider
339             "azure"),
340         "test_hit_quota": (
341             # Actions (pattern -> action)
342             [
343                 (r".*Daemon started", set_squeue),
344                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
345                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
346                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
347                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
348                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
349                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown)
350             ],
351             # Checks (things that shouldn't happen)
352             {
353                 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 2),
354                 r".*Sending create_node request.*": partial(expect_count, 5)
355             },
356             # Driver class
357             "arvnodeman.test.fake_driver.QuotaDriver",
358             # Jobs
359             {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
360              "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
361              "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
362              "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"},
363             # Provider
364             "azure"),
365         "test_probe_quota": (
366             # Actions (pattern -> action)
367             [
368                 (r".*Daemon started", set_squeue),
369                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
370                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
371                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
372                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
373                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
374                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
375                 (r".*sending request", jobs_req),
376                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
377                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
378                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
379                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
380                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
381                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
382                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
383                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
384                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
385                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
386             ],
387             # Checks (things that shouldn't happen)
388             {
389                 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 6),
390                 r".*Sending create_node request.*": partial(expect_count, 9)
391             },
392             # Driver class
393             "arvnodeman.test.fake_driver.QuotaDriver",
394             # Jobs
395             {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
396              "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
397              "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
398              "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"},
399             # Provider
400             "azure"),
401         "test_no_hang_failing_node_create": (
402             # Actions (pattern -> action)
403             [
404                 (r".*Daemon started", set_squeue),
405                 (r".*Client error: nope", noop),
406                 (r".*Client error: nope", noop),
407                 (r".*Client error: nope", noop),
408                 (r".*Client error: nope", noop),
409             ],
410             # Checks (things that shouldn't happen)
411             {},
412             # Driver class
413             "arvnodeman.test.fake_driver.FailingDriver",
414             # Jobs
415             {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
416              "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
417              "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
418              "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"},
419             # Provider
420             "azure"),
421         "test_retry_create": (
422             # Actions (pattern -> action)
423             [
424                 (r".*Daemon started", set_squeue),
425                 (r".*Rate limit exceeded - scheduling retry in 2 seconds", noop),
426                 (r".*Rate limit exceeded - scheduling retry in 1 seconds", noop),
427                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", noop),
428             ],
429             # Checks (things that shouldn't happen)
430             {},
431             # Driver class
432             "arvnodeman.test.fake_driver.RetryDriver",
433             # Jobs
434             {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail"},
435             # Provider
436             "azure"),
437         "test_single_node_aws": (
438             # Actions (pattern -> action)
439             [
440                 (r".*Daemon started", set_squeue),
441                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
442                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
443                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
444                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
445             ],
446             # Checks (things that shouldn't happen)
447             {
448                 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1),
449                 r".*Setting node quota.*": fail,
450             },
451             # Driver class
452             "arvnodeman.test.fake_driver.FakeAwsDriver",
453             # Jobs
454             {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
455             # Provider
456             "ec2"),
457         "test_single_node_gce": (
458             # Actions (pattern -> action)
459             [
460                 (r".*Daemon started", set_squeue),
461                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
462                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
463                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
464                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
465             ],
466             # Checks (things that shouldn't happen)
467             {
468                 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1),
469                 r".*Setting node quota.*": fail,
470             },
471             # Driver class
472             "arvnodeman.test.fake_driver.FakeGceDriver",
473             # Jobs
474             {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
475             # Provider
476             "gce")
477     }
478
479     code = 0
480     if len(sys.argv) > 1:
481         code = run_test(sys.argv[1], *tests[sys.argv[1]])
482     else:
483         for t in sorted(tests.keys()):
484             code += run_test(t, *tests[t])
485
486     if code == 0:
487         logger.info("Tests passed")
488     else:
489         logger.info("Tests failed")
490
491     exit(code)
492
493 if __name__ == '__main__':
494     main()