8784: Fix test for latest firefox.
[arvados.git] / services / nodemanager / tests / integration_test.py
1 #!/usr/bin/env python
2 """Integration test framework for node manager.
3
4 Runs full node manager with an API server (needs ARVADOS_API_HOST and
5 ARVADOS_API_TOKEN).  Stubs out the cloud driver and slurm commands to mock
6 specific behaviors.  Monitors the log output to verify an expected sequence of
7 events or behaviors for each test.
8
9 """
10
11 import subprocess
12 import os
13 import sys
14 import re
15 import time
16 import logging
17 import stat
18 import tempfile
19 import shutil
20 from functools import partial
21 import arvados
22 import StringIO
23
24 logger = logging.getLogger("logger")
25 logger.setLevel(logging.INFO)
26 logger.addHandler(logging.StreamHandler(sys.stderr))
27
28 detail = logging.getLogger("detail")
29 detail.setLevel(logging.INFO)
30 if os.environ.get("ANMTEST_LOGLEVEL"):
31     detail_content = sys.stderr
32 else:
33     detail_content = StringIO.StringIO()
34 detail.addHandler(logging.StreamHandler(detail_content))
35
36 fake_slurm = None
37 compute_nodes = None
38 all_jobs = None
39
40 def update_script(path, val):
41     with open(path+"_", "w") as f:
42         f.write(val)
43     os.chmod(path+"_", stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR)
44     os.rename(path+"_", path)
45     detail.info("Update script %s: %s", path, val)
46
47 def set_squeue(g):
48     global all_jobs
49     update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n" +
50                   "\n".join("echo '1|100|100|%s|%s'" % (v, k) for k,v in all_jobs.items()))
51     return 0
52
53
54 def node_paired(g):
55     global compute_nodes
56     compute_nodes[g.group(1)] = g.group(3)
57
58     update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n" +
59                   "\n".join("echo '%s alloc'" % (v) for k,v in compute_nodes.items()))
60
61     for k,v in all_jobs.items():
62         if v == "ReqNodeNotAvail":
63             all_jobs[k] = "Running"
64             break
65
66     set_squeue(g)
67
68     return 0
69
70 def remaining_jobs(g):
71     update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n" +
72                   "\n".join("echo '%s alloc'" % (v) for k,v in compute_nodes.items()))
73
74     for k,v in all_jobs.items():
75         all_jobs[k] = "Running"
76
77     set_squeue(g)
78
79     return 0
80
81
82 def node_busy(g):
83     update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n" +
84                   "\n".join("echo '%s idle'" % (v) for k,v in compute_nodes.items()))
85     return 0
86
87 def node_shutdown(g):
88     global compute_nodes
89     del compute_nodes[g.group(1)]
90     return 0
91
92 def jobs_req(g):
93     global all_jobs
94     for k,v in all_jobs.items():
95         all_jobs[k] = "ReqNodeNotAvail"
96     set_squeue(g)
97     return 0
98
99 def noop(g):
100     return 0
101
102 def fail(checks, pattern, g):
103     return 1
104
105 def expect_count(count, checks, pattern, g):
106     if count == 0:
107         return 1
108     else:
109         checks[pattern] = partial(expect_count, count-1)
110         return 0
111
112 def run_test(name, actions, checks, driver_class, jobs):
113     code = 0
114
115     # Delete any stale node records
116     api = arvados.api('v1')
117     for n in api.nodes().list().execute()['items']:
118         api.nodes().delete(uuid=n["uuid"]).execute()
119
120     logger.info("Start %s", name)
121
122     global fake_slurm
123     fake_slurm = tempfile.mkdtemp()
124     detail.info("fake_slurm is %s", fake_slurm)
125
126     global compute_nodes
127     compute_nodes = {}
128
129     global all_jobs
130     all_jobs = jobs
131
132     env = os.environ.copy()
133     env["PATH"] = fake_slurm + ":" + env["PATH"]
134
135     # Reset fake squeue/sinfo to empty
136     update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n")
137     update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n")
138
139     # Write configuration file for test
140     with open("tests/fake.cfg.template") as f:
141         open(os.path.join(fake_slurm, "id_rsa.pub"), "w").close()
142         with open(os.path.join(fake_slurm, "fake.cfg"), "w") as cfg:
143             cfg.write(f.read().format(host=os.environ["ARVADOS_API_HOST"],
144                                       token=os.environ["ARVADOS_API_TOKEN"],
145                                       driver_class=driver_class,
146                                       ssh_key=os.path.join(fake_slurm, "id_rsa.pub")))
147
148     # Tests must complete in less than 3 minutes.
149     timeout = time.time() + 180
150     terminated = False
151
152     # Now start node manager
153     p = subprocess.Popen(["bin/arvados-node-manager", "--foreground", "--config", os.path.join(fake_slurm, "fake.cfg")],
154                          bufsize=0, stderr=subprocess.PIPE, env=env)
155
156     # Test main loop:
157     # - Read line
158     # - Apply negative checks (thinks that are not supposed to happen)
159     # - Check timeout
160     # - Check if the next action should trigger
161     # - If all actions are exhausted, terminate with test success
162     # - If it hits timeout with actions remaining, terminate with test failed
163     try:
164         # naive line iteration over pipes gets buffered, which isn't what we want,
165         # see https://bugs.python.org/issue3907
166         for line in iter(p.stderr.readline, ""):
167             detail_content.write(line)
168
169             for k,v in checks.items():
170                 g = re.match(k, line)
171                 if g:
172                     detail.info("Matched check %s", k)
173                     code += v(checks, k, g)
174                     if code != 0:
175                         detail.error("Check failed")
176                         if not terminated:
177                             p.terminate()
178                             terminated = True
179
180             if terminated:
181                 continue
182
183             if time.time() > timeout:
184                 detail.error("Exceeded timeout with actions remaining: %s", actions)
185                 code += 1
186                 if not terminated:
187                     p.terminate()
188                     terminated = True
189
190             k, v = actions[0]
191             g = re.match(k, line)
192             if g:
193                 detail.info("Matched action %s", k)
194                 actions.pop(0)
195                 code += v(g)
196                 if code != 0:
197                     detail.error("Action failed")
198                     p.terminate()
199                     terminated = True
200
201             if not actions:
202                 p.terminate()
203                 terminated = True
204     except KeyboardInterrupt:
205         p.kill()
206
207     if actions:
208         detail.error("Ended with remaining actions: %s", actions)
209         code = 1
210
211     shutil.rmtree(fake_slurm)
212
213     if code == 0:
214         logger.info("%s passed", name)
215     else:
216         if isinstance(detail_content, StringIO.StringIO):
217             sys.stderr.write(detail_content.getvalue())
218         logger.info("%s failed", name)
219
220     return code
221
222
223 def main():
224     # Test lifecycle.
225
226     tests = {
227         "test_single_node": (
228             [
229                 (r".*Daemon started", set_squeue),
230                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
231                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
232                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
233                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
234             ], {
235                 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
236                 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1),
237                 r".*Setting node quota.*": fail,
238             },
239             "arvnodeman.test.fake_driver.FakeDriver",
240             {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"}),
241         "test_multiple_nodes": (
242             [
243                 (r".*Daemon started", set_squeue),
244                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
245                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
246                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
247                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
248                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
249                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
250                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
251                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
252                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
253                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
254             ], {
255                 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
256                 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 4),
257                 r".*Setting node quota.*": fail,
258             },
259             "arvnodeman.test.fake_driver.FakeDriver",
260             {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
261              "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
262              "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
263              "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"
264          }),
265         "test_hit_quota": (
266             [
267                 (r".*Daemon started", set_squeue),
268                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
269                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
270                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
271                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", remaining_jobs),
272                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
273                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
274                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown)
275             ], {
276                 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
277                 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 2),
278                 r".*Sending create_node request.*": partial(expect_count, 5)
279             },
280             "arvnodeman.test.fake_driver.QuotaDriver",
281             {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
282              "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
283              "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
284              "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"
285          }),
286         "test_probe_quota": (
287             [
288                 (r".*Daemon started", set_squeue),
289                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
290                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
291                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
292                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", remaining_jobs),
293                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
294                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
295                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
296                 (r".*sending request", jobs_req),
297                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
298                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
299                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
300                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
301                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
302                 (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
303                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
304                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
305                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
306                 (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
307             ], {
308                 r".*Suggesting shutdown because node state is \('down', .*\)": fail,
309                 r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 6),
310                 r".*Sending create_node request.*": partial(expect_count, 9)
311             },
312             "arvnodeman.test.fake_driver.QuotaDriver",
313             {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
314              "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
315              "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
316              "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"
317          }),
318         "test_no_hang_failing_node_create": (
319             [
320                 (r".*Daemon started", set_squeue),
321                 (r".*Client error: nope", noop),
322                 (r".*Client error: nope", noop),
323                 (r".*Client error: nope", noop),
324                 (r".*Client error: nope", noop),
325             ],
326             {},
327             "arvnodeman.test.fake_driver.FailingDriver",
328             {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
329              "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
330              "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
331              "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"
332          }),
333         "test_retry_create": (
334             [
335                 (r".*Daemon started", set_squeue),
336                 (r".*Rate limit exceeded - scheduling retry in 12 seconds", noop),
337                 (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", noop),
338             ],
339             {},
340             "arvnodeman.test.fake_driver.RetryDriver",
341             {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail"
342          })
343     }
344
345     code = 0
346     if len(sys.argv) > 1:
347         code = run_test(sys.argv[1], *tests[sys.argv[1]])
348     else:
349         for t in sorted(tests.keys()):
350             code += run_test(t, *tests[t])
351
352     if code == 0:
353         logger.info("Tests passed")
354     else:
355         logger.info("Tests failed")
356
357     exit(code)
358
359 if __name__ == '__main__':
360     main()