10312: Integration test framework for node manager, runs full node manager with
[arvados.git] / services / nodemanager / tests / integration_test.py
1 #!/usr/bin/env python
2 import subprocess
3 import os
4 import sys
5 import re
6 import time
7 import logging
8 import stat
9 import tempfile
10 import shutil
11
12 logging.basicConfig(level=logging.INFO)
13
14 fake_slurm = None
15 compute_nodes = None
16
17 def update_script(path, val):
18     with open(path+"_", "w") as f:
19         f.write(val)
20     os.chmod(path+"_", stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR)
21     os.rename(path+"_", path)
22
23
24 def set_squeue(actions, checks, k, g):
25     update_script(os.path.join(fake_slurm, "squeue"), """#!/bin/sh
26 echo '1|100|100|ReqNodeNotAvail|34t0i-dz642-h42bg3hq4bdfpf9'
27 """)
28     return 0
29
30 def set_sinfo_alloc(actions, checks, k, g):
31     update_script(os.path.join(fake_slurm, "sinfo"), """#!/bin/sh
32 echo '%s alloc'
33 """ % (g.group(3)))
34
35     update_script(os.path.join(fake_slurm, "squeue"), """#!/bin/sh
36 echo '1|100|100|Running|34t0i-dz642-h42bg3hq4bdfpf9'
37 """)
38
39     global compute_nodes
40     compute_nodes[g.group(1)] = g.group(3)
41     return 0
42
43 def set_sinfo_idle(actions, checks, k, g):
44     update_script(os.path.join(fake_slurm, "sinfo"), """#!/bin/sh
45 echo '%s idle'
46 """ % (compute_nodes[g.group(1)]))
47     return 0
48
49 def noop(actions, checks, k, g):
50     return 0
51
52 def down_fail(actions, checks, k, g):
53     return 1
54
55
56 def run_test(actions, checks, driver_class):
57     code = 0
58
59     global fake_slurm
60     fake_slurm = tempfile.mkdtemp()
61     logging.info("fake_slurm is %s", fake_slurm)
62
63     global compute_nodes
64     compute_nodes = {}
65
66     env = os.environ.copy()
67     env["PATH"] = fake_slurm + ":" + env["PATH"]
68
69     update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n")
70     update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n")
71
72     with open("tests/fake.cfg.template") as f:
73         with open(os.path.join(fake_slurm, "id_rsa.pub"), "w") as ssh:
74             pass
75         with open(os.path.join(fake_slurm, "fake.cfg"), "w") as cfg:
76             cfg.write(f.read().format(host=os.environ["ARVADOS_API_HOST"],
77                                       token=os.environ["ARVADOS_API_TOKEN"],
78                                       driver_class=driver_class,
79                                       ssh_key=os.path.join(fake_slurm, "id_rsa.pub")))
80
81     timeout = time.time() + 300
82
83     p = subprocess.Popen(["bin/arvados-node-manager", "--foreground", "--config", os.path.join(fake_slurm, "fake.cfg")],
84                          bufsize=1, stderr=subprocess.PIPE, env=env)
85     for line in p.stderr:
86         sys.stdout.write(line)
87
88         if time.time() > timeout:
89             logging.error("Exceeded timeout")
90             code = 1
91             p.terminate()
92
93         for k,v in actions.items():
94             g = re.match(k, line)
95             if g:
96                 logging.info("Triggered action %s", k)
97                 del actions[k]
98                 code = v(actions, checks, k, g)
99                 if code != 0:
100                     logging.error("Action failed")
101                     p.terminate()
102
103         for k,v in checks.items():
104             g = re.match(k, line)
105             if g:
106                 logging.info("Triggered check %s", k)
107                 code = v(actions, checks, k, g)
108                 if code != 0:
109                     logging.error("Check failed")
110                     p.terminate()
111
112         if not actions:
113             p.terminate()
114
115     #shutil.rmtree(fake_slurm)
116
117     return code
118
119
120 def main():
121     code = run_test({
122         r".*Daemon started": set_squeue,
123         r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": set_sinfo_alloc,
124         r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)": set_sinfo_idle,
125         r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)": noop,
126         r".*Shutdown success": noop,
127     }, {
128         r".*Suggesting shutdown because node state is \('down', .*\)": down_fail
129     },
130     "arvnodeman.test.fake_driver.FakeDriver")
131     exit(code)
132
133 main()