Merge branch 'master' into 9587-trash-page
[arvados.git] / services / api / test / unit / crunch_dispatch_test.rb
1 require 'test_helper'
2 require 'crunch_dispatch'
3 require 'helpers/git_test_helper'
4
5 class CrunchDispatchTest < ActiveSupport::TestCase
6   include GitTestHelper
7
8   test 'choose cheaper nodes first' do
9     act_as_system_user do
10       # Replace test fixtures with a set suitable for testing dispatch
11       Node.destroy_all
12
13       # Idle nodes with different prices
14       [['compute1', 3.20, 32],
15        ['compute2', 1.60, 16],
16        ['compute3', 0.80, 8]].each do |hostname, price, cores|
17         Node.create!(hostname: hostname,
18                      info: {
19                        'slurm_state' => 'idle',
20                      },
21                      properties: {
22                        'cloud_node' => {
23                          'price' => price,
24                        },
25                        'total_cpu_cores' => cores,
26                        'total_ram_mb' => cores*1024,
27                        'total_scratch_mb' => cores*10000,
28                      })
29       end
30
31       # Node with no price information
32       Node.create!(hostname: 'compute4',
33                    info: {
34                      'slurm_state' => 'idle',
35                    },
36                    properties: {
37                      'total_cpu_cores' => 8,
38                      'total_ram_mb' => 8192,
39                      'total_scratch_mb' => 80000,
40                    })
41
42       # Cheap but busy node
43       Node.create!(hostname: 'compute5',
44                    info: {
45                      'slurm_state' => 'alloc',
46                    },
47                    properties: {
48                      'cloud_node' => {
49                        'price' => 0.10,
50                      },
51                      'total_cpu_cores' => 32,
52                      'total_ram_mb' => 32768,
53                      'total_scratch_mb' => 320000,
54                    })
55     end
56
57     dispatch = CrunchDispatch.new
58     [[1, 16384, ['compute2']],
59      [2, 16384, ['compute2', 'compute1']],
60      [2, 8000, ['compute4', 'compute3']],
61     ].each do |min_nodes, min_ram, expect_nodes|
62       job = Job.new(uuid: 'zzzzz-8i9sb-382lhiizavzhqlp',
63                     runtime_constraints: {
64                       'min_nodes' => min_nodes,
65                       'min_ram_mb_per_node' => min_ram,
66                     })
67       nodes = dispatch.nodes_available_for_job_now job
68       assert_equal expect_nodes, nodes
69     end
70   end
71
72   test 'respond to TERM' do
73     lockfile = Rails.root.join 'tmp', 'dispatch.lock'
74     ENV['CRUNCH_DISPATCH_LOCKFILE'] = lockfile.to_s
75     begin
76       pid = Process.fork do
77         begin
78           # Abandon database connections inherited from parent
79           # process.  Credit to
80           # https://github.com/kstephens/rails_is_forked
81           ActiveRecord::Base.connection_handler.connection_pools.each_value do |pool|
82             pool.instance_eval do
83               @reserved_connections = {}
84               @connections = []
85             end
86           end
87           ActiveRecord::Base.establish_connection
88
89           dispatch = CrunchDispatch.new
90           dispatch.stubs(:did_recently).returns true
91           dispatch.run []
92         ensure
93           Process.exit!
94         end
95       end
96       assert_with_timeout 5, "Dispatch did not lock #{lockfile}" do
97         !can_lock(lockfile)
98       end
99     ensure
100       Process.kill("TERM", pid)
101     end
102     assert_with_timeout 20, "Dispatch did not unlock #{lockfile}" do
103       can_lock(lockfile)
104     end
105   end
106
107   test 'override --cgroup-root with CRUNCH_CGROUP_ROOT' do
108     ENV['CRUNCH_CGROUP_ROOT'] = '/path/to/cgroup'
109     Rails.configuration.crunch_job_wrapper = :none
110     act_as_system_user do
111       j = Job.create(repository: 'active/foo',
112                      script: 'hash',
113                      script_version: '4fe459abe02d9b365932b8f5dc419439ab4e2577',
114                      script_parameters: {})
115       ok = false
116       Open3.expects(:popen3).at_least_once.with do |*args|
117         if args.index(j.uuid)
118           ok = ((i = args.index '--cgroup-root') and
119                 (args[i+1] == '/path/to/cgroup'))
120         end
121         true
122       end.raises(StandardError.new('all is well'))
123       dispatch = CrunchDispatch.new
124       dispatch.parse_argv ['--jobs']
125       dispatch.refresh_todo
126       dispatch.start_jobs
127       assert ok
128     end
129   end
130
131   def assert_with_timeout timeout, message
132     t = 0
133     while (t += 0.1) < timeout
134       if yield
135         return
136       end
137       sleep 0.1
138     end
139     assert false, message + " (waited #{timeout} seconds)"
140   end
141
142   def can_lock lockfile
143     lockfile.open(File::RDWR|File::CREAT, 0644) do |f|
144       return f.flock(File::LOCK_EX|File::LOCK_NB)
145     end
146   end
147
148   test 'rate limit of partial line segments' do
149     act_as_system_user do
150       Rails.configuration.crunch_log_partial_line_throttle_period = 1
151
152       job = {}
153       job[:bytes_logged] = 0
154       job[:log_throttle_bytes_so_far] = 0
155       job[:log_throttle_lines_so_far] = 0
156       job[:log_throttle_bytes_skipped] = 0
157       job[:log_throttle_is_open] = true
158       job[:log_throttle_partial_line_last_at] = Time.new(0)
159       job[:log_throttle_first_partial_line] = true
160
161       dispatch = CrunchDispatch.new
162
163       line = "first log line"
164       limit = dispatch.rate_limit(job, line)
165       assert_equal true, limit
166       assert_equal "first log line", line
167       assert_equal 1, job[:log_throttle_lines_so_far]
168
169       # first partial line segment is skipped and counted towards skipped lines
170       now = Time.now.strftime('%Y-%m-%d-%H:%M:%S')
171       line = "#{now} localhost 100 0 stderr [...] this is first partial line segment [...]"
172       limit = dispatch.rate_limit(job, line)
173       assert_equal true, limit
174       assert_includes line, "Rate-limiting partial segments of long lines", line
175       assert_equal 2, job[:log_throttle_lines_so_far]
176
177       # next partial line segment within throttle interval is skipped but not counted towards skipped lines
178       line = "#{now} localhost 100 0 stderr [...] second partial line segment within the interval [...]"
179       limit = dispatch.rate_limit(job, line)
180       assert_equal false, limit
181       assert_equal 2, job[:log_throttle_lines_so_far]
182
183       # next partial line after interval is counted towards skipped lines
184       sleep(1)
185       line = "#{now} localhost 100 0 stderr [...] third partial line segment after the interval [...]"
186       limit = dispatch.rate_limit(job, line)
187       assert_equal false, limit
188       assert_equal 3, job[:log_throttle_lines_so_far]
189
190       # this is not a valid line segment
191       line = "#{now} localhost 100 0 stderr [...] does not end with [...] and is not a partial segment"
192       limit = dispatch.rate_limit(job, line)
193       assert_equal true, limit
194       assert_equal "#{now} localhost 100 0 stderr [...] does not end with [...] and is not a partial segment", line
195       assert_equal 4, job[:log_throttle_lines_so_far]
196
197       # this also is not a valid line segment
198       line = "#{now} localhost 100 0 stderr does not start correctly but ends with [...]"
199       limit = dispatch.rate_limit(job, line)
200       assert_equal true, limit
201       assert_equal "#{now} localhost 100 0 stderr does not start correctly but ends with [...]", line
202       assert_equal 5, job[:log_throttle_lines_so_far]
203     end
204   end
205
206   test 'scancel orphaned job nodes' do
207     Rails.configuration.crunch_job_wrapper = :slurm_immediate
208     act_as_system_user do
209       dispatch = CrunchDispatch.new
210
211       squeue_resp = IO.popen("echo zzzzz-8i9sb-pshmckwoma9plh7\necho thisisnotvalidjobuuid\necho zzzzz-8i9sb-4cf0abc123e809j\necho zzzzz-dz642-o04e3r651turtdr\n")
212       scancel_resp = IO.popen("true")
213
214       IO.expects(:popen).
215         with(['squeue', '-a', '-h', '-o', '%j']).
216         returns(squeue_resp)
217
218       IO.expects(:popen).
219         with(dispatch.sudo_preface + ['scancel', '-n', 'zzzzz-8i9sb-4cf0abc123e809j']).
220         returns(scancel_resp)
221
222       dispatch.check_orphaned_slurm_jobs
223     end
224   end
225 end