12298: Allow non-null log when cancelling an unrunnable container.
[arvados.git] / services / api / test / unit / crunch_dispatch_test.rb
1 # Copyright (C) The Arvados Authors. All rights reserved.
2 #
3 # SPDX-License-Identifier: AGPL-3.0
4
5 require 'test_helper'
6 require 'crunch_dispatch'
7 require 'helpers/git_test_helper'
8
9 class CrunchDispatchTest < ActiveSupport::TestCase
10   include GitTestHelper
11
12   test 'choose cheaper nodes first' do
13     act_as_system_user do
14       # Replace test fixtures with a set suitable for testing dispatch
15       Node.destroy_all
16
17       # Idle nodes with different prices
18       [['compute1', 3.20, 32],
19        ['compute2', 1.60, 16],
20        ['compute3', 0.80, 8]].each do |hostname, price, cores|
21         Node.create!(hostname: hostname,
22                      info: {
23                        'slurm_state' => 'idle',
24                      },
25                      properties: {
26                        'cloud_node' => {
27                          'price' => price,
28                        },
29                        'total_cpu_cores' => cores,
30                        'total_ram_mb' => cores*1024,
31                        'total_scratch_mb' => cores*10000,
32                      })
33       end
34
35       # Node with no price information
36       Node.create!(hostname: 'compute4',
37                    info: {
38                      'slurm_state' => 'idle',
39                    },
40                    properties: {
41                      'total_cpu_cores' => 8,
42                      'total_ram_mb' => 8192,
43                      'total_scratch_mb' => 80000,
44                    })
45
46       # Cheap but busy node
47       Node.create!(hostname: 'compute5',
48                    info: {
49                      'slurm_state' => 'alloc',
50                    },
51                    properties: {
52                      'cloud_node' => {
53                        'price' => 0.10,
54                      },
55                      'total_cpu_cores' => 32,
56                      'total_ram_mb' => 32768,
57                      'total_scratch_mb' => 320000,
58                    })
59     end
60
61     dispatch = CrunchDispatch.new
62     [[1, 16384, ['compute2']],
63      [2, 16384, ['compute2', 'compute1']],
64      [2, 8000, ['compute4', 'compute3']],
65     ].each do |min_nodes, min_ram, expect_nodes|
66       job = Job.new(uuid: 'zzzzz-8i9sb-382lhiizavzhqlp',
67                     runtime_constraints: {
68                       'min_nodes' => min_nodes,
69                       'min_ram_mb_per_node' => min_ram,
70                     })
71       nodes = dispatch.nodes_available_for_job_now job
72       assert_equal expect_nodes, nodes
73     end
74   end
75
76   test 'respond to TERM' do
77     lockfile = Rails.root.join 'tmp', 'dispatch.lock'
78     ENV['CRUNCH_DISPATCH_LOCKFILE'] = lockfile.to_s
79     begin
80       pid = Process.fork do
81         begin
82           # Abandon database connections inherited from parent
83           # process.  Credit to
84           # https://github.com/kstephens/rails_is_forked
85           ActiveRecord::Base.connection_handler.connection_pools.each_value do |pool|
86             pool.instance_eval do
87               @reserved_connections = {}
88               @connections = []
89             end
90           end
91           ActiveRecord::Base.establish_connection
92
93           dispatch = CrunchDispatch.new
94           dispatch.stubs(:did_recently).returns true
95           dispatch.run []
96         ensure
97           Process.exit!
98         end
99       end
100       assert_with_timeout 5, "Dispatch did not lock #{lockfile}" do
101         !can_lock(lockfile)
102       end
103     ensure
104       Process.kill("TERM", pid)
105     end
106     assert_with_timeout 20, "Dispatch did not unlock #{lockfile}" do
107       can_lock(lockfile)
108     end
109   end
110
111   test 'override --cgroup-root with CRUNCH_CGROUP_ROOT' do
112     ENV['CRUNCH_CGROUP_ROOT'] = '/path/to/cgroup'
113     Rails.configuration.crunch_job_wrapper = :none
114     act_as_system_user do
115       j = Job.create(repository: 'active/foo',
116                      script: 'hash',
117                      script_version: '4fe459abe02d9b365932b8f5dc419439ab4e2577',
118                      script_parameters: {})
119       ok = false
120       Open3.expects(:popen3).at_least_once.with do |*args|
121         if args.index(j.uuid)
122           ok = ((i = args.index '--cgroup-root') and
123                 (args[i+1] == '/path/to/cgroup'))
124         end
125         true
126       end.raises(StandardError.new('all is well'))
127       dispatch = CrunchDispatch.new
128       dispatch.parse_argv ['--jobs']
129       dispatch.refresh_todo
130       dispatch.start_jobs
131       assert ok
132     end
133   end
134
135   def assert_with_timeout timeout, message
136     t = 0
137     while (t += 0.1) < timeout
138       if yield
139         return
140       end
141       sleep 0.1
142     end
143     assert false, message + " (waited #{timeout} seconds)"
144   end
145
146   def can_lock lockfile
147     lockfile.open(File::RDWR|File::CREAT, 0644) do |f|
148       return f.flock(File::LOCK_EX|File::LOCK_NB)
149     end
150   end
151
152   test 'rate limit of partial line segments' do
153     act_as_system_user do
154       Rails.configuration.crunch_log_partial_line_throttle_period = 1
155
156       job = {}
157       job[:bytes_logged] = 0
158       job[:log_throttle_bytes_so_far] = 0
159       job[:log_throttle_lines_so_far] = 0
160       job[:log_throttle_bytes_skipped] = 0
161       job[:log_throttle_is_open] = true
162       job[:log_throttle_partial_line_last_at] = Time.new(0)
163       job[:log_throttle_first_partial_line] = true
164
165       dispatch = CrunchDispatch.new
166
167       line = "first log line"
168       limit = dispatch.rate_limit(job, line)
169       assert_equal true, limit
170       assert_equal "first log line", line
171       assert_equal 1, job[:log_throttle_lines_so_far]
172
173       # first partial line segment is skipped and counted towards skipped lines
174       now = Time.now.strftime('%Y-%m-%d-%H:%M:%S')
175       line = "#{now} localhost 100 0 stderr [...] this is first partial line segment [...]"
176       limit = dispatch.rate_limit(job, line)
177       assert_equal true, limit
178       assert_includes line, "Rate-limiting partial segments of long lines", line
179       assert_equal 2, job[:log_throttle_lines_so_far]
180
181       # next partial line segment within throttle interval is skipped but not counted towards skipped lines
182       line = "#{now} localhost 100 0 stderr [...] second partial line segment within the interval [...]"
183       limit = dispatch.rate_limit(job, line)
184       assert_equal false, limit
185       assert_equal 2, job[:log_throttle_lines_so_far]
186
187       # next partial line after interval is counted towards skipped lines
188       sleep(1)
189       line = "#{now} localhost 100 0 stderr [...] third partial line segment after the interval [...]"
190       limit = dispatch.rate_limit(job, line)
191       assert_equal false, limit
192       assert_equal 3, job[:log_throttle_lines_so_far]
193
194       # this is not a valid line segment
195       line = "#{now} localhost 100 0 stderr [...] does not end with [...] and is not a partial segment"
196       limit = dispatch.rate_limit(job, line)
197       assert_equal true, limit
198       assert_equal "#{now} localhost 100 0 stderr [...] does not end with [...] and is not a partial segment", line
199       assert_equal 4, job[:log_throttle_lines_so_far]
200
201       # this also is not a valid line segment
202       line = "#{now} localhost 100 0 stderr does not start correctly but ends with [...]"
203       limit = dispatch.rate_limit(job, line)
204       assert_equal true, limit
205       assert_equal "#{now} localhost 100 0 stderr does not start correctly but ends with [...]", line
206       assert_equal 5, job[:log_throttle_lines_so_far]
207     end
208   end
209
210   test 'scancel orphaned job nodes' do
211     Rails.configuration.crunch_job_wrapper = :slurm_immediate
212     act_as_system_user do
213       dispatch = CrunchDispatch.new
214
215       squeue_resp = IO.popen("echo zzzzz-8i9sb-pshmckwoma9plh7\necho thisisnotvalidjobuuid\necho zzzzz-8i9sb-4cf0abc123e809j\necho zzzzz-dz642-o04e3r651turtdr\n")
216       scancel_resp = IO.popen("true")
217
218       IO.expects(:popen).
219         with(['squeue', '-a', '-h', '-o', '%j']).
220         returns(squeue_resp)
221
222       IO.expects(:popen).
223         with(dispatch.sudo_preface + ['scancel', '-n', 'zzzzz-8i9sb-4cf0abc123e809j']).
224         returns(scancel_resp)
225
226       dispatch.check_orphaned_slurm_jobs
227     end
228   end
229 end