Merge branch '3877-log-memory-leak' closes #3877
[arvados.git] / services / api / script / update_node_attributes.rb
1 #!/usr/bin/env ruby
2
3 # Keep node.info[:running_job_uuid] and node.info[:slurm_state] up to date.
4 #
5 # use:     script/update_node_attributes.rb [rails_env] [update_interval]
6 # example: script/update_node_attributes.rb production 10
7
8 ENV["RAILS_ENV"] = ARGV[0] || "development"
9 @update_interval = ARGV[1] ? ARGV[1].to_i : 5
10
11 require File.dirname(__FILE__) + '/../config/boot'
12 require File.dirname(__FILE__) + '/../config/environment'
13
14 include ApplicationHelper
15 act_as_system_user
16
17 @slurm_state = {}
18 @running_job_uuid = {}
19
20 while true
21   IO.popen('sinfo --noheader --Node || true').readlines.each do |line|
22     tokens = line.strip.split
23     nodestate = tokens.last.downcase
24
25     nodenames = []
26     if (re = tokens.first.match /^([^\[]*)\[([-\d,]+)\]$/)
27       nodeprefix = re[1]
28       re[2].split(',').each do |number_range|
29         if number_range.index('-')
30           range = number_range.split('-').collect(&:to_i)
31           (range[0]..range[1]).each do |n|
32             nodenames << "#{nodeprefix}#{n}"
33           end
34         else
35           nodenames << "#{nodeprefix}#{number_range}"
36         end
37       end
38     else
39       nodenames << tokens.first
40     end
41
42     nodenames.each do |nodename|
43       if @slurm_state[nodename] != nodestate
44         has_no_job = ! ['alloc','comp'].index(nodestate)
45         node = Node.
46           where('slot_number=? and hostname=?',
47                 nodename.match(/(\d+)$/)[1].to_i,
48                 nodename).
49           first
50         raise "Fatal: Node does not exist: #{nodename}" if !node
51
52         puts "Node #{node.uuid} slot #{node.slot_number} name #{node.hostname} state #{nodestate}#{' (has_no_job)' if has_no_job}"
53         node_info_was = node.info.dup
54         node.info[:slurm_state] = nodestate
55         node.info[:running_job_uuid] = nil if has_no_job
56         if node_info_was != node.info and not node.save
57           raise "Fail: update node #{node.uuid} state #{nodestate}"
58         end
59         @slurm_state[nodename] = nodestate
60       end
61     end
62   end
63
64   IO.popen('squeue --noheader --format="%j %t %N" || true').readlines.each do |line|
65     tokens = line.strip.split
66     running_job_uuid = tokens.first
67
68     nodenames = []
69     if (re = tokens.last.match /^([^\[]*)\[([-\d,]+)\]$/)
70       nodeprefix = re[1]
71       re[2].split(',').each do |number_range|
72         if number_range.index('-')
73           range = number_range.split('-').collect(&:to_i)
74           (range[0]..range[1]).each do |n|
75             nodenames << "#{nodeprefix}#{n}"
76           end
77         else
78           nodenames << "#{nodeprefix}#{number_range}"
79         end
80       end
81     else
82       nodenames << tokens.first
83     end
84
85     nodenames.each do |nodename|
86       if @running_job_uuid[nodename] != running_job_uuid
87         node = Node.
88           where('slot_number=? and hostname=?',
89                 nodename.match(/(\d+)$/)[1].to_i,
90                 nodename).
91           first
92         raise "Fatal: Node does not exist: #{nodename}" if !node
93         puts "Node #{node.uuid} slot #{node.slot_number} name #{node.hostname} running_job_uuid #{running_job_uuid}"
94         if node.info[:running_job_uuid] != running_job_uuid
95           node.info[:running_job_uuid] = running_job_uuid
96           if not node.save
97             raise "Fail: update node #{node.uuid} running_job_uuid #{running_job_uuid}"
98           end
99         end
100         @running_job_uuid[nodename] = running_job_uuid
101       end
102     end
103   end
104
105   sleep @update_interval
106 end