Merge branch 'master' of git.curoverse.com:arvados into 3408-production-datamanager
[arvados.git] / services / api / app / models / job.rb
1 class Job < ArvadosModel
2   include HasUuid
3   include KindAndEtag
4   include CommonApiTemplate
5   attr_protected :docker_image_locator
6   serialize :script_parameters, Hash
7   serialize :runtime_constraints, Hash
8   serialize :tasks_summary, Hash
9   before_create :ensure_unique_submit_id
10   after_commit :trigger_crunch_dispatch_if_cancelled, :on => :update
11   before_validation :set_priority
12   before_validation :update_timestamps_when_state_changes
13   before_validation :update_state_from_old_state_attrs
14   validate :ensure_script_version_is_commit
15   validate :find_docker_image_locator
16   validate :validate_status
17   validate :validate_state_change
18
19   has_many :commit_ancestors, :foreign_key => :descendant, :primary_key => :script_version
20   has_many(:nodes, foreign_key: :job_uuid, primary_key: :uuid)
21
22   class SubmitIdReused < StandardError
23   end
24
25   api_accessible :user, extend: :common do |t|
26     t.add :submit_id
27     t.add :priority
28     t.add :script
29     t.add :script_parameters
30     t.add :script_version
31     t.add :cancelled_at
32     t.add :cancelled_by_client_uuid
33     t.add :cancelled_by_user_uuid
34     t.add :started_at
35     t.add :finished_at
36     t.add :output
37     t.add :success
38     t.add :running
39     t.add :state
40     t.add :is_locked_by_uuid
41     t.add :log
42     t.add :runtime_constraints
43     t.add :tasks_summary
44     t.add :dependencies
45     t.add :nondeterministic
46     t.add :repository
47     t.add :supplied_script_version
48     t.add :docker_image_locator
49     t.add :queue_position
50     t.add :node_uuids
51     t.add :description
52   end
53
54   # Supported states for a job
55   States = [
56             (Queued = 'Queued'),
57             (Running = 'Running'),
58             (Cancelled = 'Cancelled'),
59             (Failed = 'Failed'),
60             (Complete = 'Complete'),
61            ]
62
63   def assert_finished
64     update_attributes(finished_at: finished_at || Time.now,
65                       success: success.nil? ? false : success,
66                       running: false)
67   end
68
69   def node_uuids
70     nodes.map(&:uuid)
71   end
72
73   def self.queue
74     self.where('state = ?', Queued).order('priority desc, created_at')
75   end
76
77   def queue_position
78     i = 0
79     Job::queue.each do |j|
80       if j[:uuid] == self.uuid
81         return i
82       end
83     end
84     nil
85   end
86
87   def self.running
88     self.where('running = ?', true).
89       order('priority desc, created_at')
90   end
91
92   def lock locked_by_uuid
93     transaction do
94       self.reload
95       unless self.state == Queued and self.is_locked_by_uuid.nil?
96         raise AlreadyLockedError
97       end
98       self.state = Running
99       self.is_locked_by_uuid = locked_by_uuid
100       self.save!
101     end
102   end
103
104   protected
105
106   def foreign_key_attributes
107     super + %w(output log)
108   end
109
110   def skip_uuid_read_permission_check
111     super + %w(cancelled_by_client_uuid)
112   end
113
114   def skip_uuid_existence_check
115     super + %w(output log)
116   end
117
118   def set_priority
119     if self.priority.nil?
120       self.priority = 0
121     end
122     true
123   end
124
125   def ensure_script_version_is_commit
126     if self.state == Running
127       # Apparently client has already decided to go for it. This is
128       # needed to run a local job using a local working directory
129       # instead of a commit-ish.
130       return true
131     end
132     if new_record? or script_version_changed?
133       sha1 = Commit.find_commit_range(current_user, self.repository, nil, self.script_version, nil)[0] rescue nil
134       if sha1
135         self.supplied_script_version = self.script_version if self.supplied_script_version.nil? or self.supplied_script_version.empty?
136         self.script_version = sha1
137       else
138         self.errors.add :script_version, "#{self.script_version} does not resolve to a commit"
139         return false
140       end
141     end
142   end
143
144   def ensure_unique_submit_id
145     if !submit_id.nil?
146       if Job.where('submit_id=?',self.submit_id).first
147         raise SubmitIdReused.new
148       end
149     end
150     true
151   end
152
153   def find_docker_image_locator
154     # Find the Collection that holds the Docker image specified in the
155     # runtime constraints, and store its locator in docker_image_locator.
156     unless runtime_constraints.is_a? Hash
157       # We're still in validation stage, so we can't assume
158       # runtime_constraints isn't something horrible like an array or
159       # a string. Treat those cases as "no docker image supplied";
160       # other validations will fail anyway.
161       self.docker_image_locator = nil
162       return true
163     end
164     image_search = runtime_constraints['docker_image']
165     image_tag = runtime_constraints['docker_image_tag']
166     if image_search.nil?
167       self.docker_image_locator = nil
168       true
169     elsif coll = Collection.for_latest_docker_image(image_search, image_tag)
170       self.docker_image_locator = coll.portable_data_hash
171       true
172     else
173       errors.add(:docker_image_locator, "not found for #{image_search}")
174       false
175     end
176   end
177
178   def dependencies
179     deps = {}
180     queue = self.script_parameters.values
181     while not queue.empty?
182       queue = queue.flatten.compact.collect do |v|
183         if v.is_a? Hash
184           v.values
185         elsif v.is_a? String
186           v.match(/^(([0-9a-f]{32})\b(\+[^,]+)?,?)*$/) do |locator|
187             deps[locator.to_s] = true
188           end
189           nil
190         end
191       end
192     end
193     deps.keys
194   end
195
196   def permission_to_update
197     if is_locked_by_uuid_was and !(current_user and
198                                    (current_user.uuid == is_locked_by_uuid_was or
199                                     current_user.uuid == system_user.uuid))
200       if script_changed? or
201           script_parameters_changed? or
202           script_version_changed? or
203           (!cancelled_at_was.nil? and
204            (cancelled_by_client_uuid_changed? or
205             cancelled_by_user_uuid_changed? or
206             cancelled_at_changed?)) or
207           started_at_changed? or
208           finished_at_changed? or
209           running_changed? or
210           success_changed? or
211           output_changed? or
212           log_changed? or
213           tasks_summary_changed? or
214           state_changed?
215         logger.warn "User #{current_user.uuid if current_user} tried to change protected job attributes on locked #{self.class.to_s} #{uuid_was}"
216         return false
217       end
218     end
219     if !is_locked_by_uuid_changed?
220       super
221     else
222       if !current_user
223         logger.warn "Anonymous user tried to change lock on #{self.class.to_s} #{uuid_was}"
224         false
225       elsif is_locked_by_uuid_was and is_locked_by_uuid_was != current_user.uuid
226         logger.warn "User #{current_user.uuid} tried to steal lock on #{self.class.to_s} #{uuid_was} from #{is_locked_by_uuid_was}"
227         false
228       elsif !is_locked_by_uuid.nil? and is_locked_by_uuid != current_user.uuid
229         logger.warn "User #{current_user.uuid} tried to lock #{self.class.to_s} #{uuid_was} with uuid #{is_locked_by_uuid}"
230         false
231       else
232         super
233       end
234     end
235   end
236
237   def update_modified_by_fields
238     if self.cancelled_at_changed?
239       # Ensure cancelled_at cannot be set to arbitrary non-now times,
240       # or changed once it is set.
241       if self.cancelled_at and not self.cancelled_at_was
242         self.cancelled_at = Time.now
243         self.cancelled_by_user_uuid = current_user.uuid
244         self.cancelled_by_client_uuid = current_api_client.andand.uuid
245         @need_crunch_dispatch_trigger = true
246       else
247         self.cancelled_at = self.cancelled_at_was
248         self.cancelled_by_user_uuid = self.cancelled_by_user_uuid_was
249         self.cancelled_by_client_uuid = self.cancelled_by_client_uuid_was
250       end
251     end
252     super
253   end
254
255   def trigger_crunch_dispatch_if_cancelled
256     if @need_crunch_dispatch_trigger
257       File.open(Rails.configuration.crunch_refresh_trigger, 'wb') do
258         # That's all, just create/touch a file for crunch-job to see.
259       end
260     end
261   end
262
263   def update_timestamps_when_state_changes
264     return if not (state_changed? or new_record?)
265     case state
266     when Running
267       self.started_at ||= Time.now
268     when Failed, Complete
269       self.finished_at ||= Time.now
270     when Cancelled
271       self.cancelled_at ||= Time.now
272     end
273
274     # TODO: Remove the following case block when old "success" and
275     # "running" attrs go away. Until then, this ensures we still
276     # expose correct success/running flags to older clients, even if
277     # some new clients are writing only the new state attribute.
278     case state
279     when Queued
280       self.running = false
281       self.success = nil
282     when Running
283       self.running = true
284       self.success = nil
285     when Cancelled, Failed
286       self.running = false
287       self.success = false
288     when Complete
289       self.running = false
290       self.success = true
291     end
292     self.running ||= false # Default to false instead of nil.
293
294     true
295   end
296
297   def update_state_from_old_state_attrs
298     # If a client has touched the legacy state attrs, update the
299     # "state" attr to agree with the updated values of the legacy
300     # attrs.
301     #
302     # TODO: Remove this method when old "success" and "running" attrs
303     # go away.
304     if cancelled_at_changed? or
305         success_changed? or
306         running_changed? or
307         state.nil?
308       if cancelled_at
309         self.state = Cancelled
310       elsif success == false
311         self.state = Failed
312       elsif success == true
313         self.state = Complete
314       elsif running == true
315         self.state = Running
316       else
317         self.state = Queued
318       end
319     end
320     true
321   end
322
323   def validate_status
324     if self.state.in?(States)
325       true
326     else
327       errors.add :state, "#{state.inspect} must be one of: #{States.inspect}"
328       false
329     end
330   end
331
332   def validate_state_change
333     ok = true
334     if self.state_changed?
335       ok = case self.state_was
336            when nil
337              # state isn't set yet
338              true
339            when Queued
340              # Permit going from queued to any state
341              true
342            when Running
343              # From running, may only transition to a finished state
344              [Complete, Failed, Cancelled].include? self.state
345            when Complete, Failed, Cancelled
346              # Once in a finished state, don't permit any more state changes
347              false
348            else
349              # Any other state transition is also invalid
350              false
351            end
352       if not ok
353         errors.add :state, "invalid change from #{self.state_was} to #{self.state}"
354       end
355     end
356     ok
357   end
358 end