closes #9824
[arvados.git] / services / api / app / models / job.rb
1 class Job < ArvadosModel
2   include HasUuid
3   include KindAndEtag
4   include CommonApiTemplate
5   serialize :components, Hash
6   attr_protected :arvados_sdk_version, :docker_image_locator
7   serialize :script_parameters, Hash
8   serialize :runtime_constraints, Hash
9   serialize :tasks_summary, Hash
10   before_create :ensure_unique_submit_id
11   after_commit :trigger_crunch_dispatch_if_cancelled, :on => :update
12   before_validation :set_priority
13   before_validation :update_state_from_old_state_attrs
14   before_validation :update_script_parameters_digest
15   validate :ensure_script_version_is_commit
16   validate :find_docker_image_locator
17   validate :find_arvados_sdk_version
18   validate :validate_status
19   validate :validate_state_change
20   validate :ensure_no_collection_uuids_in_script_params
21   before_save :tag_version_in_internal_repository
22   before_save :update_timestamps_when_state_changes
23
24   has_many :commit_ancestors, :foreign_key => :descendant, :primary_key => :script_version
25   has_many(:nodes, foreign_key: :job_uuid, primary_key: :uuid)
26
27   class SubmitIdReused < StandardError
28   end
29
30   api_accessible :user, extend: :common do |t|
31     t.add :submit_id
32     t.add :priority
33     t.add :script
34     t.add :script_parameters
35     t.add :script_version
36     t.add :cancelled_at
37     t.add :cancelled_by_client_uuid
38     t.add :cancelled_by_user_uuid
39     t.add :started_at
40     t.add :finished_at
41     t.add :output
42     t.add :success
43     t.add :running
44     t.add :state
45     t.add :is_locked_by_uuid
46     t.add :log
47     t.add :runtime_constraints
48     t.add :tasks_summary
49     t.add :nondeterministic
50     t.add :repository
51     t.add :supplied_script_version
52     t.add :arvados_sdk_version
53     t.add :docker_image_locator
54     t.add :queue_position
55     t.add :node_uuids
56     t.add :description
57     t.add :components
58   end
59
60   # Supported states for a job
61   States = [
62             (Queued = 'Queued'),
63             (Running = 'Running'),
64             (Cancelled = 'Cancelled'),
65             (Failed = 'Failed'),
66             (Complete = 'Complete'),
67            ]
68
69   def assert_finished
70     update_attributes(finished_at: finished_at || db_current_time,
71                       success: success.nil? ? false : success,
72                       running: false)
73   end
74
75   def node_uuids
76     nodes.map(&:uuid)
77   end
78
79   def self.queue
80     self.where('state = ?', Queued).order('priority desc, created_at')
81   end
82
83   def queue_position
84     # We used to report this accurately, but the implementation made queue
85     # API requests O(n**2) for the size of the queue.  See #8800.
86     # We've soft-disabled it because it's not clear we even want this
87     # functionality: now that we have Node Manager with support for multiple
88     # node sizes, "queue position" tells you very little about when a job will
89     # run.
90     state == Queued ? 0 : nil
91   end
92
93   def self.running
94     self.where('running = ?', true).
95       order('priority desc, created_at')
96   end
97
98   def lock locked_by_uuid
99     with_lock do
100       unless self.state == Queued and self.is_locked_by_uuid.nil?
101         raise AlreadyLockedError
102       end
103       self.state = Running
104       self.is_locked_by_uuid = locked_by_uuid
105       self.save!
106     end
107   end
108
109   def update_script_parameters_digest
110     self.script_parameters_digest = self.class.sorted_hash_digest(script_parameters)
111   end
112
113   def self.searchable_columns operator
114     super - ["script_parameters_digest"]
115   end
116
117   protected
118
119   def self.sorted_hash_digest h
120     Digest::MD5.hexdigest(Oj.dump(deep_sort_hash(h)))
121   end
122
123   def self.deep_sort_hash h
124     return h unless h.is_a? Hash
125     h.sort.collect do |k, v|
126       [k, deep_sort_hash(v)]
127     end.to_h
128   end
129
130   def foreign_key_attributes
131     super + %w(output log)
132   end
133
134   def skip_uuid_read_permission_check
135     super + %w(cancelled_by_client_uuid)
136   end
137
138   def skip_uuid_existence_check
139     super + %w(output log)
140   end
141
142   def set_priority
143     if self.priority.nil?
144       self.priority = 0
145     end
146     true
147   end
148
149   def ensure_script_version_is_commit
150     if state == Running
151       # Apparently client has already decided to go for it. This is
152       # needed to run a local job using a local working directory
153       # instead of a commit-ish.
154       return true
155     end
156     if new_record? or repository_changed? or script_version_changed?
157       sha1 = Commit.find_commit_range(repository,
158                                       nil, script_version, nil).first
159       if not sha1
160         errors.add :script_version, "#{script_version} does not resolve to a commit"
161         return false
162       end
163       if supplied_script_version.nil? or supplied_script_version.empty?
164         self.supplied_script_version = script_version
165       end
166       self.script_version = sha1
167     end
168     true
169   end
170
171   def tag_version_in_internal_repository
172     if state == Running
173       # No point now. See ensure_script_version_is_commit.
174       true
175     elsif errors.any?
176       # Won't be saved, and script_version might not even be valid.
177       true
178     elsif new_record? or repository_changed? or script_version_changed?
179       uuid_was = uuid
180       begin
181         assign_uuid
182         Commit.tag_in_internal_repository repository, script_version, uuid
183       rescue
184         uuid = uuid_was
185         raise
186       end
187     end
188   end
189
190   def ensure_unique_submit_id
191     if !submit_id.nil?
192       if Job.where('submit_id=?',self.submit_id).first
193         raise SubmitIdReused.new
194       end
195     end
196     true
197   end
198
199   def resolve_runtime_constraint(key, attr_sym)
200     if ((runtime_constraints.is_a? Hash) and
201         (search = runtime_constraints[key]))
202       ok, result = yield search
203     else
204       ok, result = true, nil
205     end
206     if ok
207       send("#{attr_sym}=".to_sym, result)
208     else
209       errors.add(attr_sym, result)
210     end
211     ok
212   end
213
214   def find_arvados_sdk_version
215     resolve_runtime_constraint("arvados_sdk_version",
216                                :arvados_sdk_version) do |git_search|
217       commits = Commit.find_commit_range("arvados",
218                                          nil, git_search, nil)
219       if commits.empty?
220         [false, "#{git_search} does not resolve to a commit"]
221       elsif not runtime_constraints["docker_image"]
222         [false, "cannot be specified without a Docker image constraint"]
223       else
224         [true, commits.first]
225       end
226     end
227   end
228
229   def find_docker_image_locator
230     runtime_constraints['docker_image'] =
231         Rails.configuration.default_docker_image_for_jobs if ((runtime_constraints.is_a? Hash) and
232                                                               (runtime_constraints['docker_image']).nil? and
233                                                               Rails.configuration.default_docker_image_for_jobs)
234     resolve_runtime_constraint("docker_image",
235                                :docker_image_locator) do |image_search|
236       image_tag = runtime_constraints['docker_image_tag']
237       if coll = Collection.for_latest_docker_image(image_search, image_tag)
238         [true, coll.portable_data_hash]
239       else
240         [false, "not found for #{image_search}"]
241       end
242     end
243   end
244
245   def permission_to_update
246     if is_locked_by_uuid_was and !(current_user and
247                                    (current_user.uuid == is_locked_by_uuid_was or
248                                     current_user.uuid == system_user.uuid))
249       if script_changed? or
250           script_parameters_changed? or
251           script_version_changed? or
252           (!cancelled_at_was.nil? and
253            (cancelled_by_client_uuid_changed? or
254             cancelled_by_user_uuid_changed? or
255             cancelled_at_changed?)) or
256           started_at_changed? or
257           finished_at_changed? or
258           running_changed? or
259           success_changed? or
260           output_changed? or
261           log_changed? or
262           tasks_summary_changed? or
263           state_changed? or
264           components_changed?
265         logger.warn "User #{current_user.uuid if current_user} tried to change protected job attributes on locked #{self.class.to_s} #{uuid_was}"
266         return false
267       end
268     end
269     if !is_locked_by_uuid_changed?
270       super
271     else
272       if !current_user
273         logger.warn "Anonymous user tried to change lock on #{self.class.to_s} #{uuid_was}"
274         false
275       elsif is_locked_by_uuid_was and is_locked_by_uuid_was != current_user.uuid
276         logger.warn "User #{current_user.uuid} tried to steal lock on #{self.class.to_s} #{uuid_was} from #{is_locked_by_uuid_was}"
277         false
278       elsif !is_locked_by_uuid.nil? and is_locked_by_uuid != current_user.uuid
279         logger.warn "User #{current_user.uuid} tried to lock #{self.class.to_s} #{uuid_was} with uuid #{is_locked_by_uuid}"
280         false
281       else
282         super
283       end
284     end
285   end
286
287   def update_modified_by_fields
288     if self.cancelled_at_changed?
289       # Ensure cancelled_at cannot be set to arbitrary non-now times,
290       # or changed once it is set.
291       if self.cancelled_at and not self.cancelled_at_was
292         self.cancelled_at = db_current_time
293         self.cancelled_by_user_uuid = current_user.uuid
294         self.cancelled_by_client_uuid = current_api_client.andand.uuid
295         @need_crunch_dispatch_trigger = true
296       else
297         self.cancelled_at = self.cancelled_at_was
298         self.cancelled_by_user_uuid = self.cancelled_by_user_uuid_was
299         self.cancelled_by_client_uuid = self.cancelled_by_client_uuid_was
300       end
301     end
302     super
303   end
304
305   def trigger_crunch_dispatch_if_cancelled
306     if @need_crunch_dispatch_trigger
307       File.open(Rails.configuration.crunch_refresh_trigger, 'wb') do
308         # That's all, just create/touch a file for crunch-job to see.
309       end
310     end
311   end
312
313   def update_timestamps_when_state_changes
314     return if not (state_changed? or new_record?)
315
316     case state
317     when Running
318       self.started_at ||= db_current_time
319     when Failed, Complete
320       self.finished_at ||= db_current_time
321     when Cancelled
322       self.cancelled_at ||= db_current_time
323     end
324
325     # TODO: Remove the following case block when old "success" and
326     # "running" attrs go away. Until then, this ensures we still
327     # expose correct success/running flags to older clients, even if
328     # some new clients are writing only the new state attribute.
329     case state
330     when Queued
331       self.running = false
332       self.success = nil
333     when Running
334       self.running = true
335       self.success = nil
336     when Cancelled, Failed
337       self.running = false
338       self.success = false
339     when Complete
340       self.running = false
341       self.success = true
342     end
343     self.running ||= false # Default to false instead of nil.
344
345     @need_crunch_dispatch_trigger = true
346
347     true
348   end
349
350   def update_state_from_old_state_attrs
351     # If a client has touched the legacy state attrs, update the
352     # "state" attr to agree with the updated values of the legacy
353     # attrs.
354     #
355     # TODO: Remove this method when old "success" and "running" attrs
356     # go away.
357     if cancelled_at_changed? or
358         success_changed? or
359         running_changed? or
360         state.nil?
361       if cancelled_at
362         self.state = Cancelled
363       elsif success == false
364         self.state = Failed
365       elsif success == true
366         self.state = Complete
367       elsif running == true
368         self.state = Running
369       else
370         self.state = Queued
371       end
372     end
373     true
374   end
375
376   def validate_status
377     if self.state.in?(States)
378       true
379     else
380       errors.add :state, "#{state.inspect} must be one of: #{States.inspect}"
381       false
382     end
383   end
384
385   def validate_state_change
386     ok = true
387     if self.state_changed?
388       ok = case self.state_was
389            when nil
390              # state isn't set yet
391              true
392            when Queued
393              # Permit going from queued to any state
394              true
395            when Running
396              # From running, may only transition to a finished state
397              [Complete, Failed, Cancelled].include? self.state
398            when Complete, Failed, Cancelled
399              # Once in a finished state, don't permit any more state changes
400              false
401            else
402              # Any other state transition is also invalid
403              false
404            end
405       if not ok
406         errors.add :state, "invalid change from #{self.state_was} to #{self.state}"
407       end
408     end
409     ok
410   end
411
412   def ensure_no_collection_uuids_in_script_params
413     # recursive_hash_search searches recursively through hashes and
414     # arrays in 'thing' for string fields matching regular expression
415     # 'pattern'.  Returns true if pattern is found, false otherwise.
416     def recursive_hash_search thing, pattern
417       if thing.is_a? Hash
418         thing.each do |k, v|
419           return true if recursive_hash_search v, pattern
420         end
421       elsif thing.is_a? Array
422         thing.each do |k|
423           return true if recursive_hash_search k, pattern
424         end
425       elsif thing.is_a? String
426         return true if thing.match pattern
427       end
428       false
429     end
430
431     # Fail validation if any script_parameters field includes a string containing a
432     # collection uuid pattern.
433     if self.script_parameters_changed?
434       if recursive_hash_search(self.script_parameters, Collection.uuid_regex)
435         self.errors.add :script_parameters, "must use portable_data_hash instead of collection uuid"
436         return false
437       end
438     end
439     true
440   end
441 end