Merge branch '8840-lock-job-record' closes #8840
[arvados.git] / services / api / app / models / job.rb
1 class Job < ArvadosModel
2   include HasUuid
3   include KindAndEtag
4   include CommonApiTemplate
5   serialize :components, Hash
6   attr_protected :arvados_sdk_version, :docker_image_locator
7   serialize :script_parameters, Hash
8   serialize :runtime_constraints, Hash
9   serialize :tasks_summary, Hash
10   before_create :ensure_unique_submit_id
11   after_commit :trigger_crunch_dispatch_if_cancelled, :on => :update
12   before_validation :set_priority
13   before_validation :update_state_from_old_state_attrs
14   validate :ensure_script_version_is_commit
15   validate :find_docker_image_locator
16   validate :find_arvados_sdk_version
17   validate :validate_status
18   validate :validate_state_change
19   validate :ensure_no_collection_uuids_in_script_params
20   before_save :tag_version_in_internal_repository
21   before_save :update_timestamps_when_state_changes
22
23   has_many :commit_ancestors, :foreign_key => :descendant, :primary_key => :script_version
24   has_many(:nodes, foreign_key: :job_uuid, primary_key: :uuid)
25
26   class SubmitIdReused < StandardError
27   end
28
29   api_accessible :user, extend: :common do |t|
30     t.add :submit_id
31     t.add :priority
32     t.add :script
33     t.add :script_parameters
34     t.add :script_version
35     t.add :cancelled_at
36     t.add :cancelled_by_client_uuid
37     t.add :cancelled_by_user_uuid
38     t.add :started_at
39     t.add :finished_at
40     t.add :output
41     t.add :success
42     t.add :running
43     t.add :state
44     t.add :is_locked_by_uuid
45     t.add :log
46     t.add :runtime_constraints
47     t.add :tasks_summary
48     t.add :nondeterministic
49     t.add :repository
50     t.add :supplied_script_version
51     t.add :arvados_sdk_version
52     t.add :docker_image_locator
53     t.add :queue_position
54     t.add :node_uuids
55     t.add :description
56     t.add :components
57   end
58
59   # Supported states for a job
60   States = [
61             (Queued = 'Queued'),
62             (Running = 'Running'),
63             (Cancelled = 'Cancelled'),
64             (Failed = 'Failed'),
65             (Complete = 'Complete'),
66            ]
67
68   def assert_finished
69     update_attributes(finished_at: finished_at || db_current_time,
70                       success: success.nil? ? false : success,
71                       running: false)
72   end
73
74   def node_uuids
75     nodes.map(&:uuid)
76   end
77
78   def self.queue
79     self.where('state = ?', Queued).order('priority desc, created_at')
80   end
81
82   def queue_position
83     # We used to report this accurately, but the implementation made queue
84     # API requests O(n**2) for the size of the queue.  See #8800.
85     # We've soft-disabled it because it's not clear we even want this
86     # functionality: now that we have Node Manager with support for multiple
87     # node sizes, "queue position" tells you very little about when a job will
88     # run.
89     state == Queued ? 0 : nil
90   end
91
92   def self.running
93     self.where('running = ?', true).
94       order('priority desc, created_at')
95   end
96
97   def lock locked_by_uuid
98     with_lock do
99       unless self.state == Queued and self.is_locked_by_uuid.nil?
100         raise AlreadyLockedError
101       end
102       self.state = Running
103       self.is_locked_by_uuid = locked_by_uuid
104       self.save!
105     end
106   end
107
108   protected
109
110   def foreign_key_attributes
111     super + %w(output log)
112   end
113
114   def skip_uuid_read_permission_check
115     super + %w(cancelled_by_client_uuid)
116   end
117
118   def skip_uuid_existence_check
119     super + %w(output log)
120   end
121
122   def set_priority
123     if self.priority.nil?
124       self.priority = 0
125     end
126     true
127   end
128
129   def ensure_script_version_is_commit
130     if state == Running
131       # Apparently client has already decided to go for it. This is
132       # needed to run a local job using a local working directory
133       # instead of a commit-ish.
134       return true
135     end
136     if new_record? or repository_changed? or script_version_changed?
137       sha1 = Commit.find_commit_range(repository,
138                                       nil, script_version, nil).first
139       if not sha1
140         errors.add :script_version, "#{script_version} does not resolve to a commit"
141         return false
142       end
143       if supplied_script_version.nil? or supplied_script_version.empty?
144         self.supplied_script_version = script_version
145       end
146       self.script_version = sha1
147     end
148     true
149   end
150
151   def tag_version_in_internal_repository
152     if state == Running
153       # No point now. See ensure_script_version_is_commit.
154       true
155     elsif errors.any?
156       # Won't be saved, and script_version might not even be valid.
157       true
158     elsif new_record? or repository_changed? or script_version_changed?
159       uuid_was = uuid
160       begin
161         assign_uuid
162         Commit.tag_in_internal_repository repository, script_version, uuid
163       rescue
164         uuid = uuid_was
165         raise
166       end
167     end
168   end
169
170   def ensure_unique_submit_id
171     if !submit_id.nil?
172       if Job.where('submit_id=?',self.submit_id).first
173         raise SubmitIdReused.new
174       end
175     end
176     true
177   end
178
179   def resolve_runtime_constraint(key, attr_sym)
180     if ((runtime_constraints.is_a? Hash) and
181         (search = runtime_constraints[key]))
182       ok, result = yield search
183     else
184       ok, result = true, nil
185     end
186     if ok
187       send("#{attr_sym}=".to_sym, result)
188     else
189       errors.add(attr_sym, result)
190     end
191     ok
192   end
193
194   def find_arvados_sdk_version
195     resolve_runtime_constraint("arvados_sdk_version",
196                                :arvados_sdk_version) do |git_search|
197       commits = Commit.find_commit_range("arvados",
198                                          nil, git_search, nil)
199       if commits.empty?
200         [false, "#{git_search} does not resolve to a commit"]
201       elsif not runtime_constraints["docker_image"]
202         [false, "cannot be specified without a Docker image constraint"]
203       else
204         [true, commits.first]
205       end
206     end
207   end
208
209   def find_docker_image_locator
210     runtime_constraints['docker_image'] =
211         Rails.configuration.default_docker_image_for_jobs if ((runtime_constraints.is_a? Hash) and
212                                                               (runtime_constraints['docker_image']).nil? and
213                                                               Rails.configuration.default_docker_image_for_jobs)
214     resolve_runtime_constraint("docker_image",
215                                :docker_image_locator) do |image_search|
216       image_tag = runtime_constraints['docker_image_tag']
217       if coll = Collection.for_latest_docker_image(image_search, image_tag)
218         [true, coll.portable_data_hash]
219       else
220         [false, "not found for #{image_search}"]
221       end
222     end
223   end
224
225   def permission_to_update
226     if is_locked_by_uuid_was and !(current_user and
227                                    (current_user.uuid == is_locked_by_uuid_was or
228                                     current_user.uuid == system_user.uuid))
229       if script_changed? or
230           script_parameters_changed? or
231           script_version_changed? or
232           (!cancelled_at_was.nil? and
233            (cancelled_by_client_uuid_changed? or
234             cancelled_by_user_uuid_changed? or
235             cancelled_at_changed?)) or
236           started_at_changed? or
237           finished_at_changed? or
238           running_changed? or
239           success_changed? or
240           output_changed? or
241           log_changed? or
242           tasks_summary_changed? or
243           state_changed? or
244           components_changed?
245         logger.warn "User #{current_user.uuid if current_user} tried to change protected job attributes on locked #{self.class.to_s} #{uuid_was}"
246         return false
247       end
248     end
249     if !is_locked_by_uuid_changed?
250       super
251     else
252       if !current_user
253         logger.warn "Anonymous user tried to change lock on #{self.class.to_s} #{uuid_was}"
254         false
255       elsif is_locked_by_uuid_was and is_locked_by_uuid_was != current_user.uuid
256         logger.warn "User #{current_user.uuid} tried to steal lock on #{self.class.to_s} #{uuid_was} from #{is_locked_by_uuid_was}"
257         false
258       elsif !is_locked_by_uuid.nil? and is_locked_by_uuid != current_user.uuid
259         logger.warn "User #{current_user.uuid} tried to lock #{self.class.to_s} #{uuid_was} with uuid #{is_locked_by_uuid}"
260         false
261       else
262         super
263       end
264     end
265   end
266
267   def update_modified_by_fields
268     if self.cancelled_at_changed?
269       # Ensure cancelled_at cannot be set to arbitrary non-now times,
270       # or changed once it is set.
271       if self.cancelled_at and not self.cancelled_at_was
272         self.cancelled_at = db_current_time
273         self.cancelled_by_user_uuid = current_user.uuid
274         self.cancelled_by_client_uuid = current_api_client.andand.uuid
275         @need_crunch_dispatch_trigger = true
276       else
277         self.cancelled_at = self.cancelled_at_was
278         self.cancelled_by_user_uuid = self.cancelled_by_user_uuid_was
279         self.cancelled_by_client_uuid = self.cancelled_by_client_uuid_was
280       end
281     end
282     super
283   end
284
285   def trigger_crunch_dispatch_if_cancelled
286     if @need_crunch_dispatch_trigger
287       File.open(Rails.configuration.crunch_refresh_trigger, 'wb') do
288         # That's all, just create/touch a file for crunch-job to see.
289       end
290     end
291   end
292
293   def update_timestamps_when_state_changes
294     return if not (state_changed? or new_record?)
295
296     case state
297     when Running
298       self.started_at ||= db_current_time
299     when Failed, Complete
300       self.finished_at ||= db_current_time
301     when Cancelled
302       self.cancelled_at ||= db_current_time
303     end
304
305     # TODO: Remove the following case block when old "success" and
306     # "running" attrs go away. Until then, this ensures we still
307     # expose correct success/running flags to older clients, even if
308     # some new clients are writing only the new state attribute.
309     case state
310     when Queued
311       self.running = false
312       self.success = nil
313     when Running
314       self.running = true
315       self.success = nil
316     when Cancelled, Failed
317       self.running = false
318       self.success = false
319     when Complete
320       self.running = false
321       self.success = true
322     end
323     self.running ||= false # Default to false instead of nil.
324
325     @need_crunch_dispatch_trigger = true
326
327     true
328   end
329
330   def update_state_from_old_state_attrs
331     # If a client has touched the legacy state attrs, update the
332     # "state" attr to agree with the updated values of the legacy
333     # attrs.
334     #
335     # TODO: Remove this method when old "success" and "running" attrs
336     # go away.
337     if cancelled_at_changed? or
338         success_changed? or
339         running_changed? or
340         state.nil?
341       if cancelled_at
342         self.state = Cancelled
343       elsif success == false
344         self.state = Failed
345       elsif success == true
346         self.state = Complete
347       elsif running == true
348         self.state = Running
349       else
350         self.state = Queued
351       end
352     end
353     true
354   end
355
356   def validate_status
357     if self.state.in?(States)
358       true
359     else
360       errors.add :state, "#{state.inspect} must be one of: #{States.inspect}"
361       false
362     end
363   end
364
365   def validate_state_change
366     ok = true
367     if self.state_changed?
368       ok = case self.state_was
369            when nil
370              # state isn't set yet
371              true
372            when Queued
373              # Permit going from queued to any state
374              true
375            when Running
376              # From running, may only transition to a finished state
377              [Complete, Failed, Cancelled].include? self.state
378            when Complete, Failed, Cancelled
379              # Once in a finished state, don't permit any more state changes
380              false
381            else
382              # Any other state transition is also invalid
383              false
384            end
385       if not ok
386         errors.add :state, "invalid change from #{self.state_was} to #{self.state}"
387       end
388     end
389     ok
390   end
391
392   def ensure_no_collection_uuids_in_script_params
393     # recursive_hash_search searches recursively through hashes and
394     # arrays in 'thing' for string fields matching regular expression
395     # 'pattern'.  Returns true if pattern is found, false otherwise.
396     def recursive_hash_search thing, pattern
397       if thing.is_a? Hash
398         thing.each do |k, v|
399           return true if recursive_hash_search v, pattern
400         end
401       elsif thing.is_a? Array
402         thing.each do |k|
403           return true if recursive_hash_search k, pattern
404         end
405       elsif thing.is_a? String
406         return true if thing.match pattern
407       end
408       false
409     end
410
411     # Fail validation if any script_parameters field includes a string containing a
412     # collection uuid pattern.
413     if self.script_parameters_changed?
414       if recursive_hash_search(self.script_parameters, Collection.uuid_regex)
415         self.errors.add :script_parameters, "must use portable_data_hash instead of collection uuid"
416         return false
417       end
418     end
419     true
420   end
421 end