3168: Instead of crashing using save! log an error and go to the next job.
[arvados.git] / services / api / script / cancel_stale_jobs.rb
1 #!/usr/bin/env ruby
2
3
4 if ENV["CRUNCH_DISPATCH_LOCKFILE"]
5   lockfilename = ENV.delete "CRUNCH_DISPATCH_LOCKFILE"
6   lockfile = File.open(lockfilename, File::RDWR|File::CREAT, 0644)
7   unless lockfile.flock File::LOCK_EX|File::LOCK_NB
8     abort "Lock unavailable on #{lockfilename} - exit"
9   end
10 end
11
12 ENV["RAILS_ENV"] = ARGV[0] || ENV["RAILS_ENV"] || "development"
13
14 require File.dirname(__FILE__) + '/../config/boot'
15 require File.dirname(__FILE__) + '/../config/environment'
16
17 class CancelJobs
18   include ApplicationHelper
19
20   def cancel_stale_jobs
21     act_as_system_user do
22       Job.running.each do |jobrecord|
23         f = Log.where("object_uuid=?", jobrecord.uuid).limit(1).order("created_at desc").first
24         if f
25           age = (Time.now - f.created_at)
26           if age > 300
27             $stderr.puts "dispatch: failing orphan job #{jobrecord.uuid}, last log is #{age} seconds old"
28             # job is marked running, but not known to crunch-dispatcher, and
29             # hasn't produced any log entries for 5 minutes, so mark it as failed.
30             jobrecord.running = false
31             jobrecord.cancelled_at ||= Time.now
32             jobrecord.finished_at ||= Time.now
33             if jobrecord.success.nil?
34               jobrecord.success = false
35             end
36             jobrecord.save!
37           end
38         end
39       end
40     end
41   end
42 end
43
44 CancelJobs.new.cancel_stale_jobs