+ def fail_jobs before: nil
+ act_as_system_user do
+ threshold = nil
+ if before == 'reboot'
+ boottime = nil
+ open('/proc/stat').map(&:split).each do |stat, t|
+ if stat == 'btime'
+ boottime = t
+ end
+ end
+ if not boottime
+ raise "Could not find btime in /proc/stat"
+ end
+ threshold = Time.at(boottime.to_i)
+ elsif before
+ threshold = Time.parse(before, Time.now)
+ else
+ threshold = db_current_time
+ end
+ Rails.logger.info "fail_jobs: threshold is #{threshold}"
+
+ if Rails.configuration.crunch_job_wrapper == :slurm_immediate
+ # [["slurm_job_id", "slurm_job_name"], ...]
+ squeue = File.popen(['squeue', '-h', '-o', '%i %j']).readlines.map do |line|
+ line.strip.split(' ', 2)
+ end
+ else
+ squeue = []
+ end
+
+ Job.where('state = ? and started_at < ?', Job::Running, threshold).
+ each do |job|
+ Rails.logger.debug "fail_jobs: #{job.uuid} started #{job.started_at}"
+ squeue.each do |slurm_id, slurm_name|
+ if slurm_name == job.uuid
+ Rails.logger.info "fail_jobs: scancel #{slurm_id} for #{job.uuid}"
+ scancel slurm_id
+ end
+ end
+ fail_job(job, "cleaned up stale job: started before #{threshold}",
+ skip_lock: true)
+ end
+ end
+ end
+