+ def fail_jobs before: nil
+ act_as_system_user do
+ threshold = nil
+ if before == 'reboot'
+ boottime = nil
+ open('/proc/stat').map(&:split).each do |stat, t|
+ if stat == 'btime'
+ boottime = t
+ end
+ end
+ if not boottime
+ raise "Could not find btime in /proc/stat"
+ end
+ threshold = Time.at(boottime.to_i)
+ elsif before
+ threshold = Time.parse(before, Time.now)
+ else
+ threshold = db_current_time
+ end
+ Rails.logger.info "fail_jobs: threshold is #{threshold}"
+
+ squeue = squeue_jobs
+ Job.where('state = ? and started_at < ?', Job::Running, threshold).
+ each do |job|
+ Rails.logger.debug "fail_jobs: #{job.uuid} started #{job.started_at}"
+ squeue.each do |slurm_name|
+ if slurm_name == job.uuid
+ Rails.logger.info "fail_jobs: scancel #{job.uuid}"
+ scancel slurm_name
+ end
+ end
+ fail_job(job, "cleaned up stale job: started before #{threshold}",
+ skip_lock: true)
+ end
+ end
+ end
+
+ def check_orphaned_slurm_jobs
+ act_as_system_user do
+ squeue_uuids = squeue_jobs.select{|uuid| uuid.match(HasUuid::UUID_REGEX)}.
+ select{|uuid| !@running.has_key?(uuid)}
+
+ return if squeue_uuids.size == 0
+
+ scancel_uuids = squeue_uuids - Job.where('uuid in (?) and (state in (?) or modified_at>?)',
+ squeue_uuids,
+ ['Running', 'Queued'],
+ (Time.now - 60)).
+ collect(&:uuid)
+ scancel_uuids.each do |uuid|
+ Rails.logger.info "orphaned job: scancel #{uuid}"
+ scancel uuid
+ end
+ end
+ end
+
+ def sudo_preface
+ return [] if not Server::Application.config.crunch_job_user
+ ["sudo", "-E", "-u",
+ Server::Application.config.crunch_job_user,
+ "LD_LIBRARY_PATH=#{ENV['LD_LIBRARY_PATH']}",
+ "PATH=#{ENV['PATH']}",
+ "PERLLIB=#{ENV['PERLLIB']}",
+ "PYTHONPATH=#{ENV['PYTHONPATH']}",
+ "RUBYLIB=#{ENV['RUBYLIB']}",
+ "GEM_PATH=#{ENV['GEM_PATH']}"]
+ end
+