2955: Moved logic to clean jobs table from crunch-dispatch into a standalone
authorPeter Amstutz <peter.amstutz@curoverse.com>
Thu, 5 Jun 2014 16:43:09 +0000 (12:43 -0400)
committerPeter Amstutz <peter.amstutz@curoverse.com>
Thu, 5 Jun 2014 16:43:09 +0000 (12:43 -0400)
script.

services/api/script/clean_orphan_jobs.rb [new file with mode: 0755]
services/api/script/crunch-dispatch.rb

diff --git a/services/api/script/clean_orphan_jobs.rb b/services/api/script/clean_orphan_jobs.rb
new file mode 100755 (executable)
index 0000000..35673c3
--- /dev/null
@@ -0,0 +1,37 @@
+#!/usr/bin/env ruby
+
+if ENV["CRUNCH_DISPATCH_LOCKFILE"]
+  lockfilename = ENV.delete "CRUNCH_DISPATCH_LOCKFILE"
+  lockfile = File.open(lockfilename, File::RDWR|File::CREAT, 0644)
+  unless lockfile.flock File::LOCK_EX|File::LOCK_NB
+    abort "Lock unavailable on #{lockfilename} - exit"
+  end
+end
+
+ENV["RAILS_ENV"] = ARGV[0] || ENV["RAILS_ENV"] || "development"
+
+require File.dirname(__FILE__) + '/../config/boot'
+require File.dirname(__FILE__) + '/../config/environment'
+
+def refresh_running
+  Job.running.each do |jobrecord|
+    f = Log.where("object_uuid=?", jobrecord.uuid).limit(1).order("created_at desc").first
+    if f
+      age = (Time.now - f.created_at)
+      if age > 300
+        $stderr.puts "dispatch: failing orphan job #{jobrecord.uuid}, last log is #{age} seconds old"
+        # job is marked running, but not known to crunch-dispatcher, and
+        # hasn't produced any log entries for 5 minutes, so mark it as failed.
+        jobrecord.running = false
+        jobrecord.cancelled_at ||= Time.now
+        jobrecord.finished_at ||= Time.now
+        if jobrecord.success.nil?
+          jobrecord.success = false
+        end
+        jobrecord.save!
+      end
+    end
+  end
+end
+
+refresh_running
index 9a8280fac5d0a43937432b9a82e61394b53d3ed0..87acb651a4347c2900689c296eb20b108f35e213 100755 (executable)
@@ -41,27 +41,6 @@ class Dispatcher
     return act_as_system_user
   end
 
-  def refresh_running
-    Job.running.each do |jobrecord|
-      if !@running[jobrecord.uuid]
-        f = Log.where("object_uuid=?", jobrecord.uuid).limit(1).order("created_at desc").first
-        age = (Time.now - f.created_at)
-        if age > 300
-          $stderr.puts "dispatch: failing orphan job #{jobrecord.uuid}, last log is #{age} seconds old"
-          # job is marked running, but not known to crunch-dispatcher, and
-          # hasn't produced any log entries for 5 minutes, so mark it as failed.
-          jobrecord.running = false
-          jobrecord.canceled_at ||= Time.now
-          jobrecord.finished_at ||= Time.now
-          if jobrecord.success.nil?
-            jobrecord.success = false
-          end
-          jobrecord.save!
-        end
-      end
-    end
-  end
-
   def refresh_todo
     @todo = Job.queue.select do |j| j.repository end
     @todo_pipelines = PipelineInstance.queue
@@ -409,7 +388,6 @@ class Dispatcher
           end
         end
       else
-        refresh_running unless did_recently(:refresh_running, 60.0)
         refresh_todo unless did_recently(:refresh_todo, 1.0)
         update_node_status
         unless @todo.empty? or did_recently(:start_jobs, 1.0) or $signal[:term]