Merge branch 'master' into 2955-fail-orphan-jobs
authorPeter Amstutz <peter.amstutz@curoverse.com>
Fri, 6 Jun 2014 14:31:18 +0000 (10:31 -0400)
committerPeter Amstutz <peter.amstutz@curoverse.com>
Fri, 6 Jun 2014 14:31:18 +0000 (10:31 -0400)
sdk/cli/bin/arv-run-pipeline-instance
services/api/script/cancel_stale_jobs.rb [new file with mode: 0755]
services/api/script/crunch-dispatch.rb

index fc636dff507213ac09bdfc2a9c90c4d7fd62a126..e9b3f00b61e84004a2820cedb6f472e96126c3b2 100755 (executable)
@@ -578,7 +578,7 @@ class WhRunPipelineInstance
     failed = 0
     @components.each do |cname, c|
       if c[:job]
-        if c[:job][:finished_at]
+        if c[:job][:finished_at] or c[:job][:cancelled_at] or (c[:job][:running] == false and c[:job][:success] == false)
           ended += 1
           if c[:job][:success] == true
             succeeded += 1
diff --git a/services/api/script/cancel_stale_jobs.rb b/services/api/script/cancel_stale_jobs.rb
new file mode 100755 (executable)
index 0000000..dde4cbe
--- /dev/null
@@ -0,0 +1,37 @@
+#!/usr/bin/env ruby
+
+if ENV["CRUNCH_DISPATCH_LOCKFILE"]
+  lockfilename = ENV.delete "CRUNCH_DISPATCH_LOCKFILE"
+  lockfile = File.open(lockfilename, File::RDWR|File::CREAT, 0644)
+  unless lockfile.flock File::LOCK_EX|File::LOCK_NB
+    abort "Lock unavailable on #{lockfilename} - exit"
+  end
+end
+
+ENV["RAILS_ENV"] = ARGV[0] || ENV["RAILS_ENV"] || "development"
+
+require File.dirname(__FILE__) + '/../config/boot'
+require File.dirname(__FILE__) + '/../config/environment'
+
+def cancel_stale_jobs
+  Job.running.each do |jobrecord|
+    f = Log.where("object_uuid=?", jobrecord.uuid).limit(1).order("created_at desc").first
+    if f
+      age = (Time.now - f.created_at)
+      if age > 300
+        $stderr.puts "dispatch: failing orphan job #{jobrecord.uuid}, last log is #{age} seconds old"
+        # job is marked running, but not known to crunch-dispatcher, and
+        # hasn't produced any log entries for 5 minutes, so mark it as failed.
+        jobrecord.running = false
+        jobrecord.cancelled_at ||= Time.now
+        jobrecord.finished_at ||= Time.now
+        if jobrecord.success.nil?
+          jobrecord.success = false
+        end
+        jobrecord.save!
+      end
+    end
+  end
+end
+
+cancel_stale_jobs
index 3ddf83da18ad878bc8f84efb5ab0810ecf3f6552..ee8076b305fad984aee6bdde12ef5cfe803e14d4 100755 (executable)
@@ -1,5 +1,7 @@
 #!/usr/bin/env ruby
 
+require 'trollop'
+
 include Process
 
 $warned = {}
@@ -20,6 +22,10 @@ if ENV["CRUNCH_DISPATCH_LOCKFILE"]
   end
 end
 
+$trollopts = Trollop::options do
+    opt :use_env, "Pass selected environment variables (PATH, PYTHONPATH, RUBYLIB, GEM_PATH, PERLLIB) to crunch-job"
+end
+
 ENV["RAILS_ENV"] = ARGV[0] || ENV["RAILS_ENV"] || "development"
 
 require File.dirname(__FILE__) + '/../config/boot'
@@ -180,10 +186,10 @@ class Dispatcher
       cmd_args << '--git-dir'
       cmd_args << arvados_internal
 
-      $stderr.puts "dispatch: #{cmd_args.join ' '}"
+      $stderr.puts "dispatch: #{cmd_args}"
 
       begin
-        i, o, e, t = Open3.popen3(*cmd_args)
+        i, o, e, t = Open3.popen3({}, *cmd_args, { :unsetenv_others => true})
       rescue
         $stderr.puts "dispatch: popen3: #{$!}"
         sleep 1