2955: crunch-dispatch now sends a clean environment to crunch-job. (cherry-picked...
[arvados.git] / services / api / script / crunch-dispatch.rb
index a9b75982036669471f7ee1c8cdb05879aa315a37..f49f21b04c1e529f40f5368f252d02938d33c18d 100755 (executable)
@@ -1,5 +1,7 @@
 #!/usr/bin/env ruby
 
+require 'trollop'
+
 include Process
 
 $warned = {}
@@ -20,6 +22,10 @@ if ENV["CRUNCH_DISPATCH_LOCKFILE"]
   end
 end
 
+$trollopts = Trollop::options do
+    opt :use_env, "Pass selected environment variables (PATH, PYTHONPATH, RUBYLIB, GEM_PATH, PERLLIB) to crunch-job"
+end
+
 ENV["RAILS_ENV"] = ARGV[0] || ENV["RAILS_ENV"] || "development"
 
 require File.dirname(__FILE__) + '/../config/boot'
@@ -35,6 +41,24 @@ class Dispatcher
     return act_as_system_user
   end
 
+  def refresh_running
+    Job.running.each do |jobrecord|
+      if !@running[jobrecord.uuid]
+        f = Log.filter(["object_uuid", "=", jobrecord.uuid]).limit(1).order("created_at desc").results.first
+        if (Time.now - f.created_at) > 300
+          # job is marked running, but not known to crunch-dispatcher, and
+          # hasn't produced any log entries for 5 minutes, so mark it as failed.
+          jobrecord.running = false
+          jobrecord.finished_at ||= Time.now
+          if jobrecord.success.nil?
+            jobrecord.success = false
+          end
+          jobrecord.save!
+        end
+      end
+    end
+  end
+
   def refresh_todo
     @todo = Job.queue.select do |j| j.repository end
     @todo_pipelines = PipelineInstance.queue
@@ -134,9 +158,23 @@ class Dispatcher
       end
 
       if Server::Application.config.crunch_job_user
-        cmd_args.unshift("sudo", "-E", "-u",
-                         Server::Application.config.crunch_job_user,
-                         "PERLLIB=#{ENV['PERLLIB']}")
+        cmd_args.unshift("sudo", "-E", "-u", Server::Application.config.crunch_job_user)
+      end
+
+      cmd_args << "HOME=/dev/null"
+      cmd_args << "ARVADOS_API_HOST=#{ENV['ARVADOS_API_HOST']}"
+      cmd_args << "ARVADOS_API_HOST_INSECURE=#{ENV['ARVADOS_API_HOST_INSECURE']}" if ENV['ARVADOS_API_HOST_INSECURE']
+
+      ENV.each do |k, v|
+        cmd_args << "#{k}=#{v}" if k.starts_with? "CRUNCH_"
+      end
+
+      if $trollopts.use_env
+        cmd_args << "PATH=#{ENV['PATH']}"
+        cmd_args << "PYTHONPATH=#{ENV['PYTHONPATH']}"
+        cmd_args << "PERLLIB=#{ENV['PERLLIB']}"
+        cmd_args << "RUBYLIB=#{ENV['RUBYLIB']}"
+        cmd_args << "GEM_PATH=#{ENV['GEM_PATH']}"
       end
 
       job_auth = ApiClientAuthorization.
@@ -176,10 +214,10 @@ class Dispatcher
       cmd_args << '--git-dir'
       cmd_args << arvados_internal
 
-      $stderr.puts "dispatch: #{cmd_args.join ' '}"
+      $stderr.puts "dispatch: #{cmd_args}"
 
       begin
-        i, o, e, t = Open3.popen3(*cmd_args)
+        i, o, e, t = Open3.popen3({}, *cmd_args, { :unsetenv_others => true})
       rescue
         $stderr.puts "dispatch: popen3: #{$!}"
         sleep 1
@@ -368,6 +406,7 @@ class Dispatcher
           end
         end
       else
+        refresh_running unless did_recently(:refresh_running, 60.0)
         refresh_todo unless did_recently(:refresh_todo, 1.0)
         update_node_status
         unless @todo.empty? or did_recently(:start_jobs, 1.0) or $signal[:term]