Allow users to cancel a running crunch job by updating cancelled_at
authorTom Clegg <tom@clinicalfuture.com>
Fri, 22 Nov 2013 23:34:31 +0000 (15:34 -0800)
committerTom Clegg <tom@clinicalfuture.com>
Fri, 22 Nov 2013 23:41:57 +0000 (15:41 -0800)
(to something other than nil).

Also:
* Add test fixtures for jobs
* Take care to update the Job object piecemeal during crunch-job
  (avoid overwriting cancelled_at)

12 files changed:
sdk/cli/bin/crunch-job
sdk/perl/lib/Arvados/ResourceProxy.pm
services/api/Gemfile
services/api/Gemfile.lock
services/api/app/models/job.rb
services/api/config/environments/development.rb.example
services/api/config/environments/production.rb.example
services/api/config/environments/test.rb.example
services/api/script/crunch-dispatch.rb
services/api/test/fixtures/api_clients.yml
services/api/test/fixtures/jobs.yml [new file with mode: 0644]
services/api/test/functional/arvados/v1/jobs_controller_test.rb

index c2738e224f6222613415bd7350c6d7dc33c50c97..c2e87d2db5a747c4cc62c108f31b99b954dcefbf 100755 (executable)
@@ -58,7 +58,8 @@ Save a checkpoint and continue.
 =item SIGHUP
 
 Refresh node allocation (i.e., check whether any nodes have been added
-or unallocated). Currently this is a no-op.
+or unallocated) and attributes of the Job record that should affect
+behavior (e.g., cancel job if cancelled_at becomes non-nil).
 
 =back
 
@@ -107,10 +108,6 @@ my $job_has_uuid = $jobspec =~ /^[-a-z\d]+$/;
 my $local_job = !$job_has_uuid;
 
 
-$SIG{'HUP'} = sub
-{
-  1;
-};
 $SIG{'USR1'} = sub
 {
   $main::ENV{CRUNCH_DEBUG} = 1;
@@ -257,20 +254,17 @@ my $jobmanager_id;
 if ($job_has_uuid)
 {
   # Claim this job, and make sure nobody else does
-
-  $Job->{'is_locked_by_uuid'} = $User->{'uuid'};
-  $Job->{'started_at'} = gmtime;
-  $Job->{'running'} = 1;
-  $Job->{'success'} = undef;
-  $Job->{'tasks_summary'} = { 'failed' => 0,
-                              'todo' => 1,
-                              'running' => 0,
-                              'done' => 0 };
-  if ($job_has_uuid) {
-    unless ($Job->save() && $Job->{'is_locked_by_uuid'} == $User->{'uuid'}) {
-      croak("Error while updating / locking job");
-    }
+  unless ($Job->update_attributes('is_locked_by_uuid' => $User->{'uuid'}) &&
+          $Job->{'is_locked_by_uuid'} == $User->{'uuid'}) {
+    croak("Error while updating / locking job");
   }
+  $Job->update_attributes('started_at' => gmtime,
+                          'running' => 1,
+                          'success' => undef,
+                          'tasks_summary' => { 'failed' => 0,
+                                               'todo' => 1,
+                                               'running' => 0,
+                                               'done' => 0 });
 }
 
 
@@ -281,9 +275,12 @@ $SIG{'TERM'} = \&croak;
 $SIG{'TSTP'} = sub { $main::please_freeze = 1; };
 $SIG{'ALRM'} = sub { $main::please_info = 1; };
 $SIG{'CONT'} = sub { $main::please_continue = 1; };
+$SIG{'HUP'} = sub { $main::please_refresh = 1; };
+
 $main::please_freeze = 0;
 $main::please_info = 0;
 $main::please_continue = 0;
+$main::please_refresh = 0;
 my $jobsteps_must_output_keys = 0;     # becomes 1 when any task outputs a key
 
 grep { $ENV{$1} = $2 if /^(NOCACHE.*?)=(.*)/ } split ("\n", $$Job{knobs});
@@ -421,7 +418,9 @@ else
        Log (undef, "Using commit $commit for tree-ish $treeish");
         if ($commit ne $treeish) {
           $Job->{'script_version'} = $commit;
-          !$job_has_uuid or $Job->save() or croak("Error while updating job");
+          !$job_has_uuid or
+              $Job->update_attributes('script_version' => $commit) or
+              croak("Error while updating job");
         }
       }
     }
@@ -609,6 +608,24 @@ for (my $todo_ptr = 0; $todo_ptr <= $#jobstep_todo; $todo_ptr ++)
         (@slot > @freeslot && $todo_ptr+1 > $#jobstep_todo))
   {
     last THISROUND if $main::please_freeze;
+    if ($main::please_refresh)
+    {
+      $main::please_refresh = 0;
+      if ($job_has_uuid) {
+        $Job2 = $arv->{'jobs'}->{'get'}->execute('uuid' => $jobspec);
+        for my $attr ('cancelled_at',
+                      'cancelled_by_user_uuid',
+                      'cancelled_by_client_uuid') {
+          $Job->{$attr} = $Job2->{$attr};
+        }
+        if ($Job->{'cancelled_at'}) {
+          Log (undef, "Job cancelled at " . $Job->{cancelled_at} .
+               " by user " . $Job->{cancelled_by_user_uuid});
+          $main::success = 0;
+          $main::please_freeze = 1;
+        }
+      }
+    }
     if ($main::please_info)
     {
       $main::please_info = 0;
@@ -710,12 +727,12 @@ goto ONELEVEL if !defined $main::success;
 
 release_allocation();
 freeze();
-$Job->reload;
-$Job->{'output'} = &collate_output();
-$Job->{'running'} = 0;
-$Job->{'success'} = $Job->{'output'} && $main::success;
-$Job->{'finished_at'} = gmtime;
-$Job->save if $job_has_uuid;
+if ($job_has_uuid) {
+  $Job->update_attributes('output' => &collate_output(),
+                          'running' => 0,
+                          'success' => $Job->{'output'} && $main::success,
+                          'finished_at' => gmtime)
+}
 
 if ($Job->{'output'})
 {
@@ -749,7 +766,9 @@ sub update_progress_stats
   $Job->{'tasks_summary'}->{'todo'} = $todo;
   $Job->{'tasks_summary'}->{'done'} = $done;
   $Job->{'tasks_summary'}->{'running'} = $running;
-  $Job->save if $job_has_uuid;
+  if ($job_has_uuid) {
+    $Job->update_attributes('tasks_summary' => $Job->{'tasks_summary'});
+  }
   Log (undef, "status: $done done, $running running, $todo todo");
   $progress_is_dirty = 0;
 }
@@ -1036,8 +1055,7 @@ sub collate_output
   if ($joboutput)
   {
     Log (undef, "output $joboutput");
-    $Job->{'output'} = $joboutput;
-    $Job->save if $job_has_uuid;
+    $Job->update_attributes('output' => $joboutput) if $job_has_uuid;
   }
   else
   {
@@ -1129,11 +1147,9 @@ sub croak
 sub cleanup
 {
   return if !$job_has_uuid;
-  $Job->reload;
-  $Job->{'running'} = 0;
-  $Job->{'success'} = 0;
-  $Job->{'finished_at'} = gmtime;
-  $Job->save;
+  $Job->update_attributes('running' => 0,
+                          'success' => 0,
+                          'finished_at' => gmtime);
 }
 
 
@@ -1147,7 +1163,7 @@ sub save_meta
   undef $metastream if !$justcheckpoint; # otherwise Log() will try to use it
   Log (undef, "meta key is $loglocator");
   $Job->{'log'} = $loglocator;
-  $Job->save if $job_has_uuid;
+  $Job->update_attributes('log', $loglocator) if $job_has_uuid;
 }
 
 
index c81d87effb346d4ff0734bf0e0ec58d93faa96a6..5127d0c9f678b9c4bf58d7a4b706fff524856208 100644 (file)
@@ -21,6 +21,19 @@ sub save
     $self;
 }
 
+sub update_attributes
+{
+    my $self = shift;
+    my %updates = @_;
+    $response = $self->{'resourceAccessor'}->{'update'}->execute('uuid' => $self->{'uuid'}, $self->resource_parameter_name() => \%updates);
+    foreach my $param (keys %updates) {
+        if (exists $response->{$param}) {
+            $self->{$param} = $response->{$param};
+        }
+    }
+    $self;
+}
+
 sub reload
 {
     my $self = shift;
index ae90f677317f55925bdb20efa444ae4b80b00689..59b16cc7eabb8029eac745d98d1acd6a64b7c9bf 100644 (file)
@@ -51,3 +51,5 @@ gem 'omniauth-oauth2', '1.1.1'
 
 gem 'andand'
 gem 'redis'
+
+gem 'test_after_commit', :group => :test
index 6e255078c3ecd2dcf29ecc2422613d32b0d5044b..3929125b3724a6156c99bb697077fd9124aafbdc 100644 (file)
@@ -137,6 +137,7 @@ GEM
       multi_json (~> 1.0)
       rack (~> 1.0)
       tilt (~> 1.1, != 1.3.0)
+    test_after_commit (0.2.2)
     therubyracer (0.12.0)
       libv8 (~> 3.16.14.0)
       ref
@@ -168,5 +169,6 @@ DEPENDENCIES
   redis
   rvm-capistrano
   sass-rails (>= 3.2.0)
+  test_after_commit
   therubyracer
   uglifier (>= 1.0.3)
index e37e29279111eeefc937958ab5f817e87607ce0b..3ef52164d203ddf732a04560e051cf65fc49a7e5 100644 (file)
@@ -8,6 +8,7 @@ class Job < ArvadosModel
   before_create :ensure_unique_submit_id
   before_create :ensure_script_version_is_commit
   before_update :ensure_script_version_is_commit
+  after_commit :trigger_crunch_dispatch_if_cancelled, :on => :update
 
   has_many :commit_ancestors, :foreign_key => :descendant, :primary_key => :script_version
 
@@ -104,9 +105,10 @@ class Job < ArvadosModel
       if script_changed? or
           script_parameters_changed? or
           script_version_changed? or
-          cancelled_by_client_changed? or
-          cancelled_by_user_changed? or
-          cancelled_at_changed? or
+          (!cancelled_at_was.nil? and
+           (cancelled_by_client_changed? or
+            cancelled_by_user_changed? or
+            cancelled_at_changed?)) or
           started_at_changed? or
           finished_at_changed? or
           running_changed? or
@@ -135,4 +137,31 @@ class Job < ArvadosModel
       end
     end
   end
+
+  def update_modified_by_fields
+    if self.cancelled_at_changed?
+      # Ensure cancelled_at cannot be set to arbitrary non-now times,
+      # or changed once it is set.
+      if self.cancelled_at and not self.cancelled_at_was
+        self.cancelled_at = Time.now
+        self.cancelled_by_user_uuid = current_user.uuid
+        self.cancelled_by_client_uuid = current_api_client.uuid
+        @need_crunch_dispatch_trigger = true
+      else
+        self.cancelled_at = self.cancelled_at_was
+        self.cancelled_by_user_uuid = self.cancelled_by_user_uuid_was
+        self.cancelled_by_client_uuid = self.cancelled_by_client_uuid_was
+      end
+    end
+    super
+  end
+
+  def trigger_crunch_dispatch_if_cancelled
+    if @need_crunch_dispatch_trigger
+      File.open(Rails.configuration.crunch_dispatch_hup_trigger, 'wb') do
+        # That's all, just create a file for crunch-dispatch to see.
+      end
+    end
+  end
+
 end
index 932cc4e4f53cd57cd7d3fa5a98f70dc5e29a361f..48daed212594189fdb04824bf07c86906ce51409 100644 (file)
@@ -43,6 +43,10 @@ Server::Application.configure do
   config.crunch_job_wrapper = :none
   config.crunch_job_user = 'crunch' # if false, do not set uid when running jobs
 
+  # The web service must be able to create this file, and
+  # crunch_dispatch.rb must be able to unlink it.
+  config.crunch_dispatch_hup_trigger = '/tmp/crunch_dispatch_hup_trigger'
+
   # config.dnsmasq_conf_dir = '/etc/dnsmasq.d'
 
   # config.compute_node_ami = 'ami-cbca41a2'
index 9147f4f7da9ec74411a1e7028d09cbe6099caf97..eadb2047022bcf22f051c97c16e997491d6682a6 100644 (file)
@@ -65,6 +65,10 @@ Server::Application.configure do
   config.crunch_job_wrapper = :slurm_immediate
   config.crunch_job_user = 'crunch' # if false, do not set uid when running jobs
 
+  # The web service must be able to create this file, and
+  # crunch_dispatch.rb must be able to unlink it.
+  config.crunch_dispatch_hup_trigger = '/tmp/crunch_dispatch_hup_trigger'
+
   # config.dnsmasq_conf_dir = '/etc/dnsmasq.d'
 
   # config.compute_node_ami = 'ami-cbca41a2'
index bb07cfbe903bb4f63a46bdc1dbd2ab0ee9298fe1..e34bebc7b83192c80995aa378fca37c9d18d7423 100644 (file)
@@ -45,6 +45,10 @@ Server::Application.configure do
   config.crunch_job_wrapper = :slurm_immediate
   config.crunch_job_user = 'crunch' # if false, do not set uid when running jobs
 
+  # The web service must be able to create this file, and
+  # crunch_dispatch.rb must be able to unlink it.
+  config.crunch_dispatch_hup_trigger = '/tmp/crunch_dispatch_hup_trigger_test'
+
   # config.dnsmasq_conf_dir = '/etc/dnsmasq.d'
 
   # config.compute_node_ami = 'ami-cbca41a2'
index 88840d53b69a4358e6686b7e3f99b154dd5ddc31..dff68d867e668dbee9cd3f088d8a88319360016f 100755 (executable)
@@ -2,6 +2,7 @@
 
 include Process
 
+$warned = {}
 $signal = {}
 %w{TERM INT}.each do |sig|
   signame = sig
@@ -10,6 +11,10 @@ $signal = {}
     $signal[:term] = true
   end
 end
+Signal.trap('HUP') do
+  $stderr.puts "Received HUP signal"
+  $signal[:hup] = true
+end
 
 if ENV["CRUNCH_DISPATCH_LOCKFILE"]
   lockfilename = ENV.delete "CRUNCH_DISPATCH_LOCKFILE"
@@ -309,6 +314,30 @@ class Dispatcher
           end
         end
       else
+        if File.exists?(Rails.configuration.crunch_dispatch_hup_trigger)
+          begin
+            File.unlink(Rails.configuration.crunch_dispatch_hup_trigger)
+            $signal[:hup] = true
+          rescue Errno::ENOENT
+            $stderr.puts "Weird, hup_trigger file was deleted by someone else."
+          rescue Errno::EPERM
+            if not $warned[:hup_trigger_perm]
+              $warned[:hup_trigger_perm] = true
+              $stderr.puts "Install problem: I see the hup_trigger file but cannot delete it."
+            end
+          end
+        end
+        if $signal[:hup]
+          # Pass HUP through to all crunch-job processes.
+          @running.each do |uuid, j|
+            begin
+              Process.kill 'HUP', j[:wait_thr].pid
+            rescue Errno::ESRCH
+              # Process ended but hasn't been reaped. Nothing to do.
+            end
+          end
+          $signal.delete :hup
+        end
         refresh_todo unless did_recently(:refresh_todo, 1.0)
         update_node_status
         start_jobs unless @todo.empty? or did_recently(:start_jobs, 1.0)
index 619482c765bc5587052e672309daaca5728a6bd0..beb061cff79a1245cf292c7a25841e39964eac95 100644 (file)
@@ -1,11 +1,13 @@
 # Read about fixtures at http://api.rubyonrails.org/classes/ActiveRecord/Fixtures.html
 
 trusted_workbench:
+  uuid: zzzzz-ozdt8-teyxzyd8qllg11h
   name: Official Workbench
   url_prefix: https://official-workbench.local/
   is_trusted: true
 
 untrusted:
+  uuid: zzzzz-ozdt8-obw7foaks3qjyej
   name: Untrusted
   url_prefix: https://untrusted.local/
   is_trusted: false
diff --git a/services/api/test/fixtures/jobs.yml b/services/api/test/fixtures/jobs.yml
new file mode 100644 (file)
index 0000000..b6a08fe
--- /dev/null
@@ -0,0 +1,41 @@
+running:
+  uuid: zzzzz-8i9sb-pshmckwoma9plh7
+  owner_uuid: zzzzz-tpzed-xurymjxw79nv3jz
+  cancelled_at: ~
+  cancelled_by_user_uuid: ~
+  cancelled_by_client_uuid: ~
+  started_at: <%= 3.minute.ago.to_s(:db) %>
+  finished_at: ~
+  running: true
+  success: ~
+  output: ~
+  priority: ~
+  log: ~
+  is_locked_by_uuid: zzzzz-tpzed-xurymjxw79nv3jz
+  tasks_summary:
+    failed: 0
+    todo: 3
+    running: 1
+    done: 1
+  runtime_constraints: {}
+
+running_cancelled:
+  uuid: zzzzz-8i9sb-4cf0nhn6xte809j
+  owner_uuid: zzzzz-tpzed-xurymjxw79nv3jz
+  cancelled_at: <%= 1.minute.ago.to_s(:db) %>
+  cancelled_by_user_uuid: zzzzz-tpzed-xurymjxw79nv3jz
+  cancelled_by_client_uuid: zzzzz-ozdt8-obw7foaks3qjyej
+  started_at: <%= 3.minute.ago.to_s(:db) %>
+  finished_at: ~
+  running: true
+  success: ~
+  output: ~
+  priority: ~
+  log: ~
+  is_locked_by_uuid: zzzzz-tpzed-xurymjxw79nv3jz
+  tasks_summary:
+    failed: 0
+    todo: 3
+    running: 1
+    done: 1
+  runtime_constraints: {}
index bfbb717f4dda742a039ca2af43422d648fad876a..45e66ca85caf2b246612247c1fb77fc13d71bcb5 100644 (file)
@@ -1,4 +1,56 @@
 require 'test_helper'
 
 class Arvados::V1::JobsControllerTest < ActionController::TestCase
+
+  test "submit a job" do
+    authorize_with :active
+    post :create, job: {
+      script: "hash",
+      script_version: "master",
+      script_parameters: {}
+    }
+    assert_response :success
+    assert_not_nil assigns(:object)
+    new_job = JSON.parse(@response.body)
+    assert_not_nil new_job['uuid']
+  end
+
+  test "cancel a running job" do
+    # We need to verify that "cancel" creates a trigger file, so first
+    # let's make sure there is no stale trigger file.
+    begin
+      File.unlink(Rails.configuration.crunch_dispatch_hup_trigger)
+    rescue Errno::ENOENT
+    end
+
+    authorize_with :active
+    put :update, {
+      id: jobs(:running).uuid,
+      job: {
+        cancelled_at: 4.day.ago
+      }
+    }
+    assert_response :success
+    assert_not_nil assigns(:object)
+    job = JSON.parse(@response.body)
+    assert_not_nil job['uuid']
+    assert_not_nil job['cancelled_at']
+    assert_not_nil job['cancelled_by_user_uuid']
+    assert_not_nil job['cancelled_by_client_uuid']
+    assert_equal(true, job['cancelled_at'] > 1.minute.ago,
+                 'bogus cancelled_at corrected by server')
+    assert_equal(true,
+                 File.exists?(Rails.configuration.crunch_dispatch_hup_trigger),
+                 'trigger file should be created when job is cancelled')
+
+    put :update, {
+      id: jobs(:running).uuid,
+      job: {
+        cancelled_at: nil
+      }
+    }
+    job = JSON.parse(@response.body)
+    assert_not_nil job['cancelled_at'], 'un-cancelled job stays cancelled'
+  end
+
 end