From f99ffd6844ae9b88cfd17c8fe6e9fcc85a55d79c Mon Sep 17 00:00:00 2001 From: Tom Clegg Date: Fri, 22 Nov 2013 15:34:31 -0800 Subject: [PATCH] Allow users to cancel a running crunch job by updating cancelled_at (to something other than nil). Also: * Add test fixtures for jobs * Take care to update the Job object piecemeal during crunch-job (avoid overwriting cancelled_at) --- sdk/cli/bin/crunch-job | 84 +++++++++++-------- sdk/perl/lib/Arvados/ResourceProxy.pm | 13 +++ services/api/Gemfile | 2 + services/api/Gemfile.lock | 2 + services/api/app/models/job.rb | 35 +++++++- .../environments/development.rb.example | 4 + .../config/environments/production.rb.example | 4 + .../api/config/environments/test.rb.example | 4 + services/api/script/crunch-dispatch.rb | 29 +++++++ services/api/test/fixtures/api_clients.yml | 2 + services/api/test/fixtures/jobs.yml | 41 +++++++++ .../arvados/v1/jobs_controller_test.rb | 52 ++++++++++++ 12 files changed, 235 insertions(+), 37 deletions(-) create mode 100644 services/api/test/fixtures/jobs.yml diff --git a/sdk/cli/bin/crunch-job b/sdk/cli/bin/crunch-job index c2738e224f..c2e87d2db5 100755 --- a/sdk/cli/bin/crunch-job +++ b/sdk/cli/bin/crunch-job @@ -58,7 +58,8 @@ Save a checkpoint and continue. =item SIGHUP Refresh node allocation (i.e., check whether any nodes have been added -or unallocated). Currently this is a no-op. +or unallocated) and attributes of the Job record that should affect +behavior (e.g., cancel job if cancelled_at becomes non-nil). =back @@ -107,10 +108,6 @@ my $job_has_uuid = $jobspec =~ /^[-a-z\d]+$/; my $local_job = !$job_has_uuid; -$SIG{'HUP'} = sub -{ - 1; -}; $SIG{'USR1'} = sub { $main::ENV{CRUNCH_DEBUG} = 1; @@ -257,20 +254,17 @@ my $jobmanager_id; if ($job_has_uuid) { # Claim this job, and make sure nobody else does - - $Job->{'is_locked_by_uuid'} = $User->{'uuid'}; - $Job->{'started_at'} = gmtime; - $Job->{'running'} = 1; - $Job->{'success'} = undef; - $Job->{'tasks_summary'} = { 'failed' => 0, - 'todo' => 1, - 'running' => 0, - 'done' => 0 }; - if ($job_has_uuid) { - unless ($Job->save() && $Job->{'is_locked_by_uuid'} == $User->{'uuid'}) { - croak("Error while updating / locking job"); - } + unless ($Job->update_attributes('is_locked_by_uuid' => $User->{'uuid'}) && + $Job->{'is_locked_by_uuid'} == $User->{'uuid'}) { + croak("Error while updating / locking job"); } + $Job->update_attributes('started_at' => gmtime, + 'running' => 1, + 'success' => undef, + 'tasks_summary' => { 'failed' => 0, + 'todo' => 1, + 'running' => 0, + 'done' => 0 }); } @@ -281,9 +275,12 @@ $SIG{'TERM'} = \&croak; $SIG{'TSTP'} = sub { $main::please_freeze = 1; }; $SIG{'ALRM'} = sub { $main::please_info = 1; }; $SIG{'CONT'} = sub { $main::please_continue = 1; }; +$SIG{'HUP'} = sub { $main::please_refresh = 1; }; + $main::please_freeze = 0; $main::please_info = 0; $main::please_continue = 0; +$main::please_refresh = 0; my $jobsteps_must_output_keys = 0; # becomes 1 when any task outputs a key grep { $ENV{$1} = $2 if /^(NOCACHE.*?)=(.*)/ } split ("\n", $$Job{knobs}); @@ -421,7 +418,9 @@ else Log (undef, "Using commit $commit for tree-ish $treeish"); if ($commit ne $treeish) { $Job->{'script_version'} = $commit; - !$job_has_uuid or $Job->save() or croak("Error while updating job"); + !$job_has_uuid or + $Job->update_attributes('script_version' => $commit) or + croak("Error while updating job"); } } } @@ -609,6 +608,24 @@ for (my $todo_ptr = 0; $todo_ptr <= $#jobstep_todo; $todo_ptr ++) (@slot > @freeslot && $todo_ptr+1 > $#jobstep_todo)) { last THISROUND if $main::please_freeze; + if ($main::please_refresh) + { + $main::please_refresh = 0; + if ($job_has_uuid) { + $Job2 = $arv->{'jobs'}->{'get'}->execute('uuid' => $jobspec); + for my $attr ('cancelled_at', + 'cancelled_by_user_uuid', + 'cancelled_by_client_uuid') { + $Job->{$attr} = $Job2->{$attr}; + } + if ($Job->{'cancelled_at'}) { + Log (undef, "Job cancelled at " . $Job->{cancelled_at} . + " by user " . $Job->{cancelled_by_user_uuid}); + $main::success = 0; + $main::please_freeze = 1; + } + } + } if ($main::please_info) { $main::please_info = 0; @@ -710,12 +727,12 @@ goto ONELEVEL if !defined $main::success; release_allocation(); freeze(); -$Job->reload; -$Job->{'output'} = &collate_output(); -$Job->{'running'} = 0; -$Job->{'success'} = $Job->{'output'} && $main::success; -$Job->{'finished_at'} = gmtime; -$Job->save if $job_has_uuid; +if ($job_has_uuid) { + $Job->update_attributes('output' => &collate_output(), + 'running' => 0, + 'success' => $Job->{'output'} && $main::success, + 'finished_at' => gmtime) +} if ($Job->{'output'}) { @@ -749,7 +766,9 @@ sub update_progress_stats $Job->{'tasks_summary'}->{'todo'} = $todo; $Job->{'tasks_summary'}->{'done'} = $done; $Job->{'tasks_summary'}->{'running'} = $running; - $Job->save if $job_has_uuid; + if ($job_has_uuid) { + $Job->update_attributes('tasks_summary' => $Job->{'tasks_summary'}); + } Log (undef, "status: $done done, $running running, $todo todo"); $progress_is_dirty = 0; } @@ -1036,8 +1055,7 @@ sub collate_output if ($joboutput) { Log (undef, "output $joboutput"); - $Job->{'output'} = $joboutput; - $Job->save if $job_has_uuid; + $Job->update_attributes('output' => $joboutput) if $job_has_uuid; } else { @@ -1129,11 +1147,9 @@ sub croak sub cleanup { return if !$job_has_uuid; - $Job->reload; - $Job->{'running'} = 0; - $Job->{'success'} = 0; - $Job->{'finished_at'} = gmtime; - $Job->save; + $Job->update_attributes('running' => 0, + 'success' => 0, + 'finished_at' => gmtime); } @@ -1147,7 +1163,7 @@ sub save_meta undef $metastream if !$justcheckpoint; # otherwise Log() will try to use it Log (undef, "meta key is $loglocator"); $Job->{'log'} = $loglocator; - $Job->save if $job_has_uuid; + $Job->update_attributes('log', $loglocator) if $job_has_uuid; } diff --git a/sdk/perl/lib/Arvados/ResourceProxy.pm b/sdk/perl/lib/Arvados/ResourceProxy.pm index c81d87effb..5127d0c9f6 100644 --- a/sdk/perl/lib/Arvados/ResourceProxy.pm +++ b/sdk/perl/lib/Arvados/ResourceProxy.pm @@ -21,6 +21,19 @@ sub save $self; } +sub update_attributes +{ + my $self = shift; + my %updates = @_; + $response = $self->{'resourceAccessor'}->{'update'}->execute('uuid' => $self->{'uuid'}, $self->resource_parameter_name() => \%updates); + foreach my $param (keys %updates) { + if (exists $response->{$param}) { + $self->{$param} = $response->{$param}; + } + } + $self; +} + sub reload { my $self = shift; diff --git a/services/api/Gemfile b/services/api/Gemfile index ae90f67731..59b16cc7ea 100644 --- a/services/api/Gemfile +++ b/services/api/Gemfile @@ -51,3 +51,5 @@ gem 'omniauth-oauth2', '1.1.1' gem 'andand' gem 'redis' + +gem 'test_after_commit', :group => :test diff --git a/services/api/Gemfile.lock b/services/api/Gemfile.lock index 6e255078c3..3929125b37 100644 --- a/services/api/Gemfile.lock +++ b/services/api/Gemfile.lock @@ -137,6 +137,7 @@ GEM multi_json (~> 1.0) rack (~> 1.0) tilt (~> 1.1, != 1.3.0) + test_after_commit (0.2.2) therubyracer (0.12.0) libv8 (~> 3.16.14.0) ref @@ -168,5 +169,6 @@ DEPENDENCIES redis rvm-capistrano sass-rails (>= 3.2.0) + test_after_commit therubyracer uglifier (>= 1.0.3) diff --git a/services/api/app/models/job.rb b/services/api/app/models/job.rb index e37e292791..3ef52164d2 100644 --- a/services/api/app/models/job.rb +++ b/services/api/app/models/job.rb @@ -8,6 +8,7 @@ class Job < ArvadosModel before_create :ensure_unique_submit_id before_create :ensure_script_version_is_commit before_update :ensure_script_version_is_commit + after_commit :trigger_crunch_dispatch_if_cancelled, :on => :update has_many :commit_ancestors, :foreign_key => :descendant, :primary_key => :script_version @@ -104,9 +105,10 @@ class Job < ArvadosModel if script_changed? or script_parameters_changed? or script_version_changed? or - cancelled_by_client_changed? or - cancelled_by_user_changed? or - cancelled_at_changed? or + (!cancelled_at_was.nil? and + (cancelled_by_client_changed? or + cancelled_by_user_changed? or + cancelled_at_changed?)) or started_at_changed? or finished_at_changed? or running_changed? or @@ -135,4 +137,31 @@ class Job < ArvadosModel end end end + + def update_modified_by_fields + if self.cancelled_at_changed? + # Ensure cancelled_at cannot be set to arbitrary non-now times, + # or changed once it is set. + if self.cancelled_at and not self.cancelled_at_was + self.cancelled_at = Time.now + self.cancelled_by_user_uuid = current_user.uuid + self.cancelled_by_client_uuid = current_api_client.uuid + @need_crunch_dispatch_trigger = true + else + self.cancelled_at = self.cancelled_at_was + self.cancelled_by_user_uuid = self.cancelled_by_user_uuid_was + self.cancelled_by_client_uuid = self.cancelled_by_client_uuid_was + end + end + super + end + + def trigger_crunch_dispatch_if_cancelled + if @need_crunch_dispatch_trigger + File.open(Rails.configuration.crunch_dispatch_hup_trigger, 'wb') do + # That's all, just create a file for crunch-dispatch to see. + end + end + end + end diff --git a/services/api/config/environments/development.rb.example b/services/api/config/environments/development.rb.example index 932cc4e4f5..48daed2125 100644 --- a/services/api/config/environments/development.rb.example +++ b/services/api/config/environments/development.rb.example @@ -43,6 +43,10 @@ Server::Application.configure do config.crunch_job_wrapper = :none config.crunch_job_user = 'crunch' # if false, do not set uid when running jobs + # The web service must be able to create this file, and + # crunch_dispatch.rb must be able to unlink it. + config.crunch_dispatch_hup_trigger = '/tmp/crunch_dispatch_hup_trigger' + # config.dnsmasq_conf_dir = '/etc/dnsmasq.d' # config.compute_node_ami = 'ami-cbca41a2' diff --git a/services/api/config/environments/production.rb.example b/services/api/config/environments/production.rb.example index 9147f4f7da..eadb204702 100644 --- a/services/api/config/environments/production.rb.example +++ b/services/api/config/environments/production.rb.example @@ -65,6 +65,10 @@ Server::Application.configure do config.crunch_job_wrapper = :slurm_immediate config.crunch_job_user = 'crunch' # if false, do not set uid when running jobs + # The web service must be able to create this file, and + # crunch_dispatch.rb must be able to unlink it. + config.crunch_dispatch_hup_trigger = '/tmp/crunch_dispatch_hup_trigger' + # config.dnsmasq_conf_dir = '/etc/dnsmasq.d' # config.compute_node_ami = 'ami-cbca41a2' diff --git a/services/api/config/environments/test.rb.example b/services/api/config/environments/test.rb.example index bb07cfbe90..e34bebc7b8 100644 --- a/services/api/config/environments/test.rb.example +++ b/services/api/config/environments/test.rb.example @@ -45,6 +45,10 @@ Server::Application.configure do config.crunch_job_wrapper = :slurm_immediate config.crunch_job_user = 'crunch' # if false, do not set uid when running jobs + # The web service must be able to create this file, and + # crunch_dispatch.rb must be able to unlink it. + config.crunch_dispatch_hup_trigger = '/tmp/crunch_dispatch_hup_trigger_test' + # config.dnsmasq_conf_dir = '/etc/dnsmasq.d' # config.compute_node_ami = 'ami-cbca41a2' diff --git a/services/api/script/crunch-dispatch.rb b/services/api/script/crunch-dispatch.rb index 88840d53b6..dff68d867e 100755 --- a/services/api/script/crunch-dispatch.rb +++ b/services/api/script/crunch-dispatch.rb @@ -2,6 +2,7 @@ include Process +$warned = {} $signal = {} %w{TERM INT}.each do |sig| signame = sig @@ -10,6 +11,10 @@ $signal = {} $signal[:term] = true end end +Signal.trap('HUP') do + $stderr.puts "Received HUP signal" + $signal[:hup] = true +end if ENV["CRUNCH_DISPATCH_LOCKFILE"] lockfilename = ENV.delete "CRUNCH_DISPATCH_LOCKFILE" @@ -309,6 +314,30 @@ class Dispatcher end end else + if File.exists?(Rails.configuration.crunch_dispatch_hup_trigger) + begin + File.unlink(Rails.configuration.crunch_dispatch_hup_trigger) + $signal[:hup] = true + rescue Errno::ENOENT + $stderr.puts "Weird, hup_trigger file was deleted by someone else." + rescue Errno::EPERM + if not $warned[:hup_trigger_perm] + $warned[:hup_trigger_perm] = true + $stderr.puts "Install problem: I see the hup_trigger file but cannot delete it." + end + end + end + if $signal[:hup] + # Pass HUP through to all crunch-job processes. + @running.each do |uuid, j| + begin + Process.kill 'HUP', j[:wait_thr].pid + rescue Errno::ESRCH + # Process ended but hasn't been reaped. Nothing to do. + end + end + $signal.delete :hup + end refresh_todo unless did_recently(:refresh_todo, 1.0) update_node_status start_jobs unless @todo.empty? or did_recently(:start_jobs, 1.0) diff --git a/services/api/test/fixtures/api_clients.yml b/services/api/test/fixtures/api_clients.yml index 619482c765..beb061cff7 100644 --- a/services/api/test/fixtures/api_clients.yml +++ b/services/api/test/fixtures/api_clients.yml @@ -1,11 +1,13 @@ # Read about fixtures at http://api.rubyonrails.org/classes/ActiveRecord/Fixtures.html trusted_workbench: + uuid: zzzzz-ozdt8-teyxzyd8qllg11h name: Official Workbench url_prefix: https://official-workbench.local/ is_trusted: true untrusted: + uuid: zzzzz-ozdt8-obw7foaks3qjyej name: Untrusted url_prefix: https://untrusted.local/ is_trusted: false diff --git a/services/api/test/fixtures/jobs.yml b/services/api/test/fixtures/jobs.yml new file mode 100644 index 0000000000..b6a08feb0b --- /dev/null +++ b/services/api/test/fixtures/jobs.yml @@ -0,0 +1,41 @@ +running: + uuid: zzzzz-8i9sb-pshmckwoma9plh7 + owner_uuid: zzzzz-tpzed-xurymjxw79nv3jz + cancelled_at: ~ + cancelled_by_user_uuid: ~ + cancelled_by_client_uuid: ~ + started_at: <%= 3.minute.ago.to_s(:db) %> + finished_at: ~ + running: true + success: ~ + output: ~ + priority: ~ + log: ~ + is_locked_by_uuid: zzzzz-tpzed-xurymjxw79nv3jz + tasks_summary: + failed: 0 + todo: 3 + running: 1 + done: 1 + runtime_constraints: {} + +running_cancelled: + uuid: zzzzz-8i9sb-4cf0nhn6xte809j + owner_uuid: zzzzz-tpzed-xurymjxw79nv3jz + cancelled_at: <%= 1.minute.ago.to_s(:db) %> + cancelled_by_user_uuid: zzzzz-tpzed-xurymjxw79nv3jz + cancelled_by_client_uuid: zzzzz-ozdt8-obw7foaks3qjyej + started_at: <%= 3.minute.ago.to_s(:db) %> + finished_at: ~ + running: true + success: ~ + output: ~ + priority: ~ + log: ~ + is_locked_by_uuid: zzzzz-tpzed-xurymjxw79nv3jz + tasks_summary: + failed: 0 + todo: 3 + running: 1 + done: 1 + runtime_constraints: {} diff --git a/services/api/test/functional/arvados/v1/jobs_controller_test.rb b/services/api/test/functional/arvados/v1/jobs_controller_test.rb index bfbb717f4d..45e66ca85c 100644 --- a/services/api/test/functional/arvados/v1/jobs_controller_test.rb +++ b/services/api/test/functional/arvados/v1/jobs_controller_test.rb @@ -1,4 +1,56 @@ require 'test_helper' class Arvados::V1::JobsControllerTest < ActionController::TestCase + + test "submit a job" do + authorize_with :active + post :create, job: { + script: "hash", + script_version: "master", + script_parameters: {} + } + assert_response :success + assert_not_nil assigns(:object) + new_job = JSON.parse(@response.body) + assert_not_nil new_job['uuid'] + end + + test "cancel a running job" do + # We need to verify that "cancel" creates a trigger file, so first + # let's make sure there is no stale trigger file. + begin + File.unlink(Rails.configuration.crunch_dispatch_hup_trigger) + rescue Errno::ENOENT + end + + authorize_with :active + put :update, { + id: jobs(:running).uuid, + job: { + cancelled_at: 4.day.ago + } + } + assert_response :success + assert_not_nil assigns(:object) + job = JSON.parse(@response.body) + assert_not_nil job['uuid'] + assert_not_nil job['cancelled_at'] + assert_not_nil job['cancelled_by_user_uuid'] + assert_not_nil job['cancelled_by_client_uuid'] + assert_equal(true, job['cancelled_at'] > 1.minute.ago, + 'bogus cancelled_at corrected by server') + assert_equal(true, + File.exists?(Rails.configuration.crunch_dispatch_hup_trigger), + 'trigger file should be created when job is cancelled') + + put :update, { + id: jobs(:running).uuid, + job: { + cancelled_at: nil + } + } + job = JSON.parse(@response.body) + assert_not_nil job['cancelled_at'], 'un-cancelled job stays cancelled' + end + end -- 2.30.2