Merge branch 'master' into 8654-arv-jobs-cwl-runner
authorPeter Amstutz <peter.amstutz@curoverse.com>
Mon, 28 Mar 2016 13:37:37 +0000 (09:37 -0400)
committerPeter Amstutz <peter.amstutz@curoverse.com>
Mon, 28 Mar 2016 13:37:37 +0000 (09:37 -0400)
Conflicts:
docker/jobs/Dockerfile

14 files changed:
apps/workbench/app/views/pipeline_instances/_running_component.html.erb
build/run-build-docker-jobs-image.sh
build/run-build-packages-one-target.sh
build/run-build-packages.sh
docker/build_tools/Makefile
docker/jobs/Dockerfile
services/api/app/controllers/arvados/v1/api_client_authorizations_controller.rb
services/api/app/controllers/arvados/v1/jobs_controller.rb
services/api/app/models/job.rb
services/api/test/functional/arvados/v1/api_client_authorizations_controller_test.rb
services/api/test/unit/job_test.rb
services/fuse/arvados_fuse/fusefile.py
services/keepstore/azure_blob_volume.go
services/nodemanager/arvnodeman/computenode/dispatch/slurm.py

index d4e0944b5c510e7bcd4216a4f9808090fea834c8..ded535ef3ad5109e81a33ea1fd9815cde8ac6905 100644 (file)
             <div class="col-md-6">
               <% queuetime = Time.now - Time.parse(current_job[:created_at].to_s) %>
               Queued for <%= render_runtime(queuetime, false) %>.
-              <% begin %>
-                <% if current_job[:queue_position] == 0 %>
-                  This job is next in the queue to run.
-                <% elsif current_job[:queue_position] == 1 %>
-                  There is 1 job in the queue ahead of this one.
-                <% elsif current_job[:queue_position] %>
-                  There are <%= current_job[:queue_position] %> jobs in the queue ahead of this one.
-                <% end %>
-              <% rescue %>
-              <% end %>
             </div>
           <% elsif current_job[:state] == "Running" %>
             <%# column offset 8 %>
index fcf849bc4df9fe7e991046b150305830a711cce9..15f788163ed604ea4592daae668c3dbe9e888923 100755 (executable)
@@ -54,21 +54,23 @@ do
     esac
 done
 
-
 EXITCODE=0
 
-COLUMNS=80
-
-title () {
-    printf "\n%*s\n\n" $(((${#title}+$COLUMNS)/2)) "********** $1 **********"
+exit_cleanly() {
+    trap - INT
+    report_outcomes
+    exit $EXITCODE
 }
 
+COLUMNS=80
+. $WORKSPACE/build/run-library.sh
+
 docker_push () {
     if [[ ! -z "$tags" ]]
     then
         for tag in $( echo $tags|tr "," " " )
         do
-             $DOCKER tag -f $1 $1:$tag
+             $DOCKER tag $1 $1:$tag
         done
     fi
 
@@ -82,17 +84,9 @@ docker_push () {
     done
 
     if [[ "$ECODE" != "0" ]]; then
-        title "!!!!!! docker push $* failed !!!!!!"
         EXITCODE=$(($EXITCODE + $ECODE))
     fi
-}
-
-timer_reset() {
-    t0=$SECONDS
-}
-
-timer() {
-    echo -n "$(($SECONDS - $t0))s"
+    checkexit $ECODE "docker push $*"
 }
 
 # Sanity check
@@ -131,15 +125,19 @@ rm -f config.yml
 # Get test config.yml file
 cp $HOME/docker/config.yml .
 
-./build.sh jobs-image
+if [[ ! -z "$tags" ]]; then
+  COMMIT=${tags/,*/} ./build.sh jobs-image
+else
+  ./build.sh jobs-image
+fi
 
 ECODE=$?
 
 if [[ "$ECODE" != "0" ]]; then
-    title "!!!!!! docker BUILD FAILED !!!!!!"
     EXITCODE=$(($EXITCODE + $ECODE))
 fi
 
+checkexit $ECODE "docker build"
 title "docker build complete (`timer`)"
 
 title "uploading images"
@@ -155,10 +153,10 @@ else
         docker login -u arvados
 
         docker_push arvados/jobs
-        title "upload arvados images complete (`timer`)"
+        title "upload arvados images finished (`timer`)"
     else
-        title "upload arvados images SKIPPED because no --upload option set"
+        title "upload arvados images SKIPPED because no --upload option set (`timer`)"
     fi
 fi
 
-exit $EXITCODE
+exit_cleanly
index c5e0a89e3827dd266974045d0ebe16718db7f344..e5e7f0bfa4308bf5fb9c49b3bc24a6b7164812a8 100755 (executable)
@@ -137,11 +137,11 @@ if test -z "$packages" ; then
     case "$TARGET" in
         centos6)
             packages="$packages python27-python-arvados-fuse
-                  python27-python-arvados-python-client"
+                  python27-python-arvados-python-client python27-python-arvados-cwl-runner"
             ;;
         *)
             packages="$packages python-arvados-fuse
-                  python-arvados-python-client"
+                  python-arvados-python-client python-arvados-cwl-runner"
             ;;
     esac
 fi
index 5690a29bb292c86abab26a2ad0c36f9ba15ac88f..6e086b91875efbdc3ee49c17820a72dc99799ab7 100755 (executable)
@@ -93,8 +93,8 @@ case "$TARGET" in
             oauth2client==1.5.2 pyasn1==0.1.7 pyasn1-modules==0.0.5 \
             rsa uritemplate httplib2 ws4py pykka six pyexecjs jsonschema \
             ciso8601 pycrypto backports.ssl_match_hostname llfuse==0.41.1 \
-            'pycurl<7.21.5' contextlib2 pyyaml 'rdflib>=4.2.0' 'rdflib-jsonld>=0.3.0' \
-            shellescape mistune)
+            'pycurl<7.21.5' contextlib2 pyyaml 'rdflib>=4.2.0' \
+            shellescape mistune typing avro)
         PYTHON3_BACKPORTS=(docker-py six requests websocket-client)
         ;;
     debian8)
@@ -107,8 +107,8 @@ case "$TARGET" in
             oauth2client==1.5.2 pyasn1==0.1.7 pyasn1-modules==0.0.5 \
             rsa uritemplate httplib2 ws4py pykka six pyexecjs jsonschema \
             ciso8601 pycrypto backports.ssl_match_hostname llfuse==0.41.1 \
-            'pycurl<7.21.5' pyyaml 'rdflib>=4.2.0' 'rdflib-jsonld>=0.3.0' \
-            shellescape mistune)
+            'pycurl<7.21.5' pyyaml 'rdflib>=4.2.0' \
+            shellescape mistune typing avro)
         PYTHON3_BACKPORTS=(docker-py six requests websocket-client)
         ;;
     ubuntu1204)
@@ -122,7 +122,7 @@ case "$TARGET" in
             rsa uritemplate httplib2 ws4py pykka six pyexecjs jsonschema \
             ciso8601 pycrypto backports.ssl_match_hostname llfuse==0.41.1 \
             contextlib2 'pycurl<7.21.5' pyyaml 'rdflib>=4.2.0' \
-            'rdflib-jsonld>=0.3.0' shellescape mistune)
+            shellescape mistune typing avro isodate)
         PYTHON3_BACKPORTS=(docker-py six requests websocket-client)
         ;;
     ubuntu1404)
@@ -134,7 +134,7 @@ case "$TARGET" in
         PYTHON_BACKPORTS=(pyasn1==0.1.7 pyasn1-modules==0.0.5 llfuse==0.41.1 ciso8601 \
             google-api-python-client==1.4.2 six uritemplate oauth2client==1.5.2 httplib2 \
             rsa 'pycurl<7.21.5' backports.ssl_match_hostname pyyaml 'rdflib>=4.2.0' \
-            'rdflib-jsonld>=0.3.0' shellescape mistune)
+            shellescape mistune typing avro)
         PYTHON3_BACKPORTS=(docker-py requests websocket-client)
         ;;
     centos6)
@@ -148,7 +148,8 @@ case "$TARGET" in
             rsa uritemplate httplib2 ws4py pykka six pyexecjs jsonschema \
             ciso8601 pycrypto backports.ssl_match_hostname 'pycurl<7.21.5' \
             python-daemon lockfile llfuse==0.41.1 'pbr<1.0' pyyaml \
-            'rdflib>=4.2.0' 'rdflib-jsonld>=0.3.0' shellescape mistune)
+            'rdflib>=4.2.0' shellescape mistune typing avro requests \
+            isodate pyparsing sparqlwrapper html5lib keepalive)
         PYTHON3_BACKPORTS=(docker-py six requests websocket-client)
         export PYCURL_SSL_LIBRARY=nss
         ;;
@@ -405,7 +406,7 @@ fpm_build $WORKSPACE/sdk/python "${PYTHON2_PKG_PREFIX}-arvados-python-client" 'C
 # cwl-runner
 cd $WORKSPACE/packages/$TARGET
 rm -rf "$WORKSPACE/sdk/cwl/build"
-fpm_build $WORKSPACE/sdk/cwl "${PYTHON2_PKG_PREFIX}-arvados-cwl-runner" 'Curoverse, Inc.' 'python' "$(awk '($1 == "Version:"){print $2}' $WORKSPACE/sdk/cwl/arvados_cwl_runner.egg-info/PKG-INFO)" "--url=https://arvados.org" "--description=The Arvados CWL runner"
+fpm_build $WORKSPACE/sdk/cwl "${PYTHON2_PKG_PREFIX}-arvados-cwl-runner" 'Curoverse, Inc.' 'python' "$(awk '($1 == "Version:"){print $2}' $WORKSPACE/sdk/cwl/arvados_cwl_runner.egg-info/PKG-INFO)" "--url=https://arvados.org" "--description=The Arvados CWL runner" --iteration 2
 
 # schema_salad. This is a python dependency of arvados-cwl-runner,
 # but we can't use the usual PYTHONPACKAGES way to build this package due to the
@@ -422,10 +423,13 @@ fpm_build $WORKSPACE/sdk/cwl "${PYTHON2_PKG_PREFIX}-arvados-cwl-runner" 'Curover
 # So we build this thing separately.
 #
 # Ward, 2016-03-17
-fpm --maintainer='Ward Vandewege <ward@curoverse.com>' -s python -t deb --exclude=*/dist-packages/tests/* --exclude=*/site-packages/tests/* --deb-ignore-iteration-in-dependencies -n python-schema-salad --iteration 1 --python-bin python2.7 --python-easyinstall easy_install-2.7 --python-package-name-prefix python --depends python2.7 -v 1.7.20160316203940 schema_salad
+fpm --maintainer='Ward Vandewege <ward@curoverse.com>' -s python -t $FORMAT --exclude=*/dist-packages/tests/* --exclude=*/site-packages/tests/* --deb-ignore-iteration-in-dependencies -n "${PYTHON2_PKG_PREFIX}-schema-salad" --iteration 1 --python-bin python2.7 --python-easyinstall "$EASY_INSTALL2" --python-package-name-prefix "$PYTHON2_PKG_PREFIX" --depends "$PYTHON2_PACKAGE" -v 1.7.20160316203940 schema_salad
 
 # And for cwltool we have the same problem as for schema_salad. Ward, 2016-03-17
-fpm --maintainer='Ward Vandewege <ward@curoverse.com>' -s python -t deb --exclude=*/dist-packages/tests/* --exclude=*/site-packages/tests/* --deb-ignore-iteration-in-dependencies -n python-cwltool --iteration 1 --python-bin python2.7 --python-easyinstall easy_install-2.7 --python-package-name-prefix python --depends python2.7 -v 1.0.20160316204054 cwltool
+fpm --maintainer='Ward Vandewege <ward@curoverse.com>' -s python -t $FORMAT --exclude=*/dist-packages/tests/* --exclude=*/site-packages/tests/* --deb-ignore-iteration-in-dependencies -n "${PYTHON2_PKG_PREFIX}-cwltool" --iteration 1 --python-bin python2.7 --python-easyinstall "$EASY_INSTALL2" --python-package-name-prefix "$PYTHON2_PKG_PREFIX" --depends "$PYTHON2_PACKAGE" -v 1.0.20160316204054 cwltool
+
+# FPM eats the trailing .0 in the python-rdflib-jsonld package when built with 'rdflib-jsonld>=0.3.0'. Force the version. Ward, 2016-03-25
+fpm --maintainer='Ward Vandewege <ward@curoverse.com>' -s python -t $FORMAT --exclude=*/dist-packages/tests/* --exclude=*/site-packages/tests/* --deb-ignore-iteration-in-dependencies --verbose --log info -n "${PYTHON2_PKG_PREFIX}-rdflib-jsonld" --iteration 1 --python-bin python2.7 --python-easyinstall "$EASY_INSTALL2" --python-package-name-prefix "$PYTHON2_PKG_PREFIX" --depends "$PYTHON2_PACKAGE" -v 0.3.0 rdflib-jsonld
 
 # The PAM module
 if [[ $TARGET =~ debian|ubuntu ]]; then
index 75702960133b3be2555b851babb9d5bf92e4c9a3..8a757d014e6f73919f3f74e8c5eef40faea20585 100644 (file)
@@ -217,7 +217,7 @@ keep-proxy-image: debian-arvados-image $(BUILD) $(KEEP_PROXY_DEPS)
        date >keep-proxy-image
 
 jobs-image: debian-arvados-image $(BUILD) $(JOBS_DEPS)
-       $(DOCKER_BUILD) -t arvados/jobs jobs
+       $(DOCKER_BUILD) --build-arg COMMIT=$(COMMIT) -t arvados/jobs jobs
        date >jobs-image
 
 java-bwa-samtools-image: jobs-image $(BUILD) $(JAVA_BWA_SAMTOOLS_DEPS)
index bbe7844b7dd8f5accd7745c331bf802271a2682a..d80c3a882defe43676476df144401eee64d97728 100644 (file)
@@ -6,13 +6,14 @@ ENV DEBIAN_FRONTEND noninteractive
 
 ADD apt.arvados.org.list /etc/apt/sources.list.d/
 RUN apt-key adv --keyserver pool.sks-keyservers.net --recv 1078ECD7
+RUN gpg --keyserver pool.sks-keyservers.net --recv-keys D39DC0E3
 
-RUN apt-get update -q && apt-get install -qy git python-pip python-virtualenv python-arvados-python-client python-dev libcurl4-gnutls-dev nodejs python-arvados-cwl-runner
+ARG COMMIT=latest
+RUN echo $COMMIT && apt-get update -q
 
-RUN gpg --keyserver pool.sks-keyservers.net --recv-keys D39DC0E3
+RUN apt-get install -qy git python-pip python-virtualenv python-arvados-python-client python-dev libcurl4-gnutls-dev nodejs python-arvados-cwl-runner
 
 # Install dependencies and set up system.
-# The FUSE packages help ensure that we can install the Python SDK (arv-mount).
 RUN /usr/sbin/adduser --disabled-password \
       --gecos 'Crunch execution user' crunch && \
     /usr/bin/install --directory --owner=crunch --group=crunch --mode=0700 /keep /tmp/crunch-src /tmp/crunch-job
index 56d0d85a82b51b1c0b6e2af981f8053c267ebd88..83968be75262ae75a7f797945ae30cda527c6a31 100644 (file)
@@ -69,14 +69,27 @@ class Arvados::V1::ApiClientAuthorizationsController < ApplicationController
         val.is_a?(String) && (attr == 'uuid' || attr == 'api_token')
       }
     end
-    @objects = model_class.
-      includes(:user, :api_client).
-      where('user_id=?', current_user.id)
-    super
-    wanted_scopes.compact.each do |scope_list|
-      sorted_scopes = scope_list.sort
-      @objects = @objects.select { |auth| auth.scopes.sort == sorted_scopes }
+    @objects = model_class.where('user_id=?', current_user.id)
+    if wanted_scopes.compact.any?
+      # We can't filter on scopes effectively using AR/postgres.
+      # Instead we get the entire result set, do our own filtering on
+      # scopes to get a list of UUIDs, then start a new query
+      # (restricted to the selected UUIDs) so super can apply the
+      # offset/limit/order params in the usual way.
+      @request_limit = @limit
+      @request_offset = @offset
+      @limit = @objects.count
+      @offset = 0
+      super
+      wanted_scopes.compact.each do |scope_list|
+        sorted_scopes = scope_list.sort
+        @objects = @objects.select { |auth| auth.scopes.sort == sorted_scopes }
+      end
+      @limit = @request_limit
+      @offset = @request_offset
+      @objects = model_class.where('uuid in (?)', @objects.collect(&:uuid))
     end
+    super
   end
 
   def find_object_by_uuid
@@ -110,8 +123,10 @@ class Arvados::V1::ApiClientAuthorizationsController < ApplicationController
     # The @filters test here also prevents a non-trusted token from
     # filtering on its own scopes, and discovering whether any _other_
     # equally scoped tokens exist (403=yes, 200=no).
-    if (@objects.andand.count == 1 and
-        @objects.first.uuid == current_api_client_authorization.andand.uuid and
+    return forbidden if !@objects
+    full_set = @objects.except(:limit).except(:offset) if @objects
+    if (full_set.count == 1 and
+        full_set.first.uuid == current_api_client_authorization.andand.uuid and
         (@filters.map(&:first) & %w(uuid api_token)).any?)
       return true
     end
index f1ef2d824054f3a0dbe3bb338a966d3a00341b10..0190537e8bd918130e44f4855b713e29cb079099 100644 (file)
@@ -143,7 +143,7 @@ class Arvados::V1::JobsController < ApplicationController
               end
             end
           end
-          job_queue = Job.queue
+          job_queue = Job.queue.select(:uuid)
           n_queued_before_me = 0
           job_queue.each do |j|
             break if j.uuid == @job.uuid
@@ -152,7 +152,7 @@ class Arvados::V1::JobsController < ApplicationController
           yield "#{db_current_time}" \
             " job #{@job.uuid}" \
             " queue_position #{n_queued_before_me}" \
-            " queue_size #{job_queue.size}" \
+            " queue_size #{job_queue.count}" \
             " nodes_idle #{nodes_in_state[:idle]}" \
             " nodes_alloc #{nodes_in_state[:alloc]}\n"
           last_ack_at = db_current_time
index 6c24293334f6d4cc5af371b1b2b9a0d370466530..afaae26375b6e8b1659205c439306cbb2e1d4eea 100644 (file)
@@ -78,12 +78,13 @@ class Job < ArvadosModel
   end
 
   def queue_position
-    Job::queue.each_with_index do |job, index|
-      if job[:uuid] == self.uuid
-        return index
-      end
-    end
-    nil
+    # We used to report this accurately, but the implementation made queue
+    # API requests O(n**2) for the size of the queue.  See #8800.
+    # We've soft-disabled it because it's not clear we even want this
+    # functionality: now that we have Node Manager with support for multiple
+    # node sizes, "queue position" tells you very little about when a job will
+    # run.
+    state == Queued ? 0 : nil
   end
 
   def self.running
index 192e6b956dad89bb7e70dea714800a986f9574ab..9f0f555d55eb8963f8f13eb74ef9536d42694274 100644 (file)
@@ -38,9 +38,11 @@ class Arvados::V1::ApiClientAuthorizationsControllerTest < ActionController::Tes
     assert_response 403
   end
 
-  def assert_found_tokens(auth, search_params, *expected_tokens)
+  def assert_found_tokens(auth, search_params, expected)
     authorize_with auth
-    expected_tokens.map! { |name| api_client_authorizations(name).api_token }
+    expected_tokens = expected.map do |name|
+      api_client_authorizations(name).api_token
+    end
     get :index, search_params
     assert_response :success
     got_tokens = JSON.parse(@response.body)['items']
@@ -52,19 +54,26 @@ class Arvados::V1::ApiClientAuthorizationsControllerTest < ActionController::Tes
   # Three-tuples with auth to use, scopes to find, and expected tokens.
   # Make two tests for each tuple, one searching with where and the other
   # with filter.
-  [[:admin_trustedclient, [], :admin_noscope],
-   [:active_trustedclient, ["GET /arvados/v1/users"], :active_userlist],
+  [[:admin_trustedclient, [], [:admin_noscope]],
+   [:active_trustedclient, ["GET /arvados/v1/users"], [:active_userlist]],
    [:active_trustedclient,
     ["POST /arvados/v1/api_client_authorizations",
      "GET /arvados/v1/api_client_authorizations"],
-    :active_apitokens],
-  ].each do |auth, scopes, *expected|
+    [:active_apitokens]],
+  ].each do |auth, scopes, expected|
     test "#{auth.to_s} can find auths where scopes=#{scopes.inspect}" do
-      assert_found_tokens(auth, {where: {scopes: scopes}}, *expected)
+      assert_found_tokens(auth, {where: {scopes: scopes}}, expected)
     end
 
     test "#{auth.to_s} can find auths filtered with scopes=#{scopes.inspect}" do
-      assert_found_tokens(auth, {filters: [['scopes', '=', scopes]]}, *expected)
+      assert_found_tokens(auth, {filters: [['scopes', '=', scopes]]}, expected)
+    end
+
+    test "#{auth.to_s} offset works with filter scopes=#{scopes.inspect}" do
+      assert_found_tokens(auth, {
+                            offset: expected.length,
+                            filters: [['scopes', '=', scopes]]
+                          }, [])
     end
   end
 
@@ -112,6 +121,20 @@ class Arvados::V1::ApiClientAuthorizationsControllerTest < ActionController::Tes
       assert_response expect_list_response
       if expect_list_items
         assert_equal assigns(:objects).length, expect_list_items
+        assert_equal json_response['items_available'], expect_list_items
+      end
+    end
+
+    if expect_list_items
+      test "using '#{user}', list '#{token}' by uuid with offset" do
+        authorize_with user
+        get :index, {
+          filters: [['uuid','=',api_client_authorizations(token).uuid]],
+          offset: expect_list_items,
+        }
+        assert_response expect_list_response
+        assert_equal json_response['items_available'], expect_list_items
+        assert_equal json_response['items'].length, 0
       end
     end
 
@@ -123,6 +146,7 @@ class Arvados::V1::ApiClientAuthorizationsControllerTest < ActionController::Tes
       assert_response expect_list_response
       if expect_list_items
         assert_equal assigns(:objects).length, expect_list_items
+        assert_equal json_response['items_available'], expect_list_items
       end
     end
   end
index f16c8b2ec497e0caaaf5d78bdf5bb063b58badad..832338a3cc5de1ce742eb5088992ba91d4fe5fdc 100644 (file)
@@ -316,7 +316,6 @@ class JobTest < ActiveSupport::TestCase
 
     assert_not_nil job1.queue_position, "Expected non-nil queue position for job1"
     assert_not_nil job2.queue_position, "Expected non-nil queue position for job2"
-    assert_not_equal job1.queue_position, job2.queue_position
   end
 
   SDK_MASTER = "ca68b24e51992e790f29df5cc4bc54ce1da4a1c2"
index e731327dec5524432d0eb12c7d9dfc2b900fafd4..3f0e4932fddb181d84a17def278e21bd3035b6db 100644 (file)
@@ -95,6 +95,12 @@ class ObjectFile(StringFile):
         return self.object_uuid
 
     def update(self, obj=None):
+        if obj is None:
+            # TODO: retrieve the current record for self.object_uuid
+            # from the server. For now, at least don't crash when
+            # someone tells us it's a good time to update but doesn't
+            # pass us a fresh obj. See #8345
+            return
         self._mtime = convertTime(obj['modified_at']) if 'modified_at' in obj else 0
         self.contents = json.dumps(obj, indent=4, sort_keys=True) + "\n"
 
index 687c2fb36b7526bbbe498b86fd3c154d07ce9bd4..f08cebff63c65dc5cbbd941407602c262779708a 100644 (file)
@@ -257,7 +257,7 @@ func (v *AzureBlobVolume) Put(loc string, block []byte) error {
        if v.readonly {
                return MethodDisabledError
        }
-       return v.bsClient.CreateBlockBlobFromReader(v.containerName, loc, uint64(len(block)), bytes.NewReader(block))
+       return v.bsClient.CreateBlockBlobFromReader(v.containerName, loc, uint64(len(block)), bytes.NewReader(block), nil)
 }
 
 // Touch updates the last-modified property of a block blob.
index 4d70436801564e9a35675e95c18f33fddc125806..255e50a53018351ecf3f972d12002229f16954a2 100644 (file)
@@ -36,15 +36,7 @@ class ComputeNodeShutdownActor(ShutdownActorBase):
     def _get_slurm_state(self):
         return subprocess.check_output(['sinfo', '--noheader', '-o', '%t', '-n', self._nodename])
 
-    # The following methods retry on OSError.  This is intended to mitigate bug
-    # #6321 where fork() of node manager raises "OSError: [Errno 12] Cannot
-    # allocate memory" resulting in the untimely death of the shutdown actor
-    # and tends to result in node manager getting into a wedged state where it
-    # won't allocate new nodes or shut down gracefully.  The underlying causes
-    # of the excessive memory usage that result in the "Cannot allocate memory"
-    # error are still being investigated.
-
-    @RetryMixin._retry((subprocess.CalledProcessError, OSError))
+    @RetryMixin._retry((subprocess.CalledProcessError,))
     def cancel_shutdown(self, reason):
         if self._nodename:
             if self._get_slurm_state() in self.SLURM_DRAIN_STATES:
@@ -56,14 +48,14 @@ class ComputeNodeShutdownActor(ShutdownActorBase):
                 pass
         return super(ComputeNodeShutdownActor, self).cancel_shutdown(reason)
 
-    @RetryMixin._retry((subprocess.CalledProcessError, OSError))
+    @RetryMixin._retry((subprocess.CalledProcessError,))
     @ShutdownActorBase._stop_if_window_closed
     def issue_slurm_drain(self):
         self._set_node_state('DRAIN', 'Reason=Node Manager shutdown')
         self._logger.info("Waiting for SLURM node %s to drain", self._nodename)
         self._later.await_slurm_drain()
 
-    @RetryMixin._retry((subprocess.CalledProcessError, OSError))
+    @RetryMixin._retry((subprocess.CalledProcessError,))
     @ShutdownActorBase._stop_if_window_closed
     def await_slurm_drain(self):
         output = self._get_slurm_state()