closes #8936
authorradhika <radhika@curoverse.com>
Thu, 21 Apr 2016 18:34:20 +0000 (14:34 -0400)
committerradhika <radhika@curoverse.com>
Thu, 21 Apr 2016 18:34:20 +0000 (14:34 -0400)
Merge branch '8936-ttl-in-signing-key'

31 files changed:
build/run-build-packages-one-target.sh
build/run-build-packages.sh
sdk/cli/bin/arv-run-pipeline-instance
sdk/cwl/setup.py
sdk/go/keepclient/collectionreader.go
sdk/go/keepclient/collectionreader_test.go
sdk/ruby/arvados.gemspec
sdk/ruby/lib/arvados.rb
sdk/ruby/lib/arvados/collection.rb
sdk/ruby/lib/arvados/keep.rb
sdk/ruby/test/test_keep_manifest.rb
services/crunch-run/crunchrun_test.go
services/fuse/tests/test_mount.py
services/keep-web/handler.go
services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
services/nodemanager/arvnodeman/computenode/dispatch/transitions.py [new file with mode: 0644]
services/nodemanager/arvnodeman/daemon.py
services/nodemanager/tests/test_computenode_dispatch.py
services/nodemanager/tests/test_computenode_dispatch_slurm.py
services/nodemanager/tests/test_daemon.py
services/nodemanager/tests/testutil.py
tools/arvbox/bin/arvbox
tools/arvbox/lib/arvbox/docker/Dockerfile.base
tools/arvbox/lib/arvbox/docker/api-setup.sh [new file with mode: 0755]
tools/arvbox/lib/arvbox/docker/common.sh
tools/arvbox/lib/arvbox/docker/service/api/run-service
tools/arvbox/lib/arvbox/docker/service/websockets/log/main/.gitstub [new file with mode: 0644]
tools/arvbox/lib/arvbox/docker/service/websockets/log/run [new symlink]
tools/arvbox/lib/arvbox/docker/service/websockets/run [new symlink]
tools/arvbox/lib/arvbox/docker/service/websockets/run-service [new file with mode: 0755]

index e5e7f0bfa4308bf5fb9c49b3bc24a6b7164812a8..7034e3fafe043357dc876f782d4137e6ab7eb4c1 100755 (executable)
@@ -130,6 +130,7 @@ if test -z "$packages" ; then
         crunchstat
         keepproxy
         keep-rsync
+        keep-block-check
         keepstore
         keep-web
         libarvados-perl"
index 5dcf3e94a137ef4ce2502d12313e2ae7f1a7e7f5..470b451111ccda90ffc756615aa83487dff7d042 100755 (executable)
@@ -392,6 +392,8 @@ package_go_binary services/crunchstat crunchstat \
     "Gather cpu/memory/network statistics of running Crunch jobs"
 package_go_binary tools/keep-rsync keep-rsync \
     "Copy all data from one set of Keep servers to another"
+package_go_binary tools/keep-block-check keep-block-check \
+    "Verify that all data from one set of Keep servers to another was copied"
 package_go_binary sdk/go/crunchrunner crunchrunner \
     "Crunchrunner executes a command inside a container and uploads the output"
 
index 70e2f42ede56ab9949f7ee9105c730b83a545966..6dc82c5a20b841b1aeb1400ecdaf7dd6c21d4ed5 100755 (executable)
@@ -17,7 +17,7 @@ begin
   require 'trollop'
   require 'google/api_client'
 rescue LoadError => l
-  puts $:
+  $stderr.puts $:
   abort <<-EOS
 #{$0}: fatal: #{l.message}
 Some runtime dependencies may be missing.
@@ -132,7 +132,7 @@ if $options[:instance]
     abort "#{$0}: syntax error: --instance cannot be combined with --template or --submit."
   end
 elsif not $options[:template]
-  puts "error: you must supply a --template or --instance."
+  $stderr.puts "error: you must supply a --template or --instance."
   p.educate
   abort
 end
index c061309f70ddb548ca1f543d4ffee54ba3447504..149c0bae860388dc37e9c6881318ae65807ab17c 100644 (file)
@@ -30,7 +30,7 @@ setup(name='arvados-cwl-runner',
           'bin/arvados-cwl-runner'
       ],
       install_requires=[
-          'cwltool>=1.0.20160325200114',
+          'cwltool==1.0.20160421140153',
           'arvados-python-client>=0.1.20160322001610'
       ],
       test_suite='tests',
index d2c171d96111af3e3c6922f73511427d0d41ae2a..bed60f499562a36c4585018932860fe35df34701 100644 (file)
@@ -209,6 +209,10 @@ GET:
                }
                var buf = make([]byte, fs.Offset+fs.Len)
                _, err = io.ReadFull(rdr, buf)
+               errClosing := rdr.Close()
+               if err == nil {
+                       err = errClosing
+               }
                if err != nil {
                        r.err = err
                        close(r.errNotNil)
index 58a047c55a053c14d8324266363fc7ad7fae33fa..2cc23738855dfeab3cd8ab2ef33cb27055a35fa1 100644 (file)
@@ -220,4 +220,5 @@ func (s *CollectionReaderUnit) TestCollectionReaderDataError(c *check.C) {
                c.Check(err, check.NotNil)
                c.Check(err, check.Not(check.Equals), io.EOF)
        }
+       c.Check(rdr.Close(), check.NotNil)
 }
index 3d090f4b5cc69aa23eec3dc057041c200024ed9a..355ed5d2d814b9542a6fca773a214de0e9af1292 100644 (file)
@@ -18,13 +18,15 @@ Gem::Specification.new do |s|
   s.files       = ["lib/arvados.rb", "lib/arvados/google_api_client.rb",
                    "lib/arvados/collection.rb", "lib/arvados/keep.rb",
                    "README", "LICENSE-2.0.txt"]
-  s.required_ruby_version = '>= 2.1.0'
+  s.required_ruby_version = '>= 1.8.7'
   # activesupport <4.2.6 only because https://dev.arvados.org/issues/8222
-  s.add_dependency('activesupport', '>= 3.2.13', '< 4.2.6')
+  s.add_dependency('activesupport', '>= 3', '< 4.2.6')
   s.add_dependency('andand', '~> 1.3', '>= 1.3.3')
-  s.add_dependency('google-api-client', '~> 0.6.3', '>= 0.6.3')
+  s.add_dependency('google-api-client', '>= 0.7', '< 0.9')
+  # work around undeclared dependency on i18n in some activesupport 3.x.x:
+  s.add_dependency('i18n', '~> 0')
   s.add_dependency('json', '~> 1.7', '>= 1.7.7')
-  s.add_runtime_dependency('jwt', '>= 0.1.5', '< 1.0.0')
+  s.add_runtime_dependency('jwt', '<2', '>= 0.1.5')
   s.homepage    =
     'https://arvados.org'
 end
index 753c518b3191ebbfefbd4407ca67c2f9b83daa45..7a3f4b4226210646cbdd098d5594ff354f42778e 100644 (file)
@@ -209,7 +209,7 @@ class Arvados
                 :parameters => parameters,
                 :body_object => body,
                 :headers => {
-                  authorization: 'OAuth2 '+arvados.config['ARVADOS_API_TOKEN']
+                  :authorization => 'OAuth2 '+arvados.config['ARVADOS_API_TOKEN']
                 })
       resp = JSON.parse result.body, :symbolize_names => true
       if resp[:errors]
@@ -217,7 +217,7 @@ class Arvados
       elsif resp[:uuid] and resp[:etag]
         self.new(resp)
       elsif resp[:items].is_a? Array
-        resp.merge(items: resp[:items].collect do |i|
+        resp.merge(:items => resp[:items].collect do |i|
                      self.new(i)
                    end)
       else
index 07b751908f7da26b93fd5321fe8a5c192872a8d6..474241dc41832e657b53d4d55b5746e479e79075 100644 (file)
@@ -44,7 +44,7 @@ module Arv
     end
 
     def cp_r(source, target, source_collection=nil)
-      opts = {descend_target: !source.end_with?("/")}
+      opts = {:descend_target => !source.end_with?("/")}
       copy(:merge, source.chomp("/"), target, source_collection, opts)
     end
 
@@ -70,7 +70,7 @@ module Arv
     end
 
     def rm_r(source)
-      remove(source, recursive: true)
+      remove(source, :recursive => true)
     end
 
     protected
@@ -155,7 +155,7 @@ module Arv
       modified
     end
 
-    LocatorSegment = Struct.new(:locators, :start_pos, :length)
+    Struct.new("LocatorSegment", :locators, :start_pos, :length)
 
     class LocatorRange < Range
       attr_reader :locator
@@ -187,9 +187,9 @@ module Arv
           end_index = search_for_byte(start_pos + length - 1, start_index)
         end
         seg_ranges = @ranges[start_index..end_index]
-        LocatorSegment.new(seg_ranges.map(&:locator),
-                           start_pos - seg_ranges.first.begin,
-                           length)
+        Struct::LocatorSegment.new(seg_ranges.map(&:locator),
+                                   start_pos - seg_ranges.first.begin,
+                                   length)
       end
 
       private
index 3c6b26b765f59c4938465aaa7dcc589187fa7722..489eeeeebb7e11a3ea3dda78375b33809fdf97a6 100644 (file)
@@ -47,19 +47,19 @@ module Keep
         raise ArgumentError.new "locator is nil or empty"
       end
 
-      m = LOCATOR_REGEXP.match(tok.strip)
+      m = LOCATOR_REGEXP.match(tok)
       unless m
         raise ArgumentError.new "not a valid locator #{tok}"
       end
 
-      tokhash, _, toksize, _, trailer = m[1..5]
+      tokhash, _, toksize, _, _, trailer = m[1..6]
       tokhints = []
       if trailer
         trailer.split('+').each do |hint|
-          if hint =~ /^[[:upper:]][[:alnum:]@_-]+$/
+          if hint =~ /^[[:upper:]][[:alnum:]@_-]*$/
             tokhints.push(hint)
           else
-            raise ArgumentError.new "unknown hint #{hint}"
+            raise ArgumentError.new "invalid hint #{hint}"
           end
         end
       end
index 5ed9cfc2b186b6e2122d41a879603054962bd0d3..fa1dc3f2e83da930e84e2f468f1b083f91cd965d 100644 (file)
@@ -266,6 +266,8 @@ class ManifestTest < Minitest::Test
    [true, 'd41d8cd98f00b204e9800998ecf8427e+0', '+0','0',nil],
    [true, 'd41d8cd98f00b204e9800998ecf8427e+0+Fizz+Buzz','+0','0','+Fizz+Buzz'],
    [true, 'd41d8cd98f00b204e9800998ecf8427e+Fizz+Buzz', nil,nil,'+Fizz+Buzz'],
+   [true, 'd41d8cd98f00b204e9800998ecf8427e+0+Ad41d8cd98f00b204e9800998ecf8427e00000000+Foo', '+0','0','+Ad41d8cd98f00b204e9800998ecf8427e00000000+Foo'],
+   [true, 'd41d8cd98f00b204e9800998ecf8427e+Ad41d8cd98f00b204e9800998ecf8427e00000000+Foo', nil,nil,'+Ad41d8cd98f00b204e9800998ecf8427e00000000+Foo'],
    [true, 'd41d8cd98f00b204e9800998ecf8427e+0+Z', '+0','0','+Z'],
    [true, 'd41d8cd98f00b204e9800998ecf8427e+Z', nil,nil,'+Z'],
   ].each do |ok, locator, match2, match3, match4|
@@ -278,6 +280,18 @@ class ManifestTest < Minitest::Test
         assert_equal match4, match[4]
       end
     end
+    define_method "test_parse_method_on_#{locator.inspect}" do
+      loc = Keep::Locator.parse locator
+      if !ok
+        assert_nil loc
+      else
+        refute_nil loc
+        assert loc.is_a?(Keep::Locator)
+        #assert loc.hash
+        #assert loc.size
+        #assert loc.hints.is_a?(Array)
+      end
+    end
   end
 
   [
@@ -301,6 +315,7 @@ class ManifestTest < Minitest::Test
     [true, ". d41d8cd98f00b204e9800998ecf8427e+0 0:0:\\040\n"],
     [true, ". 00000000000000000000000000000000+0 0:0:0\n"],
     [true, ". 00000000000000000000000000000000+0 0:0:d41d8cd98f00b204e9800998ecf8427e+0+Ad41d8cd98f00b204e9800998ecf8427e00000000@ffffffff\n"],
+    [true, ". d41d8cd98f00b204e9800998ecf8427e+0+Ad41d8cd98f00b204e9800998ecf8427e00000000@ffffffff 0:0:empty.txt\n"],
     [false, '. d41d8cd98f00b204e9800998ecf8427e 0:0:abc.txt',
       "Invalid manifest: does not end with newline"],
     [false, "abc d41d8cd98f00b204e9800998ecf8427e 0:0:abc.txt\n",
index 659b3c0ede524a31af3ada93369fcc6cab808e2a..c826a21e74e08405ab6bca75d02716cdad1fb079 100644 (file)
@@ -524,7 +524,7 @@ func (s *TestSuite) TestFullRunStderr(c *C) {
                t.finish <- dockerclient.WaitResult{ExitCode: 1}
        })
 
-       c.Check(api.Calls, Equals, 8)
+       c.Assert(api.Calls, Equals, 8)
        c.Check(api.Content[7]["container"].(arvadosclient.Dict)["log"], NotNil)
        c.Check(api.Content[7]["container"].(arvadosclient.Dict)["exit_code"], Equals, 1)
        c.Check(api.Content[7]["container"].(arvadosclient.Dict)["state"], Equals, "Complete")
index fa48849626a5d251c57ba1584b78855d1c73d0a6..e534e3273747372ce0f9ba19d7b08e9a21b3b7a8 100644 (file)
@@ -1142,7 +1142,6 @@ class TokenExpiryTest(MountTestBase):
 
     @mock.patch('arvados.keep.KeepClient.get')
     def runTest(self, mocked_get):
-        logging.getLogger('arvados.arvados_fuse').setLevel(logging.DEBUG)
         self.api._rootDesc = {"blobSignatureTtl": 2}
         mnt = self.make_mount(fuse.CollectionDirectory, collection_record='zzzzz-4zz18-op4e2lbej01tcvu')
         mocked_get.return_value = 'fake data'
index e1b23621af8f70aa214e43639140ca23ed2784c4..6f5f66ae0ef1bf57979f04189fe4d110818b1bd6 100644 (file)
@@ -320,6 +320,12 @@ func (h *handler) ServeHTTP(wOrig http.ResponseWriter, r *http.Request) {
                statusCode, statusText = http.StatusInternalServerError, err.Error()
                return
        }
+       if kc.Client != nil && kc.Client.Transport != nil {
+               // Workaround for https://dev.arvados.org/issues/9005
+               if t, ok := kc.Client.Transport.(*http.Transport); ok {
+                       defer t.CloseIdleConnections()
+               }
+       }
        rdr, err := kc.CollectionFileReader(collection, filename)
        if os.IsNotExist(err) {
                statusCode = http.StatusNotFound
index 552ed01b728d9e1e7bc11df7940eee2970ab01ff..412a5d7547313505e11da861172add90ef5292e1 100644 (file)
@@ -14,6 +14,7 @@ from .. import \
     arvados_node_missing, RetryMixin
 from ...clientactor import _notify_subscribers
 from ... import config
+from .transitions import transitions
 
 class ComputeNodeStateChangeBase(config.actor_class, RetryMixin):
     """Base class for actors that change a compute node's state.
@@ -208,17 +209,6 @@ class ComputeNodeShutdownActor(ComputeNodeStateChangeBase):
         self._logger.info("Shutdown cancelled: %s.", reason)
         self._finished(success_flag=False)
 
-    def _stop_if_window_closed(orig_func):
-        @functools.wraps(orig_func)
-        def stop_wrapper(self, *args, **kwargs):
-            if (self.cancellable and
-                  (self._monitor.shutdown_eligible().get() is not True)):
-                self._later.cancel_shutdown(self.WINDOW_CLOSED)
-                return None
-            else:
-                return orig_func(self, *args, **kwargs)
-        return stop_wrapper
-
     def _cancel_on_exception(orig_func):
         @functools.wraps(orig_func)
         def finish_wrapper(self, *args, **kwargs):
@@ -230,7 +220,6 @@ class ComputeNodeShutdownActor(ComputeNodeStateChangeBase):
         return finish_wrapper
 
     @_cancel_on_exception
-    @_stop_if_window_closed
     @RetryMixin._retry()
     def shutdown_node(self):
         self._logger.info("Starting shutdown")
@@ -254,9 +243,6 @@ class ComputeNodeShutdownActor(ComputeNodeStateChangeBase):
         self._clean_arvados_node(arvados_node, "Shut down by Node Manager")
         self._finished(success_flag=True)
 
-    # Make the decorator available to subclasses.
-    _stop_if_window_closed = staticmethod(_stop_if_window_closed)
-
 
 class ComputeNodeUpdateActor(config.actor_class):
     """Actor to dispatch one-off cloud management requests.
@@ -370,50 +356,63 @@ class ComputeNodeMonitorActor(config.actor_class):
         return result
 
     def shutdown_eligible(self):
-        """Return True if eligible for shutdown, or a string explaining why the node
-        is not eligible for shutdown."""
+        """Determine if node is candidate for shut down.
+
+        Returns a tuple of (boolean, string) where the first value is whether
+        the node is candidate for shut down, and the second value is the
+        reason for the decision.
+        """
+
+        # Collect states and then consult state transition table whether we
+        # should shut down.  Possible states are:
+        # crunch_worker_state = ['unpaired', 'busy', 'idle', 'down']
+        # window = ["open", "closed"]
+        # boot_grace = ["boot wait", "boot exceeded"]
+        # idle_grace = ["not idle", "idle wait", "idle exceeded"]
 
-        if not self._shutdowns.window_open():
-            return "shutdown window is not open."
         if self.arvados_node is None:
-            # Node is unpaired.
-            # If it hasn't pinged Arvados after boot_fail seconds, shut it down
-            if timestamp_fresh(self.cloud_node_start_time, self.boot_fail_after):
-                return "node is still booting, will be considered a failed boot at %s" % time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(self.cloud_node_start_time + self.boot_fail_after))
-            else:
-                return True
-        missing = arvados_node_missing(self.arvados_node, self.node_stale_after)
-        if missing and self._cloud.broken(self.cloud_node):
-            # Node is paired, but Arvados says it is missing and the cloud says the node
-            # is in an error state, so shut it down.
-            return True
-        if missing is None and self._cloud.broken(self.cloud_node):
-            self._logger.info(
-                "Cloud node considered 'broken' but paired node %s last_ping_at is None, " +
-                "cannot check node_stale_after (node may be shut down and we just haven't gotten the message yet).",
-                self.arvados_node['uuid'])
-        if self.in_state('idle'):
-            return True
+            crunch_worker_state = 'unpaired'
+        elif not timestamp_fresh(arvados_node_mtime(self.arvados_node), self.node_stale_after):
+            return (False, "node state is stale")
+        elif self.arvados_node['crunch_worker_state']:
+            crunch_worker_state = self.arvados_node['crunch_worker_state']
+        else:
+            return (False, "node is paired but crunch_worker_state is '%s'" % self.arvados_node['crunch_worker_state'])
+
+        window = "open" if self._shutdowns.window_open() else "closed"
+
+        if timestamp_fresh(self.cloud_node_start_time, self.boot_fail_after):
+            boot_grace = "boot wait"
         else:
-            return "node is not idle."
+            boot_grace = "boot exceeded"
 
-    def resume_node(self):
-        pass
+        # API server side not implemented yet.
+        idle_grace = 'idle exceeded'
+
+        node_state = (crunch_worker_state, window, boot_grace, idle_grace)
+        t = transitions[node_state]
+        if t is not None:
+            # yes, shutdown eligible
+            return (True, "node state is %s" % (node_state,))
+        else:
+            # no, return a reason
+            return (False, "node state is %s" % (node_state,))
 
     def consider_shutdown(self):
         try:
+            eligible, reason = self.shutdown_eligible()
             next_opening = self._shutdowns.next_opening()
-            eligible = self.shutdown_eligible()
-            if eligible is True:
-                self._debug("Suggesting shutdown.")
+            if eligible:
+                self._debug("Suggesting shutdown because %s", reason)
                 _notify_subscribers(self.actor_ref.proxy(), self.subscribers)
-            elif self._shutdowns.window_open():
-                self._debug("Cannot shut down because %s", eligible)
-            elif self.last_shutdown_opening != next_opening:
-                self._debug("Shutdown window closed.  Next at %s.",
-                            time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(next_opening)))
-                self._timer.schedule(next_opening, self._later.consider_shutdown)
-                self.last_shutdown_opening = next_opening
+            else:
+                self._debug("Not eligible for shut down because %s", reason)
+
+                if self.last_shutdown_opening != next_opening:
+                    self._debug("Shutdown window closed.  Next at %s.",
+                                time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(next_opening)))
+                    self._timer.schedule(next_opening, self._later.consider_shutdown)
+                    self.last_shutdown_opening = next_opening
         except Exception:
             self._logger.exception("Unexpected exception")
 
index 41919db07e12efe7a262c4635e9a8432febdec2f..cae87197f72f0241ef8763c637e6eecb885acc94 100644 (file)
@@ -6,9 +6,8 @@ import subprocess
 import time
 
 from . import \
-    ComputeNodeSetupActor, ComputeNodeUpdateActor
+    ComputeNodeSetupActor, ComputeNodeUpdateActor, ComputeNodeMonitorActor
 from . import ComputeNodeShutdownActor as ShutdownActorBase
-from . import ComputeNodeMonitorActor as MonitorActorBase
 from .. import RetryMixin
 
 class SlurmMixin(object):
@@ -52,43 +51,27 @@ class ComputeNodeShutdownActor(SlurmMixin, ShutdownActorBase):
         return super(ComputeNodeShutdownActor, self).cancel_shutdown(reason)
 
     @RetryMixin._retry((subprocess.CalledProcessError,))
-    @ShutdownActorBase._stop_if_window_closed
     def issue_slurm_drain(self):
-        self._set_node_state(self._nodename, 'DRAIN', 'Reason=Node Manager shutdown')
-        self._logger.info("Waiting for SLURM node %s to drain", self._nodename)
-        self._later.await_slurm_drain()
+        if self.cancel_reason is not None:
+            return
+        if self._nodename:
+            self._set_node_state(self._nodename, 'DRAIN', 'Reason=Node Manager shutdown')
+            self._logger.info("Waiting for SLURM node %s to drain", self._nodename)
+            self._later.await_slurm_drain()
+        else:
+            self._later.shutdown_node()
 
     @RetryMixin._retry((subprocess.CalledProcessError,))
-    @ShutdownActorBase._stop_if_window_closed
     def await_slurm_drain(self):
+        if self.cancel_reason is not None:
+            return
         output = self._get_slurm_state(self._nodename)
-        if output in self.SLURM_END_STATES:
-            self._later.shutdown_node()
-        else:
+        if output in ("drng\n", "alloc\n", "drng*\n", "alloc*\n"):
             self._timer.schedule(time.time() + 10,
                                  self._later.await_slurm_drain)
-
-
-class ComputeNodeMonitorActor(SlurmMixin, MonitorActorBase):
-
-    def shutdown_eligible(self):
-        if self.arvados_node is not None:
-            state = self._get_slurm_state(self.arvados_node['hostname'])
-            # Automatically eligible for shutdown if it's down or failed, but
-            # not drain to avoid a race condition with resume_node().
-            if state in self.SLURM_END_STATES:
-                if state in self.SLURM_DRAIN_STATES:
-                    return "node is draining"
-                else:
-                    return True
-        return super(ComputeNodeMonitorActor, self).shutdown_eligible()
-
-    def resume_node(self):
-        try:
-            if (self.arvados_node is not None and
-                self._get_slurm_state(self.arvados_node['hostname']) in self.SLURM_DRAIN_STATES):
-                # Resume from "drng" or "drain"
-                self._set_node_state(self.arvados_node['hostname'], 'RESUME')
-        except Exception as error:
-            self._logger.warn(
-                "Exception reenabling node: %s", error, exc_info=error)
+        elif output in ("idle\n"):
+            # Not in "drng" so cancel self.
+            self.cancel_shutdown("slurm state is %s" % output.strip())
+        else:
+            # any other state.
+            self._later.shutdown_node()
diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/transitions.py b/services/nodemanager/arvnodeman/computenode/dispatch/transitions.py
new file mode 100644 (file)
index 0000000..2ff3c94
--- /dev/null
@@ -0,0 +1,52 @@
+transitions = {
+ ('busy', 'closed', 'boot exceeded', 'idle exceeded'): None,
+ ('busy', 'closed', 'boot exceeded', 'idle wait'): None,
+ ('busy', 'closed', 'boot exceeded', 'not idle'): None,
+ ('busy', 'closed', 'boot wait', 'idle exceeded'): None,
+ ('busy', 'closed', 'boot wait', 'idle wait'): None,
+ ('busy', 'closed', 'boot wait', 'not idle'): None,
+ ('busy', 'open', 'boot exceeded', 'idle exceeded'): None,
+ ('busy', 'open', 'boot exceeded', 'idle wait'): None,
+ ('busy', 'open', 'boot exceeded', 'not idle'): None,
+ ('busy', 'open', 'boot wait', 'idle exceeded'): None,
+ ('busy', 'open', 'boot wait', 'idle wait'): None,
+ ('busy', 'open', 'boot wait', 'not idle'): None,
+
+ ('down', 'closed', 'boot exceeded', 'idle exceeded'): "START_SHUTDOWN",
+ ('down', 'closed', 'boot exceeded', 'idle wait'): "START_SHUTDOWN",
+ ('down', 'closed', 'boot exceeded', 'not idle'): "START_SHUTDOWN",
+ ('down', 'closed', 'boot wait', 'idle exceeded'): None,
+ ('down', 'closed', 'boot wait', 'idle wait'): None,
+ ('down', 'closed', 'boot wait', 'not idle'): None,
+ ('down', 'open', 'boot exceeded', 'idle exceeded'): "START_SHUTDOWN",
+ ('down', 'open', 'boot exceeded', 'idle wait'): "START_SHUTDOWN",
+ ('down', 'open', 'boot exceeded', 'not idle'): "START_SHUTDOWN",
+ ('down', 'open', 'boot wait', 'idle exceeded'): "START_SHUTDOWN",
+ ('down', 'open', 'boot wait', 'idle wait'): "START_SHUTDOWN",
+ ('down', 'open', 'boot wait', 'not idle'): "START_SHUTDOWN",
+
+ ('idle', 'closed', 'boot exceeded', 'idle exceeded'): None,
+ ('idle', 'closed', 'boot exceeded', 'idle wait'): None,
+ ('idle', 'closed', 'boot exceeded', 'not idle'): None,
+ ('idle', 'closed', 'boot wait', 'idle exceeded'): None,
+ ('idle', 'closed', 'boot wait', 'idle wait'): None,
+ ('idle', 'closed', 'boot wait', 'not idle'): None,
+ ('idle', 'open', 'boot exceeded', 'idle exceeded'): "START_DRAIN",
+ ('idle', 'open', 'boot exceeded', 'idle wait'): None,
+ ('idle', 'open', 'boot exceeded', 'not idle'): None,
+ ('idle', 'open', 'boot wait', 'idle exceeded'): "START_DRAIN",
+ ('idle', 'open', 'boot wait', 'idle wait'): None,
+ ('idle', 'open', 'boot wait', 'not idle'): None,
+
+ ('unpaired', 'closed', 'boot exceeded', 'idle exceeded'): "START_SHUTDOWN",
+ ('unpaired', 'closed', 'boot exceeded', 'idle wait'): "START_SHUTDOWN",
+ ('unpaired', 'closed', 'boot exceeded', 'not idle'): "START_SHUTDOWN",
+ ('unpaired', 'closed', 'boot wait', 'idle exceeded'): None,
+ ('unpaired', 'closed', 'boot wait', 'idle wait'): None,
+ ('unpaired', 'closed', 'boot wait', 'not idle'): None,
+ ('unpaired', 'open', 'boot exceeded', 'idle exceeded'): "START_SHUTDOWN",
+ ('unpaired', 'open', 'boot exceeded', 'idle wait'): "START_SHUTDOWN",
+ ('unpaired', 'open', 'boot exceeded', 'not idle'): "START_SHUTDOWN",
+ ('unpaired', 'open', 'boot wait', 'idle exceeded'): None,
+ ('unpaired', 'open', 'boot wait', 'idle wait'): None,
+ ('unpaired', 'open', 'boot wait', 'not idle'): None}
index a02da230d8a8b8765debcc18f6971bb1769aacb2..11204409c03cd1894ac4128f1bfc84e3a4767652 100644 (file)
@@ -151,9 +151,6 @@ class NodeManagerDaemonActor(actor_class):
     def _update_poll_time(self, poll_key):
         self.last_polls[poll_key] = time.time()
 
-    def _resume_node(self, node_record):
-        node_record.actor.resume_node()
-
     def _pair_nodes(self, node_record, arvados_node):
         self._logger.info("Cloud node %s is now paired with Arvados node %s",
                           node_record.cloud_node.name, arvados_node['uuid'])
@@ -221,15 +218,6 @@ class NodeManagerDaemonActor(actor_class):
                 if cloud_rec.actor.offer_arvados_pair(arv_node).get():
                     self._pair_nodes(cloud_rec, arv_node)
                     break
-        for rec in self.cloud_nodes.nodes.itervalues():
-            # crunch-dispatch turns all slurm states that are not either "idle"
-            # or "alloc" into "down", but in case that behavior changes, assume
-            # any state that is not "idle" or "alloc" could be a state we want
-            # to try to resume from.
-            if (rec.arvados_node is not None and
-                rec.arvados_node["info"].get("slurm_state") not in ("idle", "alloc") and
-                rec.cloud_node.id not in self.shutdowns):
-                self._resume_node(rec)
 
     def _nodes_booting(self, size):
         s = sum(1
@@ -314,17 +302,17 @@ class NodeManagerDaemonActor(actor_class):
         booting_count = self._nodes_booting(size) + self._nodes_unpaired(size)
         shutdown_count = self._size_shutdowns(size)
         busy_count = self._nodes_busy(size)
-        up_count = self._nodes_up(size) - (shutdown_count + busy_count + self._nodes_missing(size))
+        idle_count = self._nodes_up(size) - (busy_count + self._nodes_missing(size))
 
         self._logger.info("%s: wishlist %i, up %i (booting %i, idle %i, busy %i), shutting down %i", size.name,
                           self._size_wishlist(size),
-                          up_count + busy_count,
+                          idle_count + busy_count,
                           booting_count,
-                          up_count - booting_count,
+                          idle_count - booting_count,
                           busy_count,
                           shutdown_count)
 
-        wanted = self._size_wishlist(size) - up_count
+        wanted = self._size_wishlist(size) - idle_count
         if wanted > 0 and self.max_total_price and ((total_price + (size.price*wanted)) > self.max_total_price):
             can_boot = int((self.max_total_price - total_price) / size.price)
             if can_boot == 0:
@@ -335,7 +323,7 @@ class NodeManagerDaemonActor(actor_class):
             return wanted
 
     def _nodes_excess(self, size):
-        up_count = self._nodes_up(size) - self._size_shutdowns(size)
+        up_count = (self._nodes_booting(size) + self._nodes_booted(size)) - self._size_shutdowns(size)
         if size.id == self.min_cloud_size.id:
             up_count -= self.min_nodes
         return up_count - self._nodes_busy(size) - self._size_wishlist(size)
index 14cd85e414ec271f2d8d1a3a9c283ac3c724fe7c..8def8535cf488e7deb028c6f01d0f9dc45ba3ffa 100644 (file)
@@ -205,7 +205,9 @@ class ComputeNodeShutdownActorMixin(testutil.ActorTestMixin):
         cloud_node = testutil.cloud_node_mock(61)
         arv_node = testutil.arvados_node_mock(61)
         self.make_mocks(cloud_node, arv_node, shutdown_open=False)
+        self.cloud_client.destroy_node.return_value = False
         self.make_actor(cancellable=True)
+        self.shutdown_actor.cancel_shutdown("test")
         self.check_success_flag(False, 2)
         self.assertFalse(self.arvados_client.nodes().update.called)
 
@@ -219,14 +221,6 @@ class ComputeNodeShutdownActorTestCase(ComputeNodeShutdownActorMixin,
         self.check_success_flag(True)
         self.assertTrue(self.cloud_client.destroy_node.called)
 
-    def test_shutdown_cancelled_when_window_closes(self):
-        self.make_mocks(shutdown_open=False)
-        self.make_actor()
-        self.check_success_flag(False, 2)
-        self.assertFalse(self.cloud_client.destroy_node.called)
-        self.assertEqual(self.ACTOR_CLASS.WINDOW_CLOSED,
-                         self.shutdown_actor.cancel_reason.get(self.TIMEOUT))
-
     def test_shutdown_retries_when_cloud_fails(self):
         self.make_mocks()
         self.cloud_client.destroy_node.return_value = False
@@ -356,28 +350,29 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
     def test_no_shutdown_booting(self):
         self.make_actor()
         self.shutdowns._set_state(True, 600)
-        self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT).startswith("node is still booting"))
+        self.assertEquals(self.node_actor.shutdown_eligible().get(self.TIMEOUT),
+                          (False, "node state is ('unpaired', 'open', 'boot wait', 'idle exceeded')"))
 
     def test_shutdown_without_arvados_node(self):
         self.make_actor(start_time=0)
         self.shutdowns._set_state(True, 600)
-        self.assertIs(True, self.node_actor.shutdown_eligible().get(self.TIMEOUT))
+        self.assertEquals(self.node_actor.shutdown_eligible().get(self.TIMEOUT), (True, "node state is ('unpaired', 'open', 'boot exceeded', 'idle exceeded')"))
 
-    def test_no_shutdown_missing(self):
+    def test_shutdown_missing(self):
         arv_node = testutil.arvados_node_mock(10, job_uuid=None,
                                               crunch_worker_state="down",
                                               last_ping_at='1970-01-01T01:02:03.04050607Z')
         self.make_actor(10, arv_node)
         self.shutdowns._set_state(True, 600)
-        self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT).startswith("node is not idle."))
+        self.assertEquals(self.node_actor.shutdown_eligible().get(self.TIMEOUT), (True, "node state is ('down', 'open', 'boot wait', 'idle exceeded')"))
 
-    def test_no_shutdown_running_broken(self):
+    def test_shutdown_running_broken(self):
         arv_node = testutil.arvados_node_mock(12, job_uuid=None,
                                               crunch_worker_state="down")
         self.make_actor(12, arv_node)
         self.shutdowns._set_state(True, 600)
         self.cloud_client.broken.return_value = True
-        self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT).startswith("node is not idle."))
+        self.assertEquals(self.node_actor.shutdown_eligible().get(self.TIMEOUT), (True, "node state is ('down', 'open', 'boot wait', 'idle exceeded')"))
 
     def test_shutdown_missing_broken(self):
         arv_node = testutil.arvados_node_mock(11, job_uuid=None,
@@ -386,27 +381,31 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
         self.make_actor(11, arv_node)
         self.shutdowns._set_state(True, 600)
         self.cloud_client.broken.return_value = True
-        self.assertIs(True, self.node_actor.shutdown_eligible().get(self.TIMEOUT))
+        self.assertEquals(self.node_actor.shutdown_eligible().get(self.TIMEOUT), (True, "node state is ('down', 'open', 'boot wait', 'idle exceeded')"))
 
     def test_no_shutdown_when_window_closed(self):
         self.make_actor(3, testutil.arvados_node_mock(3, job_uuid=None))
-        self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT).startswith("shutdown window is not open."))
+        self.assertEquals(self.node_actor.shutdown_eligible().get(self.TIMEOUT),
+                          (False, "node state is ('idle', 'closed', 'boot wait', 'idle exceeded')"))
 
     def test_no_shutdown_when_node_running_job(self):
         self.make_actor(4, testutil.arvados_node_mock(4, job_uuid=True))
         self.shutdowns._set_state(True, 600)
-        self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT).startswith("node is not idle."))
+        self.assertEquals(self.node_actor.shutdown_eligible().get(self.TIMEOUT),
+                          (False, "node state is ('busy', 'open', 'boot wait', 'idle exceeded')"))
 
     def test_no_shutdown_when_node_state_unknown(self):
         self.make_actor(5, testutil.arvados_node_mock(
             5, crunch_worker_state=None))
         self.shutdowns._set_state(True, 600)
-        self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT).startswith("node is not idle."))
+        self.assertEquals(self.node_actor.shutdown_eligible().get(self.TIMEOUT),
+                          (False, "node is paired but crunch_worker_state is 'None'"))
 
     def test_no_shutdown_when_node_state_stale(self):
         self.make_actor(6, testutil.arvados_node_mock(6, age=90000))
         self.shutdowns._set_state(True, 600)
-        self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT).startswith("node is not idle."))
+        self.assertEquals(self.node_actor.shutdown_eligible().get(self.TIMEOUT),
+                          (False, "node state is stale"))
 
     def test_arvados_node_match(self):
         self.make_actor(2)
index 135b817d91b725d26f712a8f2b1f5a5bb1f93144..85a40ceeb25964e15e3696141b373d9ef87cee4d 100644 (file)
@@ -41,11 +41,11 @@ class SLURMComputeNodeShutdownActorTestCase(ComputeNodeShutdownActorMixin,
             self.check_success_after_reset(proc_mock, end_state)
         return test
 
-    for wait_state in ['alloc\n', 'drng\n', 'idle*\n']:
+    for wait_state in ['alloc\n', 'drng\n']:
         locals()['test_wait_while_' + wait_state.strip()
                  ] = make_wait_state_test(start_state=wait_state)
 
-    for end_state in ['down\n', 'down*\n', 'drain\n', 'fail\n']:
+    for end_state in ['idle*\n', 'down\n', 'down*\n', 'drain\n', 'fail\n']:
         locals()['test_wait_until_' + end_state.strip()
                  ] = make_wait_state_test(end_state=end_state)
 
@@ -55,27 +55,30 @@ class SLURMComputeNodeShutdownActorTestCase(ComputeNodeShutdownActorMixin,
 
     def test_slurm_bypassed_when_no_arvados_node(self, proc_mock):
         # Test we correctly handle a node that failed to bootstrap.
-        proc_mock.return_value = 'idle\n'
+        proc_mock.return_value = 'down\n'
         self.make_actor(start_time=0)
         self.check_success_flag(True)
         self.assertFalse(proc_mock.called)
 
-    def test_node_undrained_when_shutdown_window_closes(self, proc_mock):
-        proc_mock.side_effect = iter(['drng\n', 'idle\n'])
-        self.make_mocks(arvados_node=testutil.arvados_node_mock(job_uuid=True))
-        self.make_actor()
-        self.check_success_flag(False, 2)
-        self.check_slurm_got_args(proc_mock, 'NodeName=compute99', 'State=RESUME')
-
-    def test_alloc_node_undrained_when_shutdown_window_closes(self, proc_mock):
-        proc_mock.side_effect = iter(['alloc\n'])
-        self.make_mocks(arvados_node=testutil.arvados_node_mock(job_uuid=True))
-        self.make_actor()
-        self.check_success_flag(False, 2)
-        self.check_slurm_got_args(proc_mock, 'sinfo', '--noheader', '-o', '%t', '-n', 'compute99')
+    def test_node_undrained_when_shutdown_cancelled(self, proc_mock):
+        try:
+            proc_mock.side_effect = iter(['', 'drng\n', 'drng\n', ''])
+            self.make_mocks(arvados_node=testutil.arvados_node_mock(job_uuid=True))
+            self.timer = testutil.MockTimer(False)
+            self.make_actor()
+            self.busywait(lambda: proc_mock.call_args is not None)
+            self.shutdown_actor.cancel_shutdown("test").get(self.TIMEOUT)
+            self.check_success_flag(False, 2)
+            self.assertEqual(proc_mock.call_args_list,
+                             [mock.call(['scontrol', 'update', 'NodeName=compute99', 'State=DRAIN', 'Reason=Node Manager shutdown']),
+                              mock.call(['sinfo', '--noheader', '-o', '%t', '-n', 'compute99']),
+                              mock.call(['sinfo', '--noheader', '-o', '%t', '-n', 'compute99']),
+                              mock.call(['scontrol', 'update', 'NodeName=compute99', 'State=RESUME'])])
+        finally:
+            self.shutdown_actor.actor_ref.stop()
 
     def test_cancel_shutdown_retry(self, proc_mock):
-        proc_mock.side_effect = iter([OSError, 'drain\n', OSError, 'idle\n'])
+        proc_mock.side_effect = iter([OSError, 'drain\n', OSError, 'idle\n', 'idle\n'])
         self.make_mocks(arvados_node=testutil.arvados_node_mock(job_uuid=True))
         self.make_actor()
         self.check_success_flag(False, 2)
@@ -88,66 +91,3 @@ class SLURMComputeNodeShutdownActorTestCase(ComputeNodeShutdownActorMixin,
         proc_mock.return_value = 'drain\n'
         super(SLURMComputeNodeShutdownActorTestCase,
               self).test_arvados_node_cleaned_after_shutdown()
-
-class SLURMComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
-                                      unittest.TestCase):
-
-    def make_mocks(self, node_num):
-        self.shutdowns = testutil.MockShutdownTimer()
-        self.shutdowns._set_state(False, 300)
-        self.timer = mock.MagicMock(name='timer_mock')
-        self.updates = mock.MagicMock(name='update_mock')
-        self.cloud_mock = testutil.cloud_node_mock(node_num)
-        self.subscriber = mock.Mock(name='subscriber_mock')
-        self.cloud_client = mock.MagicMock(name='cloud_client')
-        self.cloud_client.broken.return_value = False
-
-    def make_actor(self, node_num=1, arv_node=None, start_time=None):
-        if not hasattr(self, 'cloud_mock'):
-            self.make_mocks(node_num)
-        if start_time is None:
-            start_time = time.time()
-        self.node_actor = slurm_dispatch.ComputeNodeMonitorActor.start(
-            self.cloud_mock, start_time, self.shutdowns,
-            testutil.cloud_node_fqdn, self.timer, self.updates, self.cloud_client,
-            arv_node, boot_fail_after=300).proxy()
-        self.node_actor.subscribe(self.subscriber).get(self.TIMEOUT)
-
-    @mock.patch("subprocess.check_output")
-    def test_resume_node(self, check_output):
-        arv_node = testutil.arvados_node_mock()
-        self.make_actor(arv_node=arv_node)
-        check_output.return_value = "drain\n"
-        self.node_actor.resume_node().get(self.TIMEOUT)
-        self.assertIn(mock.call(['sinfo', '--noheader', '-o', '%t', '-n', arv_node['hostname']]), check_output.call_args_list)
-        self.assertIn(mock.call(['scontrol', 'update', 'NodeName=' + arv_node['hostname'], 'State=RESUME']), check_output.call_args_list)
-
-    @mock.patch("subprocess.check_output")
-    def test_no_resume_idle_node(self, check_output):
-        arv_node = testutil.arvados_node_mock()
-        self.make_actor(arv_node=arv_node)
-        check_output.return_value = "idle\n"
-        self.node_actor.resume_node().get(self.TIMEOUT)
-        self.assertIn(mock.call(['sinfo', '--noheader', '-o', '%t', '-n', arv_node['hostname']]), check_output.call_args_list)
-        self.assertNotIn(mock.call(['scontrol', 'update', 'NodeName=' + arv_node['hostname'], 'State=RESUME']), check_output.call_args_list)
-
-    @mock.patch("subprocess.check_output")
-    def test_resume_node_exception(self, check_output):
-        arv_node = testutil.arvados_node_mock()
-        self.make_actor(arv_node=arv_node)
-        check_output.side_effect = Exception()
-        self.node_actor.resume_node().get(self.TIMEOUT)
-        self.assertIn(mock.call(['sinfo', '--noheader', '-o', '%t', '-n', arv_node['hostname']]), check_output.call_args_list)
-        self.assertNotIn(mock.call(['scontrol', 'update', 'NodeName=' + arv_node['hostname'], 'State=RESUME']), check_output.call_args_list)
-
-    @mock.patch("subprocess.check_output")
-    def test_shutdown_down_node(self, check_output):
-        check_output.return_value = "down\n"
-        self.make_actor(arv_node=testutil.arvados_node_mock())
-        self.assertIs(True, self.node_actor.shutdown_eligible().get(self.TIMEOUT))
-
-    @mock.patch("subprocess.check_output")
-    def test_no_shutdown_drain_node(self, check_output):
-        check_output.return_value = "drain\n"
-        self.make_actor(arv_node=testutil.arvados_node_mock())
-        self.assertEquals('node is draining', self.node_actor.shutdown_eligible().get(self.TIMEOUT))
index 00e05a147ffaf58ea46d345b4b3f4fd84a56a333..d52cdae65172f95061798bddcfc4ac8b2ec12d5a 100644 (file)
@@ -233,6 +233,7 @@ class NodeManagerDaemonActorTestCase(testutil.ActorTestMixin,
         arv_node = testutil.arvados_node_mock(2, job_uuid=True)
         self.make_daemon([testutil.cloud_node_mock(2, size=size)], [arv_node],
                          [size], avail_sizes=[(size, {"cores":1})])
+        self.busywait(lambda: self.node_setup.start.called)
         self.stop_proxy(self.daemon)
         self.assertTrue(self.node_setup.start.called)
 
@@ -522,7 +523,7 @@ class NodeManagerDaemonActorTestCase(testutil.ActorTestMixin,
     def test_nodes_shutting_down_replaced_below_max_nodes(self):
         size = testutil.MockSize(6)
         cloud_node = testutil.cloud_node_mock(6, size=size)
-        self.make_daemon([cloud_node], [testutil.arvados_node_mock(6)],
+        self.make_daemon([cloud_node], [testutil.arvados_node_mock(6, crunch_worker_state='down')],
                          avail_sizes=[(size, {"cores":1})])
         self.assertEqual(1, self.alive_monitor_count())
         monitor = self.monitor_list()[0].proxy()
@@ -602,13 +603,6 @@ class NodeManagerDaemonActorTestCase(testutil.ActorTestMixin,
         self.stop_proxy(self.daemon)
         self.assertEqual(1, self.last_shutdown.stop.call_count)
 
-    def busywait(self, f):
-        n = 0
-        while not f() and n < 10:
-            time.sleep(.1)
-            n += 1
-        self.assertTrue(f())
-
     def test_node_create_two_sizes(self):
         small = testutil.MockSize(1)
         big = testutil.MockSize(2)
@@ -718,25 +712,3 @@ class NodeManagerDaemonActorTestCase(testutil.ActorTestMixin,
         # test for that.
         self.assertEqual(2, sizecounts[small.id])
         self.assertEqual(1, sizecounts[big.id])
-
-    @mock.patch("arvnodeman.daemon.NodeManagerDaemonActor._resume_node")
-    def test_resume_drained_nodes(self, resume_node):
-        cloud_node = testutil.cloud_node_mock(1)
-        arv_node = testutil.arvados_node_mock(1, info={"ec2_instance_id": "1", "slurm_state": "down"})
-        self.make_daemon([cloud_node], [arv_node])
-        resume_node.assert_called_with(self.daemon.cloud_nodes.get(self.TIMEOUT).nodes.values()[0])
-        self.stop_proxy(self.daemon)
-
-    @mock.patch("arvnodeman.daemon.NodeManagerDaemonActor._resume_node")
-    def test_no_resume_shutdown_nodes(self, resume_node):
-        cloud_node = testutil.cloud_node_mock(1)
-        arv_node = testutil.arvados_node_mock(1, info={"ec2_instance_id": "1", "slurm_state": "down"})
-
-        self.make_daemon([cloud_node], [])
-
-        self.node_shutdown = mock.MagicMock(name='shutdown_mock')
-        self.daemon.shutdowns.get(self.TIMEOUT)[cloud_node.id] = self.node_shutdown
-
-        self.daemon.update_arvados_nodes([arv_node]).get(self.TIMEOUT)
-        self.stop_proxy(self.daemon)
-        resume_node.assert_not_called()
index a17f0882e65366529882bdc99e4b3ea01034fcd1..1b6aab3cafed16cfc0960d1a39a32d669fe53ffb 100644 (file)
@@ -119,6 +119,13 @@ class ActorTestMixin(object):
             if result is not unassigned:
                 return result
 
+    def busywait(self, f):
+        n = 0
+        while not f() and n < 10:
+            time.sleep(.1)
+            n += 1
+        self.assertTrue(f())
+
 
 class DriverTestMixin(object):
     def setUp(self):
index 2e46886681ac77b2edde0fd8694734a3cc209020..991ccec674e1bd65350e3e59ecbf79ee8eebd8c1 100755 (executable)
@@ -130,7 +130,8 @@ run() {
               --publish=25100:25100
               --publish=25107:25107
               --publish=25108:25108
-              --publish=8001:8001"
+              --publish=8001:8001
+              --publish=8002:8002"
     else
         PUBLIC=""
     fi
index 280ac6854e1540f8ff82905a29d5572953731d50..4c9a4ef318bb7c854119be26a520e268e52c453e 100644 (file)
@@ -35,7 +35,7 @@ ADD fuse.conf /etc/
 ADD crunch-setup.sh gitolite.rc \
     keep-setup.sh common.sh createusers.sh \
     logger runsu.sh waitforpostgres.sh \
-    application_yml_override.py \
+    application_yml_override.py api-setup.sh \
     /usr/local/lib/arvbox/
 
 # Start the supervisor.
diff --git a/tools/arvbox/lib/arvbox/docker/api-setup.sh b/tools/arvbox/lib/arvbox/docker/api-setup.sh
new file mode 100755 (executable)
index 0000000..67c43b4
--- /dev/null
@@ -0,0 +1,86 @@
+#!/bin/bash
+
+exec 2>&1
+set -ex -o pipefail
+
+. /usr/local/lib/arvbox/common.sh
+
+cd /usr/src/arvados/services/api
+export RAILS_ENV=development
+
+set -u
+
+if ! test -s /var/lib/arvados/api_uuid_prefix ; then
+    ruby -e 'puts "#{rand(2**64).to_s(36)[0,5]}"' > /var/lib/arvados/api_uuid_prefix
+fi
+uuid_prefix=$(cat /var/lib/arvados/api_uuid_prefix)
+
+if ! test -s /var/lib/arvados/api_secret_token ; then
+    ruby -e 'puts rand(2**400).to_s(36)' > /var/lib/arvados/api_secret_token
+fi
+secret_token=$(cat /var/lib/arvados/api_secret_token)
+
+if ! test -s /var/lib/arvados/blob_signing_key ; then
+    ruby -e 'puts rand(2**400).to_s(36)' > /var/lib/arvados/blob_signing_key
+fi
+blob_signing_key=$(cat /var/lib/arvados/blob_signing_key)
+
+# self signed key will be created by SSO server script.
+test -s /var/lib/arvados/self-signed.key
+
+sso_app_secret=$(cat /var/lib/arvados/sso_app_secret)
+
+if test -s /var/lib/arvados/vm-uuid ; then
+    vm_uuid=$(cat /var/lib/arvados/vm-uuid)
+else
+    vm_uuid=$uuid_prefix-2x53u-$(ruby -e 'puts rand(2**400).to_s(36)[0,15]')
+    echo $vm_uuid > /var/lib/arvados/vm-uuid
+fi
+
+cat >config/application.yml <<EOF
+development:
+  uuid_prefix: $uuid_prefix
+  secret_token: $secret_token
+  blob_signing_key: $blob_signing_key
+  sso_app_secret: $sso_app_secret
+  sso_app_id: arvados-server
+  sso_provider_url: "https://$localip:${services[sso]}"
+  sso_insecure: true
+  workbench_address: "http://$localip/"
+  websocket_address: "ws://$localip:${services[websockets]}/websocket"
+  git_repo_ssh_base: "git@$localip:"
+  git_repo_https_base: "http://$localip:${services[arv-git-httpd]}/"
+  new_users_are_active: true
+  auto_admin_first_user: true
+  auto_setup_new_users: true
+  auto_setup_new_users_with_vm_uuid: $vm_uuid
+  auto_setup_new_users_with_repository: true
+  default_collection_replication: 1
+EOF
+
+(cd config && /usr/local/lib/arvbox/application_yml_override.py)
+
+if ! test -f /var/lib/arvados/api_database_pw ; then
+    ruby -e 'puts rand(2**128).to_s(36)' > /var/lib/arvados/api_database_pw
+fi
+database_pw=$(cat /var/lib/arvados/api_database_pw)
+
+if ! (psql postgres -c "\du" | grep "^ arvados ") >/dev/null ; then
+    psql postgres -c "create user arvados with password '$database_pw'"
+    psql postgres -c "ALTER USER arvados CREATEDB;"
+fi
+
+sed "s/password:.*/password: $database_pw/" <config/database.yml.example >config/database.yml
+
+if ! test -f /var/lib/arvados/api_database_setup ; then
+   bundle exec rake db:setup
+   touch /var/lib/arvados/api_database_setup
+fi
+
+if ! test -s /var/lib/arvados/superuser_token ; then
+    bundle exec ./script/create_superuser_token.rb > /var/lib/arvados/superuser_token
+fi
+
+rm -rf tmp
+
+bundle exec rake db:migrate
index 0e50a06ef59b4de5c3e18516bd63b3ed8af27f32..3733fa2ecb187b7a1daa5ce7851546faf5856603 100644 (file)
@@ -21,6 +21,7 @@ services=(
   [keepstore1]=25108
   [ssh]=22
   [doc]=8001
+  [websockets]=8002
 )
 
 if test "$(id arvbox -u 2>/dev/null)" = 0 ; then
index 058939c477723d703960b19ccb4331641b1d56eb..a36205c9678d4e67063bbec141072df1a737cea9 100755 (executable)
@@ -15,88 +15,14 @@ if test "$1" = "--only-deps" ; then
     exit
 fi
 
-set -u
-
-if ! test -s /var/lib/arvados/api_uuid_prefix ; then
-    ruby -e 'puts "#{rand(2**64).to_s(36)[0,5]}"' > /var/lib/arvados/api_uuid_prefix
-fi
-uuid_prefix=$(cat /var/lib/arvados/api_uuid_prefix)
-
-if ! test -s /var/lib/arvados/api_secret_token ; then
-    ruby -e 'puts rand(2**400).to_s(36)' > /var/lib/arvados/api_secret_token
-fi
-secret_token=$(cat /var/lib/arvados/api_secret_token)
-
-if ! test -s /var/lib/arvados/blob_signing_key ; then
-    ruby -e 'puts rand(2**400).to_s(36)' > /var/lib/arvados/blob_signing_key
-fi
-blob_signing_key=$(cat /var/lib/arvados/blob_signing_key)
-
-# self signed key will be created by SSO server script.
-test -s /var/lib/arvados/self-signed.key
-
-sso_app_secret=$(cat /var/lib/arvados/sso_app_secret)
-
-if test -s /var/lib/arvados/vm-uuid ; then
-    vm_uuid=$(cat /var/lib/arvados/vm-uuid)
-else
-    vm_uuid=$uuid_prefix-2x53u-$(ruby -e 'puts rand(2**400).to_s(36)[0,15]')
-    echo $vm_uuid > /var/lib/arvados/vm-uuid
-fi
-
-cat >config/application.yml <<EOF
-development:
-  uuid_prefix: $uuid_prefix
-  secret_token: $secret_token
-  blob_signing_key: $blob_signing_key
-  sso_app_secret: $sso_app_secret
-  sso_app_id: arvados-server
-  sso_provider_url: "https://$localip:${services[sso]}"
-  sso_insecure: true
-  workbench_address: "http://$localip/"
-  git_repo_ssh_base: "git@$localip:"
-  git_repo_https_base: "http://$localip:${services[arv-git-httpd]}/"
-  new_users_are_active: true
-  auto_admin_first_user: true
-  auto_setup_new_users: true
-  auto_setup_new_users_with_vm_uuid: $vm_uuid
-  auto_setup_new_users_with_repository: true
-  default_collection_replication: 1
-EOF
-
-(cd config && /usr/local/lib/arvbox/application_yml_override.py)
-
-if ! test -f /var/lib/arvados/api_database_pw ; then
-    ruby -e 'puts rand(2**128).to_s(36)' > /var/lib/arvados/api_database_pw
-fi
-database_pw=$(cat /var/lib/arvados/api_database_pw)
-
-if ! (psql postgres -c "\du" | grep "^ arvados ") >/dev/null ; then
-    psql postgres -c "create user arvados with password '$database_pw'"
-    psql postgres -c "ALTER USER arvados CREATEDB;"
-fi
-
-sed "s/password:.*/password: $database_pw/" <config/database.yml.example >config/database.yml
-
-if ! test -f /var/lib/arvados/api_database_setup ; then
-   bundle exec rake db:setup
-   touch /var/lib/arvados/api_database_setup
-fi
-
-if ! test -s /var/lib/arvados/superuser_token ; then
-    bundle exec ./script/create_superuser_token.rb > /var/lib/arvados/superuser_token
-fi
-
-rm -rf tmp
-
-bundle exec rake db:migrate
+flock /var/lib/arvados/api.lock /usr/local/lib/arvbox/api-setup.sh
 
 set +u
 if test "$1" = "--only-setup" ; then
     exit
 fi
 
-ARVADOS_WEBSOCKETS=1 exec bundle exec passenger start --port=${services[api]} \
+exec bundle exec passenger start --port=${services[api]} \
                   --runtime-dir=/var/lib/passenger \
                   --ssl --ssl-certificate=/var/lib/arvados/self-signed.pem \
                   --ssl-certificate-key=/var/lib/arvados/self-signed.key
diff --git a/tools/arvbox/lib/arvbox/docker/service/websockets/log/main/.gitstub b/tools/arvbox/lib/arvbox/docker/service/websockets/log/main/.gitstub
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/tools/arvbox/lib/arvbox/docker/service/websockets/log/run b/tools/arvbox/lib/arvbox/docker/service/websockets/log/run
new file mode 120000 (symlink)
index 0000000..d6aef4a
--- /dev/null
@@ -0,0 +1 @@
+/usr/local/lib/arvbox/logger
\ No newline at end of file
diff --git a/tools/arvbox/lib/arvbox/docker/service/websockets/run b/tools/arvbox/lib/arvbox/docker/service/websockets/run
new file mode 120000 (symlink)
index 0000000..a388c8b
--- /dev/null
@@ -0,0 +1 @@
+/usr/local/lib/arvbox/runsu.sh
\ No newline at end of file
diff --git a/tools/arvbox/lib/arvbox/docker/service/websockets/run-service b/tools/arvbox/lib/arvbox/docker/service/websockets/run-service
new file mode 100755 (executable)
index 0000000..d0c0b5d
--- /dev/null
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+exec 2>&1
+set -ex -o pipefail
+
+. /usr/local/lib/arvbox/common.sh
+
+cd /usr/src/arvados/services/api
+export RAILS_ENV=development
+
+run_bundler --without=development
+
+if test "$1" = "--only-deps" ; then
+    exit
+fi
+
+flock /var/lib/arvados/api.lock /usr/local/lib/arvbox/api-setup.sh
+
+set +u
+if test "$1" = "--only-setup" ; then
+    exit
+fi
+
+export ARVADOS_WEBSOCKETS=ws-only
+
+# serving ssl directly doesn't work, gets
+# Rack app error: #<TypeError: no implicit conversion of Puma::MiniSSL::Socket into Integer>
+#exec bundle exec puma -b "ssl://0.0.0.0:${services[websockets]}?cert=/var/lib/arvados/self-signed.pem&key=/var/lib/arvados/self-signed.key"
+
+exec bundle exec puma -p${services[websockets]}