crunchstat
keepproxy
keep-rsync
+ keep-block-check
keepstore
keep-web
libarvados-perl"
"Gather cpu/memory/network statistics of running Crunch jobs"
package_go_binary tools/keep-rsync keep-rsync \
"Copy all data from one set of Keep servers to another"
+package_go_binary tools/keep-block-check keep-block-check \
+ "Verify that all data from one set of Keep servers to another was copied"
package_go_binary sdk/go/crunchrunner crunchrunner \
"Crunchrunner executes a command inside a container and uploads the output"
require 'trollop'
require 'google/api_client'
rescue LoadError => l
- puts $:
+ $stderr.puts $:
abort <<-EOS
#{$0}: fatal: #{l.message}
Some runtime dependencies may be missing.
abort "#{$0}: syntax error: --instance cannot be combined with --template or --submit."
end
elsif not $options[:template]
- puts "error: you must supply a --template or --instance."
+ $stderr.puts "error: you must supply a --template or --instance."
p.educate
abort
end
'bin/arvados-cwl-runner'
],
install_requires=[
- 'cwltool>=1.0.20160325200114',
+ 'cwltool==1.0.20160421140153',
'arvados-python-client>=0.1.20160322001610'
],
test_suite='tests',
}
var buf = make([]byte, fs.Offset+fs.Len)
_, err = io.ReadFull(rdr, buf)
+ errClosing := rdr.Close()
+ if err == nil {
+ err = errClosing
+ }
if err != nil {
r.err = err
close(r.errNotNil)
c.Check(err, check.NotNil)
c.Check(err, check.Not(check.Equals), io.EOF)
}
+ c.Check(rdr.Close(), check.NotNil)
}
s.files = ["lib/arvados.rb", "lib/arvados/google_api_client.rb",
"lib/arvados/collection.rb", "lib/arvados/keep.rb",
"README", "LICENSE-2.0.txt"]
- s.required_ruby_version = '>= 2.1.0'
+ s.required_ruby_version = '>= 1.8.7'
# activesupport <4.2.6 only because https://dev.arvados.org/issues/8222
- s.add_dependency('activesupport', '>= 3.2.13', '< 4.2.6')
+ s.add_dependency('activesupport', '>= 3', '< 4.2.6')
s.add_dependency('andand', '~> 1.3', '>= 1.3.3')
- s.add_dependency('google-api-client', '~> 0.6.3', '>= 0.6.3')
+ s.add_dependency('google-api-client', '>= 0.7', '< 0.9')
+ # work around undeclared dependency on i18n in some activesupport 3.x.x:
+ s.add_dependency('i18n', '~> 0')
s.add_dependency('json', '~> 1.7', '>= 1.7.7')
- s.add_runtime_dependency('jwt', '>= 0.1.5', '< 1.0.0')
+ s.add_runtime_dependency('jwt', '<2', '>= 0.1.5')
s.homepage =
'https://arvados.org'
end
:parameters => parameters,
:body_object => body,
:headers => {
- authorization: 'OAuth2 '+arvados.config['ARVADOS_API_TOKEN']
+ :authorization => 'OAuth2 '+arvados.config['ARVADOS_API_TOKEN']
})
resp = JSON.parse result.body, :symbolize_names => true
if resp[:errors]
elsif resp[:uuid] and resp[:etag]
self.new(resp)
elsif resp[:items].is_a? Array
- resp.merge(items: resp[:items].collect do |i|
+ resp.merge(:items => resp[:items].collect do |i|
self.new(i)
end)
else
end
def cp_r(source, target, source_collection=nil)
- opts = {descend_target: !source.end_with?("/")}
+ opts = {:descend_target => !source.end_with?("/")}
copy(:merge, source.chomp("/"), target, source_collection, opts)
end
end
def rm_r(source)
- remove(source, recursive: true)
+ remove(source, :recursive => true)
end
protected
modified
end
- LocatorSegment = Struct.new(:locators, :start_pos, :length)
+ Struct.new("LocatorSegment", :locators, :start_pos, :length)
class LocatorRange < Range
attr_reader :locator
end_index = search_for_byte(start_pos + length - 1, start_index)
end
seg_ranges = @ranges[start_index..end_index]
- LocatorSegment.new(seg_ranges.map(&:locator),
- start_pos - seg_ranges.first.begin,
- length)
+ Struct::LocatorSegment.new(seg_ranges.map(&:locator),
+ start_pos - seg_ranges.first.begin,
+ length)
end
private
raise ArgumentError.new "locator is nil or empty"
end
- m = LOCATOR_REGEXP.match(tok.strip)
+ m = LOCATOR_REGEXP.match(tok)
unless m
raise ArgumentError.new "not a valid locator #{tok}"
end
- tokhash, _, toksize, _, trailer = m[1..5]
+ tokhash, _, toksize, _, _, trailer = m[1..6]
tokhints = []
if trailer
trailer.split('+').each do |hint|
- if hint =~ /^[[:upper:]][[:alnum:]@_-]+$/
+ if hint =~ /^[[:upper:]][[:alnum:]@_-]*$/
tokhints.push(hint)
else
- raise ArgumentError.new "unknown hint #{hint}"
+ raise ArgumentError.new "invalid hint #{hint}"
end
end
end
[true, 'd41d8cd98f00b204e9800998ecf8427e+0', '+0','0',nil],
[true, 'd41d8cd98f00b204e9800998ecf8427e+0+Fizz+Buzz','+0','0','+Fizz+Buzz'],
[true, 'd41d8cd98f00b204e9800998ecf8427e+Fizz+Buzz', nil,nil,'+Fizz+Buzz'],
+ [true, 'd41d8cd98f00b204e9800998ecf8427e+0+Ad41d8cd98f00b204e9800998ecf8427e00000000+Foo', '+0','0','+Ad41d8cd98f00b204e9800998ecf8427e00000000+Foo'],
+ [true, 'd41d8cd98f00b204e9800998ecf8427e+Ad41d8cd98f00b204e9800998ecf8427e00000000+Foo', nil,nil,'+Ad41d8cd98f00b204e9800998ecf8427e00000000+Foo'],
[true, 'd41d8cd98f00b204e9800998ecf8427e+0+Z', '+0','0','+Z'],
[true, 'd41d8cd98f00b204e9800998ecf8427e+Z', nil,nil,'+Z'],
].each do |ok, locator, match2, match3, match4|
assert_equal match4, match[4]
end
end
+ define_method "test_parse_method_on_#{locator.inspect}" do
+ loc = Keep::Locator.parse locator
+ if !ok
+ assert_nil loc
+ else
+ refute_nil loc
+ assert loc.is_a?(Keep::Locator)
+ #assert loc.hash
+ #assert loc.size
+ #assert loc.hints.is_a?(Array)
+ end
+ end
end
[
[true, ". d41d8cd98f00b204e9800998ecf8427e+0 0:0:\\040\n"],
[true, ". 00000000000000000000000000000000+0 0:0:0\n"],
[true, ". 00000000000000000000000000000000+0 0:0:d41d8cd98f00b204e9800998ecf8427e+0+Ad41d8cd98f00b204e9800998ecf8427e00000000@ffffffff\n"],
+ [true, ". d41d8cd98f00b204e9800998ecf8427e+0+Ad41d8cd98f00b204e9800998ecf8427e00000000@ffffffff 0:0:empty.txt\n"],
[false, '. d41d8cd98f00b204e9800998ecf8427e 0:0:abc.txt',
"Invalid manifest: does not end with newline"],
[false, "abc d41d8cd98f00b204e9800998ecf8427e 0:0:abc.txt\n",
t.finish <- dockerclient.WaitResult{ExitCode: 1}
})
- c.Check(api.Calls, Equals, 8)
+ c.Assert(api.Calls, Equals, 8)
c.Check(api.Content[7]["container"].(arvadosclient.Dict)["log"], NotNil)
c.Check(api.Content[7]["container"].(arvadosclient.Dict)["exit_code"], Equals, 1)
c.Check(api.Content[7]["container"].(arvadosclient.Dict)["state"], Equals, "Complete")
@mock.patch('arvados.keep.KeepClient.get')
def runTest(self, mocked_get):
- logging.getLogger('arvados.arvados_fuse').setLevel(logging.DEBUG)
self.api._rootDesc = {"blobSignatureTtl": 2}
mnt = self.make_mount(fuse.CollectionDirectory, collection_record='zzzzz-4zz18-op4e2lbej01tcvu')
mocked_get.return_value = 'fake data'
statusCode, statusText = http.StatusInternalServerError, err.Error()
return
}
+ if kc.Client != nil && kc.Client.Transport != nil {
+ // Workaround for https://dev.arvados.org/issues/9005
+ if t, ok := kc.Client.Transport.(*http.Transport); ok {
+ defer t.CloseIdleConnections()
+ }
+ }
rdr, err := kc.CollectionFileReader(collection, filename)
if os.IsNotExist(err) {
statusCode = http.StatusNotFound
arvados_node_missing, RetryMixin
from ...clientactor import _notify_subscribers
from ... import config
+from .transitions import transitions
class ComputeNodeStateChangeBase(config.actor_class, RetryMixin):
"""Base class for actors that change a compute node's state.
self._logger.info("Shutdown cancelled: %s.", reason)
self._finished(success_flag=False)
- def _stop_if_window_closed(orig_func):
- @functools.wraps(orig_func)
- def stop_wrapper(self, *args, **kwargs):
- if (self.cancellable and
- (self._monitor.shutdown_eligible().get() is not True)):
- self._later.cancel_shutdown(self.WINDOW_CLOSED)
- return None
- else:
- return orig_func(self, *args, **kwargs)
- return stop_wrapper
-
def _cancel_on_exception(orig_func):
@functools.wraps(orig_func)
def finish_wrapper(self, *args, **kwargs):
return finish_wrapper
@_cancel_on_exception
- @_stop_if_window_closed
@RetryMixin._retry()
def shutdown_node(self):
self._logger.info("Starting shutdown")
self._clean_arvados_node(arvados_node, "Shut down by Node Manager")
self._finished(success_flag=True)
- # Make the decorator available to subclasses.
- _stop_if_window_closed = staticmethod(_stop_if_window_closed)
-
class ComputeNodeUpdateActor(config.actor_class):
"""Actor to dispatch one-off cloud management requests.
return result
def shutdown_eligible(self):
- """Return True if eligible for shutdown, or a string explaining why the node
- is not eligible for shutdown."""
+ """Determine if node is candidate for shut down.
+
+ Returns a tuple of (boolean, string) where the first value is whether
+ the node is candidate for shut down, and the second value is the
+ reason for the decision.
+ """
+
+ # Collect states and then consult state transition table whether we
+ # should shut down. Possible states are:
+ # crunch_worker_state = ['unpaired', 'busy', 'idle', 'down']
+ # window = ["open", "closed"]
+ # boot_grace = ["boot wait", "boot exceeded"]
+ # idle_grace = ["not idle", "idle wait", "idle exceeded"]
- if not self._shutdowns.window_open():
- return "shutdown window is not open."
if self.arvados_node is None:
- # Node is unpaired.
- # If it hasn't pinged Arvados after boot_fail seconds, shut it down
- if timestamp_fresh(self.cloud_node_start_time, self.boot_fail_after):
- return "node is still booting, will be considered a failed boot at %s" % time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(self.cloud_node_start_time + self.boot_fail_after))
- else:
- return True
- missing = arvados_node_missing(self.arvados_node, self.node_stale_after)
- if missing and self._cloud.broken(self.cloud_node):
- # Node is paired, but Arvados says it is missing and the cloud says the node
- # is in an error state, so shut it down.
- return True
- if missing is None and self._cloud.broken(self.cloud_node):
- self._logger.info(
- "Cloud node considered 'broken' but paired node %s last_ping_at is None, " +
- "cannot check node_stale_after (node may be shut down and we just haven't gotten the message yet).",
- self.arvados_node['uuid'])
- if self.in_state('idle'):
- return True
+ crunch_worker_state = 'unpaired'
+ elif not timestamp_fresh(arvados_node_mtime(self.arvados_node), self.node_stale_after):
+ return (False, "node state is stale")
+ elif self.arvados_node['crunch_worker_state']:
+ crunch_worker_state = self.arvados_node['crunch_worker_state']
+ else:
+ return (False, "node is paired but crunch_worker_state is '%s'" % self.arvados_node['crunch_worker_state'])
+
+ window = "open" if self._shutdowns.window_open() else "closed"
+
+ if timestamp_fresh(self.cloud_node_start_time, self.boot_fail_after):
+ boot_grace = "boot wait"
else:
- return "node is not idle."
+ boot_grace = "boot exceeded"
- def resume_node(self):
- pass
+ # API server side not implemented yet.
+ idle_grace = 'idle exceeded'
+
+ node_state = (crunch_worker_state, window, boot_grace, idle_grace)
+ t = transitions[node_state]
+ if t is not None:
+ # yes, shutdown eligible
+ return (True, "node state is %s" % (node_state,))
+ else:
+ # no, return a reason
+ return (False, "node state is %s" % (node_state,))
def consider_shutdown(self):
try:
+ eligible, reason = self.shutdown_eligible()
next_opening = self._shutdowns.next_opening()
- eligible = self.shutdown_eligible()
- if eligible is True:
- self._debug("Suggesting shutdown.")
+ if eligible:
+ self._debug("Suggesting shutdown because %s", reason)
_notify_subscribers(self.actor_ref.proxy(), self.subscribers)
- elif self._shutdowns.window_open():
- self._debug("Cannot shut down because %s", eligible)
- elif self.last_shutdown_opening != next_opening:
- self._debug("Shutdown window closed. Next at %s.",
- time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(next_opening)))
- self._timer.schedule(next_opening, self._later.consider_shutdown)
- self.last_shutdown_opening = next_opening
+ else:
+ self._debug("Not eligible for shut down because %s", reason)
+
+ if self.last_shutdown_opening != next_opening:
+ self._debug("Shutdown window closed. Next at %s.",
+ time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(next_opening)))
+ self._timer.schedule(next_opening, self._later.consider_shutdown)
+ self.last_shutdown_opening = next_opening
except Exception:
self._logger.exception("Unexpected exception")
import time
from . import \
- ComputeNodeSetupActor, ComputeNodeUpdateActor
+ ComputeNodeSetupActor, ComputeNodeUpdateActor, ComputeNodeMonitorActor
from . import ComputeNodeShutdownActor as ShutdownActorBase
-from . import ComputeNodeMonitorActor as MonitorActorBase
from .. import RetryMixin
class SlurmMixin(object):
return super(ComputeNodeShutdownActor, self).cancel_shutdown(reason)
@RetryMixin._retry((subprocess.CalledProcessError,))
- @ShutdownActorBase._stop_if_window_closed
def issue_slurm_drain(self):
- self._set_node_state(self._nodename, 'DRAIN', 'Reason=Node Manager shutdown')
- self._logger.info("Waiting for SLURM node %s to drain", self._nodename)
- self._later.await_slurm_drain()
+ if self.cancel_reason is not None:
+ return
+ if self._nodename:
+ self._set_node_state(self._nodename, 'DRAIN', 'Reason=Node Manager shutdown')
+ self._logger.info("Waiting for SLURM node %s to drain", self._nodename)
+ self._later.await_slurm_drain()
+ else:
+ self._later.shutdown_node()
@RetryMixin._retry((subprocess.CalledProcessError,))
- @ShutdownActorBase._stop_if_window_closed
def await_slurm_drain(self):
+ if self.cancel_reason is not None:
+ return
output = self._get_slurm_state(self._nodename)
- if output in self.SLURM_END_STATES:
- self._later.shutdown_node()
- else:
+ if output in ("drng\n", "alloc\n", "drng*\n", "alloc*\n"):
self._timer.schedule(time.time() + 10,
self._later.await_slurm_drain)
-
-
-class ComputeNodeMonitorActor(SlurmMixin, MonitorActorBase):
-
- def shutdown_eligible(self):
- if self.arvados_node is not None:
- state = self._get_slurm_state(self.arvados_node['hostname'])
- # Automatically eligible for shutdown if it's down or failed, but
- # not drain to avoid a race condition with resume_node().
- if state in self.SLURM_END_STATES:
- if state in self.SLURM_DRAIN_STATES:
- return "node is draining"
- else:
- return True
- return super(ComputeNodeMonitorActor, self).shutdown_eligible()
-
- def resume_node(self):
- try:
- if (self.arvados_node is not None and
- self._get_slurm_state(self.arvados_node['hostname']) in self.SLURM_DRAIN_STATES):
- # Resume from "drng" or "drain"
- self._set_node_state(self.arvados_node['hostname'], 'RESUME')
- except Exception as error:
- self._logger.warn(
- "Exception reenabling node: %s", error, exc_info=error)
+ elif output in ("idle\n"):
+ # Not in "drng" so cancel self.
+ self.cancel_shutdown("slurm state is %s" % output.strip())
+ else:
+ # any other state.
+ self._later.shutdown_node()
--- /dev/null
+transitions = {
+ ('busy', 'closed', 'boot exceeded', 'idle exceeded'): None,
+ ('busy', 'closed', 'boot exceeded', 'idle wait'): None,
+ ('busy', 'closed', 'boot exceeded', 'not idle'): None,
+ ('busy', 'closed', 'boot wait', 'idle exceeded'): None,
+ ('busy', 'closed', 'boot wait', 'idle wait'): None,
+ ('busy', 'closed', 'boot wait', 'not idle'): None,
+ ('busy', 'open', 'boot exceeded', 'idle exceeded'): None,
+ ('busy', 'open', 'boot exceeded', 'idle wait'): None,
+ ('busy', 'open', 'boot exceeded', 'not idle'): None,
+ ('busy', 'open', 'boot wait', 'idle exceeded'): None,
+ ('busy', 'open', 'boot wait', 'idle wait'): None,
+ ('busy', 'open', 'boot wait', 'not idle'): None,
+
+ ('down', 'closed', 'boot exceeded', 'idle exceeded'): "START_SHUTDOWN",
+ ('down', 'closed', 'boot exceeded', 'idle wait'): "START_SHUTDOWN",
+ ('down', 'closed', 'boot exceeded', 'not idle'): "START_SHUTDOWN",
+ ('down', 'closed', 'boot wait', 'idle exceeded'): None,
+ ('down', 'closed', 'boot wait', 'idle wait'): None,
+ ('down', 'closed', 'boot wait', 'not idle'): None,
+ ('down', 'open', 'boot exceeded', 'idle exceeded'): "START_SHUTDOWN",
+ ('down', 'open', 'boot exceeded', 'idle wait'): "START_SHUTDOWN",
+ ('down', 'open', 'boot exceeded', 'not idle'): "START_SHUTDOWN",
+ ('down', 'open', 'boot wait', 'idle exceeded'): "START_SHUTDOWN",
+ ('down', 'open', 'boot wait', 'idle wait'): "START_SHUTDOWN",
+ ('down', 'open', 'boot wait', 'not idle'): "START_SHUTDOWN",
+
+ ('idle', 'closed', 'boot exceeded', 'idle exceeded'): None,
+ ('idle', 'closed', 'boot exceeded', 'idle wait'): None,
+ ('idle', 'closed', 'boot exceeded', 'not idle'): None,
+ ('idle', 'closed', 'boot wait', 'idle exceeded'): None,
+ ('idle', 'closed', 'boot wait', 'idle wait'): None,
+ ('idle', 'closed', 'boot wait', 'not idle'): None,
+ ('idle', 'open', 'boot exceeded', 'idle exceeded'): "START_DRAIN",
+ ('idle', 'open', 'boot exceeded', 'idle wait'): None,
+ ('idle', 'open', 'boot exceeded', 'not idle'): None,
+ ('idle', 'open', 'boot wait', 'idle exceeded'): "START_DRAIN",
+ ('idle', 'open', 'boot wait', 'idle wait'): None,
+ ('idle', 'open', 'boot wait', 'not idle'): None,
+
+ ('unpaired', 'closed', 'boot exceeded', 'idle exceeded'): "START_SHUTDOWN",
+ ('unpaired', 'closed', 'boot exceeded', 'idle wait'): "START_SHUTDOWN",
+ ('unpaired', 'closed', 'boot exceeded', 'not idle'): "START_SHUTDOWN",
+ ('unpaired', 'closed', 'boot wait', 'idle exceeded'): None,
+ ('unpaired', 'closed', 'boot wait', 'idle wait'): None,
+ ('unpaired', 'closed', 'boot wait', 'not idle'): None,
+ ('unpaired', 'open', 'boot exceeded', 'idle exceeded'): "START_SHUTDOWN",
+ ('unpaired', 'open', 'boot exceeded', 'idle wait'): "START_SHUTDOWN",
+ ('unpaired', 'open', 'boot exceeded', 'not idle'): "START_SHUTDOWN",
+ ('unpaired', 'open', 'boot wait', 'idle exceeded'): None,
+ ('unpaired', 'open', 'boot wait', 'idle wait'): None,
+ ('unpaired', 'open', 'boot wait', 'not idle'): None}
def _update_poll_time(self, poll_key):
self.last_polls[poll_key] = time.time()
- def _resume_node(self, node_record):
- node_record.actor.resume_node()
-
def _pair_nodes(self, node_record, arvados_node):
self._logger.info("Cloud node %s is now paired with Arvados node %s",
node_record.cloud_node.name, arvados_node['uuid'])
if cloud_rec.actor.offer_arvados_pair(arv_node).get():
self._pair_nodes(cloud_rec, arv_node)
break
- for rec in self.cloud_nodes.nodes.itervalues():
- # crunch-dispatch turns all slurm states that are not either "idle"
- # or "alloc" into "down", but in case that behavior changes, assume
- # any state that is not "idle" or "alloc" could be a state we want
- # to try to resume from.
- if (rec.arvados_node is not None and
- rec.arvados_node["info"].get("slurm_state") not in ("idle", "alloc") and
- rec.cloud_node.id not in self.shutdowns):
- self._resume_node(rec)
def _nodes_booting(self, size):
s = sum(1
booting_count = self._nodes_booting(size) + self._nodes_unpaired(size)
shutdown_count = self._size_shutdowns(size)
busy_count = self._nodes_busy(size)
- up_count = self._nodes_up(size) - (shutdown_count + busy_count + self._nodes_missing(size))
+ idle_count = self._nodes_up(size) - (busy_count + self._nodes_missing(size))
self._logger.info("%s: wishlist %i, up %i (booting %i, idle %i, busy %i), shutting down %i", size.name,
self._size_wishlist(size),
- up_count + busy_count,
+ idle_count + busy_count,
booting_count,
- up_count - booting_count,
+ idle_count - booting_count,
busy_count,
shutdown_count)
- wanted = self._size_wishlist(size) - up_count
+ wanted = self._size_wishlist(size) - idle_count
if wanted > 0 and self.max_total_price and ((total_price + (size.price*wanted)) > self.max_total_price):
can_boot = int((self.max_total_price - total_price) / size.price)
if can_boot == 0:
return wanted
def _nodes_excess(self, size):
- up_count = self._nodes_up(size) - self._size_shutdowns(size)
+ up_count = (self._nodes_booting(size) + self._nodes_booted(size)) - self._size_shutdowns(size)
if size.id == self.min_cloud_size.id:
up_count -= self.min_nodes
return up_count - self._nodes_busy(size) - self._size_wishlist(size)
cloud_node = testutil.cloud_node_mock(61)
arv_node = testutil.arvados_node_mock(61)
self.make_mocks(cloud_node, arv_node, shutdown_open=False)
+ self.cloud_client.destroy_node.return_value = False
self.make_actor(cancellable=True)
+ self.shutdown_actor.cancel_shutdown("test")
self.check_success_flag(False, 2)
self.assertFalse(self.arvados_client.nodes().update.called)
self.check_success_flag(True)
self.assertTrue(self.cloud_client.destroy_node.called)
- def test_shutdown_cancelled_when_window_closes(self):
- self.make_mocks(shutdown_open=False)
- self.make_actor()
- self.check_success_flag(False, 2)
- self.assertFalse(self.cloud_client.destroy_node.called)
- self.assertEqual(self.ACTOR_CLASS.WINDOW_CLOSED,
- self.shutdown_actor.cancel_reason.get(self.TIMEOUT))
-
def test_shutdown_retries_when_cloud_fails(self):
self.make_mocks()
self.cloud_client.destroy_node.return_value = False
def test_no_shutdown_booting(self):
self.make_actor()
self.shutdowns._set_state(True, 600)
- self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT).startswith("node is still booting"))
+ self.assertEquals(self.node_actor.shutdown_eligible().get(self.TIMEOUT),
+ (False, "node state is ('unpaired', 'open', 'boot wait', 'idle exceeded')"))
def test_shutdown_without_arvados_node(self):
self.make_actor(start_time=0)
self.shutdowns._set_state(True, 600)
- self.assertIs(True, self.node_actor.shutdown_eligible().get(self.TIMEOUT))
+ self.assertEquals(self.node_actor.shutdown_eligible().get(self.TIMEOUT), (True, "node state is ('unpaired', 'open', 'boot exceeded', 'idle exceeded')"))
- def test_no_shutdown_missing(self):
+ def test_shutdown_missing(self):
arv_node = testutil.arvados_node_mock(10, job_uuid=None,
crunch_worker_state="down",
last_ping_at='1970-01-01T01:02:03.04050607Z')
self.make_actor(10, arv_node)
self.shutdowns._set_state(True, 600)
- self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT).startswith("node is not idle."))
+ self.assertEquals(self.node_actor.shutdown_eligible().get(self.TIMEOUT), (True, "node state is ('down', 'open', 'boot wait', 'idle exceeded')"))
- def test_no_shutdown_running_broken(self):
+ def test_shutdown_running_broken(self):
arv_node = testutil.arvados_node_mock(12, job_uuid=None,
crunch_worker_state="down")
self.make_actor(12, arv_node)
self.shutdowns._set_state(True, 600)
self.cloud_client.broken.return_value = True
- self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT).startswith("node is not idle."))
+ self.assertEquals(self.node_actor.shutdown_eligible().get(self.TIMEOUT), (True, "node state is ('down', 'open', 'boot wait', 'idle exceeded')"))
def test_shutdown_missing_broken(self):
arv_node = testutil.arvados_node_mock(11, job_uuid=None,
self.make_actor(11, arv_node)
self.shutdowns._set_state(True, 600)
self.cloud_client.broken.return_value = True
- self.assertIs(True, self.node_actor.shutdown_eligible().get(self.TIMEOUT))
+ self.assertEquals(self.node_actor.shutdown_eligible().get(self.TIMEOUT), (True, "node state is ('down', 'open', 'boot wait', 'idle exceeded')"))
def test_no_shutdown_when_window_closed(self):
self.make_actor(3, testutil.arvados_node_mock(3, job_uuid=None))
- self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT).startswith("shutdown window is not open."))
+ self.assertEquals(self.node_actor.shutdown_eligible().get(self.TIMEOUT),
+ (False, "node state is ('idle', 'closed', 'boot wait', 'idle exceeded')"))
def test_no_shutdown_when_node_running_job(self):
self.make_actor(4, testutil.arvados_node_mock(4, job_uuid=True))
self.shutdowns._set_state(True, 600)
- self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT).startswith("node is not idle."))
+ self.assertEquals(self.node_actor.shutdown_eligible().get(self.TIMEOUT),
+ (False, "node state is ('busy', 'open', 'boot wait', 'idle exceeded')"))
def test_no_shutdown_when_node_state_unknown(self):
self.make_actor(5, testutil.arvados_node_mock(
5, crunch_worker_state=None))
self.shutdowns._set_state(True, 600)
- self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT).startswith("node is not idle."))
+ self.assertEquals(self.node_actor.shutdown_eligible().get(self.TIMEOUT),
+ (False, "node is paired but crunch_worker_state is 'None'"))
def test_no_shutdown_when_node_state_stale(self):
self.make_actor(6, testutil.arvados_node_mock(6, age=90000))
self.shutdowns._set_state(True, 600)
- self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT).startswith("node is not idle."))
+ self.assertEquals(self.node_actor.shutdown_eligible().get(self.TIMEOUT),
+ (False, "node state is stale"))
def test_arvados_node_match(self):
self.make_actor(2)
self.check_success_after_reset(proc_mock, end_state)
return test
- for wait_state in ['alloc\n', 'drng\n', 'idle*\n']:
+ for wait_state in ['alloc\n', 'drng\n']:
locals()['test_wait_while_' + wait_state.strip()
] = make_wait_state_test(start_state=wait_state)
- for end_state in ['down\n', 'down*\n', 'drain\n', 'fail\n']:
+ for end_state in ['idle*\n', 'down\n', 'down*\n', 'drain\n', 'fail\n']:
locals()['test_wait_until_' + end_state.strip()
] = make_wait_state_test(end_state=end_state)
def test_slurm_bypassed_when_no_arvados_node(self, proc_mock):
# Test we correctly handle a node that failed to bootstrap.
- proc_mock.return_value = 'idle\n'
+ proc_mock.return_value = 'down\n'
self.make_actor(start_time=0)
self.check_success_flag(True)
self.assertFalse(proc_mock.called)
- def test_node_undrained_when_shutdown_window_closes(self, proc_mock):
- proc_mock.side_effect = iter(['drng\n', 'idle\n'])
- self.make_mocks(arvados_node=testutil.arvados_node_mock(job_uuid=True))
- self.make_actor()
- self.check_success_flag(False, 2)
- self.check_slurm_got_args(proc_mock, 'NodeName=compute99', 'State=RESUME')
-
- def test_alloc_node_undrained_when_shutdown_window_closes(self, proc_mock):
- proc_mock.side_effect = iter(['alloc\n'])
- self.make_mocks(arvados_node=testutil.arvados_node_mock(job_uuid=True))
- self.make_actor()
- self.check_success_flag(False, 2)
- self.check_slurm_got_args(proc_mock, 'sinfo', '--noheader', '-o', '%t', '-n', 'compute99')
+ def test_node_undrained_when_shutdown_cancelled(self, proc_mock):
+ try:
+ proc_mock.side_effect = iter(['', 'drng\n', 'drng\n', ''])
+ self.make_mocks(arvados_node=testutil.arvados_node_mock(job_uuid=True))
+ self.timer = testutil.MockTimer(False)
+ self.make_actor()
+ self.busywait(lambda: proc_mock.call_args is not None)
+ self.shutdown_actor.cancel_shutdown("test").get(self.TIMEOUT)
+ self.check_success_flag(False, 2)
+ self.assertEqual(proc_mock.call_args_list,
+ [mock.call(['scontrol', 'update', 'NodeName=compute99', 'State=DRAIN', 'Reason=Node Manager shutdown']),
+ mock.call(['sinfo', '--noheader', '-o', '%t', '-n', 'compute99']),
+ mock.call(['sinfo', '--noheader', '-o', '%t', '-n', 'compute99']),
+ mock.call(['scontrol', 'update', 'NodeName=compute99', 'State=RESUME'])])
+ finally:
+ self.shutdown_actor.actor_ref.stop()
def test_cancel_shutdown_retry(self, proc_mock):
- proc_mock.side_effect = iter([OSError, 'drain\n', OSError, 'idle\n'])
+ proc_mock.side_effect = iter([OSError, 'drain\n', OSError, 'idle\n', 'idle\n'])
self.make_mocks(arvados_node=testutil.arvados_node_mock(job_uuid=True))
self.make_actor()
self.check_success_flag(False, 2)
proc_mock.return_value = 'drain\n'
super(SLURMComputeNodeShutdownActorTestCase,
self).test_arvados_node_cleaned_after_shutdown()
-
-class SLURMComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
- unittest.TestCase):
-
- def make_mocks(self, node_num):
- self.shutdowns = testutil.MockShutdownTimer()
- self.shutdowns._set_state(False, 300)
- self.timer = mock.MagicMock(name='timer_mock')
- self.updates = mock.MagicMock(name='update_mock')
- self.cloud_mock = testutil.cloud_node_mock(node_num)
- self.subscriber = mock.Mock(name='subscriber_mock')
- self.cloud_client = mock.MagicMock(name='cloud_client')
- self.cloud_client.broken.return_value = False
-
- def make_actor(self, node_num=1, arv_node=None, start_time=None):
- if not hasattr(self, 'cloud_mock'):
- self.make_mocks(node_num)
- if start_time is None:
- start_time = time.time()
- self.node_actor = slurm_dispatch.ComputeNodeMonitorActor.start(
- self.cloud_mock, start_time, self.shutdowns,
- testutil.cloud_node_fqdn, self.timer, self.updates, self.cloud_client,
- arv_node, boot_fail_after=300).proxy()
- self.node_actor.subscribe(self.subscriber).get(self.TIMEOUT)
-
- @mock.patch("subprocess.check_output")
- def test_resume_node(self, check_output):
- arv_node = testutil.arvados_node_mock()
- self.make_actor(arv_node=arv_node)
- check_output.return_value = "drain\n"
- self.node_actor.resume_node().get(self.TIMEOUT)
- self.assertIn(mock.call(['sinfo', '--noheader', '-o', '%t', '-n', arv_node['hostname']]), check_output.call_args_list)
- self.assertIn(mock.call(['scontrol', 'update', 'NodeName=' + arv_node['hostname'], 'State=RESUME']), check_output.call_args_list)
-
- @mock.patch("subprocess.check_output")
- def test_no_resume_idle_node(self, check_output):
- arv_node = testutil.arvados_node_mock()
- self.make_actor(arv_node=arv_node)
- check_output.return_value = "idle\n"
- self.node_actor.resume_node().get(self.TIMEOUT)
- self.assertIn(mock.call(['sinfo', '--noheader', '-o', '%t', '-n', arv_node['hostname']]), check_output.call_args_list)
- self.assertNotIn(mock.call(['scontrol', 'update', 'NodeName=' + arv_node['hostname'], 'State=RESUME']), check_output.call_args_list)
-
- @mock.patch("subprocess.check_output")
- def test_resume_node_exception(self, check_output):
- arv_node = testutil.arvados_node_mock()
- self.make_actor(arv_node=arv_node)
- check_output.side_effect = Exception()
- self.node_actor.resume_node().get(self.TIMEOUT)
- self.assertIn(mock.call(['sinfo', '--noheader', '-o', '%t', '-n', arv_node['hostname']]), check_output.call_args_list)
- self.assertNotIn(mock.call(['scontrol', 'update', 'NodeName=' + arv_node['hostname'], 'State=RESUME']), check_output.call_args_list)
-
- @mock.patch("subprocess.check_output")
- def test_shutdown_down_node(self, check_output):
- check_output.return_value = "down\n"
- self.make_actor(arv_node=testutil.arvados_node_mock())
- self.assertIs(True, self.node_actor.shutdown_eligible().get(self.TIMEOUT))
-
- @mock.patch("subprocess.check_output")
- def test_no_shutdown_drain_node(self, check_output):
- check_output.return_value = "drain\n"
- self.make_actor(arv_node=testutil.arvados_node_mock())
- self.assertEquals('node is draining', self.node_actor.shutdown_eligible().get(self.TIMEOUT))
arv_node = testutil.arvados_node_mock(2, job_uuid=True)
self.make_daemon([testutil.cloud_node_mock(2, size=size)], [arv_node],
[size], avail_sizes=[(size, {"cores":1})])
+ self.busywait(lambda: self.node_setup.start.called)
self.stop_proxy(self.daemon)
self.assertTrue(self.node_setup.start.called)
def test_nodes_shutting_down_replaced_below_max_nodes(self):
size = testutil.MockSize(6)
cloud_node = testutil.cloud_node_mock(6, size=size)
- self.make_daemon([cloud_node], [testutil.arvados_node_mock(6)],
+ self.make_daemon([cloud_node], [testutil.arvados_node_mock(6, crunch_worker_state='down')],
avail_sizes=[(size, {"cores":1})])
self.assertEqual(1, self.alive_monitor_count())
monitor = self.monitor_list()[0].proxy()
self.stop_proxy(self.daemon)
self.assertEqual(1, self.last_shutdown.stop.call_count)
- def busywait(self, f):
- n = 0
- while not f() and n < 10:
- time.sleep(.1)
- n += 1
- self.assertTrue(f())
-
def test_node_create_two_sizes(self):
small = testutil.MockSize(1)
big = testutil.MockSize(2)
# test for that.
self.assertEqual(2, sizecounts[small.id])
self.assertEqual(1, sizecounts[big.id])
-
- @mock.patch("arvnodeman.daemon.NodeManagerDaemonActor._resume_node")
- def test_resume_drained_nodes(self, resume_node):
- cloud_node = testutil.cloud_node_mock(1)
- arv_node = testutil.arvados_node_mock(1, info={"ec2_instance_id": "1", "slurm_state": "down"})
- self.make_daemon([cloud_node], [arv_node])
- resume_node.assert_called_with(self.daemon.cloud_nodes.get(self.TIMEOUT).nodes.values()[0])
- self.stop_proxy(self.daemon)
-
- @mock.patch("arvnodeman.daemon.NodeManagerDaemonActor._resume_node")
- def test_no_resume_shutdown_nodes(self, resume_node):
- cloud_node = testutil.cloud_node_mock(1)
- arv_node = testutil.arvados_node_mock(1, info={"ec2_instance_id": "1", "slurm_state": "down"})
-
- self.make_daemon([cloud_node], [])
-
- self.node_shutdown = mock.MagicMock(name='shutdown_mock')
- self.daemon.shutdowns.get(self.TIMEOUT)[cloud_node.id] = self.node_shutdown
-
- self.daemon.update_arvados_nodes([arv_node]).get(self.TIMEOUT)
- self.stop_proxy(self.daemon)
- resume_node.assert_not_called()
if result is not unassigned:
return result
+ def busywait(self, f):
+ n = 0
+ while not f() and n < 10:
+ time.sleep(.1)
+ n += 1
+ self.assertTrue(f())
+
class DriverTestMixin(object):
def setUp(self):
--publish=25100:25100
--publish=25107:25107
--publish=25108:25108
- --publish=8001:8001"
+ --publish=8001:8001
+ --publish=8002:8002"
else
PUBLIC=""
fi
ADD crunch-setup.sh gitolite.rc \
keep-setup.sh common.sh createusers.sh \
logger runsu.sh waitforpostgres.sh \
- application_yml_override.py \
+ application_yml_override.py api-setup.sh \
/usr/local/lib/arvbox/
# Start the supervisor.
--- /dev/null
+#!/bin/bash
+
+exec 2>&1
+set -ex -o pipefail
+
+. /usr/local/lib/arvbox/common.sh
+
+cd /usr/src/arvados/services/api
+export RAILS_ENV=development
+
+set -u
+
+if ! test -s /var/lib/arvados/api_uuid_prefix ; then
+ ruby -e 'puts "#{rand(2**64).to_s(36)[0,5]}"' > /var/lib/arvados/api_uuid_prefix
+fi
+uuid_prefix=$(cat /var/lib/arvados/api_uuid_prefix)
+
+if ! test -s /var/lib/arvados/api_secret_token ; then
+ ruby -e 'puts rand(2**400).to_s(36)' > /var/lib/arvados/api_secret_token
+fi
+secret_token=$(cat /var/lib/arvados/api_secret_token)
+
+if ! test -s /var/lib/arvados/blob_signing_key ; then
+ ruby -e 'puts rand(2**400).to_s(36)' > /var/lib/arvados/blob_signing_key
+fi
+blob_signing_key=$(cat /var/lib/arvados/blob_signing_key)
+
+# self signed key will be created by SSO server script.
+test -s /var/lib/arvados/self-signed.key
+
+sso_app_secret=$(cat /var/lib/arvados/sso_app_secret)
+
+if test -s /var/lib/arvados/vm-uuid ; then
+ vm_uuid=$(cat /var/lib/arvados/vm-uuid)
+else
+ vm_uuid=$uuid_prefix-2x53u-$(ruby -e 'puts rand(2**400).to_s(36)[0,15]')
+ echo $vm_uuid > /var/lib/arvados/vm-uuid
+fi
+
+cat >config/application.yml <<EOF
+development:
+ uuid_prefix: $uuid_prefix
+ secret_token: $secret_token
+ blob_signing_key: $blob_signing_key
+ sso_app_secret: $sso_app_secret
+ sso_app_id: arvados-server
+ sso_provider_url: "https://$localip:${services[sso]}"
+ sso_insecure: true
+ workbench_address: "http://$localip/"
+ websocket_address: "ws://$localip:${services[websockets]}/websocket"
+ git_repo_ssh_base: "git@$localip:"
+ git_repo_https_base: "http://$localip:${services[arv-git-httpd]}/"
+ new_users_are_active: true
+ auto_admin_first_user: true
+ auto_setup_new_users: true
+ auto_setup_new_users_with_vm_uuid: $vm_uuid
+ auto_setup_new_users_with_repository: true
+ default_collection_replication: 1
+EOF
+
+(cd config && /usr/local/lib/arvbox/application_yml_override.py)
+
+if ! test -f /var/lib/arvados/api_database_pw ; then
+ ruby -e 'puts rand(2**128).to_s(36)' > /var/lib/arvados/api_database_pw
+fi
+database_pw=$(cat /var/lib/arvados/api_database_pw)
+
+if ! (psql postgres -c "\du" | grep "^ arvados ") >/dev/null ; then
+ psql postgres -c "create user arvados with password '$database_pw'"
+ psql postgres -c "ALTER USER arvados CREATEDB;"
+fi
+
+sed "s/password:.*/password: $database_pw/" <config/database.yml.example >config/database.yml
+
+if ! test -f /var/lib/arvados/api_database_setup ; then
+ bundle exec rake db:setup
+ touch /var/lib/arvados/api_database_setup
+fi
+
+if ! test -s /var/lib/arvados/superuser_token ; then
+ bundle exec ./script/create_superuser_token.rb > /var/lib/arvados/superuser_token
+fi
+
+rm -rf tmp
+
+bundle exec rake db:migrate
[keepstore1]=25108
[ssh]=22
[doc]=8001
+ [websockets]=8002
)
if test "$(id arvbox -u 2>/dev/null)" = 0 ; then
exit
fi
-set -u
-
-if ! test -s /var/lib/arvados/api_uuid_prefix ; then
- ruby -e 'puts "#{rand(2**64).to_s(36)[0,5]}"' > /var/lib/arvados/api_uuid_prefix
-fi
-uuid_prefix=$(cat /var/lib/arvados/api_uuid_prefix)
-
-if ! test -s /var/lib/arvados/api_secret_token ; then
- ruby -e 'puts rand(2**400).to_s(36)' > /var/lib/arvados/api_secret_token
-fi
-secret_token=$(cat /var/lib/arvados/api_secret_token)
-
-if ! test -s /var/lib/arvados/blob_signing_key ; then
- ruby -e 'puts rand(2**400).to_s(36)' > /var/lib/arvados/blob_signing_key
-fi
-blob_signing_key=$(cat /var/lib/arvados/blob_signing_key)
-
-# self signed key will be created by SSO server script.
-test -s /var/lib/arvados/self-signed.key
-
-sso_app_secret=$(cat /var/lib/arvados/sso_app_secret)
-
-if test -s /var/lib/arvados/vm-uuid ; then
- vm_uuid=$(cat /var/lib/arvados/vm-uuid)
-else
- vm_uuid=$uuid_prefix-2x53u-$(ruby -e 'puts rand(2**400).to_s(36)[0,15]')
- echo $vm_uuid > /var/lib/arvados/vm-uuid
-fi
-
-cat >config/application.yml <<EOF
-development:
- uuid_prefix: $uuid_prefix
- secret_token: $secret_token
- blob_signing_key: $blob_signing_key
- sso_app_secret: $sso_app_secret
- sso_app_id: arvados-server
- sso_provider_url: "https://$localip:${services[sso]}"
- sso_insecure: true
- workbench_address: "http://$localip/"
- git_repo_ssh_base: "git@$localip:"
- git_repo_https_base: "http://$localip:${services[arv-git-httpd]}/"
- new_users_are_active: true
- auto_admin_first_user: true
- auto_setup_new_users: true
- auto_setup_new_users_with_vm_uuid: $vm_uuid
- auto_setup_new_users_with_repository: true
- default_collection_replication: 1
-EOF
-
-(cd config && /usr/local/lib/arvbox/application_yml_override.py)
-
-if ! test -f /var/lib/arvados/api_database_pw ; then
- ruby -e 'puts rand(2**128).to_s(36)' > /var/lib/arvados/api_database_pw
-fi
-database_pw=$(cat /var/lib/arvados/api_database_pw)
-
-if ! (psql postgres -c "\du" | grep "^ arvados ") >/dev/null ; then
- psql postgres -c "create user arvados with password '$database_pw'"
- psql postgres -c "ALTER USER arvados CREATEDB;"
-fi
-
-sed "s/password:.*/password: $database_pw/" <config/database.yml.example >config/database.yml
-
-if ! test -f /var/lib/arvados/api_database_setup ; then
- bundle exec rake db:setup
- touch /var/lib/arvados/api_database_setup
-fi
-
-if ! test -s /var/lib/arvados/superuser_token ; then
- bundle exec ./script/create_superuser_token.rb > /var/lib/arvados/superuser_token
-fi
-
-rm -rf tmp
-
-bundle exec rake db:migrate
+flock /var/lib/arvados/api.lock /usr/local/lib/arvbox/api-setup.sh
set +u
if test "$1" = "--only-setup" ; then
exit
fi
-ARVADOS_WEBSOCKETS=1 exec bundle exec passenger start --port=${services[api]} \
+exec bundle exec passenger start --port=${services[api]} \
--runtime-dir=/var/lib/passenger \
--ssl --ssl-certificate=/var/lib/arvados/self-signed.pem \
--ssl-certificate-key=/var/lib/arvados/self-signed.key
--- /dev/null
+/usr/local/lib/arvbox/logger
\ No newline at end of file
--- /dev/null
+/usr/local/lib/arvbox/runsu.sh
\ No newline at end of file
--- /dev/null
+#!/bin/bash
+
+exec 2>&1
+set -ex -o pipefail
+
+. /usr/local/lib/arvbox/common.sh
+
+cd /usr/src/arvados/services/api
+export RAILS_ENV=development
+
+run_bundler --without=development
+
+if test "$1" = "--only-deps" ; then
+ exit
+fi
+
+flock /var/lib/arvados/api.lock /usr/local/lib/arvbox/api-setup.sh
+
+set +u
+if test "$1" = "--only-setup" ; then
+ exit
+fi
+
+export ARVADOS_WEBSOCKETS=ws-only
+
+# serving ssl directly doesn't work, gets
+# Rack app error: #<TypeError: no implicit conversion of Puma::MiniSSL::Socket into Integer>
+#exec bundle exec puma -b "ssl://0.0.0.0:${services[websockets]}?cert=/var/lib/arvados/self-signed.pem&key=/var/lib/arvados/self-signed.key"
+
+exec bundle exec puma -p${services[websockets]}