def arvados_create_kwargs(self, size, arvados_node):
name = self.create_cloud_name(arvados_node)
+
+ if size.scratch > 375000:
+ self._logger.warning("Requested %d MB scratch space, but GCE driver currently only supports attaching a single 375 GB disk.", size.scratch)
+
disks = [
{'autoDelete': True,
'boot': True,
self.cores = kwargs.pop('cores')
# libcloud disk sizes are in GB, Arvados/SLURM are in MB
# multiply by 1000 instead of 1024 to err on low side
+ if self.disk is None:
+ self.disk = 0
self.scratch = self.disk * 1000
self.ram = int(self.ram * node_mem_scaling)
for name, override in kwargs.iteritems():
self._calculator = server_calc
def is_common_error(self, exception):
- return self._client.is_cloud_exception(exception)
+ return isinstance(exception, config.CLOUD_ERRORS)
def _item_key(self, node):
return node.id
from arvnodeman.computenode import ARVADOS_TIMEFMT
-from libcloud.compute.base import NodeSize, Node, NodeDriver, NodeState
+from libcloud.compute.base import NodeSize, Node, NodeDriver, NodeState, NodeImage
+from libcloud.compute.drivers.gce import GCEDiskType
from libcloud.common.exceptions import BaseHTTPError
all_nodes = []
ex_resource_group=None,
ex_user_name=None,
ex_tags=None,
+ ex_metadata=None,
ex_network=None,
ex_userdata=None):
global all_nodes, create_calls
create_calls += 1
- n = Node(name, name, NodeState.RUNNING, [], [], self, size=size, extra={"tags": ex_tags})
+ nodeid = "node%i" % create_calls
+ n = Node(nodeid, nodeid, NodeState.RUNNING, [], [], self, size=size, extra={"tags": ex_tags})
all_nodes.append(n)
if ex_customdata:
- ping_url = re.search(r"echo '(.*)' > /var/tmp/arv-node-data/arv-ping-url", ex_customdata).groups(1)[0] + "&instance_id=" + name
+ ping_url = re.search(r"echo '(.*)' > /var/tmp/arv-node-data/arv-ping-url", ex_customdata).groups(1)[0]
if ex_userdata:
ping_url = ex_userdata
+ if ex_metadata:
+ ping_url = ex_metadata["arv-ping-url"]
+ ping_url += "&instance_id=" + nodeid
ctx = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
ctx.verify_mode = ssl.CERT_NONE
f = urllib.urlopen(ping_url, "", context=ctx)
return [NodeSize("m3.xlarge", "Extra Large Instance", 3500, 80, 0, 0, self),
NodeSize("m4.xlarge", "Extra Large Instance", 3500, 0, 0, 0, self),
NodeSize("m4.2xlarge", "Double Extra Large Instance", 7000, 0, 0, 0, self)]
+
+
+class FakeGceDriver(FakeDriver):
+
+ def create_node(self, name=None,
+ size=None,
+ image=None,
+ auth=None,
+ external_ip=None,
+ ex_metadata=None,
+ ex_tags=None,
+ ex_disks_gce_struct=None):
+ n = super(FakeGceDriver, self).create_node(name=name,
+ size=size,
+ image=image,
+ auth=auth,
+ ex_metadata=ex_metadata)
+ n.extra = {
+ "metadata": {
+ "items": [{"key": k, "value": v} for k,v in ex_metadata.iteritems()]
+ },
+ "zone": "fake"
+ }
+ return n
+
+ def list_images(self, ex_project=None):
+ return [NodeImage("fake_image_id", "fake_image_id", self)]
+
+ def list_sizes(self, **kwargs):
+ return [NodeSize("n1-standard-1", "Standard", 3750, None, 0, 0, self),
+ NodeSize("n1-standard-2", "Double standard", 7500, None, 0, 0, self)]
+
+ def ex_list_disktypes(self, zone=None):
+ return [GCEDiskType("pd-standard", "pd-standard", zone, self,
+ extra={"selfLink": "pd-standard"}),
+ GCEDiskType("local-ssd", "local-ssd", zone, self,
+ extra={"selfLink": "local-ssd"})]
+
+ def ex_get_node(self, name, zone=None):
+ global all_nodes
+ for n in all_nodes:
+ if n.id == name:
+ return n
+ return None
+
+ def ex_set_node_metadata(self, n, items):
+ n.extra["metadata"]["items"] = items
--- /dev/null
+# Azure configuration for Arvados Node Manager.
+# All times are in seconds unless specified otherwise.
+
+[Manage]
+# The management server responds to http://addr:port/status.json with
+# a snapshot of internal state.
+
+# Management server listening address (default 127.0.0.1)
+#address = 0.0.0.0
+
+# Management server port number (default -1, server is disabled)
+#port = 8989
+
+[Daemon]
+# The dispatcher can customize the start and stop procedure for
+# cloud nodes. For example, the SLURM dispatcher drains nodes
+# through SLURM before shutting them down.
+#dispatcher = slurm
+
+# Node Manager will ensure that there are at least this many nodes running at
+# all times. If node manager needs to start new idle nodes for the purpose of
+# satisfying min_nodes, it will use the cheapest node type. However, depending
+# on usage patterns, it may also satisfy min_nodes by keeping alive some
+# more-expensive nodes
+min_nodes = 0
+
+# Node Manager will not start any compute nodes when at least this
+# many are running.
+max_nodes = 8
+
+# Upper limit on rate of spending (in $/hr), will not boot additional nodes
+# if total price of already running nodes meets or exceeds this threshold.
+# default 0 means no limit.
+max_total_price = 0
+
+# Poll Azure nodes and Arvados for new information every N seconds.
+poll_time = 5
+
+# Polls have exponential backoff when services fail to respond.
+# This is the longest time to wait between polls.
+max_poll_time = 300
+
+# If Node Manager can't succesfully poll a service for this long,
+# it will never start or stop compute nodes, on the assumption that its
+# information is too outdated.
+poll_stale_after = 600
+
+# If Node Manager boots a cloud node, and it does not pair with an Arvados
+# node before this long, assume that there was a cloud bootstrap failure and
+# shut it down. Note that normal shutdown windows apply (see the Cloud
+# section), so this should be shorter than the first shutdown window value.
+boot_fail_after = 45
+
+# "Node stale time" affects two related behaviors.
+# 1. If a compute node has been running for at least this long, but it
+# isn't paired with an Arvados node, do not shut it down, but leave it alone.
+# This prevents the node manager from shutting down a node that might
+# actually be doing work, but is having temporary trouble contacting the
+# API server.
+# 2. When the Node Manager starts a new compute node, it will try to reuse
+# an Arvados node that hasn't been updated for this long.
+node_stale_after = 14400
+
+# Scaling factor to be applied to nodes' available RAM size. Usually there's a
+# variable discrepancy between the advertised RAM value on cloud nodes and the
+# actual amount available.
+# If not set, this value will be set to 0.95
+node_mem_scaling = 0.95
+
+# File path for Certificate Authorities
+certs_file = /etc/ssl/certs/ca-certificates.crt
+
+[Logging]
+# Log file path
+#file = node-manager.log
+
+# Log level for most Node Manager messages.
+# Choose one of DEBUG, INFO, WARNING, ERROR, or CRITICAL.
+# WARNING lets you know when polling a service fails.
+# INFO additionally lets you know when a compute node is started or stopped.
+level = DEBUG
+
+# You can also set different log levels for specific libraries.
+# Pykka is the Node Manager's actor library.
+# Setting this to DEBUG will display tracebacks for uncaught
+# exceptions in the actors, but it's also very chatty.
+pykka = WARNING
+
+# Setting apiclient to INFO will log the URL of every Arvados API request.
+apiclient = WARNING
+
+[Arvados]
+host = {host}
+token = {token}
+timeout = 15
+jobs_queue = no
+
+# Accept an untrusted SSL certificate from the API server?
+insecure = yes
+
+[Cloud]
+provider = ec2
+driver_class = {driver_class}
+
+# Shutdown windows define periods of time when a node may and may not be shut
+# down. These are windows in full minutes, separated by commas. Counting from
+# the time the node is booted, the node WILL NOT shut down for N1 minutes; then
+# it MAY shut down for N2 minutes; then it WILL NOT shut down for N3 minutes;
+# and so on. For example, "20, 999999" means the node may shut down between
+# the 20th and 999999th minutes of uptime.
+# Azure bills by the minute, so it makes sense to agressively shut down idle
+# nodes. Specify at least two windows. You can add as many as you need beyond
+# that.
+shutdown_windows = 1, 999999
+
+[Cloud Credentials]
+
+key = 00000000-0000-0000-0000-000000000000
+secret = PASSWORD
+timeout = 60
+region = East US
+
+[Cloud List]
+
+[Cloud Create]
+# The image id
+image = fake_image_id
+
+# Path to a local ssh key file that will be used to provision new nodes.
+ssh_key = {ssh_key}
+
+# the API server to ping
+ping_host = {host}
+
+# You can define any number of Size sections to list Azure sizes you're willing
+# to use. The Node Manager should boot the cheapest size(s) that can run jobs
+# in the queue. You must also provide price per hour as the Azure driver
+# compute currently does not report prices.
+#
+# See https://azure.microsoft.com/en-us/pricing/details/virtual-machines/
+# for a list of known machine types that may be used as a Size parameter.
+#
+# Each size section MUST define the number of cores are available in this
+# size class (since libcloud does not provide any consistent API for exposing
+# this setting).
+# You may also want to define the amount of scratch space (expressed
+# in GB) for Crunch jobs. You can also override Microsoft's provided
+# data fields by setting them here.
+
+[Size m4.xlarge]
+cores = 4
+price = 0.56
+scratch = 250
+
+[Size m4.2xlarge]
+cores = 8
+price = 1.12
+scratch = 500
--- /dev/null
+# Azure configuration for Arvados Node Manager.
+# All times are in seconds unless specified otherwise.
+
+[Manage]
+# The management server responds to http://addr:port/status.json with
+# a snapshot of internal state.
+
+# Management server listening address (default 127.0.0.1)
+#address = 0.0.0.0
+
+# Management server port number (default -1, server is disabled)
+#port = 8989
+
+[Daemon]
+# The dispatcher can customize the start and stop procedure for
+# cloud nodes. For example, the SLURM dispatcher drains nodes
+# through SLURM before shutting them down.
+#dispatcher = slurm
+
+# Node Manager will ensure that there are at least this many nodes running at
+# all times. If node manager needs to start new idle nodes for the purpose of
+# satisfying min_nodes, it will use the cheapest node type. However, depending
+# on usage patterns, it may also satisfy min_nodes by keeping alive some
+# more-expensive nodes
+min_nodes = 0
+
+# Node Manager will not start any compute nodes when at least this
+# many are running.
+max_nodes = 8
+
+# Upper limit on rate of spending (in $/hr), will not boot additional nodes
+# if total price of already running nodes meets or exceeds this threshold.
+# default 0 means no limit.
+max_total_price = 0
+
+# Poll Azure nodes and Arvados for new information every N seconds.
+poll_time = 5
+
+# Polls have exponential backoff when services fail to respond.
+# This is the longest time to wait between polls.
+max_poll_time = 300
+
+# If Node Manager can't succesfully poll a service for this long,
+# it will never start or stop compute nodes, on the assumption that its
+# information is too outdated.
+poll_stale_after = 600
+
+# If Node Manager boots a cloud node, and it does not pair with an Arvados
+# node before this long, assume that there was a cloud bootstrap failure and
+# shut it down. Note that normal shutdown windows apply (see the Cloud
+# section), so this should be shorter than the first shutdown window value.
+boot_fail_after = 45
+
+# "Node stale time" affects two related behaviors.
+# 1. If a compute node has been running for at least this long, but it
+# isn't paired with an Arvados node, do not shut it down, but leave it alone.
+# This prevents the node manager from shutting down a node that might
+# actually be doing work, but is having temporary trouble contacting the
+# API server.
+# 2. When the Node Manager starts a new compute node, it will try to reuse
+# an Arvados node that hasn't been updated for this long.
+node_stale_after = 14400
+
+# Scaling factor to be applied to nodes' available RAM size. Usually there's a
+# variable discrepancy between the advertised RAM value on cloud nodes and the
+# actual amount available.
+# If not set, this value will be set to 0.95
+node_mem_scaling = 0.95
+
+# File path for Certificate Authorities
+certs_file = /etc/ssl/certs/ca-certificates.crt
+
+[Logging]
+# Log file path
+#file = node-manager.log
+
+# Log level for most Node Manager messages.
+# Choose one of DEBUG, INFO, WARNING, ERROR, or CRITICAL.
+# WARNING lets you know when polling a service fails.
+# INFO additionally lets you know when a compute node is started or stopped.
+level = DEBUG
+
+# You can also set different log levels for specific libraries.
+# Pykka is the Node Manager's actor library.
+# Setting this to DEBUG will display tracebacks for uncaught
+# exceptions in the actors, but it's also very chatty.
+pykka = WARNING
+
+# Setting apiclient to INFO will log the URL of every Arvados API request.
+apiclient = WARNING
+
+[Arvados]
+host = {host}
+token = {token}
+timeout = 15
+jobs_queue = no
+
+# Accept an untrusted SSL certificate from the API server?
+insecure = yes
+
+[Cloud]
+provider = gce
+driver_class = {driver_class}
+
+# Shutdown windows define periods of time when a node may and may not be shut
+# down. These are windows in full minutes, separated by commas. Counting from
+# the time the node is booted, the node WILL NOT shut down for N1 minutes; then
+# it MAY shut down for N2 minutes; then it WILL NOT shut down for N3 minutes;
+# and so on. For example, "20, 999999" means the node may shut down between
+# the 20th and 999999th minutes of uptime.
+# Azure bills by the minute, so it makes sense to agressively shut down idle
+# nodes. Specify at least two windows. You can add as many as you need beyond
+# that.
+shutdown_windows = 1, 999999
+
+[Cloud Credentials]
+key = 00000000-0000-0000-0000-000000000000
+secret = PASSWORD
+timeout = 60
+region = East US
+
+[Cloud List]
+
+[Cloud Create]
+# The image id
+image = fake_image_id
+
+# Path to a local ssh key file that will be used to provision new nodes.
+ssh_key = {ssh_key}
+
+# the API server to ping
+ping_host = {host}
+
+# You can define any number of Size sections to list Azure sizes you're willing
+# to use. The Node Manager should boot the cheapest size(s) that can run jobs
+# in the queue. You must also provide price per hour as the Azure driver
+# compute currently does not report prices.
+#
+# See https://azure.microsoft.com/en-us/pricing/details/virtual-machines/
+# for a list of known machine types that may be used as a Size parameter.
+#
+# Each size section MUST define the number of cores are available in this
+# size class (since libcloud does not provide any consistent API for exposing
+# this setting).
+# You may also want to define the amount of scratch space (expressed
+# in GB) for Crunch jobs. You can also override Microsoft's provided
+# data fields by setting them here.
+
+[Size n1-standard-1]
+cores = 1
+price = 0.56
+
+[Size n1-standard-2]
+cores = 2
+price = 1.12
\ No newline at end of file
checks[pattern] = partial(expect_count, count-1)
return 0
-def run_test(name, actions, checks, driver_class, jobs):
+def run_test(name, actions, checks, driver_class, jobs, provider):
code = 0
# Delete any stale node records
update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n")
# Write configuration file for test
- with open("tests/fake.cfg.template") as f:
+ with open("tests/fake_%s.cfg.template" % provider) as f:
open(os.path.join(fake_slurm, "id_rsa.pub"), "w").close()
with open(os.path.join(fake_slurm, "fake.cfg"), "w") as cfg:
cfg.write(f.read().format(host=os.environ["ARVADOS_API_HOST"],
# Test lifecycle.
tests = {
- "test_single_node": (
+ "test_single_node_azure": (
[
(r".*Daemon started", set_squeue),
(r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
r".*Setting node quota.*": fail,
},
"arvnodeman.test.fake_driver.FakeDriver",
- {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"}),
+ {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
+ "azure"),
"test_multiple_nodes": (
[
(r".*Daemon started", set_squeue),
"34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
"34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
"34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"
- }),
+ }, "azure"),
"test_hit_quota": (
[
(r".*Daemon started", set_squeue),
"34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
"34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
"34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"
- }),
+ }, "azure"),
"test_probe_quota": (
[
(r".*Daemon started", set_squeue),
"34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
"34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
"34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"
- }),
+ }, "azure"),
"test_no_hang_failing_node_create": (
[
(r".*Daemon started", set_squeue),
"34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
"34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
"34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"
- }),
+ }, "azure"),
"test_retry_create": (
[
(r".*Daemon started", set_squeue),
{},
"arvnodeman.test.fake_driver.RetryDriver",
{"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail"
- })
+ }, "azure"),
+ "test_single_node_aws": (
+ [
+ (r".*Daemon started", set_squeue),
+ (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
+ (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
+ (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
+ (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
+ ], {
+ r".*Suggesting shutdown because node state is \('down', .*\)": fail,
+ r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1),
+ r".*Setting node quota.*": fail,
+ },
+ "arvnodeman.test.fake_driver.FakeAwsDriver",
+ {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
+ "ec2"),
+ "test_single_node_gce": (
+ [
+ (r".*Daemon started", set_squeue),
+ (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
+ (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
+ (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
+ (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
+ ], {
+ r".*Suggesting shutdown because node state is \('down', .*\)": fail,
+ r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1),
+ r".*Setting node quota.*": fail,
+ },
+ "arvnodeman.test.fake_driver.FakeGceDriver",
+ {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
+ "gce")
}
code = 0