--- /dev/null
+#!/usr/bin/env python
+
+from __future__ import absolute_import, print_function
+
+import functools
+import json
+import time
+
+import libcloud.compute.base as cloud_base
+import libcloud.compute.providers as cloud_provider
+import libcloud.compute.types as cloud_types
+from libcloud.compute.drivers import gce
+
+from . import BaseComputeNodeDriver
+from .. import arvados_node_fqdn
+
+class ComputeNodeDriver(BaseComputeNodeDriver):
+ """Compute node driver wrapper for GCE
+
+ This translates cloud driver requests to GCE's specific parameters.
+ """
+ DEFAULT_DRIVER = cloud_provider.get_driver(cloud_types.Provider.GCE)
+ SEARCH_CACHE = {}
+ ssh_key = None
+ service_accounts = None
+
+ def __init__(self, auth_kwargs, list_kwargs, create_kwargs,
+ driver_class=DEFAULT_DRIVER):
+ super(ComputeNodeDriver, self).__init__(
+ auth_kwargs, list_kwargs, create_kwargs,
+ driver_class)
+
+ for key in self.create_kwargs.keys():
+ init_method = getattr(self, '_init_' + key, None)
+ if init_method is not None:
+ new_pair = init_method(self.create_kwargs.pop(key))
+ if new_pair is not None:
+ self.create_kwargs[new_pair[0]] = new_pair[1]
+
+ def _init_image_id(self, image_id):
+ return 'image', self.search_for(image_id, 'list_images')
+
+ def _init_ping_host(self, ping_host):
+ self.ping_host = ping_host
+
+ def _init_service_accounts(self, service_accounts_str):
+ self.service_accounts = json.loads(service_accounts_str)
+
+ def _init_network_id(self, subnet_id):
+ return 'ex_network', self.search_for(subnet_id, 'ex_list_networks')
+
+ def _init_ssh_key(self, filename):
+ with open(filename) as ssh_file:
+ self.ssh_key = ssh_file.read().strip()
+
+ def arvados_create_kwargs(self, arvados_node):
+ result = {'ex_metadata': self.list_kwargs.copy() }
+ ping_secret = arvados_node['info'].get('ping_secret')
+ if ping_secret is not None:
+ ping_url = ('https://{}/arvados/v1/nodes/{}/ping?ping_secret={}'.
+ format(self.ping_host, arvados_node['uuid'],
+ ping_secret))
+ result['ex_userdata'] = ping_url
+ if self.service_accounts is not None:
+ result['ex_service_accounts'] = self.service_accounts
+
+ # SSH keys are delivered to GCE nodes via ex_metadata: see
+ # http://stackoverflow.com/questions/26752617/creating-sshkeys-for-gce-instance-using-libcloud
+ if self.ssh_key is not None:
+ result['ex_metadata']['sshKeys'] = 'root:{}'.format(self.ssh_key)
+ return result
+
+ # When an Arvados node is synced with a GCE node, the Arvados hostname
+ # is forwarded in a GCE tag 'hostname-foo'.
+ # TODO(twp): implement an ex_set_metadata method (at least until
+ # libcloud supports the API setMetadata method) so we can pass this
+ # sensibly in the node metadata.
+ def sync_node(self, cloud_node, arvados_node):
+ tags = ['hostname-{}'.format(arvados_node_fqdn(arvados_node))]
+ self.real.ex_set_node_tags(cloud_node, tags)
+
+ @classmethod
+ def node_start_time(cls, node):
+ time_str = node.extra['launch_time'].split('.', 2)[0] + 'UTC'
+ return time.mktime(time.strptime(
+ time_str,'%Y-%m-%dT%H:%M:%S%Z')) - time.timezone
# willing to use. The Node Manager should boot the cheapest size(s) that
# can run jobs in the queue (N.B.: defining more than one size has not been
# tested yet).
-# Each size section MUST define the number of cores it has. You may also
-# want to define the number of mebibytes of scratch space for Crunch jobs.
-# You can also override Amazon's provided data fields by setting the same
-# names here.
+# Each size section MUST define the number of cores are available in this
+# size class (since libcloud does not provide any consistent API for exposing
+# this setting).
+# You may also want to define the amount of scratch space (expressed
+# in GB) for Crunch jobs. You can also override Amazon's provided
+# data fields by setting the same names here.
cores = 2
-scratch = 100
\ No newline at end of file
+scratch = 100
--- /dev/null
+# Google Compute Engine configuration for Arvados Node Manager.
+# All times are in seconds unless specified otherwise.
+
+[Daemon]
+# Node Manager will ensure that there are at least this many nodes
+# running at all times.
+min_nodes = 0
+
+# Node Manager will not start any compute nodes when at least this
+# many are running.
+max_nodes = 8
+
+# Poll compute nodes and Arvados for new information every N seconds.
+poll_time = 60
+
+# Polls have exponential backoff when services fail to respond.
+# This is the longest time to wait between polls.
+max_poll_time = 300
+
+# If Node Manager can't succesfully poll a service for this long,
+# it will never start or stop compute nodes, on the assumption that its
+# information is too outdated.
+poll_stale_after = 600
+
+# "Node stale time" affects two related behaviors.
+# 1. If a compute node has been running for at least this long, but it
+# isn't paired with an Arvados node, do not shut it down, but leave it alone.
+# This prevents the node manager from shutting down a node that might
+# actually be doing work, but is having temporary trouble contacting the
+# API server.
+# 2. When the Node Manager starts a new compute node, it will try to reuse
+# an Arvados node that hasn't been updated for this long.
+node_stale_after = 14400
+
+# File path for Certificate Authorities
+certs_file = /etc/ssl/certs/ca-certificates.crt
+
+[Logging]
+# Log file path
+file = /var/log/arvados/node-manager.log
+
+# Log level for most Node Manager messages.
+# Choose one of DEBUG, INFO, WARNING, ERROR, or CRITICAL.
+# WARNING lets you know when polling a service fails.
+# INFO additionally lets you know when a compute node is started or stopped.
+level = INFO
+
+# You can also set different log levels for specific libraries.
+# Pykka is the Node Manager's actor library.
+# Setting this to DEBUG will display tracebacks for uncaught
+# exceptions in the actors, but it's also very chatty.
+pykka = WARNING
+
+# Setting apiclient to INFO will log the URL of every Arvados API request.
+apiclient = WARNING
+
+[Arvados]
+host = zyxwv.arvadosapi.com
+token = ARVADOS_TOKEN
+timeout = 15
+
+# Accept an untrusted SSL certificate from the API server?
+insecure = no
+
+[Cloud]
+provider = gce
+
+# XXX(twp): figure out good default settings for GCE
+# It's usually most cost-effective to shut down compute nodes during narrow
+# windows of time. For example, EC2 bills each node by the hour, so the best
+# time to shut down a node is right before a new hour of uptime starts.
+# Shutdown windows define these periods of time. These are windows in
+# full minutes, separated by commas. Counting from the time the node is
+# booted, the node WILL NOT shut down for N1 minutes; then it MAY shut down
+# for N2 minutes; then it WILL NOT shut down for N3 minutes; and so on.
+# For example, "54, 5, 1" means the node may shut down from the 54th to the
+# 59th minute of each hour of uptime.
+# Specify at least two windows. You can add as many as you need beyond that.
+shutdown_windows = 54, 5, 1
+
+[Cloud Credentials]
+user_id = USERID
+key = SECRET_KEY
+project = project_name
+timeout = 60 # used by NodeManagerConfig
+
+# Optional settings. For full documentation see
+# http://libcloud.readthedocs.org/en/latest/compute/drivers/gce.html#libcloud.compute.drivers.gce.GCENodeDriver
+#
+# datacenter = 'us-central1-a'
+# auth_type = 'SA' # SA, IA or GCE
+# scopes = https://www.googleapis.com/auth/compute
+# credential_file =
+
+[Cloud List]
+# Keywords here will be used to populate the metadata field for a GCE node.
+
+[Cloud Create]
+# New compute nodes will send pings to Arvados at this host.
+# You may specify a port, and use brackets to disambiguate IPv6 addresses.
+ping_host = hostname:port
+
+# A file path for an SSH key that can log in to the compute node.
+# ssh_key = path
+
+# The GCE IDs of the image and network compute nodes should use.
+image_id = idstring
+network_id = idstring
+
+# JSON string of service account authorizations for this cluster.
+# See http://libcloud.readthedocs.org/en/latest/compute/drivers/gce.html#specifying-service-account-scopes
+# service_accounts = [{'email':'account@example.com', 'scopes':['storage-ro']}]
+
+[Size n1-standard-2]
+# You can define any number of Size sections to list node sizes you're
+# willing to use. The Node Manager should boot the cheapest size(s) that
+# can run jobs in the queue (N.B.: defining more than one size has not been
+# tested yet).
+#
+# The Size fields are interpreted the same way as with a libcloud NodeSize:
+# http://libcloud.readthedocs.org/en/latest/compute/api.html#libcloud.compute.base.NodeSize
+#
+# Each size section MUST define the number of cores are available in this
+# size class (since libcloud does not provide any consistent API for exposing
+# this setting).
+# You may also want to define the amount of scratch space (expressed
+# in GB) for Crunch jobs.
+cores = 2
+scratch = 100
+ram = 512
--- /dev/null
+#!/usr/bin/env python
+
+from __future__ import absolute_import, print_function
+
+import time
+import unittest
+
+import mock
+
+import arvnodeman.computenode.driver.gce as gce
+from . import testutil
+
+class GCEComputeNodeDriverTestCase(unittest.TestCase):
+ def setUp(self):
+ self.driver_mock = mock.MagicMock(name='driver_mock')
+
+ def new_driver(self, auth_kwargs={}, list_kwargs={}, create_kwargs={}):
+ create_kwargs.setdefault('ping_host', '100::')
+ return gce.ComputeNodeDriver(
+ auth_kwargs, list_kwargs, create_kwargs,
+ driver_class=self.driver_mock)
+
+ def test_driver_instantiation(self):
+ kwargs = {'user_id': 'foo'}
+ driver = self.new_driver(auth_kwargs=kwargs)
+ self.assertTrue(self.driver_mock.called)
+ self.assertEqual(kwargs, self.driver_mock.call_args[1])
+
+ def test_create_location_loaded_at_initialization(self):
+ kwargs = {'location': 'testregion'}
+ driver = self.new_driver(create_kwargs=kwargs)
+ self.assertTrue(self.driver_mock().list_locations)
+
+ def test_create_image_loaded_at_initialization(self):
+ kwargs = {'image': 'testimage'}
+ driver = self.new_driver(create_kwargs=kwargs)
+ self.assertTrue(self.driver_mock().list_images)
+
+ def test_create_includes_ping_secret(self):
+ arv_node = testutil.arvados_node_mock(info={'ping_secret': 'ssshh'})
+ driver = self.new_driver()
+ driver.create_node(testutil.MockSize(1), arv_node)
+ create_method = self.driver_mock().create_node
+ self.assertTrue(create_method.called)
+ self.assertIn('ping_secret=ssshh',
+ create_method.call_args[1].get('ex_userdata',
+ 'arg missing'))
+
+ def test_generate_metadata_for_new_arvados_node(self):
+ arv_node = testutil.arvados_node_mock(8)
+ driver = self.new_driver(list_kwargs={'list': 'test'})
+ self.assertEqual({'ex_metadata': {'list': 'test'}},
+ driver.arvados_create_kwargs(arv_node))
+
+ def test_tags_set_default_hostname_from_new_arvados_node(self):
+ arv_node = testutil.arvados_node_mock(hostname=None)
+ cloud_node = testutil.cloud_node_mock(1)
+ driver = self.new_driver()
+ driver.sync_node(cloud_node, arv_node)
+ tag_mock = self.driver_mock().ex_set_node_tags
+ self.assertTrue(tag_mock.called)
+ self.assertEqual(['hostname-dynamic.compute.zzzzz.arvadosapi.com'],
+ tag_mock.call_args[0][1])
+
+ def test_sync_node_sets_static_hostname(self):
+ arv_node = testutil.arvados_node_mock(1)
+ cloud_node = testutil.cloud_node_mock(2)
+ driver = self.new_driver()
+ driver.sync_node(cloud_node, arv_node)
+ tag_mock = self.driver_mock().ex_set_node_tags
+ self.assertTrue(tag_mock.called)
+ self.assertEqual(['hostname-compute1.zzzzz.arvadosapi.com'],
+ tag_mock.call_args[0][1])
+
+ def test_node_create_time(self):
+ refsecs = int(time.time())
+ reftuple = time.gmtime(refsecs)
+ node = testutil.cloud_node_mock()
+ node.extra = {'launch_time': time.strftime('%Y-%m-%dT%H:%M:%S.000Z',
+ reftuple)}
+ self.assertEqual(refsecs, gce.ComputeNodeDriver.node_start_time(node))
+
+ def test_generate_metadata_for_new_arvados_node(self):
+ arv_node = testutil.arvados_node_mock(8)
+ driver = self.new_driver(list_kwargs={'list': 'test'})
+ self.assertEqual({'ex_metadata': {'list': 'test'}},
+ driver.arvados_create_kwargs(arv_node))
+
+ def test_deliver_ssh_key_in_metadata(self):
+ test_ssh_key = 'ssh-rsa-foo'
+ arv_node = testutil.arvados_node_mock(1)
+ with mock.patch('__builtin__.open', mock.mock_open(read_data=test_ssh_key)) as mock_file:
+ driver = self.new_driver(create_kwargs={'ssh_key': 'ssh-key-file'})
+ mock_file.assert_called_once_with('ssh-key-file')
+ self.assertEqual({'ex_metadata': {'sshKeys': 'root:ssh-rsa-foo'}},
+ driver.arvados_create_kwargs(arv_node))
+
+ def test_create_driver_with_service_accounts(self):
+ srv_acct_config = { 'service_accounts': '{ "email": "foo@bar", "scopes":["storage-full"]}' }
+ arv_node = testutil.arvados_node_mock(1)
+ driver = self.new_driver(create_kwargs=srv_acct_config)
+ create_kwargs = driver.arvados_create_kwargs(arv_node)
+ self.assertEqual({u'email': u'foo@bar', u'scopes': [u'storage-full']},
+ create_kwargs['ex_service_accounts'])