From: Tim Pierce Date: Tue, 18 Nov 2014 18:49:10 +0000 (-0500) Subject: 4138: support for Google Cloud Engine. X-Git-Tag: 1.1.0~1796^2~10 X-Git-Url: https://git.arvados.org/arvados.git/commitdiff_plain/9d6a6eca3a634e4090d7e0fc4f094c411ab5817a 4138: support for Google Cloud Engine. * Added: ** nodemanager/arvnodeman/computenode/drivers/gce.py ** nodemanager/doc/gce.example.cfg ** nodemanager/tests/test_computenode_driver_gce.py Updated comment in nodemanager/arvnodeman/computenode/drivers/ec2.py. --- diff --git a/services/nodemanager/arvnodeman/computenode/driver/gce.py b/services/nodemanager/arvnodeman/computenode/driver/gce.py new file mode 100644 index 0000000000..a4fd57deee --- /dev/null +++ b/services/nodemanager/arvnodeman/computenode/driver/gce.py @@ -0,0 +1,86 @@ +#!/usr/bin/env python + +from __future__ import absolute_import, print_function + +import functools +import json +import time + +import libcloud.compute.base as cloud_base +import libcloud.compute.providers as cloud_provider +import libcloud.compute.types as cloud_types +from libcloud.compute.drivers import gce + +from . import BaseComputeNodeDriver +from .. import arvados_node_fqdn + +class ComputeNodeDriver(BaseComputeNodeDriver): + """Compute node driver wrapper for GCE + + This translates cloud driver requests to GCE's specific parameters. + """ + DEFAULT_DRIVER = cloud_provider.get_driver(cloud_types.Provider.GCE) + SEARCH_CACHE = {} + ssh_key = None + service_accounts = None + + def __init__(self, auth_kwargs, list_kwargs, create_kwargs, + driver_class=DEFAULT_DRIVER): + super(ComputeNodeDriver, self).__init__( + auth_kwargs, list_kwargs, create_kwargs, + driver_class) + + for key in self.create_kwargs.keys(): + init_method = getattr(self, '_init_' + key, None) + if init_method is not None: + new_pair = init_method(self.create_kwargs.pop(key)) + if new_pair is not None: + self.create_kwargs[new_pair[0]] = new_pair[1] + + def _init_image_id(self, image_id): + return 'image', self.search_for(image_id, 'list_images') + + def _init_ping_host(self, ping_host): + self.ping_host = ping_host + + def _init_service_accounts(self, service_accounts_str): + self.service_accounts = json.loads(service_accounts_str) + + def _init_network_id(self, subnet_id): + return 'ex_network', self.search_for(subnet_id, 'ex_list_networks') + + def _init_ssh_key(self, filename): + with open(filename) as ssh_file: + self.ssh_key = ssh_file.read().strip() + + def arvados_create_kwargs(self, arvados_node): + result = {'ex_metadata': self.list_kwargs.copy() } + ping_secret = arvados_node['info'].get('ping_secret') + if ping_secret is not None: + ping_url = ('https://{}/arvados/v1/nodes/{}/ping?ping_secret={}'. + format(self.ping_host, arvados_node['uuid'], + ping_secret)) + result['ex_userdata'] = ping_url + if self.service_accounts is not None: + result['ex_service_accounts'] = self.service_accounts + + # SSH keys are delivered to GCE nodes via ex_metadata: see + # http://stackoverflow.com/questions/26752617/creating-sshkeys-for-gce-instance-using-libcloud + if self.ssh_key is not None: + result['ex_metadata']['sshKeys'] = 'root:{}'.format(self.ssh_key) + return result + + # When an Arvados node is synced with a GCE node, the Arvados hostname + # is forwarded in a GCE tag 'hostname-foo'. + # TODO(twp): implement an ex_set_metadata method (at least until + # libcloud supports the API setMetadata method) so we can pass this + # sensibly in the node metadata. + def sync_node(self, cloud_node, arvados_node): + tags = ['hostname-{}'.format(arvados_node_fqdn(arvados_node))] + self.real.ex_set_node_tags(cloud_node, tags) + + @classmethod + def node_start_time(cls, node): + time_str = node.extra['launch_time'].split('.', 2)[0] + 'UTC' + return time.mktime(time.strptime( + time_str,'%Y-%m-%dT%H:%M:%S%Z')) - time.timezone diff --git a/services/nodemanager/doc/ec2.example.cfg b/services/nodemanager/doc/ec2.example.cfg index 024ed2b59b..9b41ca14d5 100644 --- a/services/nodemanager/doc/ec2.example.cfg +++ b/services/nodemanager/doc/ec2.example.cfg @@ -128,9 +128,11 @@ security_groups = idstring1, idstring2 # willing to use. The Node Manager should boot the cheapest size(s) that # can run jobs in the queue (N.B.: defining more than one size has not been # tested yet). -# Each size section MUST define the number of cores it has. You may also -# want to define the number of mebibytes of scratch space for Crunch jobs. -# You can also override Amazon's provided data fields by setting the same -# names here. +# Each size section MUST define the number of cores are available in this +# size class (since libcloud does not provide any consistent API for exposing +# this setting). +# You may also want to define the amount of scratch space (expressed +# in GB) for Crunch jobs. You can also override Amazon's provided +# data fields by setting the same names here. cores = 2 -scratch = 100 \ No newline at end of file +scratch = 100 diff --git a/services/nodemanager/doc/gce.example.cfg b/services/nodemanager/doc/gce.example.cfg new file mode 100644 index 0000000000..d09396fac3 --- /dev/null +++ b/services/nodemanager/doc/gce.example.cfg @@ -0,0 +1,130 @@ +# Google Compute Engine configuration for Arvados Node Manager. +# All times are in seconds unless specified otherwise. + +[Daemon] +# Node Manager will ensure that there are at least this many nodes +# running at all times. +min_nodes = 0 + +# Node Manager will not start any compute nodes when at least this +# many are running. +max_nodes = 8 + +# Poll compute nodes and Arvados for new information every N seconds. +poll_time = 60 + +# Polls have exponential backoff when services fail to respond. +# This is the longest time to wait between polls. +max_poll_time = 300 + +# If Node Manager can't succesfully poll a service for this long, +# it will never start or stop compute nodes, on the assumption that its +# information is too outdated. +poll_stale_after = 600 + +# "Node stale time" affects two related behaviors. +# 1. If a compute node has been running for at least this long, but it +# isn't paired with an Arvados node, do not shut it down, but leave it alone. +# This prevents the node manager from shutting down a node that might +# actually be doing work, but is having temporary trouble contacting the +# API server. +# 2. When the Node Manager starts a new compute node, it will try to reuse +# an Arvados node that hasn't been updated for this long. +node_stale_after = 14400 + +# File path for Certificate Authorities +certs_file = /etc/ssl/certs/ca-certificates.crt + +[Logging] +# Log file path +file = /var/log/arvados/node-manager.log + +# Log level for most Node Manager messages. +# Choose one of DEBUG, INFO, WARNING, ERROR, or CRITICAL. +# WARNING lets you know when polling a service fails. +# INFO additionally lets you know when a compute node is started or stopped. +level = INFO + +# You can also set different log levels for specific libraries. +# Pykka is the Node Manager's actor library. +# Setting this to DEBUG will display tracebacks for uncaught +# exceptions in the actors, but it's also very chatty. +pykka = WARNING + +# Setting apiclient to INFO will log the URL of every Arvados API request. +apiclient = WARNING + +[Arvados] +host = zyxwv.arvadosapi.com +token = ARVADOS_TOKEN +timeout = 15 + +# Accept an untrusted SSL certificate from the API server? +insecure = no + +[Cloud] +provider = gce + +# XXX(twp): figure out good default settings for GCE +# It's usually most cost-effective to shut down compute nodes during narrow +# windows of time. For example, EC2 bills each node by the hour, so the best +# time to shut down a node is right before a new hour of uptime starts. +# Shutdown windows define these periods of time. These are windows in +# full minutes, separated by commas. Counting from the time the node is +# booted, the node WILL NOT shut down for N1 minutes; then it MAY shut down +# for N2 minutes; then it WILL NOT shut down for N3 minutes; and so on. +# For example, "54, 5, 1" means the node may shut down from the 54th to the +# 59th minute of each hour of uptime. +# Specify at least two windows. You can add as many as you need beyond that. +shutdown_windows = 54, 5, 1 + +[Cloud Credentials] +user_id = USERID +key = SECRET_KEY +project = project_name +timeout = 60 # used by NodeManagerConfig + +# Optional settings. For full documentation see +# http://libcloud.readthedocs.org/en/latest/compute/drivers/gce.html#libcloud.compute.drivers.gce.GCENodeDriver +# +# datacenter = 'us-central1-a' +# auth_type = 'SA' # SA, IA or GCE +# scopes = https://www.googleapis.com/auth/compute +# credential_file = + +[Cloud List] +# Keywords here will be used to populate the metadata field for a GCE node. + +[Cloud Create] +# New compute nodes will send pings to Arvados at this host. +# You may specify a port, and use brackets to disambiguate IPv6 addresses. +ping_host = hostname:port + +# A file path for an SSH key that can log in to the compute node. +# ssh_key = path + +# The GCE IDs of the image and network compute nodes should use. +image_id = idstring +network_id = idstring + +# JSON string of service account authorizations for this cluster. +# See http://libcloud.readthedocs.org/en/latest/compute/drivers/gce.html#specifying-service-account-scopes +# service_accounts = [{'email':'account@example.com', 'scopes':['storage-ro']}] + +[Size n1-standard-2] +# You can define any number of Size sections to list node sizes you're +# willing to use. The Node Manager should boot the cheapest size(s) that +# can run jobs in the queue (N.B.: defining more than one size has not been +# tested yet). +# +# The Size fields are interpreted the same way as with a libcloud NodeSize: +# http://libcloud.readthedocs.org/en/latest/compute/api.html#libcloud.compute.base.NodeSize +# +# Each size section MUST define the number of cores are available in this +# size class (since libcloud does not provide any consistent API for exposing +# this setting). +# You may also want to define the amount of scratch space (expressed +# in GB) for Crunch jobs. +cores = 2 +scratch = 100 +ram = 512 diff --git a/services/nodemanager/tests/test_computenode_driver_gce.py b/services/nodemanager/tests/test_computenode_driver_gce.py new file mode 100644 index 0000000000..075760ab7d --- /dev/null +++ b/services/nodemanager/tests/test_computenode_driver_gce.py @@ -0,0 +1,104 @@ +#!/usr/bin/env python + +from __future__ import absolute_import, print_function + +import time +import unittest + +import mock + +import arvnodeman.computenode.driver.gce as gce +from . import testutil + +class GCEComputeNodeDriverTestCase(unittest.TestCase): + def setUp(self): + self.driver_mock = mock.MagicMock(name='driver_mock') + + def new_driver(self, auth_kwargs={}, list_kwargs={}, create_kwargs={}): + create_kwargs.setdefault('ping_host', '100::') + return gce.ComputeNodeDriver( + auth_kwargs, list_kwargs, create_kwargs, + driver_class=self.driver_mock) + + def test_driver_instantiation(self): + kwargs = {'user_id': 'foo'} + driver = self.new_driver(auth_kwargs=kwargs) + self.assertTrue(self.driver_mock.called) + self.assertEqual(kwargs, self.driver_mock.call_args[1]) + + def test_create_location_loaded_at_initialization(self): + kwargs = {'location': 'testregion'} + driver = self.new_driver(create_kwargs=kwargs) + self.assertTrue(self.driver_mock().list_locations) + + def test_create_image_loaded_at_initialization(self): + kwargs = {'image': 'testimage'} + driver = self.new_driver(create_kwargs=kwargs) + self.assertTrue(self.driver_mock().list_images) + + def test_create_includes_ping_secret(self): + arv_node = testutil.arvados_node_mock(info={'ping_secret': 'ssshh'}) + driver = self.new_driver() + driver.create_node(testutil.MockSize(1), arv_node) + create_method = self.driver_mock().create_node + self.assertTrue(create_method.called) + self.assertIn('ping_secret=ssshh', + create_method.call_args[1].get('ex_userdata', + 'arg missing')) + + def test_generate_metadata_for_new_arvados_node(self): + arv_node = testutil.arvados_node_mock(8) + driver = self.new_driver(list_kwargs={'list': 'test'}) + self.assertEqual({'ex_metadata': {'list': 'test'}}, + driver.arvados_create_kwargs(arv_node)) + + def test_tags_set_default_hostname_from_new_arvados_node(self): + arv_node = testutil.arvados_node_mock(hostname=None) + cloud_node = testutil.cloud_node_mock(1) + driver = self.new_driver() + driver.sync_node(cloud_node, arv_node) + tag_mock = self.driver_mock().ex_set_node_tags + self.assertTrue(tag_mock.called) + self.assertEqual(['hostname-dynamic.compute.zzzzz.arvadosapi.com'], + tag_mock.call_args[0][1]) + + def test_sync_node_sets_static_hostname(self): + arv_node = testutil.arvados_node_mock(1) + cloud_node = testutil.cloud_node_mock(2) + driver = self.new_driver() + driver.sync_node(cloud_node, arv_node) + tag_mock = self.driver_mock().ex_set_node_tags + self.assertTrue(tag_mock.called) + self.assertEqual(['hostname-compute1.zzzzz.arvadosapi.com'], + tag_mock.call_args[0][1]) + + def test_node_create_time(self): + refsecs = int(time.time()) + reftuple = time.gmtime(refsecs) + node = testutil.cloud_node_mock() + node.extra = {'launch_time': time.strftime('%Y-%m-%dT%H:%M:%S.000Z', + reftuple)} + self.assertEqual(refsecs, gce.ComputeNodeDriver.node_start_time(node)) + + def test_generate_metadata_for_new_arvados_node(self): + arv_node = testutil.arvados_node_mock(8) + driver = self.new_driver(list_kwargs={'list': 'test'}) + self.assertEqual({'ex_metadata': {'list': 'test'}}, + driver.arvados_create_kwargs(arv_node)) + + def test_deliver_ssh_key_in_metadata(self): + test_ssh_key = 'ssh-rsa-foo' + arv_node = testutil.arvados_node_mock(1) + with mock.patch('__builtin__.open', mock.mock_open(read_data=test_ssh_key)) as mock_file: + driver = self.new_driver(create_kwargs={'ssh_key': 'ssh-key-file'}) + mock_file.assert_called_once_with('ssh-key-file') + self.assertEqual({'ex_metadata': {'sshKeys': 'root:ssh-rsa-foo'}}, + driver.arvados_create_kwargs(arv_node)) + + def test_create_driver_with_service_accounts(self): + srv_acct_config = { 'service_accounts': '{ "email": "foo@bar", "scopes":["storage-full"]}' } + arv_node = testutil.arvados_node_mock(1) + driver = self.new_driver(create_kwargs=srv_acct_config) + create_kwargs = driver.arvados_create_kwargs(arv_node) + self.assertEqual({u'email': u'foo@bar', u'scopes': [u'storage-full']}, + create_kwargs['ex_service_accounts'])