Merge branch 'master' into 13804-no-shutdown-wanted-nodes
[arvados.git] / services / nodemanager / arvnodeman / config.py
index 30e8995baa26bdcc22154570dfa099aa5658d341..4fda7e76d69ed87aa1b5b032f1e3b1b8e608f5b5 100644 (file)
@@ -1,4 +1,7 @@
 #!/usr/bin/env python
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: AGPL-3.0
 
 from __future__ import absolute_import, print_function
 
@@ -14,11 +17,16 @@ from apiclient import errors as apierror
 
 from .baseactor import BaseNodeManagerActor
 
+from functools import partial
+from libcloud.common.types import LibcloudError
+from libcloud.common.exceptions import BaseHTTPError
+
 # IOError is the base class for socket.error, ssl.SSLError, and friends.
 # It seems like it hits the sweet spot for operations we want to retry:
 # it's low-level, but unlikely to catch code bugs.
 NETWORK_ERRORS = (IOError,)
 ARVADOS_ERRORS = NETWORK_ERRORS + (apierror.Error,)
+CLOUD_ERRORS = NETWORK_ERRORS + (LibcloudError, BaseHTTPError)
 
 actor_class = BaseNodeManagerActor
 
@@ -36,7 +44,10 @@ class NodeManagerConfig(ConfigParser.SafeConfigParser):
         ConfigParser.SafeConfigParser.__init__(self, *args, **kwargs)
         for sec_name, settings in {
             'Arvados': {'insecure': 'no',
-                        'timeout': '15'},
+                        'timeout': '15',
+                        'jobs_queue': 'yes',
+                        'slurm_queue': 'yes'
+                    },
             'Daemon': {'min_nodes': '0',
                        'max_nodes': '1',
                        'poll_time': '60',
@@ -46,11 +57,13 @@ class NodeManagerConfig(ConfigParser.SafeConfigParser):
                        'boot_fail_after': str(sys.maxint),
                        'node_stale_after': str(60 * 60 * 2),
                        'watchdog': '600',
-                       'node_mem_scaling': '0.95'},
+                       'node_mem_scaling': '0.95',
+                       'consecutive_idle_count': '2'},
             'Manage': {'address': '127.0.0.1',
-                       'port': '-1'},
+                       'port': '-1',
+                       'ManagementToken': ''},
             'Logging': {'file': '/dev/stderr',
-                        'level': 'WARNING'},
+                        'level': 'WARNING'}
         }.iteritems():
             if not self.has_section(sec_name):
                 self.add_section(sec_name)
@@ -58,12 +71,23 @@ class NodeManagerConfig(ConfigParser.SafeConfigParser):
                 if not self.has_option(sec_name, opt_name):
                     self.set(sec_name, opt_name, value)
 
-    def get_section(self, section, transformer=None):
+    def get_section(self, section, transformers={}, default_transformer=None):
+        transformer_map = {
+            str: self.get,
+            int: self.getint,
+            bool: self.getboolean,
+            float: self.getfloat,
+        }
         result = self._dict()
         for key, value in self.items(section):
+            transformer = None
+            if transformers.get(key) in transformer_map:
+                transformer = partial(transformer_map[transformers[key]], section)
+            elif default_transformer in transformer_map:
+                transformer = partial(transformer_map[default_transformer], section)
             if transformer is not None:
                 try:
-                    value = transformer(value)
+                    value = transformer(key)
                 except (TypeError, ValueError):
                     pass
             result[key] = value
@@ -103,38 +127,55 @@ class NodeManagerConfig(ConfigParser.SafeConfigParser):
     def new_cloud_client(self):
         module = importlib.import_module('arvnodeman.computenode.driver.' +
                                          self.get('Cloud', 'provider'))
+        driver_class = module.ComputeNodeDriver.DEFAULT_DRIVER
+        if self.has_option('Cloud', 'driver_class'):
+            d = self.get('Cloud', 'driver_class').split('.')
+            mod = '.'.join(d[:-1])
+            cls = d[-1]
+            driver_class = importlib.import_module(mod).__dict__[cls]
         auth_kwargs = self.get_section('Cloud Credentials')
         if 'timeout' in auth_kwargs:
             auth_kwargs['timeout'] = int(auth_kwargs['timeout'])
         return module.ComputeNodeDriver(auth_kwargs,
                                         self.get_section('Cloud List'),
-                                        self.get_section('Cloud Create'))
+                                        self.get_section('Cloud Create'),
+                                        driver_class=driver_class)
 
-    def node_sizes(self, all_sizes):
+    def node_sizes(self):
         """Finds all acceptable NodeSizes for our installation.
 
         Returns a list of (NodeSize, kwargs) pairs for each NodeSize object
         returned by libcloud that matches a size listed in our config file.
         """
-
+        all_sizes = self.new_cloud_client().list_sizes()
         size_kwargs = {}
+        section_types = {
+            'instance_type': str,
+            'price': float,
+            'preemptible': bool,
+        }
         for sec_name in self.sections():
             sec_words = sec_name.split(None, 2)
             if sec_words[0] != 'Size':
                 continue
-            size_spec = self.get_section(sec_name, int)
-            if 'price' in size_spec:
-                size_spec['price'] = float(size_spec['price'])
+            size_spec = self.get_section(sec_name, section_types, int)
+            if 'preemptible' not in size_spec:
+                size_spec['preemptible'] = False
+            if 'instance_type' not in size_spec:
+                # Assume instance type is Size name if missing
+                size_spec['instance_type'] = sec_words[1]
+            size_spec['id'] = sec_words[1]
             size_kwargs[sec_words[1]] = size_spec
         # EC2 node sizes are identified by id. GCE sizes are identified by name.
         matching_sizes = []
         for size in all_sizes:
-            if size.id in size_kwargs:
-                matching_sizes.append((size, size_kwargs[size.id]))
-            elif size.name in size_kwargs:
-                matching_sizes.append((size, size_kwargs[size.name]))
+            matching_sizes += [
+                (size, size_kwargs[s]) for s in size_kwargs
+                if size_kwargs[s]['instance_type'] == size.id
+                or size_kwargs[s]['instance_type'] == size.name
+            ]
         return matching_sizes
 
     def shutdown_windows(self):
-        return [int(n)
+        return [float(n)
                 for n in self.get('Cloud', 'shutdown_windows').split(',')]