7713: Node Manager blackholes broken nodes that can't shut down.
[arvados.git] / services / nodemanager / arvnodeman / config.py
1 #!/usr/bin/env python
2
3 from __future__ import absolute_import, print_function
4
5 import ConfigParser
6 import importlib
7 import logging
8 import ssl
9 import sys
10
11 import arvados
12 import httplib2
13 import pykka
14 from apiclient import errors as apierror
15
16 # IOError is the base class for socket.error and friends.
17 # It seems like it hits the sweet spot for operations we want to retry:
18 # it's low-level, but unlikely to catch code bugs.
19 NETWORK_ERRORS = (IOError, ssl.SSLError)
20 ARVADOS_ERRORS = NETWORK_ERRORS + (apierror.Error,)
21
22 actor_class = pykka.ThreadingActor
23
24 class NodeManagerConfig(ConfigParser.SafeConfigParser):
25     """Node Manager Configuration class.
26
27     This a standard Python ConfigParser, with additional helper methods to
28     create objects instantiated with configuration information.
29     """
30
31     LOGGING_NONLEVELS = frozenset(['file'])
32
33     def __init__(self, *args, **kwargs):
34         # Can't use super() because SafeConfigParser is an old-style class.
35         ConfigParser.SafeConfigParser.__init__(self, *args, **kwargs)
36         for sec_name, settings in {
37             'Arvados': {'insecure': 'no',
38                         'timeout': '15'},
39             'Daemon': {'min_nodes': '0',
40                        'max_nodes': '1',
41                        'poll_time': '60',
42                        'max_poll_time': '300',
43                        'poll_stale_after': '600',
44                        'boot_fail_after': str(sys.maxint),
45                        'node_stale_after': str(60 * 60 * 2)},
46             'Logging': {'file': '/dev/stderr',
47                         'level': 'WARNING'},
48         }.iteritems():
49             if not self.has_section(sec_name):
50                 self.add_section(sec_name)
51             for opt_name, value in settings.iteritems():
52                 if not self.has_option(sec_name, opt_name):
53                     self.set(sec_name, opt_name, value)
54
55     def get_section(self, section, transformer=None):
56         result = self._dict()
57         for key, value in self.items(section):
58             if transformer is not None:
59                 try:
60                     value = transformer(value)
61                 except (TypeError, ValueError):
62                     pass
63             result[key] = value
64         return result
65
66     def log_levels(self):
67         return {key: getattr(logging, self.get('Logging', key).upper())
68                 for key in self.options('Logging')
69                 if key not in self.LOGGING_NONLEVELS}
70
71     def dispatch_classes(self):
72         mod_name = 'arvnodeman.computenode.dispatch'
73         if self.has_option('Daemon', 'dispatcher'):
74             mod_name = '{}.{}'.format(mod_name,
75                                       self.get('Daemon', 'dispatcher'))
76         module = importlib.import_module(mod_name)
77         return (module.ComputeNodeSetupActor,
78                 module.ComputeNodeShutdownActor,
79                 module.ComputeNodeUpdateActor,
80                 module.ComputeNodeMonitorActor)
81
82     def new_arvados_client(self):
83         if self.has_option('Daemon', 'certs_file'):
84             certs_file = self.get('Daemon', 'certs_file')
85         else:
86             certs_file = None
87         insecure = self.getboolean('Arvados', 'insecure')
88         http = httplib2.Http(timeout=self.getint('Arvados', 'timeout'),
89                              ca_certs=certs_file,
90                              disable_ssl_certificate_validation=insecure)
91         return arvados.api(version='v1',
92                            host=self.get('Arvados', 'host'),
93                            token=self.get('Arvados', 'token'),
94                            insecure=insecure,
95                            http=http)
96
97     def new_cloud_client(self):
98         module = importlib.import_module('arvnodeman.computenode.driver.' +
99                                          self.get('Cloud', 'provider'))
100         auth_kwargs = self.get_section('Cloud Credentials')
101         if 'timeout' in auth_kwargs:
102             auth_kwargs['timeout'] = int(auth_kwargs['timeout'])
103         return module.ComputeNodeDriver(auth_kwargs,
104                                         self.get_section('Cloud List'),
105                                         self.get_section('Cloud Create'))
106
107     def node_sizes(self, all_sizes):
108         """Finds all acceptable NodeSizes for our installation.
109
110         Returns a list of (NodeSize, kwargs) pairs for each NodeSize object
111         returned by libcloud that matches a size listed in our config file.
112         """
113
114         size_kwargs = {}
115         for sec_name in self.sections():
116             sec_words = sec_name.split(None, 2)
117             if sec_words[0] != 'Size':
118                 continue
119             size_kwargs[sec_words[1]] = self.get_section(sec_name, int)
120         # EC2 node sizes are identified by id. GCE sizes are identified by name.
121         matching_sizes = []
122         for size in all_sizes:
123             if size.id in size_kwargs:
124                 matching_sizes.append((size, size_kwargs[size.id]))
125             elif size.name in size_kwargs:
126                 matching_sizes.append((size, size_kwargs[size.name]))
127         return matching_sizes
128
129     def shutdown_windows(self):
130         return [int(n)
131                 for n in self.get('Cloud', 'shutdown_windows').split(',')]