X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/88c382d13b3d6e6f3b03ba0d5139ad9552c3c359..bb91f86ea02e3535e3953ee0916dd6877bf585f7:/services/nodemanager/arvnodeman/config.py diff --git a/services/nodemanager/arvnodeman/config.py b/services/nodemanager/arvnodeman/config.py index dcfe1ceb13..e47f9fcb1d 100644 --- a/services/nodemanager/arvnodeman/config.py +++ b/services/nodemanager/arvnodeman/config.py @@ -1,4 +1,7 @@ #!/usr/bin/env python +# Copyright (C) The Arvados Authors. All rights reserved. +# +# SPDX-License-Identifier: AGPL-3.0 from __future__ import absolute_import, print_function @@ -12,15 +15,19 @@ import httplib2 import pykka from apiclient import errors as apierror -from .fullstopactor import FullStopActor +from .baseactor import BaseNodeManagerActor + +from libcloud.common.types import LibcloudError +from libcloud.common.exceptions import BaseHTTPError # IOError is the base class for socket.error, ssl.SSLError, and friends. # It seems like it hits the sweet spot for operations we want to retry: # it's low-level, but unlikely to catch code bugs. NETWORK_ERRORS = (IOError,) ARVADOS_ERRORS = NETWORK_ERRORS + (apierror.Error,) +CLOUD_ERRORS = NETWORK_ERRORS + (LibcloudError, BaseHTTPError) -actor_class = FullStopActor +actor_class = BaseNodeManagerActor class NodeManagerConfig(ConfigParser.SafeConfigParser): """Node Manager Configuration class. @@ -36,7 +43,10 @@ class NodeManagerConfig(ConfigParser.SafeConfigParser): ConfigParser.SafeConfigParser.__init__(self, *args, **kwargs) for sec_name, settings in { 'Arvados': {'insecure': 'no', - 'timeout': '15'}, + 'timeout': '15', + 'jobs_queue': 'yes', + 'slurm_queue': 'yes' + }, 'Daemon': {'min_nodes': '0', 'max_nodes': '1', 'poll_time': '60', @@ -44,9 +54,14 @@ class NodeManagerConfig(ConfigParser.SafeConfigParser): 'poll_stale_after': '600', 'max_total_price': '0', 'boot_fail_after': str(sys.maxint), - 'node_stale_after': str(60 * 60 * 2)}, + 'node_stale_after': str(60 * 60 * 2), + 'watchdog': '600', + 'node_mem_scaling': '0.95'}, + 'Manage': {'address': '127.0.0.1', + 'port': '-1', + 'ManagementToken': ''}, 'Logging': {'file': '/dev/stderr', - 'level': 'WARNING'}, + 'level': 'WARNING'} }.iteritems(): if not self.has_section(sec_name): self.add_section(sec_name) @@ -99,12 +114,19 @@ class NodeManagerConfig(ConfigParser.SafeConfigParser): def new_cloud_client(self): module = importlib.import_module('arvnodeman.computenode.driver.' + self.get('Cloud', 'provider')) + driver_class = module.ComputeNodeDriver.DEFAULT_DRIVER + if self.has_option('Cloud', 'driver_class'): + d = self.get('Cloud', 'driver_class').split('.') + mod = '.'.join(d[:-1]) + cls = d[-1] + driver_class = importlib.import_module(mod).__dict__[cls] auth_kwargs = self.get_section('Cloud Credentials') if 'timeout' in auth_kwargs: auth_kwargs['timeout'] = int(auth_kwargs['timeout']) return module.ComputeNodeDriver(auth_kwargs, self.get_section('Cloud List'), - self.get_section('Cloud Create')) + self.get_section('Cloud Create'), + driver_class=driver_class) def node_sizes(self, all_sizes): """Finds all acceptable NodeSizes for our installation.