#!/usr/bin/env python
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: AGPL-3.0
from __future__ import absolute_import, print_function
from ._version import __version__
node_daemon = None
+watchdog = None
def abort(msg, code=1):
print("arvados-node-manager: " + msg)
return root_logger
def build_server_calculator(config):
- cloud_size_list = config.node_sizes(config.new_cloud_client().list_sizes())
+ cloud_size_list = config.node_sizes()
if not cloud_size_list:
abort("No valid node sizes configured")
return ServerCalculator(cloud_size_list,
config.getfloat('Daemon', 'node_mem_scaling'))
def launch_pollers(config, server_calculator):
- poll_time = config.getint('Daemon', 'poll_time')
+ poll_time = config.getfloat('Daemon', 'poll_time')
max_poll_time = config.getint('Daemon', 'max_poll_time')
+ cloudlist_poll_time = config.getfloat('Daemon', 'cloudlist_poll_time') or poll_time
+ nodelist_poll_time = config.getfloat('Daemon', 'nodelist_poll_time') or poll_time
+ wishlist_poll_time = config.getfloat('Daemon', 'wishlist_poll_time') or poll_time
+
timer = TimedCallBackActor.start(poll_time / 10.0).tell_proxy()
cloud_node_poller = CloudNodeListMonitorActor.start(
- config.new_cloud_client(), timer, server_calculator, poll_time, max_poll_time).tell_proxy()
+ config.new_cloud_client(), timer, server_calculator, cloudlist_poll_time, max_poll_time).tell_proxy()
arvados_node_poller = ArvadosNodeListMonitorActor.start(
- config.new_arvados_client(), timer, poll_time, max_poll_time).tell_proxy()
+ config.new_arvados_client(), timer, nodelist_poll_time, max_poll_time).tell_proxy()
job_queue_poller = JobQueueMonitorActor.start(
config.new_arvados_client(), timer, server_calculator,
- poll_time, max_poll_time).tell_proxy()
+ config.getboolean('Arvados', 'jobs_queue'),
+ config.getboolean('Arvados', 'slurm_queue'),
+ wishlist_poll_time, max_poll_time
+ ).tell_proxy()
return timer, cloud_node_poller, arvados_node_poller, job_queue_poller
_caught_signals = {}
pykka.ActorRegistry.stop_all()
sys.exit(-signal_code)
elif current_count == 0:
+ watchdog.stop()
node_daemon.shutdown()
elif current_count == 1:
pykka.ActorRegistry.stop_all()
sys.exit(-signal_code)
def main(args=None):
- global node_daemon
+ global node_daemon, watchdog
args = parse_cli(args)
config = load_config(args.config)
try:
root_logger = setup_logging(config.get('Logging', 'file'), **config.log_levels())
- root_logger.info("%s %s, libcloud %s", sys.argv[0], __version__, libcloud.__version__)
+ root_logger.info("%s %s started, libcloud %s", sys.argv[0], __version__, libcloud.__version__)
node_setup, node_shutdown, node_update, node_monitor = \
config.dispatch_classes()
server_calculator = build_server_calculator(config)
timer, cloud_node_poller, arvados_node_poller, job_queue_poller = \
launch_pollers(config, server_calculator)
- cloud_node_updater = node_update.start(config.new_cloud_client).tell_proxy()
+ cloud_node_updater = node_update.start(config.new_cloud_client, timer).tell_proxy()
node_daemon = NodeManagerDaemonActor.start(
job_queue_poller, arvados_node_poller, cloud_node_poller,
cloud_node_updater, timer,
config.getint('Daemon', 'boot_fail_after'),
config.getint('Daemon', 'node_stale_after'),
node_setup, node_shutdown, node_monitor,
- max_total_price=config.getfloat('Daemon', 'max_total_price')).tell_proxy()
+ max_total_price=config.getfloat('Daemon', 'max_total_price'),
+ consecutive_idle_count=config.getint('Daemon', 'consecutive_idle_count'),).tell_proxy()
- WatchdogActor.start(config.getint('Daemon', 'watchdog'),
+ watchdog = WatchdogActor.start(config.getint('Daemon', 'watchdog'),
cloud_node_poller.actor_ref,
arvados_node_poller.actor_ref,
job_queue_poller.actor_ref,