Merge branch 'master' into 13804-no-shutdown-wanted-nodes
[arvados.git] / services / nodemanager / arvnodeman / launcher.py
index 11d38ecb76d22105b289d17cec5081aaa5bf3952..1439c94118a913179f09a928df38caaab4c24e04 100644 (file)
@@ -1,4 +1,7 @@
 #!/usr/bin/env python
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: AGPL-3.0
 
 from __future__ import absolute_import, print_function
 
@@ -22,6 +25,7 @@ from .timedcallback import TimedCallBackActor
 from ._version import __version__
 
 node_daemon = None
+watchdog = None
 
 def abort(msg, code=1):
     print("arvados-node-manager: " + msg)
@@ -67,7 +71,7 @@ def setup_logging(path, level, **sublevels):
     return root_logger
 
 def build_server_calculator(config):
-    cloud_size_list = config.node_sizes(config.new_cloud_client().list_sizes())
+    cloud_size_list = config.node_sizes()
     if not cloud_size_list:
         abort("No valid node sizes configured")
     return ServerCalculator(cloud_size_list,
@@ -76,7 +80,7 @@ def build_server_calculator(config):
                             config.getfloat('Daemon', 'node_mem_scaling'))
 
 def launch_pollers(config, server_calculator):
-    poll_time = config.getint('Daemon', 'poll_time')
+    poll_time = config.getfloat('Daemon', 'poll_time')
     max_poll_time = config.getint('Daemon', 'max_poll_time')
 
     timer = TimedCallBackActor.start(poll_time / 10.0).tell_proxy()
@@ -86,7 +90,10 @@ def launch_pollers(config, server_calculator):
         config.new_arvados_client(), timer, poll_time, max_poll_time).tell_proxy()
     job_queue_poller = JobQueueMonitorActor.start(
         config.new_arvados_client(), timer, server_calculator,
-        poll_time, max_poll_time).tell_proxy()
+        config.getboolean('Arvados', 'jobs_queue'),
+        config.getboolean('Arvados', 'slurm_queue'),
+        poll_time, max_poll_time
+    ).tell_proxy()
     return timer, cloud_node_poller, arvados_node_poller, job_queue_poller
 
 _caught_signals = {}
@@ -97,6 +104,7 @@ def shutdown_signal(signal_code, frame):
         pykka.ActorRegistry.stop_all()
         sys.exit(-signal_code)
     elif current_count == 0:
+        watchdog.stop()
         node_daemon.shutdown()
     elif current_count == 1:
         pykka.ActorRegistry.stop_all()
@@ -104,7 +112,7 @@ def shutdown_signal(signal_code, frame):
         sys.exit(-signal_code)
 
 def main(args=None):
-    global node_daemon
+    global node_daemon, watchdog
     args = parse_cli(args)
     config = load_config(args.config)
 
@@ -117,13 +125,13 @@ def main(args=None):
 
     try:
         root_logger = setup_logging(config.get('Logging', 'file'), **config.log_levels())
-        root_logger.info("%s %s, libcloud %s", sys.argv[0], __version__, libcloud.__version__)
+        root_logger.info("%s %s started, libcloud %s", sys.argv[0], __version__, libcloud.__version__)
         node_setup, node_shutdown, node_update, node_monitor = \
             config.dispatch_classes()
         server_calculator = build_server_calculator(config)
         timer, cloud_node_poller, arvados_node_poller, job_queue_poller = \
             launch_pollers(config, server_calculator)
-        cloud_node_updater = node_update.start(config.new_cloud_client).tell_proxy()
+        cloud_node_updater = node_update.start(config.new_cloud_client, timer).tell_proxy()
         node_daemon = NodeManagerDaemonActor.start(
             job_queue_poller, arvados_node_poller, cloud_node_poller,
             cloud_node_updater, timer,
@@ -136,9 +144,10 @@ def main(args=None):
             config.getint('Daemon', 'boot_fail_after'),
             config.getint('Daemon', 'node_stale_after'),
             node_setup, node_shutdown, node_monitor,
-            max_total_price=config.getfloat('Daemon', 'max_total_price')).tell_proxy()
+            max_total_price=config.getfloat('Daemon', 'max_total_price'),
+            consecutive_idle_count=config.getint('Daemon', 'consecutive_idle_count'),).tell_proxy()
 
-        WatchdogActor.start(config.getint('Daemon', 'watchdog'),
+        watchdog = WatchdogActor.start(config.getint('Daemon', 'watchdog'),
                             cloud_node_poller.actor_ref,
                             arvados_node_poller.actor_ref,
                             job_queue_poller.actor_ref,