11652: Merge branch 'master' into 11652-recursive-contents
[arvados.git] / services / nodemanager / arvnodeman / launcher.py
index 78bd2db5cc05fe9516c10e718506ef11734055db..11d38ecb76d22105b289d17cec5081aaa5bf3952 100644 (file)
@@ -10,12 +10,16 @@ import time
 
 import daemon
 import pykka
 
 import daemon
 import pykka
+import libcloud
 
 from . import config as nmconfig
 
 from . import config as nmconfig
+from . import status
+from .baseactor import WatchdogActor
 from .daemon import NodeManagerDaemonActor
 from .jobqueue import JobQueueMonitorActor, ServerCalculator
 from .nodelist import ArvadosNodeListMonitorActor, CloudNodeListMonitorActor
 from .timedcallback import TimedCallBackActor
 from .daemon import NodeManagerDaemonActor
 from .jobqueue import JobQueueMonitorActor, ServerCalculator
 from .nodelist import ArvadosNodeListMonitorActor, CloudNodeListMonitorActor
 from .timedcallback import TimedCallBackActor
+from ._version import __version__
 
 node_daemon = None
 
 
 node_daemon = None
 
@@ -27,6 +31,10 @@ def parse_cli(args):
     parser = argparse.ArgumentParser(
         prog='arvados-node-manager',
         description="Dynamically allocate Arvados cloud compute nodes")
     parser = argparse.ArgumentParser(
         prog='arvados-node-manager',
         description="Dynamically allocate Arvados cloud compute nodes")
+    parser.add_argument(
+        '--version', action='version',
+        version="%s %s" % (sys.argv[0], __version__),
+        help='Print version and exit.')
     parser.add_argument(
         '--foreground', action='store_true', default=False,
         help="Run in the foreground.  Don't daemonize.")
     parser.add_argument(
         '--foreground', action='store_true', default=False,
         help="Run in the foreground.  Don't daemonize.")
@@ -56,6 +64,7 @@ def setup_logging(path, level, **sublevels):
     for logger_name, sublevel in sublevels.iteritems():
         sublogger = logging.getLogger(logger_name)
         sublogger.setLevel(sublevel)
     for logger_name, sublevel in sublevels.iteritems():
         sublogger = logging.getLogger(logger_name)
         sublogger.setLevel(sublevel)
+    return root_logger
 
 def build_server_calculator(config):
     cloud_size_list = config.node_sizes(config.new_cloud_client().list_sizes())
 
 def build_server_calculator(config):
     cloud_size_list = config.node_sizes(config.new_cloud_client().list_sizes())
@@ -63,7 +72,8 @@ def build_server_calculator(config):
         abort("No valid node sizes configured")
     return ServerCalculator(cloud_size_list,
                             config.getint('Daemon', 'max_nodes'),
         abort("No valid node sizes configured")
     return ServerCalculator(cloud_size_list,
                             config.getint('Daemon', 'max_nodes'),
-                            config.getfloat('Daemon', 'max_total_price'))
+                            config.getfloat('Daemon', 'max_total_price'),
+                            config.getfloat('Daemon', 'node_mem_scaling'))
 
 def launch_pollers(config, server_calculator):
     poll_time = config.getint('Daemon', 'poll_time')
 
 def launch_pollers(config, server_calculator):
     poll_time = config.getint('Daemon', 'poll_time')
@@ -71,7 +81,7 @@ def launch_pollers(config, server_calculator):
 
     timer = TimedCallBackActor.start(poll_time / 10.0).tell_proxy()
     cloud_node_poller = CloudNodeListMonitorActor.start(
 
     timer = TimedCallBackActor.start(poll_time / 10.0).tell_proxy()
     cloud_node_poller = CloudNodeListMonitorActor.start(
-        config.new_cloud_client(), timer, poll_time, max_poll_time).tell_proxy()
+        config.new_cloud_client(), timer, server_calculator, poll_time, max_poll_time).tell_proxy()
     arvados_node_poller = ArvadosNodeListMonitorActor.start(
         config.new_arvados_client(), timer, poll_time, max_poll_time).tell_proxy()
     job_queue_poller = JobQueueMonitorActor.start(
     arvados_node_poller = ArvadosNodeListMonitorActor.start(
         config.new_arvados_client(), timer, poll_time, max_poll_time).tell_proxy()
     job_queue_poller = JobQueueMonitorActor.start(
@@ -103,8 +113,11 @@ def main(args=None):
     for sigcode in [signal.SIGINT, signal.SIGQUIT, signal.SIGTERM]:
         signal.signal(sigcode, shutdown_signal)
 
     for sigcode in [signal.SIGINT, signal.SIGQUIT, signal.SIGTERM]:
         signal.signal(sigcode, shutdown_signal)
 
+    status.Server(config).start()
+
     try:
     try:
-        setup_logging(config.get('Logging', 'file'), **config.log_levels())
+        root_logger = setup_logging(config.get('Logging', 'file'), **config.log_levels())
+        root_logger.info("%s %s, libcloud %s", sys.argv[0], __version__, libcloud.__version__)
         node_setup, node_shutdown, node_update, node_monitor = \
             config.dispatch_classes()
         server_calculator = build_server_calculator(config)
         node_setup, node_shutdown, node_update, node_monitor = \
             config.dispatch_classes()
         server_calculator = build_server_calculator(config)
@@ -125,6 +138,12 @@ def main(args=None):
             node_setup, node_shutdown, node_monitor,
             max_total_price=config.getfloat('Daemon', 'max_total_price')).tell_proxy()
 
             node_setup, node_shutdown, node_monitor,
             max_total_price=config.getfloat('Daemon', 'max_total_price')).tell_proxy()
 
+        WatchdogActor.start(config.getint('Daemon', 'watchdog'),
+                            cloud_node_poller.actor_ref,
+                            arvados_node_poller.actor_ref,
+                            job_queue_poller.actor_ref,
+                            node_daemon.actor_ref)
+
         signal.pause()
         daemon_stopped = node_daemon.actor_ref.actor_stopped.is_set
         while not daemon_stopped():
         signal.pause()
         daemon_stopped = node_daemon.actor_ref.actor_stopped.is_set
         while not daemon_stopped():