Merge branch 'master' into 13804-no-shutdown-wanted-nodes
[arvados.git] / services / nodemanager / arvnodeman / computenode / dispatch / __init__.py
index bb83a193fa30bae46dc7a7f6979843eb65fd42d1..77c515d565e8113681c2dad610d103fae4156a15 100644 (file)
@@ -130,7 +130,7 @@ class ComputeNodeSetupActor(ComputeNodeStateChangeBase):
     @RetryMixin._retry()
     def create_cloud_node(self):
         self._logger.info("Sending create_node request for node size %s.",
-                          self.cloud_size.name)
+                          self.cloud_size.id)
         try:
             self.cloud_node = self._cloud.create_node(self.cloud_size,
                                                       self.arvados_node)
@@ -339,7 +339,7 @@ class ComputeNodeMonitorActor(config.actor_class):
     def __init__(self, cloud_node, cloud_node_start_time, shutdown_timer,
                  timer_actor, update_actor, cloud_client,
                  arvados_node=None, poll_stale_after=600, node_stale_after=3600,
-                 boot_fail_after=1800
+                 boot_fail_after=1800, consecutive_idle_count=0
     ):
         super(ComputeNodeMonitorActor, self).__init__()
         self._later = self.actor_ref.tell_proxy()
@@ -354,6 +354,7 @@ class ComputeNodeMonitorActor(config.actor_class):
         self.boot_fail_after = boot_fail_after
         self.subscribers = set()
         self.arvados_node = None
+        self.consecutive_idle_count = consecutive_idle_count
         self.consecutive_idle = 0
         self._later.update_arvados_node(arvados_node)
         self.last_shutdown_opening = None
@@ -437,6 +438,11 @@ class ComputeNodeMonitorActor(config.actor_class):
         reason for the decision.
         """
 
+        # If this node's size is invalid (because it has a stale arvados_node_size
+        # tag), return True so that it's properly shut down.
+        if self.cloud_node.size.id == 'invalid':
+            return (True, "node's size tag '%s' not recognizable" % (self.cloud_node.extra['arvados_node_size'],))
+
         # Collect states and then consult state transition table whether we
         # should shut down.  Possible states are:
         # crunch_worker_state = ['unpaired', 'busy', 'idle', 'down']
@@ -457,8 +463,8 @@ class ComputeNodeMonitorActor(config.actor_class):
             boot_grace = "boot exceeded"
 
         if crunch_worker_state == "idle":
-            # Must report as "idle" at least two consecutive times
-            if self.consecutive_idle < 2:
+            # Must report as "idle" at least "consecutive_idle_count" times
+            if self.consecutive_idle < self.consecutive_idle_count:
                 idle_grace = 'idle wait'
             else:
                 idle_grace = 'idle exceeded'