def __init__(self, cloud_node, cloud_node_start_time, shutdown_timer,
timer_actor, update_actor, cloud_client,
arvados_node=None, poll_stale_after=600, node_stale_after=3600,
- boot_fail_after=1800
+ boot_fail_after=1800, consecutive_idle_count=0
):
super(ComputeNodeMonitorActor, self).__init__()
self._later = self.actor_ref.tell_proxy()
self.boot_fail_after = boot_fail_after
self.subscribers = set()
self.arvados_node = None
+ self.consecutive_idle_count = consecutive_idle_count
self.consecutive_idle = 0
self._later.update_arvados_node(arvados_node)
self.last_shutdown_opening = None
if crunch_worker_state == "idle":
# Must report as "idle" at least two consecutive times
- if self.consecutive_idle < 2:
+ if self.consecutive_idle < self.consecutive_idle_count:
idle_grace = 'idle wait'
else:
idle_grace = 'idle exceeded'
node_setup_class=dispatch.ComputeNodeSetupActor,
node_shutdown_class=dispatch.ComputeNodeShutdownActor,
node_actor_class=dispatch.ComputeNodeMonitorActor,
- max_total_price=0):
+ max_total_price=0,
+ consecutive_idle_count=1):
super(NodeManagerDaemonActor, self).__init__()
self._node_setup = node_setup_class
self._node_shutdown = node_shutdown_class
self.poll_stale_after = poll_stale_after
self.boot_fail_after = boot_fail_after
self.node_stale_after = node_stale_after
+ self.consecutive_idle_count = consecutive_idle_count
self.last_polls = {}
for poll_name in ['server_wishlist', 'arvados_nodes', 'cloud_nodes']:
poll_actor = locals()[poll_name + '_actor']
poll_stale_after=self.poll_stale_after,
node_stale_after=self.node_stale_after,
cloud_client=self._cloud_driver,
- boot_fail_after=self.boot_fail_after)
+ boot_fail_after=self.boot_fail_after,
+ consecutive_idle_count=self.consecutive_idle_count)
actorTell = actor.tell_proxy()
actorTell.subscribe(self._later.node_can_shutdown)
self._cloud_nodes_actor.subscribe_to(cloud_node.id,
self.make_actor()
self.shutdowns._set_state(True, 600)
self.assertEquals(self.node_actor.shutdown_eligible().get(self.TIMEOUT),
- (False, "node state is ('unpaired', 'open', 'boot wait', 'idle exceeded')"))
+ (False, "node state is ('unpaired', 'open', 'boot wait', 'not idle')"))
def test_shutdown_without_arvados_node(self):
self.make_actor(start_time=0)
self.shutdowns._set_state(True, 600)
- self.assertEquals((True, "node state is ('down', 'open', 'boot exceeded', 'idle exceeded')"),
+ self.assertEquals((True, "node state is ('down', 'open', 'boot exceeded', 'not idle')"),
self.node_actor.shutdown_eligible().get(self.TIMEOUT))
def test_shutdown_missing(self):
last_ping_at='1970-01-01T01:02:03.04050607Z')
self.make_actor(10, arv_node)
self.shutdowns._set_state(True, 600)
- self.assertEquals((True, "node state is ('down', 'open', 'boot wait', 'idle exceeded')"),
+ self.assertEquals((True, "node state is ('down', 'open', 'boot wait', 'not idle')"),
self.node_actor.shutdown_eligible().get(self.TIMEOUT))
def test_shutdown_running_broken(self):
self.make_actor(12, arv_node)
self.shutdowns._set_state(True, 600)
self.cloud_client.broken.return_value = True
- self.assertEquals((True, "node state is ('down', 'open', 'boot wait', 'idle exceeded')"),
+ self.assertEquals((True, "node state is ('down', 'open', 'boot wait', 'not idle')"),
self.node_actor.shutdown_eligible().get(self.TIMEOUT))
def test_shutdown_missing_broken(self):
self.make_actor(11, arv_node)
self.shutdowns._set_state(True, 600)
self.cloud_client.broken.return_value = True
- self.assertEquals(self.node_actor.shutdown_eligible().get(self.TIMEOUT), (True, "node state is ('down', 'open', 'boot wait', 'idle exceeded')"))
+ self.assertEquals(self.node_actor.shutdown_eligible().get(self.TIMEOUT), (True, "node state is ('down', 'open', 'boot wait', 'not idle')"))
def test_no_shutdown_when_window_closed(self):
self.make_actor(3, testutil.arvados_node_mock(3, job_uuid=None))
def test_no_shutdown_when_node_running_job(self):
self.make_actor(4, testutil.arvados_node_mock(4, job_uuid=True))
self.shutdowns._set_state(True, 600)
- self.assertEquals((False, "node state is ('busy', 'open', 'boot wait', 'idle exceeded')"),
+ self.assertEquals((False, "node state is ('busy', 'open', 'boot wait', 'not idle')"),
self.node_actor.shutdown_eligible().get(self.TIMEOUT))
def test_shutdown_when_node_state_unknown(self):
self.make_actor(5, testutil.arvados_node_mock(
5, crunch_worker_state='fail'))
self.shutdowns._set_state(True, 600)
- self.assertEquals((True, "node state is ('fail', 'open', 'boot wait', 'idle exceeded')"),
+ self.assertEquals((True, "node state is ('fail', 'open', 'boot wait', 'not idle')"),
self.node_actor.shutdown_eligible().get(self.TIMEOUT))
def test_no_shutdown_when_node_state_stale(self):