cloud_node=cloud_node,
cloud_node_start_time=start_time,
shutdown_timer=shutdown_timer,
+ cloud_fqdn_func=self._cloud_driver.node_fqdn,
update_actor=self._cloud_updater,
timer_actor=self._timer,
arvados_node=None,
poll_stale_after=self.poll_stale_after,
- node_stale_after=self.node_stale_after).proxy()
+ node_stale_after=self.node_stale_after,
+ cloud_client=self._cloud_driver,
+ boot_fail_after=self.boot_fail_after).proxy()
actor.subscribe(self._later.node_can_shutdown)
self._cloud_nodes_actor.subscribe_to(cloud_node.id,
actor.update_cloud_node)
self._pair_nodes(record, arv_rec.arvados_node)
break
for key, record in self.cloud_nodes.orphans.iteritems():
+ if key in self.shutdowns:
+ try:
+ self.shutdowns[key].stop().get()
+ except pykka.ActorDeadError:
+ pass
+ del self.shutdowns[key]
record.actor.stop()
record.cloud_node = None
- self.shutdowns.pop(key, None)
def update_arvados_nodes(self, nodelist):
self._update_poll_time('arvados_nodes')
[self.cloud_nodes, self.booted, self.booting])
def _nodes_busy(self):
- return sum(1 for idle in
- pykka.get_all(rec.actor.in_state('idle') for rec in
+ return sum(1 for busy in
+ pykka.get_all(rec.actor.in_state('busy') for rec in
self.cloud_nodes.nodes.itervalues())
- if idle is False)
+ if busy)
+
+ def _nodes_missing(self):
+ return sum(1 for arv_node in
+ pykka.get_all(rec.actor.arvados_node for rec in
+ self.cloud_nodes.nodes.itervalues()
+ if rec.actor.cloud_node.get().id not in self.shutdowns)
+ if arv_node and cnode.arvados_node_missing(arv_node, self.node_stale_after))
def _nodes_wanted(self):
up_count = self._nodes_up()
elif under_min > 0:
return under_min
else:
- up_count -= len(self.shutdowns) + self._nodes_busy()
+ up_count -= len(self.shutdowns) + self._nodes_busy() + self._nodes_missing()
return len(self.last_wishlist) - up_count
def _nodes_excess(self):
if (nodes_excess < 1) or not self.booting:
return None
for key, node in self.booting.iteritems():
- node.stop_if_no_cloud_node().get()
- if not node.actor_ref.is_alive():
+ if node.stop_if_no_cloud_node().get():
del self.booting[key]
if nodes_excess > 1:
self._later.stop_booting_node()
return None
shutdown = self._node_shutdown.start(
timer_actor=self._timer, cloud_client=self._new_cloud(),
+ arvados_client=self._new_arvados(),
node_monitor=node_actor.actor_ref, cancellable=cancellable).proxy()
self.shutdowns[cloud_node_id] = shutdown
shutdown.subscribe(self._later.node_finished_shutdown)
break
else:
return None
- if record.arvados_node is None:
+ if not record.actor.in_state('idle', 'busy').get():
self._begin_node_shutdown(record.actor, cancellable=False)
def node_finished_shutdown(self, shutdown_actor):
def shutdown(self):
self._logger.info("Shutting down after signal.")
self.poll_stale_after = -1 # Inhibit starting/stopping nodes
- for bootnode in self.booting.itervalues():
- bootnode.stop_if_no_cloud_node()
+ setup_stops = {key: node.stop_if_no_cloud_node()
+ for key, node in self.booting.iteritems()}
+ self.booting = {key: self.booting[key]
+ for key in setup_stops if not setup_stops[key].get()}
self._later.await_shutdown()
def await_shutdown(self):
- if any(node.actor_ref.is_alive() for node in self.booting.itervalues()):
+ if self.booting:
self._timer.schedule(time.time() + 1, self._later.await_shutdown)
else:
self.stop()