12085: When an idle node disappears from the cloud node list, clear its counter.
authorLucas Di Pentima <ldipentima@veritasgenetics.com>
Tue, 3 Apr 2018 16:50:35 +0000 (13:50 -0300)
committerLucas Di Pentima <ldipentima@veritasgenetics.com>
Thu, 5 Apr 2018 14:17:03 +0000 (11:17 -0300)
Arvados-DCO-1.1-Signed-off-by: Lucas Di Pentima <ldipentima@veritasgenetics.com>

services/nodemanager/arvnodeman/daemon.py
services/nodemanager/tests/test_daemon.py

index 5f15b284e491af2469365a423ec8d25d52c771b1..3147de328a9df1332edaf83e7e5d5cfd14b8caf0 100644 (file)
@@ -219,6 +219,11 @@ class NodeManagerDaemonActor(actor_class):
                 # actor if necessary and forget about the node.
                 if record.actor:
                     try:
+                        # If it's paired and idle, stop its idle time counter
+                        # before removing the monitor actor.
+                        if record.actor.get_state().get() == 'idle':
+                            status.tracker.idle_out(
+                                record.actor.arvados_node.get()['hostname'])
                         record.actor.stop()
                     except pykka.ActorDeadError:
                         pass
index c4970125afb8d91ce7175083c638527a7c3869f0..8050e6981411d69f127617e0cb2b44681470341d 100644 (file)
@@ -665,6 +665,27 @@ class NodeManagerDaemonActorTestCase(testutil.ActorTestMixin,
         self.daemon.update_cloud_nodes([]).get(self.TIMEOUT)
         self.busywait(lambda: 1 == self.last_shutdown.stop.call_count)
 
+    def test_idle_node_disappearing_clears_status_idle_time_counter(self):
+        size = testutil.MockSize(1)
+        status.tracker._idle_nodes = {}
+        cloud_nodes = [testutil.cloud_node_mock(1, size=size)]
+        arv_nodes = [testutil.arvados_node_mock(1, job_uuid=None)]
+        self.make_daemon(cloud_nodes, arv_nodes, [size])
+        self.busywait(lambda: 1 == self.paired_monitor_count())
+        for mon_ref in self.monitor_list():
+            monitor = mon_ref.proxy()
+            if monitor.cloud_node.get(self.TIMEOUT) is cloud_nodes[-1]:
+                break
+        else:
+            self.fail("monitor for idle node not found")
+        self.assertEqual(1, status.tracker.get('nodes_idle'))
+        hostname = monitor.arvados_node.get()['hostname']
+        self.assertIn(hostname, status.tracker._idle_nodes)
+        # Simulate the node disappearing from the cloud node list
+        self.daemon.update_cloud_nodes([]).get(self.TIMEOUT)
+        self.busywait(lambda: 0 == self.alive_monitor_count())
+        self.assertNotIn(hostname, status.tracker._idle_nodes)
+
     def test_shutdown_actor_cleanup_copes_with_dead_actors(self):
         self.make_daemon(cloud_nodes=[testutil.cloud_node_mock()])
         self.assertEqual(1, self.alive_monitor_count())