from libcloud.common.exceptions import BaseHTTPError
import arvnodeman.computenode.dispatch as dispatch
+import arvnodeman.status as status
from arvnodeman.computenode.driver import BaseComputeNodeDriver
from . import testutil
def check_success_flag(self, expected, allow_msg_count=1):
# allow_msg_count is the number of internal messages that may
# need to be handled for shutdown to finish.
- for try_num in range(1 + allow_msg_count):
+ for _ in range(1 + allow_msg_count):
last_flag = self.shutdown_actor.success.get(self.TIMEOUT)
if last_flag is expected:
break
else:
self.fail("success flag {} is not {}".format(last_flag, expected))
+ def test_boot_failure_counting(self, *mocks):
+ # A boot failure happens when a node transitions from unpaired to shutdown
+ status.tracker.update({'boot_failures': 0})
+ self.make_mocks(shutdown_open=True, arvados_node=testutil.arvados_node_mock(crunch_worker_state="unpaired"))
+ self.cloud_client.destroy_node.return_value = True
+ self.make_actor(cancellable=False)
+ self.check_success_flag(True, 2)
+ self.assertTrue(self.cloud_client.destroy_node.called)
+ self.assertEqual(1, status.tracker.get('boot_failures'))
+
def test_cancellable_shutdown(self, *mocks):
self.make_mocks(shutdown_open=True, arvados_node=testutil.arvados_node_mock(crunch_worker_state="busy"))
self.cloud_client.destroy_node.return_value = True
self.assertFalse(self.cloud_client.destroy_node.called)
def test_uncancellable_shutdown(self, *mocks):
+ status.tracker.update({'boot_failures': 0})
self.make_mocks(shutdown_open=True, arvados_node=testutil.arvados_node_mock(crunch_worker_state="busy"))
self.cloud_client.destroy_node.return_value = True
self.make_actor(cancellable=False)
self.check_success_flag(True, 4)
self.assertTrue(self.cloud_client.destroy_node.called)
+ # A normal shutdown shouldn't be counted as boot failure
+ self.assertEqual(0, status.tracker.get('boot_failures'))
def test_arvados_node_cleaned_after_shutdown(self, *mocks):
if len(mocks) == 1:
self.assertTrue(self.node_state('down'))
def test_in_idle_state(self):
+ idle_nodes_before = status.tracker._idle_nodes.keys()
self.make_actor(2, arv_node=testutil.arvados_node_mock(job_uuid=None))
self.assertTrue(self.node_state('idle'))
self.assertFalse(self.node_state('busy'))
self.assertTrue(self.node_state('idle', 'busy'))
+ idle_nodes_after = status.tracker._idle_nodes.keys()
+ new_idle_nodes = [n for n in idle_nodes_after if n not in idle_nodes_before]
+ # There should be 1 additional idle node
+ self.assertEqual(1, len(new_idle_nodes))
def test_in_busy_state(self):
+ idle_nodes_before = status.tracker._idle_nodes.keys()
self.make_actor(3, arv_node=testutil.arvados_node_mock(job_uuid=True))
self.assertFalse(self.node_state('idle'))
self.assertTrue(self.node_state('busy'))
self.assertTrue(self.node_state('idle', 'busy'))
+ idle_nodes_after = status.tracker._idle_nodes.keys()
+ new_idle_nodes = [n for n in idle_nodes_after if n not in idle_nodes_before]
+ # There shouldn't be any additional idle node
+ self.assertEqual(0, len(new_idle_nodes))
def test_init_shutdown_scheduling(self):
self.make_actor()
self.make_actor()
self.shutdowns._set_state(True, 600)
self.assertEquals(self.node_actor.shutdown_eligible().get(self.TIMEOUT),
- (False, "node state is ('unpaired', 'open', 'boot wait', 'idle exceeded')"))
+ (False, "node state is ('unpaired', 'open', 'boot wait', 'not idle')"))
+
+ def test_shutdown_when_invalid_cloud_node_size(self):
+ self.make_mocks(1)
+ self.cloud_mock.size.id = 'invalid'
+ self.cloud_mock.extra['arvados_node_size'] = 'stale.type'
+ self.make_actor()
+ self.shutdowns._set_state(True, 600)
+ self.assertEquals((True, "node's size tag 'stale.type' not recognizable"),
+ self.node_actor.shutdown_eligible().get(self.TIMEOUT))
def test_shutdown_without_arvados_node(self):
self.make_actor(start_time=0)
self.shutdowns._set_state(True, 600)
- self.assertEquals((True, "node state is ('down', 'open', 'boot exceeded', 'idle exceeded')"),
+ self.assertEquals((True, "node state is ('down', 'open', 'boot exceeded', 'not idle')"),
self.node_actor.shutdown_eligible().get(self.TIMEOUT))
def test_shutdown_missing(self):
last_ping_at='1970-01-01T01:02:03.04050607Z')
self.make_actor(10, arv_node)
self.shutdowns._set_state(True, 600)
- self.assertEquals((True, "node state is ('down', 'open', 'boot wait', 'idle exceeded')"),
+ self.assertEquals((True, "node state is ('down', 'open', 'boot wait', 'not idle')"),
self.node_actor.shutdown_eligible().get(self.TIMEOUT))
def test_shutdown_running_broken(self):
self.make_actor(12, arv_node)
self.shutdowns._set_state(True, 600)
self.cloud_client.broken.return_value = True
- self.assertEquals((True, "node state is ('down', 'open', 'boot wait', 'idle exceeded')"),
+ self.assertEquals((True, "node state is ('down', 'open', 'boot wait', 'not idle')"),
self.node_actor.shutdown_eligible().get(self.TIMEOUT))
def test_shutdown_missing_broken(self):
self.make_actor(11, arv_node)
self.shutdowns._set_state(True, 600)
self.cloud_client.broken.return_value = True
- self.assertEquals(self.node_actor.shutdown_eligible().get(self.TIMEOUT), (True, "node state is ('down', 'open', 'boot wait', 'idle exceeded')"))
+ self.assertEquals(self.node_actor.shutdown_eligible().get(self.TIMEOUT), (True, "node state is ('down', 'open', 'boot wait', 'not idle')"))
def test_no_shutdown_when_window_closed(self):
self.make_actor(3, testutil.arvados_node_mock(3, job_uuid=None))
def test_no_shutdown_when_node_running_job(self):
self.make_actor(4, testutil.arvados_node_mock(4, job_uuid=True))
self.shutdowns._set_state(True, 600)
- self.assertEquals((False, "node state is ('busy', 'open', 'boot wait', 'idle exceeded')"),
+ self.assertEquals((False, "node state is ('busy', 'open', 'boot wait', 'not idle')"),
self.node_actor.shutdown_eligible().get(self.TIMEOUT))
def test_shutdown_when_node_state_unknown(self):
self.make_actor(5, testutil.arvados_node_mock(
5, crunch_worker_state='fail'))
self.shutdowns._set_state(True, 600)
- self.assertEquals((True, "node state is ('fail', 'open', 'boot wait', 'idle exceeded')"),
+ self.assertEquals((True, "node state is ('fail', 'open', 'boot wait', 'not idle')"),
self.node_actor.shutdown_eligible().get(self.TIMEOUT))
def test_no_shutdown_when_node_state_stale(self):