4670: Node Manager handles more libcloud exceptions.
authorBrett Smith <brett@curoverse.com>
Fri, 12 Dec 2014 18:18:51 +0000 (13:18 -0500)
committerBrett Smith <brett@curoverse.com>
Thu, 18 Dec 2014 21:19:50 +0000 (16:19 -0500)
libcloud compute drivers (at least EC2 and GCE) raise bare Exceptions
when there's some problem talking to the cloud service.  The previous
code was expecting to see a LibcloudError, so it wouldn't handle these
errors as intended.

I didn't want to just catch errors with "except Exception" everywhere,
so I added an is_cloud_exception class method to our driver classes to
more accurately identify exceptions that represent trouble talking to
the cloud service.  It recognizes exact Exceptions, plus the other
classes we were catching before.

While I was at this, I gave more specific names to the wrapper methods
in compute node actor decorators, as a debugging aid.

services/nodemanager/arvnodeman/clientactor.py
services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
services/nodemanager/arvnodeman/computenode/driver/__init__.py
services/nodemanager/arvnodeman/config.py
services/nodemanager/arvnodeman/nodelist.py
services/nodemanager/tests/test_computenode_driver_ec2.py

index 46a103eb02985a7ba9e24af464e96e0cf6dd5a7b..6319f4bbfc5cece52832842cfda05d9ae9253005 100644 (file)
@@ -30,12 +30,10 @@ class RemotePollLoopActor(actor_class):
     response to subscribers.  It takes care of error handling, and retrying
     requests with exponential backoff.
 
-    To use this actor, define CLIENT_ERRORS and the _send_request method.
-    If you also define an _item_key method, this class will support
-    subscribing to a specific item by key in responses.
+    To use this actor, define the _send_request method.  If you also
+    define an _item_key method, this class will support subscribing to
+    a specific item by key in responses.
     """
-    CLIENT_ERRORS = ()
-
     def __init__(self, client, timer_actor, poll_wait=60, max_poll_wait=180):
         super(RemotePollLoopActor, self).__init__()
         self._client = client
@@ -87,6 +85,9 @@ class RemotePollLoopActor(actor_class):
         return "{} got error: {} - waiting {} seconds".format(
             self.log_prefix, error, self.poll_wait)
 
+    def is_common_error(self, exception):
+        return False
+
     def poll(self, scheduled_start=None):
         self._logger.debug("%s sending poll", self.log_prefix)
         start_time = time.time()
@@ -96,7 +97,7 @@ class RemotePollLoopActor(actor_class):
             response = self._send_request()
         except Exception as error:
             errmsg = self._got_error(error)
-            if isinstance(error, self.CLIENT_ERRORS):
+            if self.is_common_error(error):
                 self._logger.warning(errmsg)
             else:
                 self._logger.exception(errmsg)
index c79d8f9588fbea18e276fd1e8f30474b3ac7677e..f50670e7ba118c4c00ce88cbaf45ef3e9509d9c4 100644 (file)
@@ -19,32 +19,38 @@ class ComputeNodeStateChangeBase(config.actor_class):
     This base class takes care of retrying changes and notifying
     subscribers when the change is finished.
     """
-    def __init__(self, logger_name, timer_actor, retry_wait, max_retry_wait):
+    def __init__(self, logger_name, cloud_client, timer_actor,
+                 retry_wait, max_retry_wait):
         super(ComputeNodeStateChangeBase, self).__init__()
         self._later = self.actor_ref.proxy()
-        self._timer = timer_actor
         self._logger = logging.getLogger(logger_name)
+        self._cloud = cloud_client
+        self._timer = timer_actor
         self.min_retry_wait = retry_wait
         self.max_retry_wait = max_retry_wait
         self.retry_wait = retry_wait
         self.subscribers = set()
 
     @staticmethod
-    def _retry(errors):
+    def _retry(errors=()):
         """Retry decorator for an actor method that makes remote requests.
 
         Use this function to decorator an actor method, and pass in a
         tuple of exceptions to catch.  This decorator will schedule
         retries of that method with exponential backoff if the
-        original method raises any of the given errors.
+        original method raises a known cloud driver error, or any of the
+        given exception types.
         """
         def decorator(orig_func):
             @functools.wraps(orig_func)
-            def wrapper(self, *args, **kwargs):
+            def retry_wrapper(self, *args, **kwargs):
                 start_time = time.time()
                 try:
                     orig_func(self, *args, **kwargs)
-                except errors as error:
+                except Exception as error:
+                    if not (isinstance(error, errors) or
+                            self._cloud.is_cloud_exception(error)):
+                        raise
                     self._logger.warning(
                         "Client error: %s - waiting %s seconds",
                         error, self.retry_wait)
@@ -56,7 +62,7 @@ class ComputeNodeStateChangeBase(config.actor_class):
                                           self.max_retry_wait)
                 else:
                     self.retry_wait = self.min_retry_wait
-            return wrapper
+            return retry_wrapper
         return decorator
 
     def _finished(self):
@@ -86,9 +92,9 @@ class ComputeNodeSetupActor(ComputeNodeStateChangeBase):
                  cloud_size, arvados_node=None,
                  retry_wait=1, max_retry_wait=180):
         super(ComputeNodeSetupActor, self).__init__(
-            'arvnodeman.nodeup', timer_actor, retry_wait, max_retry_wait)
+            'arvnodeman.nodeup', cloud_client, timer_actor,
+            retry_wait, max_retry_wait)
         self._arvados = arvados_client
-        self._cloud = cloud_client
         self.cloud_size = cloud_size
         self.arvados_node = None
         self.cloud_node = None
@@ -97,12 +103,12 @@ class ComputeNodeSetupActor(ComputeNodeStateChangeBase):
         else:
             self._later.prepare_arvados_node(arvados_node)
 
-    @ComputeNodeStateChangeBase._retry(config.ARVADOS_ERRORS)
+    @ComputeNodeStateChangeBase._retry()
     def create_arvados_node(self):
         self.arvados_node = self._arvados.nodes().create(body={}).execute()
         self._later.create_cloud_node()
 
-    @ComputeNodeStateChangeBase._retry(config.ARVADOS_ERRORS)
+    @ComputeNodeStateChangeBase._retry()
     def prepare_arvados_node(self, node):
         self.arvados_node = self._arvados.nodes().update(
             uuid=node['uuid'],
@@ -116,7 +122,7 @@ class ComputeNodeSetupActor(ComputeNodeStateChangeBase):
             ).execute()
         self._later.create_cloud_node()
 
-    @ComputeNodeStateChangeBase._retry(config.CLOUD_ERRORS)
+    @ComputeNodeStateChangeBase._retry()
     def create_cloud_node(self):
         self._logger.info("Creating cloud node with size %s.",
                           self.cloud_size.name)
@@ -143,8 +149,8 @@ class ComputeNodeShutdownActor(ComputeNodeStateChangeBase):
         # eligible.  Normal shutdowns based on job demand should be
         # cancellable; shutdowns based on node misbehavior should not.
         super(ComputeNodeShutdownActor, self).__init__(
-            'arvnodeman.nodedown', timer_actor, retry_wait, max_retry_wait)
-        self._cloud = cloud_client
+            'arvnodeman.nodedown', cloud_client, timer_actor,
+            retry_wait, max_retry_wait)
         self._monitor = node_monitor.proxy()
         self.cloud_node = self._monitor.cloud_node.get()
         self.cancellable = cancellable
@@ -159,7 +165,7 @@ class ComputeNodeShutdownActor(ComputeNodeStateChangeBase):
 
     def _stop_if_window_closed(orig_func):
         @functools.wraps(orig_func)
-        def wrapper(self, *args, **kwargs):
+        def stop_wrapper(self, *args, **kwargs):
             if (self.cancellable and
                   (not self._monitor.shutdown_eligible().get())):
                 self._logger.info(
@@ -169,10 +175,10 @@ class ComputeNodeShutdownActor(ComputeNodeStateChangeBase):
                 return None
             else:
                 return orig_func(self, *args, **kwargs)
-        return wrapper
+        return stop_wrapper
 
     @_stop_if_window_closed
-    @ComputeNodeStateChangeBase._retry(config.CLOUD_ERRORS)
+    @ComputeNodeStateChangeBase._retry()
     def shutdown_node(self):
         if self._cloud.destroy_node(self.cloud_node):
             self._logger.info("Cloud node %s shut down.", self.cloud_node.id)
@@ -210,14 +216,14 @@ class ComputeNodeUpdateActor(config.actor_class):
 
     def _throttle_errors(orig_func):
         @functools.wraps(orig_func)
-        def wrapper(self, *args, **kwargs):
+        def throttle_wrapper(self, *args, **kwargs):
             throttle_time = self.next_request_time - time.time()
             if throttle_time > 0:
                 time.sleep(throttle_time)
             self.next_request_time = time.time()
             try:
                 result = orig_func(self, *args, **kwargs)
-            except config.CLOUD_ERRORS:
+            except Exception as error:
                 self.error_streak += 1
                 self.next_request_time += min(2 ** self.error_streak,
                                               self.max_retry_wait)
@@ -225,7 +231,7 @@ class ComputeNodeUpdateActor(config.actor_class):
             else:
                 self.error_streak = 0
                 return result
-        return wrapper
+        return throttle_wrapper
 
     @_throttle_errors
     def sync_node(self, cloud_node, arvados_node):
index a20cfde37146a46f05438817aef14ac307565491..99b419ec4907572b0916e77431dfba7b0eadfde7 100644 (file)
@@ -2,6 +2,10 @@
 
 from __future__ import absolute_import, print_function
 
+import libcloud.common.types as cloud_types
+
+from ...config import NETWORK_ERRORS
+
 class BaseComputeNodeDriver(object):
     """Abstract base class for compute node drivers.
 
@@ -15,6 +19,8 @@ class BaseComputeNodeDriver(object):
     creation kwargs with information about the specific Arvados node
     record), sync_node, and node_start_time.
     """
+    CLOUD_ERRORS = NETWORK_ERRORS + (cloud_types.LibcloudError,)
+
     def __init__(self, auth_kwargs, list_kwargs, create_kwargs, driver_class):
         self.real = driver_class(**auth_kwargs)
         self.list_kwargs = list_kwargs
@@ -62,3 +68,11 @@ class BaseComputeNodeDriver(object):
     @classmethod
     def node_start_time(cls, node):
         raise NotImplementedError("BaseComputeNodeDriver.node_start_time")
+
+    @classmethod
+    def is_cloud_exception(cls, exception):
+        # libcloud compute drivers typically raise bare Exceptions to
+        # represent API errors.  Return True for any exception that is
+        # exactly an Exception, or a better-known higher-level exception.
+        return (isinstance(exception, cls.CLOUD_ERRORS) or
+                getattr(exception, '__class__', None) is Exception)
index f018015717c150b0065212248f35bbfbf3d1d4a7..b7ec1fc80d9a0211867b7d06e5dd8ffb272f1ff4 100644 (file)
@@ -10,7 +10,6 @@ import sys
 
 import arvados
 import httplib2
-import libcloud.common.types as cloud_types
 import pykka
 from apiclient import errors as apierror
 
@@ -19,7 +18,6 @@ from apiclient import errors as apierror
 # it's low-level, but unlikely to catch code bugs.
 NETWORK_ERRORS = (IOError, ssl.SSLError)
 ARVADOS_ERRORS = NETWORK_ERRORS + (apierror.Error,)
-CLOUD_ERRORS = NETWORK_ERRORS + (cloud_types.LibcloudError,)
 
 actor_class = pykka.ThreadingActor
 
index 7ddfb7ca33e8b97f8132117c66789529415d8b90..83dd93f077bfb504a8f41b6f0addd93c717ac7da 100644 (file)
@@ -11,10 +11,11 @@ class ArvadosNodeListMonitorActor(clientactor.RemotePollLoopActor):
     This actor regularly polls the list of Arvados node records, and
     sends it to subscribers.
     """
-
-    CLIENT_ERRORS = config.ARVADOS_ERRORS
     LOGGER_NAME = 'arvnodeman.arvados_nodes'
 
+    def is_common_error(self, exception):
+        return isinstance(exception, config.ARVADOS_ERRORS)
+
     def _item_key(self, node):
         return node['uuid']
 
@@ -28,10 +29,11 @@ class CloudNodeListMonitorActor(clientactor.RemotePollLoopActor):
     This actor regularly polls the cloud to get a list of running compute
     nodes, and sends it to subscribers.
     """
-
-    CLIENT_ERRORS = config.CLOUD_ERRORS
     LOGGER_NAME = 'arvnodeman.cloud_nodes'
 
+    def is_common_error(self, exception):
+        return self._client.is_cloud_exception(exception)
+
     def _item_key(self, node):
         return node.id
 
index fde103e10e606f68ca5e0b3ba262f0a350e6df64..c765587993d4984dde8d12284bb4f62ffed77bb4 100644 (file)
@@ -2,9 +2,11 @@
 
 from __future__ import absolute_import, print_function
 
+import ssl
 import time
 import unittest
 
+import libcloud.common.types as cloud_types
 import mock
 
 import arvnodeman.computenode.driver.ec2 as ec2
@@ -87,3 +89,16 @@ class EC2ComputeNodeDriverTestCase(unittest.TestCase):
         node.extra = {'launch_time': time.strftime('%Y-%m-%dT%H:%M:%S.000Z',
                                                    reftuple)}
         self.assertEqual(refsecs, ec2.ComputeNodeDriver.node_start_time(node))
+
+    def test_cloud_exceptions(self):
+        for error in [Exception("test exception"),
+                      IOError("test exception"),
+                      ssl.SSLError("test exception"),
+                      cloud_types.LibcloudError("test exception")]:
+            self.assertTrue(ec2.ComputeNodeDriver.is_cloud_exception(error),
+                            "{} not flagged as cloud exception".format(error))
+
+    def test_noncloud_exceptions(self):
+        self.assertFalse(
+            ec2.ComputeNodeDriver.is_cloud_exception(ValueError("test error")),
+            "ValueError flagged as cloud exception")