8437: Add FullStopActor which uses os.killpg() to terminate node manager on_failure...
authorPeter Amstutz <peter.amstutz@curoverse.com>
Thu, 25 Feb 2016 14:08:01 +0000 (09:08 -0500)
committerPeter Amstutz <peter.amstutz@curoverse.com>
Thu, 25 Feb 2016 14:08:01 +0000 (09:08 -0500)
services/nodemanager/arvnodeman/config.py
services/nodemanager/arvnodeman/fullstopactor.py [new file with mode: 0644]

index dd45165deaa1514789c42428dacbb4bcf862b5a5..dcfe1ceb133e671527e70967dd25a783d64210a6 100644 (file)
@@ -12,13 +12,15 @@ import httplib2
 import pykka
 from apiclient import errors as apierror
 
+from .fullstopactor import FullStopActor
+
 # IOError is the base class for socket.error, ssl.SSLError, and friends.
 # It seems like it hits the sweet spot for operations we want to retry:
 # it's low-level, but unlikely to catch code bugs.
 NETWORK_ERRORS = (IOError,)
 ARVADOS_ERRORS = NETWORK_ERRORS + (apierror.Error,)
 
-actor_class = pykka.ThreadingActor
+actor_class = FullStopActor
 
 class NodeManagerConfig(ConfigParser.SafeConfigParser):
     """Node Manager Configuration class.
diff --git a/services/nodemanager/arvnodeman/fullstopactor.py b/services/nodemanager/arvnodeman/fullstopactor.py
new file mode 100644 (file)
index 0000000..07e0625
--- /dev/null
@@ -0,0 +1,17 @@
+from __future__ import absolute_import, print_function
+
+import errno
+import logging
+import os
+import threading
+import traceback
+
+import pykka
+
+class FullStopActor(pykka.ThreadingActor):
+    def on_failure(self, exception_type, exception_value, tb):
+        lg = getattr(self, "_logger", logging)
+        if (exception_type in (threading.ThreadError, MemoryError) or
+            exception_type is OSError and exception_value.errno == errno.ENOMEM):
+            lg.critical("Unhandled exception is a fatal error, killing Node Manager")
+            os.killpg(os.getpgid(0), 9)