From fc6a133423d79b224adff0e666b9da594524b460 Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Thu, 25 Feb 2016 09:08:01 -0500 Subject: [PATCH] 8437: Add FullStopActor which uses os.killpg() to terminate node manager on_failure. Added test. --- services/nodemanager/arvnodeman/config.py | 4 +++- .../nodemanager/arvnodeman/fullstopactor.py | 17 +++++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) create mode 100644 services/nodemanager/arvnodeman/fullstopactor.py diff --git a/services/nodemanager/arvnodeman/config.py b/services/nodemanager/arvnodeman/config.py index dd45165dea..dcfe1ceb13 100644 --- a/services/nodemanager/arvnodeman/config.py +++ b/services/nodemanager/arvnodeman/config.py @@ -12,13 +12,15 @@ import httplib2 import pykka from apiclient import errors as apierror +from .fullstopactor import FullStopActor + # IOError is the base class for socket.error, ssl.SSLError, and friends. # It seems like it hits the sweet spot for operations we want to retry: # it's low-level, but unlikely to catch code bugs. NETWORK_ERRORS = (IOError,) ARVADOS_ERRORS = NETWORK_ERRORS + (apierror.Error,) -actor_class = pykka.ThreadingActor +actor_class = FullStopActor class NodeManagerConfig(ConfigParser.SafeConfigParser): """Node Manager Configuration class. diff --git a/services/nodemanager/arvnodeman/fullstopactor.py b/services/nodemanager/arvnodeman/fullstopactor.py new file mode 100644 index 0000000000..07e0625304 --- /dev/null +++ b/services/nodemanager/arvnodeman/fullstopactor.py @@ -0,0 +1,17 @@ +from __future__ import absolute_import, print_function + +import errno +import logging +import os +import threading +import traceback + +import pykka + +class FullStopActor(pykka.ThreadingActor): + def on_failure(self, exception_type, exception_value, tb): + lg = getattr(self, "_logger", logging) + if (exception_type in (threading.ThreadError, MemoryError) or + exception_type is OSError and exception_value.errno == errno.ENOMEM): + lg.critical("Unhandled exception is a fatal error, killing Node Manager") + os.killpg(os.getpgid(0), 9) -- 2.30.2