X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/0934e5663b3e3dc0567ecfc71423d970a313578a..3a0aa1db801154916f50b1b299d5100945a3e1df:/services/nodemanager/arvnodeman/baseactor.py diff --git a/services/nodemanager/arvnodeman/baseactor.py b/services/nodemanager/arvnodeman/baseactor.py index 840ba4c931..68ea97ab75 100644 --- a/services/nodemanager/arvnodeman/baseactor.py +++ b/services/nodemanager/arvnodeman/baseactor.py @@ -84,7 +84,7 @@ class BaseNodeManagerActor(pykka.ThreadingActor): if (exception_type in (threading.ThreadError, MemoryError) or exception_type is OSError and exception_value.errno == errno.ENOMEM): lg.critical("Unhandled exception is a fatal error, killing Node Manager") - os.kill(os.getpid(), signal.SIGQUIT) + os.kill(os.getpid(), signal.SIGKILL) def ping(self): return True @@ -94,29 +94,25 @@ class WatchdogActor(pykka.ThreadingActor): def __init__(self, timeout, *args, **kwargs): super(pykka.ThreadingActor, self).__init__(*args, **kwargs) self.timeout = timeout + self.actors = [a.proxy() for a in args] self.actor_ref = TellableActorRef(self) self._later = self.actor_ref.tell_proxy() - def kill_self(self, act): + def kill_self(self, e, act): lg = getattr(self, "_logger", logging) + lg.critical("Watchdog exception", exc_info=e) lg.critical("Actor %s watchdog ping time out, killing Node Manager", act) - os.kill(os.getpid(), signal.SIGQUIT) + os.kill(os.getpid(), signal.SIGKILL) def on_start(self): self._later.run() def run(self): - actors = pykka.ActorRegistry.get_all() - for a in actors: - if a.actor_class is WatchdogActor: - continue - try: - a.proxy().ping().get(self.timeout) - except pykka.ActorDeadError: - pass - except pykka.Timeout: - self.kill_self(a) - return - - time.sleep(20) - self._later.run() + a = None + try: + for a in self.actors: + a.ping().get(self.timeout) + time.sleep(20) + self._later.run() + except Exception as e: + self.kill_self(e, a)