X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/1eb5f8fe7b290813e2c40a8d248280d488fc37fb..a20c1480b7ed827d02511d1630e0894696814107:/services/nodemanager/arvnodeman/baseactor.py diff --git a/services/nodemanager/arvnodeman/baseactor.py b/services/nodemanager/arvnodeman/baseactor.py index 61695874be..bdfe5d45a7 100644 --- a/services/nodemanager/arvnodeman/baseactor.py +++ b/services/nodemanager/arvnodeman/baseactor.py @@ -1,3 +1,7 @@ +# Copyright (C) The Arvados Authors. All rights reserved. +# +# SPDX-License-Identifier: AGPL-3.0 + from __future__ import absolute_import, print_function import errno @@ -10,6 +14,8 @@ import traceback import pykka +from .status import tracker + class _TellCallableProxy(object): """Internal helper class for proxying callables.""" @@ -78,17 +84,21 @@ class BaseNodeManagerActor(pykka.ThreadingActor): def __init__(self, *args, **kwargs): super(pykka.ThreadingActor, self).__init__(*args, **kwargs) self.actor_ref = TellableActorRef(self) + self._killfunc = kwargs.get("killfunc", os.kill) def on_failure(self, exception_type, exception_value, tb): lg = getattr(self, "_logger", logging) if (exception_type in (threading.ThreadError, MemoryError) or exception_type is OSError and exception_value.errno == errno.ENOMEM): lg.critical("Unhandled exception is a fatal error, killing Node Manager") - os.kill(os.getpid(), signal.SIGKILL) + self._killfunc(os.getpid(), signal.SIGKILL) + tracker.counter_add('actor_exceptions') def ping(self): return True + def get_thread(self): + return threading.current_thread() class WatchdogActor(pykka.ThreadingActor): def __init__(self, timeout, *args, **kwargs): @@ -97,11 +107,13 @@ class WatchdogActor(pykka.ThreadingActor): self.actors = [a.proxy() for a in args] self.actor_ref = TellableActorRef(self) self._later = self.actor_ref.tell_proxy() + self._killfunc = kwargs.get("killfunc", os.kill) - def kill_self(self, act): + def kill_self(self, e, act): lg = getattr(self, "_logger", logging) + lg.critical("Watchdog exception", exc_info=e) lg.critical("Actor %s watchdog ping time out, killing Node Manager", act) - os.kill(os.getpid(), signal.SIGKILL) + self._killfunc(os.getpid(), signal.SIGKILL) def on_start(self): self._later.run() @@ -113,5 +125,5 @@ class WatchdogActor(pykka.ThreadingActor): a.ping().get(self.timeout) time.sleep(20) self._later.run() - except: - self.kill_self(a) + except Exception as e: + self.kill_self(e, a)