15964: Remove qr1hi from a few more places. Delete unused includes.
[arvados.git] / services / nodemanager / arvnodeman / baseactor.py
index 840ba4c931ad856ecb1de94e3cc1aa48518867fd..bdfe5d45a7ac444575192b9f1bb95fcadbf18bfe 100644 (file)
@@ -1,3 +1,7 @@
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: AGPL-3.0
+
 from __future__ import absolute_import, print_function
 
 import errno
@@ -10,6 +14,8 @@ import traceback
 
 import pykka
 
+from .status import tracker
+
 class _TellCallableProxy(object):
     """Internal helper class for proxying callables."""
 
@@ -78,45 +84,46 @@ class BaseNodeManagerActor(pykka.ThreadingActor):
     def __init__(self, *args, **kwargs):
          super(pykka.ThreadingActor, self).__init__(*args, **kwargs)
          self.actor_ref = TellableActorRef(self)
+         self._killfunc = kwargs.get("killfunc", os.kill)
 
     def on_failure(self, exception_type, exception_value, tb):
         lg = getattr(self, "_logger", logging)
         if (exception_type in (threading.ThreadError, MemoryError) or
             exception_type is OSError and exception_value.errno == errno.ENOMEM):
             lg.critical("Unhandled exception is a fatal error, killing Node Manager")
-            os.kill(os.getpid(), signal.SIGQUIT)
+            self._killfunc(os.getpid(), signal.SIGKILL)
+        tracker.counter_add('actor_exceptions')
 
     def ping(self):
         return True
 
+    def get_thread(self):
+        return threading.current_thread()
 
 class WatchdogActor(pykka.ThreadingActor):
     def __init__(self, timeout, *args, **kwargs):
          super(pykka.ThreadingActor, self).__init__(*args, **kwargs)
          self.timeout = timeout
+         self.actors = [a.proxy() for a in args]
          self.actor_ref = TellableActorRef(self)
          self._later = self.actor_ref.tell_proxy()
+         self._killfunc = kwargs.get("killfunc", os.kill)
 
-    def kill_self(self, act):
+    def kill_self(self, e, act):
         lg = getattr(self, "_logger", logging)
+        lg.critical("Watchdog exception", exc_info=e)
         lg.critical("Actor %s watchdog ping time out, killing Node Manager", act)
-        os.kill(os.getpid(), signal.SIGQUIT)
+        self._killfunc(os.getpid(), signal.SIGKILL)
 
     def on_start(self):
         self._later.run()
 
     def run(self):
-        actors = pykka.ActorRegistry.get_all()
-        for a in actors:
-            if a.actor_class is WatchdogActor:
-                continue
-            try:
-                a.proxy().ping().get(self.timeout)
-            except pykka.ActorDeadError:
-                pass
-            except pykka.Timeout:
-                self.kill_self(a)
-                return
-
-        time.sleep(20)
-        self._later.run()
+        a = None
+        try:
+            for a in self.actors:
+                a.ping().get(self.timeout)
+            time.sleep(20)
+            self._later.run()
+        except Exception as e:
+            self.kill_self(e, a)