Merge branch '8567-docker-migrator' refs #8567
authorPeter Amstutz <peter.amstutz@curoverse.com>
Fri, 24 Mar 2017 13:31:36 +0000 (09:31 -0400)
committerPeter Amstutz <peter.amstutz@curoverse.com>
Fri, 24 Mar 2017 13:31:36 +0000 (09:31 -0400)
32 files changed:
doc/_config.yml
doc/install/crunch2-slurm/install-compute-node.html.textile.liquid
doc/install/crunch2-slurm/install-dispatch.html.textile.liquid
doc/install/install-api-server.html.textile.liquid
doc/install/install-ws.html.textile.liquid
sdk/cli/arvados-cli.gemspec
sdk/cli/test/test_arv-collection-create.rb
sdk/python/arvados/api.py
sdk/python/arvados/cache.py [new file with mode: 0644]
sdk/python/tests/run_test_server.py
sdk/python/tests/test_cache.py [new file with mode: 0644]
services/api/app/models/log.rb
services/api/config/application.default.yml
services/api/lib/audit_logs.rb [new file with mode: 0644]
services/api/lib/crunch_dispatch.rb
services/api/test/fixtures/logs.yml
services/api/test/unit/crunch_dispatch_test.rb
services/api/test/unit/fail_jobs_test.rb
services/api/test/unit/log_test.rb
services/arv-git-httpd/arvados-git-httpd.service
services/crunch-dispatch-slurm/crunch-dispatch-slurm.service
services/crunch-run/crunchrun.go
services/crunch-run/crunchrun_test.go
services/dockercleaner/arvados-docker-cleaner.service
services/keep-balance/keep-balance.service
services/keep-web/keep-web.service
services/keepproxy/keepproxy.service
services/keepstore/keepstore.service
services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
services/nodemanager/arvnodeman/daemon.py
services/nodemanager/tests/test_daemon.py
services/ws/arvados-ws.service

index 8c3d42a397691b3a745b77b1a9de5f4f61316c24..55edc3d3a293e5bc2916dc8141cc39f6dca96659 100644 (file)
@@ -151,6 +151,7 @@ navbar:
       - install/install-postgresql.html.textile.liquid
       - install/install-sso.html.textile.liquid
       - install/install-api-server.html.textile.liquid
+      - install/install-ws.html.textile.liquid
       - install/install-arv-git-httpd.html.textile.liquid
       - install/install-workbench-app.html.textile.liquid
       - install/install-shell-server.html.textile.liquid
index 330cc3ae4835b5f6e79e992741713e2e1f54b49a..25e105b4e97cf2ed8cfcfccbeaa39fd9803c8b10 100644 (file)
@@ -33,8 +33,6 @@ On Debian-based systems:
 
 h2. Set up SLURM
 
-Install SLURM following "the same process you used to install the Crunch dispatcher":install-crunch-dispatch.html#slurm.
+Install SLURM on the compute node using the same process you used on the API server in the "previous step":install-slurm.html.
 
-h2. Copy configuration files from the dispatcher (API server)
-
-The @slurm.conf@ and @/etc/munge/munge.key@ files need to be identical across the dispatcher and all compute nodes. Copy the files you created in the "Install the Crunch dispatcher":install-crunch-dispatch.html step to this compute node.
+The @slurm.conf@ and @/etc/munge/munge.key@ files must be identical on all SLURM nodes. Copy the files you created on the API server in the "previous step":install-slurm.html to each compute node.
index e6b8e1ea01d2deab5c408b1dd2c6015473d6d00c..f50534e1986d2b3ba1a77cf08ce1811dd5131e48 100644 (file)
@@ -120,6 +120,21 @@ You can work around this issue by disabling the Docker daemon's systemd integrat
 
 {% include 'notebox_end' %}
 
+h3. CrunchRunCommand: Using host networking for containers
+
+Older Linux kernels (prior to 3.18) have bugs in network namespace handling which can lead to compute node lockups.  This by is indicated by blocked kernel tasks in "Workqueue: netns cleanup_net".   If you are experiencing this problem, as a workaround you can disable use of network namespaces by Docker across the cluster.  Be aware this reduces container isolation, which may be a security risk.
+
+<notextile>
+<pre><code class="userinput">Client:
+  APIHost: <b>zzzzz.arvadosapi.com</b>
+  AuthToken: <b>zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz</b>
+CrunchRunCommand:
+- <b>crunch-run</b>
+- <b>"-container-enable-networking=always"</b>
+- <b>"-container-network-mode=host"</b>
+</code></pre>
+</notextile>
+
 h3. MinRetryPeriod: Rate-limit repeated attempts to start containers
 
 If SLURM is unable to run a container, the dispatcher will submit it again after the next PollPeriod. If PollPeriod is very short, this can be excessive. If MinRetryPeriod is set, the dispatcher will avoid submitting the same container to SLURM more than once in the given time span.
index b4019a418ab523c106c7b10df4698deb47cf0abb..fa07f889f16942d352faef398aeb9def9796122f 100644 (file)
@@ -153,45 +153,15 @@ Example @application.yml@:
 </code></pre>
 </notextile>
 
-h2(#set_up). Set up Web servers
+h2(#set_up). Set up Nginx and Passenger
 
-For best performance, we recommend you use Nginx as your Web server front-end, with a Passenger backend for the main API server and a Puma backend for API server Websockets.  To do that:
+The Nginx server will serve API requests using Passenger. It will also be used to proxy SSL requests to other services which are covered later in this guide.
 
-<notextile>
-<ol>
-<li><a href="https://www.phusionpassenger.com/library/walkthroughs/deploy/ruby/ownserver/nginx/oss/install_passenger_main.html">Install Nginx and Phusion Passenger</a>.</li>
-
-<li><p>Install runit to supervise the Puma daemon.  {% include 'install_runit' %}<notextile></p></li>
-
-<li><p>Install the script below as the run script for the Puma service, modifying it as directed by the comments.</p>
-
-<pre><code>#!/bin/bash
-
-set -e
-exec 2>&1
-
-# Uncomment the line below if you're using RVM.
-#source /etc/profile.d/rvm.sh
-
-envdir="`pwd`/env"
-mkdir -p "$envdir"
-echo ws-only > "$envdir/ARVADOS_WEBSOCKETS"
-
-cd /var/www/arvados-api/current
-echo "Starting puma in `pwd`"
-
-# Change arguments below to match your deployment, "webserver-user" and
-# "webserver-group" should be changed to the user and group of the web server
-# process.  This is typically "www-data:www-data" on Debian systems by default,
-# other systems may use different defaults such the name of the web server
-# software (for example, "nginx:nginx").
-exec chpst -m 1073741824 -u webserver-user:webserver-group -e "$envdir" \
-  bundle exec puma -t 0:512 -e production -b tcp://127.0.0.1:8100
-</code></pre>
-</li>
+First, "Install Nginx and Phusion Passenger":https://www.phusionpassenger.com/library/walkthroughs/deploy/ruby/ownserver/nginx/oss/install_passenger_main.html.
 
-<li><p>Edit the http section of your Nginx configuration to run the Passenger server, and act as a front-end for both it and Puma.  You might add a block like the following, adding SSL and logging parameters to taste:</p>
+Edit the http section of your Nginx configuration to run the Passenger server, and serve SSL requests. Add a block like the following, adding SSL and logging parameters to taste:
 
+<notextile>
 <pre><code>server {
   listen 127.0.0.1:8000;
   server_name localhost-api;
@@ -216,11 +186,6 @@ upstream api {
   server     127.0.0.1:8000  fail_timeout=10s;
 }
 
-upstream websockets {
-  # The address below must match the one specified in puma's -b option.
-  server     127.0.0.1:8100  fail_timeout=10s;
-}
-
 proxy_http_version 1.1;
 
 # When Keep clients request a list of Keep services from the API server, the
@@ -259,41 +224,14 @@ server {
     proxy_set_header      X-Forwarded-For $proxy_add_x_forwarded_for;
   }
 }
-
-server {
-  listen       <span class="userinput">[your public IP address]</span>:443 ssl;
-  server_name  ws.<span class="userinput">uuid_prefix.your.domain</span>;
-
-  ssl on;
-  ssl_certificate     <span class="userinput">/YOUR/PATH/TO/cert.pem</span>;
-  ssl_certificate_key <span class="userinput">/YOUR/PATH/TO/cert.key</span>;
-
-  index  index.html index.htm index.php;
-
-  location / {
-    proxy_pass            http://websockets;
-    proxy_redirect        off;
-    proxy_connect_timeout 90s;
-    proxy_read_timeout    300s;
-
-    proxy_set_header      Upgrade $http_upgrade;
-    proxy_set_header      Connection "upgrade";
-    proxy_set_header      Host $host;
-    proxy_set_header      X-Real-IP $remote_addr;
-    proxy_set_header      X-Forwarded-For $proxy_add_x_forwarded_for;
-  }
-}
 </code></pre>
-</li>
+</notextile>
 
-<li><p>Restart Nginx:</p>
+Restart Nginx to apply the new configuration.
 
+<notextile>
 <pre><code>~$ <span class="userinput">sudo nginx -s reload</span>
 </code></pre>
-
-</li>
-
-</ol>
 </notextile>
 
 h2. Prepare the API server deployment
@@ -303,7 +241,9 @@ h2. Prepare the API server deployment
 
 {% include 'notebox_begin' %}
 You can safely ignore the following messages if they appear while this command runs:
-<pre>Don't run Bundler as root. Bundler can ask for sudo if it is needed, and installing your bundle as root will
-break this application for all non-root users on this machine.</pre>
-<pre>fatal: Not a git repository (or any of the parent directories): .git</pre>
+
+<notextile><pre>Don't run Bundler as root. Bundler can ask for sudo if it is needed, and installing your bundle as root will
+break this application for all non-root users on this machine.</pre></notextile>
+
+<notextile><pre>fatal: Not a git repository (or any of the parent directories): .git</pre></notextile>
 {% include 'notebox_end' %}
index a36a59a56f9847654e95397835b5bb8d129cca27..9887f462c6559d76d8238ba5231bec3e2d0cb86a 100644 (file)
@@ -4,13 +4,7 @@ navsection: installguide
 title: Install the websocket server
 ...
 
-{% include 'notebox_begin_warning' %}
-
-This websocket server is an alternative to the puma server that comes with the API server. It is available as an *experimental pre-release* and is not recommended for production sites.
-
-{% include 'notebox_end' %}
-
-The arvados-ws server provides event notifications to websocket clients. It can be installed anywhere with access to Postgres database and the Arvados API server, typically behind a web proxy that provides SSL support. See the "godoc page":http://godoc.org/github.com/curoverse/arvados/services/keep-web for additional information.
+The arvados-ws server provides event notifications to websocket clients. It can be installed anywhere with access to Postgres database and the Arvados API server, typically behind a web proxy that provides SSL support. See the "godoc page":http://godoc.org/github.com/curoverse/arvados/services/ws for additional information.
 
 By convention, we use the following hostname for the websocket service.
 
@@ -175,7 +169,9 @@ server {
 }
 </pre></notextile>
 
-If Nginx is already configured to proxy @ws@ requests to puma, move that configuration out of the way or change its @server_name@ so it doesn't conflict.
+{% include 'notebox_begin' %}
+If you are upgrading a cluster where Nginx is configured to proxy @ws@ requests to puma, change the @server_name@ value in the old configuration block so it doesn't conflict. When the new configuration is working, delete the old Nginx configuration sections (i.e., the "upstream websockets" block, and the "server" block that references @http://websockets@), and disable/remove the runit or systemd files for the puma server.
+{% include 'notebox_end' %}
 
 h3. Update API server configuration
 
index 0eeee57e7ffb36a4881b6029942b644297e5502e..651ebf20b0e96f13b4b32bb1b55855f0ac1076a9 100644 (file)
@@ -28,7 +28,7 @@ Gem::Specification.new do |s|
   # Our google-api-client dependency used to be < 0.9, but that could be
   # satisfied by the buggy 0.9.pre*.  https://dev.arvados.org/issues/9213
   s.add_runtime_dependency 'google-api-client', '~> 0.6', '>= 0.6.3', '<0.8.9'
-  s.add_runtime_dependency 'activesupport', '~> 3.2', '>= 3.2.13'
+  s.add_runtime_dependency 'activesupport', '>= 3.2.13', '< 5'
   s.add_runtime_dependency 'json', '~> 1.7', '>= 1.7.7'
   s.add_runtime_dependency 'trollop', '~> 2.0'
   s.add_runtime_dependency 'andand', '~> 1.3', '>= 1.3.3'
index f7a9dbe41a853412d84a1752ccae0637ef2250d4..a46d1e6ac0fba92d38acc16ec5b09c9893cf9b29 100644 (file)
@@ -1,5 +1,6 @@
 require 'minitest/autorun'
 require 'digest/md5'
+require 'active_support'
 require 'active_support/core_ext'
 require 'tempfile'
 
index ccf16a5fcef3be02b6450bd4b527e6ff682c88d1..d1263e24f27b2e89b3ef386d3d35a28bb9341811 100644 (file)
@@ -15,6 +15,7 @@ from apiclient import errors as apiclient_errors
 import config
 import errors
 import util
+import cache
 
 _logger = logging.getLogger('arvados.api')
 
@@ -136,7 +137,7 @@ def http_cache(data_type):
         util.mkdir_dash_p(path)
     except OSError:
         path = None
-    return path
+    return cache.SafeHTTPCache(path, max_age=60*60*24*2)
 
 def api(version=None, cache=True, host=None, token=None, insecure=False, **kwargs):
     """Return an apiclient Resources object for an Arvados instance.
diff --git a/sdk/python/arvados/cache.py b/sdk/python/arvados/cache.py
new file mode 100644 (file)
index 0000000..08c19e4
--- /dev/null
@@ -0,0 +1,71 @@
+import errno
+import md5
+import os
+import tempfile
+import time
+
+class SafeHTTPCache(object):
+    """Thread-safe replacement for httplib2.FileCache"""
+
+    def __init__(self, path=None, max_age=None):
+        self._dir = path
+        if max_age is not None:
+            try:
+                self._clean(threshold=time.time() - max_age)
+            except:
+                pass
+
+    def _clean(self, threshold=0):
+        for ent in os.listdir(self._dir):
+            fnm = os.path.join(self._dir, ent)
+            if os.path.isdir(fnm) or not fnm.endswith('.tmp'):
+                continue
+            stat = os.lstat(fnm)
+            if stat.st_mtime < threshold:
+                try:
+                    os.unlink(fnm)
+                except OSError as err:
+                    if err.errno != errno.ENOENT:
+                        raise
+
+    def __str__(self):
+        return self._dir
+
+    def _filename(self, url):
+        return os.path.join(self._dir, md5.new(url).hexdigest()+'.tmp')
+
+    def get(self, url):
+        filename = self._filename(url)
+        try:
+            with open(filename, 'rb') as f:
+                return f.read()
+        except IOError, OSError:
+            return None
+
+    def set(self, url, content):
+        try:
+            fd, tempname = tempfile.mkstemp(dir=self._dir)
+        except:
+            return None
+        try:
+            try:
+                f = os.fdopen(fd, 'w')
+            except:
+                os.close(fd)
+                raise
+            try:
+                f.write(content)
+            finally:
+                f.close()
+            os.rename(tempname, self._filename(url))
+            tempname = None
+        finally:
+            if tempname:
+                os.unlink(tempname)
+
+    def delete(self, url):
+        try:
+            os.unlink(self._filename(url))
+        except OSError as err:
+            if err.errno != errno.ENOENT:
+                raise
index 776ff728cb164113c056362bc28ed58e09d34320..d10e60c22fef1009179c90da126d098a2fdc9c56 100644 (file)
@@ -34,9 +34,19 @@ import arvados.config
 ARVADOS_DIR = os.path.realpath(os.path.join(MY_DIRNAME, '../../..'))
 SERVICES_SRC_DIR = os.path.join(ARVADOS_DIR, 'services')
 if 'GOPATH' in os.environ:
+    # Add all GOPATH bin dirs to PATH -- but insert them after the
+    # ruby gems bin dir, to ensure "bundle" runs the Ruby bundler
+    # command, not the golang.org/x/tools/cmd/bundle command.
     gopaths = os.environ['GOPATH'].split(':')
-    gobins = [os.path.join(path, 'bin') for path in gopaths]
-    os.environ['PATH'] = ':'.join(gobins) + ':' + os.environ['PATH']
+    addbins = [os.path.join(path, 'bin') for path in gopaths]
+    newbins = []
+    for path in os.environ['PATH'].split(':'):
+        newbins.append(path)
+        if os.path.exists(os.path.join(path, 'bundle')):
+            newbins += addbins
+            addbins = []
+    newbins += addbins
+    os.environ['PATH'] = ':'.join(newbins)
 
 TEST_TMPDIR = os.path.join(ARVADOS_DIR, 'tmp')
 if not os.path.exists(TEST_TMPDIR):
@@ -229,8 +239,9 @@ def run(leave_running_atexit=False):
     # This will clear cached docs that belong to other processes (like
     # concurrent test suites) even if they're still running. They should
     # be able to tolerate that.
-    for fn in glob.glob(os.path.join(arvados.http_cache('discovery'),
-                                     '*,arvados,v1,rest,*')):
+    for fn in glob.glob(os.path.join(
+            str(arvados.http_cache('discovery')),
+            '*,arvados,v1,rest,*')):
         os.unlink(fn)
 
     pid_file = _pidfile('api')
diff --git a/sdk/python/tests/test_cache.py b/sdk/python/tests/test_cache.py
new file mode 100644 (file)
index 0000000..baa60bf
--- /dev/null
@@ -0,0 +1,82 @@
+from __future__ import print_function
+
+import md5
+import mock
+import shutil
+import random
+import sys
+import tempfile
+import threading
+import unittest
+
+import arvados.cache
+import run_test_server
+
+
+def _random(n):
+    return bytearray(random.getrandbits(8) for _ in xrange(n))
+
+
+class CacheTestThread(threading.Thread):
+    def __init__(self, dir):
+        super(CacheTestThread, self).__init__()
+        self._dir = dir
+
+    def run(self):
+        c = arvados.cache.SafeHTTPCache(self._dir)
+        url = 'http://example.com/foo'
+        self.ok = True
+        for x in range(16):
+            try:
+                data_in = _random(128)
+                data_in = md5.new(data_in).hexdigest() + "\n" + str(data_in)
+                c.set(url, data_in)
+                data_out = c.get(url)
+                digest, content = data_out.split("\n", 1)
+                if digest != md5.new(content).hexdigest():
+                    self.ok = False
+            except Exception as err:
+                self.ok = False
+                print("cache failed: {}".format(err), file=sys.stderr)
+
+
+class CacheTest(unittest.TestCase):
+    def setUp(self):
+        self._dir = tempfile.mkdtemp()
+
+    def tearDown(self):
+        shutil.rmtree(self._dir)
+
+    def test_cache_crud(self):
+        c = arvados.cache.SafeHTTPCache(self._dir, max_age=0)
+        url = 'https://example.com/foo?bar=baz'
+        data1 = _random(256)
+        data2 = _random(128)
+        self.assertEqual(None, c.get(url))
+        c.delete(url)
+        c.set(url, data1)
+        self.assertEqual(data1, c.get(url))
+        c.delete(url)
+        self.assertEqual(None, c.get(url))
+        c.set(url, data1)
+        c.set(url, data2)
+        self.assertEqual(data2, c.get(url))
+
+    def test_cache_threads(self):
+        threads = []
+        for _ in range(64):
+            t = CacheTestThread(dir=self._dir)
+            t.start()
+            threads.append(t)
+        for t in threads:
+            t.join()
+            self.assertTrue(t.ok)
+
+
+class CacheIntegrationTest(run_test_server.TestCaseWithServers):
+    MAIN_SERVER = {}
+
+    def test_cache_used_by_default_client(self):
+        with mock.patch('arvados.cache.SafeHTTPCache.get') as getter:
+            arvados.api('v1')._rootDesc.get('foobar')
+            getter.assert_called()
index 3207d1f288f2f264c671d6709063d93140ce3fec..eedf06a976c74726d515b4ce9e8a46c402fe18aa 100644 (file)
@@ -1,3 +1,5 @@
+require 'audit_logs'
+
 class Log < ArvadosModel
   include HasUuid
   include KindAndEtag
@@ -5,6 +7,7 @@ class Log < ArvadosModel
   serialize :properties, Hash
   before_validation :set_default_event_at
   after_save :send_notify
+  after_commit { AuditLogs.tidy_in_background }
 
   api_accessible :user, extend: :common do |t|
     t.add :id
@@ -101,5 +104,4 @@ class Log < ArvadosModel
   def send_notify
     connection.execute "NOTIFY logs, '#{self.id}'"
   end
-
 end
index cae6bbdf174468d719d2a0b2dc61d757a78f10bd..5241cb43788aa6b8d98a587887f5d6b88afec730 100644 (file)
@@ -232,6 +232,24 @@ common:
   # crunchstat logs from the logs table.
   clean_container_log_rows_after: <%= 30.days %>
 
+  # Time to keep audit logs, in seconds. (An audit log is a row added
+  # to the "logs" table in the PostgreSQL database each time an
+  # Arvados object is created, modified, or deleted.)
+  #
+  # Currently, websocket event notifications rely on audit logs, so
+  # this should not be set lower than 600 (5 minutes).
+  max_audit_log_age: 1209600
+
+  # Maximum number of log rows to delete in a single SQL transaction.
+  #
+  # If max_audit_log_delete_batch is 0, log entries will never be
+  # deleted by Arvados. Cleanup can be done by an external process
+  # without affecting any Arvados system processes, as long as very
+  # recent (<5 minutes old) logs are not deleted.
+  #
+  # 100000 is a reasonable batch size for most sites.
+  max_audit_log_delete_batch: 0
+
   # The maximum number of compute nodes that can be in use simultaneously
   # If this limit is reduced, any existing nodes with slot number >= new limit
   # will not be counted against the new limit. In other words, the new limit
diff --git a/services/api/lib/audit_logs.rb b/services/api/lib/audit_logs.rb
new file mode 100644 (file)
index 0000000..ddbf2d0
--- /dev/null
@@ -0,0 +1,65 @@
+require 'current_api_client'
+require 'db_current_time'
+
+module AuditLogs
+  extend CurrentApiClient
+  extend DbCurrentTime
+
+  def self.delete_old(max_age:, max_batch:)
+    act_as_system_user do
+      if !File.owned?(Rails.root.join('tmp'))
+        Rails.logger.warn("AuditLogs: not owner of #{Rails.root}/tmp, skipping")
+        return
+      end
+      lockfile = Rails.root.join('tmp', 'audit_logs.lock')
+      File.open(lockfile, File::RDWR|File::CREAT, 0600) do |f|
+        return unless f.flock(File::LOCK_NB|File::LOCK_EX)
+
+        sql = "select clock_timestamp() - interval '#{'%.9f' % max_age} seconds'"
+        threshold = ActiveRecord::Base.connection.select_value(sql).to_time.utc
+        Rails.logger.info "AuditLogs: deleting logs older than #{threshold}"
+
+        did_total = 0
+        loop do
+          sql = Log.unscoped.
+                select(:id).
+                order(:created_at).
+                where('event_type in (?)', ['create', 'update', 'destroy', 'delete']).
+                where('created_at < ?', threshold).
+                limit(max_batch).
+                to_sql
+          did = Log.unscoped.where("id in (#{sql})").delete_all
+          did_total += did
+
+          Rails.logger.info "AuditLogs: deleted batch of #{did}"
+          break if did == 0
+        end
+        Rails.logger.info "AuditLogs: deleted total #{did_total}"
+      end
+    end
+  end
+
+  def self.tidy_in_background
+    max_age = Rails.configuration.max_audit_log_age
+    max_batch = Rails.configuration.max_audit_log_delete_batch
+    return if max_age <= 0 || max_batch <= 0
+
+    exp = (max_age/14).seconds
+    need = false
+    Rails.cache.fetch('AuditLogs', expires_in: exp) do
+      need = true
+    end
+    return if !need
+
+    Thread.new do
+      Thread.current.abort_on_exception = false
+      begin
+        delete_old(max_age: max_age, max_batch: max_batch)
+      rescue => e
+        Rails.logger.error "#{e.class}: #{e}\n#{e.backtrace.join("\n\t")}"
+      ensure
+        ActiveRecord::Base.connection.close
+      end
+    end
+  end
+end
index 21843de67e64cc045c71d73e8085e4ca063f800e..bea1657de22b72c0d5296a4c571afcee3ffc0993 100644 (file)
@@ -963,8 +963,11 @@ class CrunchDispatch
   # An array of job_uuids in squeue
   def squeue_jobs
     if Rails.configuration.crunch_job_wrapper == :slurm_immediate
-      File.popen(['squeue', '-a', '-h', '-o', '%j']).readlines.map do |line|
-        line.strip
+      p = IO.popen(['squeue', '-a', '-h', '-o', '%j'])
+      begin
+        p.readlines.map {|line| line.strip}
+      ensure
+        p.close
       end
     else
       []
@@ -973,7 +976,9 @@ class CrunchDispatch
 
   def scancel slurm_name
     cmd = sudo_preface + ['scancel', '-n', slurm_name]
-    puts File.popen(cmd).read
+    IO.popen(cmd) do |scancel_pipe|
+      puts scancel_pipe.read
+    end
     if not $?.success?
       Rails.logger.error "scancel #{slurm_name.shellescape}: $?"
     end
index d83cf967e5b4c46c42b3713b581c768612443a75..a39aff5843841faf50f79dab92bc6d4fcacd0fd8 100644 (file)
@@ -13,6 +13,7 @@ admin_changes_repository2: # admin changes repository2, which is owned by active
   object_uuid: zzzzz-2x53u-382brsig8rp3667 # repository foo
   object_owner_uuid: zzzzz-tpzed-xurymjxw79nv3jz # active user
   event_at: <%= 2.minute.ago.to_s(:db) %>
+  event_type: update
 
 admin_changes_specimen: # admin changes specimen owned_by_spectator
   id: 3
@@ -21,6 +22,7 @@ admin_changes_specimen: # admin changes specimen owned_by_spectator
   object_uuid: zzzzz-2x53u-3b0xxwzlbzxq5yr # specimen owned_by_spectator
   object_owner_uuid: zzzzz-tpzed-l1s2piq4t4mps8r # spectator user
   event_at: <%= 3.minute.ago.to_s(:db) %>
+  event_type: update
 
 system_adds_foo_file: # foo collection added, readable by active through link
   id: 4
@@ -29,6 +31,7 @@ system_adds_foo_file: # foo collection added, readable by active through link
   object_uuid: zzzzz-4zz18-znfnqtbbv4spc3w # foo file
   object_owner_uuid: zzzzz-tpzed-000000000000000 # system user
   event_at: <%= 4.minute.ago.to_s(:db) %>
+  event_type: create
 
 system_adds_baz: # baz collection added, readable by active and spectator through group 'all users' group membership
   id: 5
@@ -37,6 +40,7 @@ system_adds_baz: # baz collection added, readable by active and spectator throug
   object_uuid: zzzzz-4zz18-y9vne9npefyxh8g # baz file
   object_owner_uuid: zzzzz-tpzed-000000000000000 # system user
   event_at: <%= 5.minute.ago.to_s(:db) %>
+  event_type: create
 
 log_owned_by_active:
   id: 6
index 4646f7afd151c1b6a0467521f2ff08841791c199..d091847df2a20a19bbf08d55a5a9c8e511b8d7b9 100644 (file)
@@ -208,14 +208,14 @@ class CrunchDispatchTest < ActiveSupport::TestCase
     act_as_system_user do
       dispatch = CrunchDispatch.new
 
-      squeue_resp = File.popen("echo zzzzz-8i9sb-pshmckwoma9plh7\necho thisisnotvalidjobuuid\necho zzzzz-8i9sb-4cf0abc123e809j\n")
-      scancel_resp = File.popen("true")
+      squeue_resp = IO.popen("echo zzzzz-8i9sb-pshmckwoma9plh7\necho thisisnotvalidjobuuid\necho zzzzz-8i9sb-4cf0abc123e809j\n")
+      scancel_resp = IO.popen("true")
 
-      File.expects(:popen).
+      IO.expects(:popen).
         with(['squeue', '-a', '-h', '-o', '%j']).
         returns(squeue_resp)
 
-      File.expects(:popen).
+      IO.expects(:popen).
         with(dispatch.sudo_preface + ['scancel', '-n', 'zzzzz-8i9sb-4cf0abc123e809j']).
         returns(scancel_resp)
 
index 8c6539e8dc62820e53076c108aee5c3dfdf32f52..311976501cda65792c23283b98b21882120b17c2 100644 (file)
@@ -38,12 +38,12 @@ class FailJobsTest < ActiveSupport::TestCase
   test 'cancel slurm jobs' do
     Rails.configuration.crunch_job_wrapper = :slurm_immediate
     Rails.configuration.crunch_job_user = 'foobar'
-    fake_squeue = File.popen("echo #{@job[:before_reboot].uuid}")
-    fake_scancel = File.popen("true")
-    File.expects(:popen).
+    fake_squeue = IO.popen("echo #{@job[:before_reboot].uuid}")
+    fake_scancel = IO.popen("true")
+    IO.expects(:popen).
       with(['squeue', '-a', '-h', '-o', '%j']).
       returns(fake_squeue)
-    File.expects(:popen).
+    IO.expects(:popen).
       with(includes('sudo', '-u', 'foobar', 'scancel', '-n', @job[:before_reboot].uuid)).
       returns(fake_scancel)
     @dispatch.fail_jobs(before: Time.at(BOOT_TIME).to_s)
index a1a19cad4b6dd0ed3a980fafd75e95cecc0ca103..7376876bb450ec92b8988e2ed56d92ce3f995d7a 100644 (file)
@@ -1,4 +1,5 @@
 require 'test_helper'
+require 'audit_logs'
 
 class LogTest < ActiveSupport::TestCase
   include CurrentApiClient
@@ -311,4 +312,81 @@ class LogTest < ActiveSupport::TestCase
       end
     end
   end
+
+  def assert_no_logs_deleted
+    logs_before = Log.unscoped.all.count
+    yield
+    assert_equal logs_before, Log.unscoped.all.count
+  end
+
+  def remaining_audit_logs
+    Log.unscoped.where('event_type in (?)', %w(create update destroy delete))
+  end
+
+  # Default settings should not delete anything -- some sites rely on
+  # the original "keep everything forever" behavior.
+  test 'retain old audit logs with default settings' do
+    assert_no_logs_deleted do
+      AuditLogs.delete_old(
+        max_age: Rails.configuration.max_audit_log_age,
+        max_batch: Rails.configuration.max_audit_log_delete_batch)
+    end
+  end
+
+  # Batch size 0 should retain all logs -- even if max_age is very
+  # short, and even if the default settings (and associated test) have
+  # changed.
+  test 'retain old audit logs with max_audit_log_delete_batch=0' do
+    assert_no_logs_deleted do
+      AuditLogs.delete_old(max_age: 1, max_batch: 0)
+    end
+  end
+
+  # We recommend a more conservative age of 5 minutes for production,
+  # but 3 minutes suits our test data better (and is test-worthy in
+  # that it's expected to work correctly in production).
+  test 'delete old audit logs with production settings' do
+    initial_log_count = Log.unscoped.all.count
+    AuditLogs.delete_old(max_age: 180, max_batch: 100000)
+    assert_operator remaining_audit_logs.count, :<, initial_log_count
+  end
+
+  test 'delete all audit logs in multiple batches' do
+    AuditLogs.delete_old(max_age: 0.00001, max_batch: 2)
+    assert_equal [], remaining_audit_logs.collect(&:uuid)
+  end
+
+  test 'delete old audit logs in thread' do
+    begin
+      Rails.configuration.max_audit_log_age = 20
+      Rails.configuration.max_audit_log_delete_batch = 100000
+      Rails.cache.delete 'AuditLogs'
+      initial_log_count = Log.unscoped.all.count + 1
+      act_as_system_user do
+        Log.create!()
+        initial_log_count += 1
+      end
+      deadline = Time.now + 10
+      while remaining_audit_logs.count == initial_log_count
+        if Time.now > deadline
+          raise "timed out"
+        end
+        sleep 0.1
+      end
+      assert_operator remaining_audit_logs.count, :<, initial_log_count
+    ensure
+      # The test framework rolls back our transactions, but that
+      # doesn't undo the deletes we did from separate threads.
+      ActiveRecord::Base.connection.exec_query 'ROLLBACK'
+      Thread.new do
+        begin
+          dc = DatabaseController.new
+          dc.define_singleton_method :render do |*args| end
+          dc.reset
+        ensure
+          ActiveRecord::Base.connection.close
+        end
+      end.join
+    end
+  end
 end
index c41a5f3465d61403959a366565a89ec671af236e..034c0f3db122a9558c192e2fecdfc3c88c383289 100644 (file)
@@ -3,11 +3,16 @@ Description=Arvados git server
 Documentation=https://doc.arvados.org/
 After=network.target
 AssertPathExists=/etc/arvados/git-httpd/git-httpd.yml
+# systemd<230
+StartLimitInterval=0
+# systemd>=230
+StartLimitIntervalSec=0
 
 [Service]
 Type=notify
 ExecStart=/usr/bin/arvados-git-httpd
 Restart=always
+RestartSec=1
 
 [Install]
 WantedBy=multi-user.target
index 34ba80b5c2e08c085959caa9e84c84238faea85e..80e4fb977e8bd7f7d53a1f46784214d6737c135c 100644 (file)
@@ -3,11 +3,16 @@ Description=Arvados Crunch Dispatcher for SLURM
 Documentation=https://doc.arvados.org/
 After=network.target
 AssertPathExists=/etc/arvados/crunch-dispatch-slurm/crunch-dispatch-slurm.yml
+# systemd<230
+StartLimitInterval=0
+# systemd>=230
+StartLimitIntervalSec=0
 
 [Service]
 Type=notify
 ExecStart=/usr/bin/crunch-dispatch-slurm
 Restart=always
+RestartSec=1
 
 [Install]
 WantedBy=multi-user.target
index 3b3cdf1c14a872dacbc76b5679db9970609907dd..062126d6ff42e710523c5779ac2441c109f77f65 100644 (file)
@@ -33,6 +33,7 @@ type IArvadosClient interface {
        Get(resourceType string, uuid string, parameters arvadosclient.Dict, output interface{}) error
        Update(resourceType string, uuid string, parameters arvadosclient.Dict, output interface{}) error
        Call(method, resourceType, uuid, action string, parameters arvadosclient.Dict, output interface{}) error
+       CallRaw(method string, resourceType string, uuid string, action string, parameters arvadosclient.Dict) (reader io.ReadCloser, err error)
        Discovery(key string) (interface{}, error)
 }
 
@@ -91,8 +92,6 @@ type ContainerRunner struct {
        CleanupTempDir []string
        Binds          []string
        OutputPDH      *string
-       CancelLock     sync.Mutex
-       Cancelled      bool
        SigChan        chan os.Signal
        ArvMountExit   chan error
        finalState     string
@@ -114,6 +113,13 @@ type ContainerRunner struct {
        // parent to be X" feature even on sites where the "specify
        // cgroup parent" feature breaks.
        setCgroupParent string
+
+       cStateLock sync.Mutex
+       cStarted   bool // StartContainer() succeeded
+       cCancelled bool // StopContainer() invoked
+
+       enableNetwork string // one of "default" or "always"
+       networkMode   string // passed through to HostConfig.NetworkMode
 }
 
 // SetupSignals sets up signal handling to gracefully terminate the underlying
@@ -133,13 +139,13 @@ func (runner *ContainerRunner) SetupSignals() {
 
 // stop the underlying Docker container.
 func (runner *ContainerRunner) stop() {
-       runner.CancelLock.Lock()
-       defer runner.CancelLock.Unlock()
-       if runner.Cancelled {
+       runner.cStateLock.Lock()
+       defer runner.cStateLock.Unlock()
+       if runner.cCancelled {
                return
        }
-       runner.Cancelled = true
-       if runner.ContainerID != "" {
+       runner.cCancelled = true
+       if runner.cStarted {
                err := runner.Docker.StopContainer(runner.ContainerID, 10)
                if err != nil {
                        log.Printf("StopContainer failed: %s", err)
@@ -504,6 +510,98 @@ func (runner *ContainerRunner) StartCrunchstat() {
        runner.statReporter.Start()
 }
 
+type infoCommand struct {
+       label string
+       cmd   []string
+}
+
+// Gather node information and store it on the log for debugging
+// purposes.
+func (runner *ContainerRunner) LogNodeInfo() (err error) {
+       w := runner.NewLogWriter("node-info")
+       logger := log.New(w, "node-info", 0)
+
+       commands := []infoCommand{
+               infoCommand{
+                       label: "Host Information",
+                       cmd:   []string{"uname", "-a"},
+               },
+               infoCommand{
+                       label: "CPU Information",
+                       cmd:   []string{"cat", "/proc/cpuinfo"},
+               },
+               infoCommand{
+                       label: "Memory Information",
+                       cmd:   []string{"cat", "/proc/meminfo"},
+               },
+               infoCommand{
+                       label: "Disk Space",
+                       cmd:   []string{"df", "-m", "/", os.TempDir()},
+               },
+               infoCommand{
+                       label: "Disk INodes",
+                       cmd:   []string{"df", "-i", "/", os.TempDir()},
+               },
+       }
+
+       // Run commands with informational output to be logged.
+       var out []byte
+       for _, command := range commands {
+               out, err = exec.Command(command.cmd[0], command.cmd[1:]...).CombinedOutput()
+               if err != nil {
+                       return fmt.Errorf("While running command %q: %v",
+                               command.cmd, err)
+               }
+               logger.Println(command.label)
+               for _, line := range strings.Split(string(out), "\n") {
+                       logger.Println(" ", line)
+               }
+       }
+
+       err = w.Close()
+       if err != nil {
+               return fmt.Errorf("While closing node-info logs: %v", err)
+       }
+       return nil
+}
+
+// Get and save the raw JSON container record from the API server
+func (runner *ContainerRunner) LogContainerRecord() (err error) {
+       w := &ArvLogWriter{
+               runner.ArvClient,
+               runner.Container.UUID,
+               "container",
+               runner.LogCollection.Open("container.json"),
+       }
+       // Get Container record JSON from the API Server
+       reader, err := runner.ArvClient.CallRaw("GET", "containers", runner.Container.UUID, "", nil)
+       if err != nil {
+               return fmt.Errorf("While retrieving container record from the API server: %v", err)
+       }
+       defer reader.Close()
+       // Read the API server response as []byte
+       json_bytes, err := ioutil.ReadAll(reader)
+       if err != nil {
+               return fmt.Errorf("While reading container record API server response: %v", err)
+       }
+       // Decode the JSON []byte
+       var cr map[string]interface{}
+       if err = json.Unmarshal(json_bytes, &cr); err != nil {
+               return fmt.Errorf("While decoding the container record JSON response: %v", err)
+       }
+       // Re-encode it using indentation to improve readability
+       enc := json.NewEncoder(w)
+       enc.SetIndent("", "    ")
+       if err = enc.Encode(cr); err != nil {
+               return fmt.Errorf("While logging the JSON container record: %v", err)
+       }
+       err = w.Close()
+       if err != nil {
+               return fmt.Errorf("While closing container.json log: %v", err)
+       }
+       return nil
+}
+
 // AttachLogs connects the docker container stdout and stderr logs to the
 // Arvados logger which logs to Keep and the API server logs table.
 func (runner *ContainerRunner) AttachStreams() (err error) {
@@ -563,6 +661,15 @@ func (runner *ContainerRunner) CreateContainer() error {
        for k, v := range runner.Container.Environment {
                runner.ContainerConfig.Env = append(runner.ContainerConfig.Env, k+"="+v)
        }
+
+       runner.HostConfig = dockerclient.HostConfig{
+               Binds:        runner.Binds,
+               CgroupParent: runner.setCgroupParent,
+               LogConfig: dockerclient.LogConfig{
+                       Type: "none",
+               },
+       }
+
        if wantAPI := runner.Container.RuntimeConstraints.API; wantAPI != nil && *wantAPI {
                tok, err := runner.ContainerToken()
                if err != nil {
@@ -573,9 +680,13 @@ func (runner *ContainerRunner) CreateContainer() error {
                        "ARVADOS_API_HOST="+os.Getenv("ARVADOS_API_HOST"),
                        "ARVADOS_API_HOST_INSECURE="+os.Getenv("ARVADOS_API_HOST_INSECURE"),
                )
-               runner.ContainerConfig.NetworkDisabled = false
+               runner.HostConfig.NetworkMode = runner.networkMode
        } else {
-               runner.ContainerConfig.NetworkDisabled = true
+               if runner.enableNetwork == "always" {
+                       runner.HostConfig.NetworkMode = runner.networkMode
+               } else {
+                       runner.HostConfig.NetworkMode = "none"
+               }
        }
 
        var err error
@@ -584,24 +695,22 @@ func (runner *ContainerRunner) CreateContainer() error {
                return fmt.Errorf("While creating container: %v", err)
        }
 
-       runner.HostConfig = dockerclient.HostConfig{
-               Binds:        runner.Binds,
-               CgroupParent: runner.setCgroupParent,
-               LogConfig: dockerclient.LogConfig{
-                       Type: "none",
-               },
-       }
-
        return runner.AttachStreams()
 }
 
 // StartContainer starts the docker container created by CreateContainer.
 func (runner *ContainerRunner) StartContainer() error {
        runner.CrunchLog.Printf("Starting Docker container id '%s'", runner.ContainerID)
+       runner.cStateLock.Lock()
+       defer runner.cStateLock.Unlock()
+       if runner.cCancelled {
+               return ErrCancelled
+       }
        err := runner.Docker.StartContainer(runner.ContainerID, &runner.HostConfig)
        if err != nil {
                return fmt.Errorf("could not start container: %v", err)
        }
+       runner.cStarted = true
        return nil
 }
 
@@ -846,9 +955,9 @@ func (runner *ContainerRunner) CommitLogs() error {
 
 // UpdateContainerRunning updates the container state to "Running"
 func (runner *ContainerRunner) UpdateContainerRunning() error {
-       runner.CancelLock.Lock()
-       defer runner.CancelLock.Unlock()
-       if runner.Cancelled {
+       runner.cStateLock.Lock()
+       defer runner.cStateLock.Unlock()
+       if runner.cCancelled {
                return ErrCancelled
        }
        return runner.ArvClient.Update("containers", runner.Container.UUID,
@@ -892,9 +1001,9 @@ func (runner *ContainerRunner) UpdateContainerFinal() error {
 
 // IsCancelled returns the value of Cancelled, with goroutine safety.
 func (runner *ContainerRunner) IsCancelled() bool {
-       runner.CancelLock.Lock()
-       defer runner.CancelLock.Unlock()
-       return runner.Cancelled
+       runner.cStateLock.Lock()
+       defer runner.cStateLock.Unlock()
+       return runner.cCancelled
 }
 
 // NewArvLogWriter creates an ArvLogWriter
@@ -990,6 +1099,17 @@ func (runner *ContainerRunner) Run() (err error) {
                return
        }
 
+       // Gather and record node information
+       err = runner.LogNodeInfo()
+       if err != nil {
+               return
+       }
+       // Save container.json record on log collection
+       err = runner.LogContainerRecord()
+       if err != nil {
+               return
+       }
+
        runner.StartCrunchstat()
 
        if runner.IsCancelled() {
@@ -1037,6 +1157,14 @@ func main() {
        cgroupParent := flag.String("cgroup-parent", "docker", "name of container's parent cgroup (ignored if -cgroup-parent-subsystem is used)")
        cgroupParentSubsystem := flag.String("cgroup-parent-subsystem", "", "use current cgroup for given subsystem as parent cgroup for container")
        caCertsPath := flag.String("ca-certs", "", "Path to TLS root certificates")
+       enableNetwork := flag.String("container-enable-networking", "default",
+               `Specify if networking should be enabled for container.  One of 'default', 'always':
+       default: only enable networking if container requests it.
+       always:  containers always have networking enabled
+       `)
+       networkMode := flag.String("container-network-mode", "default",
+               `Set networking mode for container.  Corresponds to Docker network mode (--net).
+       `)
        flag.Parse()
 
        containerId := flag.Arg(0)
@@ -1068,6 +1196,8 @@ func main() {
        cr.statInterval = *statInterval
        cr.cgroupRoot = *cgroupRoot
        cr.expectCgroupParent = *cgroupParent
+       cr.enableNetwork = *enableNetwork
+       cr.networkMode = *networkMode
        if *cgroupParentSubsystem != "" {
                p := findCgroup(*cgroupParentSubsystem)
                cr.setCgroupParent = p
index 36ddd36d31525533e604d3d9d521de404206c9d8..7224c4f1b3d622051e1506da7bc1116f443ac003 100644 (file)
@@ -11,6 +11,7 @@ import (
        "os"
        "os/exec"
        "path/filepath"
+       "runtime/pprof"
        "sort"
        "strings"
        "sync"
@@ -183,6 +184,21 @@ func (client *ArvTestClient) Call(method, resourceType, uuid, action string, par
        }
 }
 
+func (client *ArvTestClient) CallRaw(method, resourceType, uuid, action string,
+       parameters arvadosclient.Dict) (reader io.ReadCloser, err error) {
+       j := []byte(`{
+               "command": ["sleep", "1"],
+               "container_image": "d4ab34d3d4f8a72f5c4973051ae69fab+122",
+               "cwd": ".",
+               "environment": {},
+               "mounts": {"/tmp": {"kind": "tmp"} },
+               "output_path": "/tmp",
+               "priority": 1,
+               "runtime_constraints": {}
+       }`)
+       return ioutil.NopCloser(bytes.NewReader(j)), nil
+}
+
 func (client *ArvTestClient) Get(resourceType string, uuid string, parameters arvadosclient.Dict, output interface{}) error {
        if resourceType == "collections" {
                if uuid == hwPDH {
@@ -321,6 +337,11 @@ func (ArvErrorTestClient) Call(method, resourceType, uuid, action string, parame
        return errors.New("ArvError")
 }
 
+func (ArvErrorTestClient) CallRaw(method, resourceType, uuid, action string,
+       parameters arvadosclient.Dict) (reader io.ReadCloser, err error) {
+       return nil, errors.New("ArvError")
+}
+
 func (ArvErrorTestClient) Get(resourceType string, uuid string, parameters arvadosclient.Dict, output interface{}) error {
        return errors.New("ArvError")
 }
@@ -525,7 +546,7 @@ func (s *TestSuite) TestUpdateContainerCancelled(c *C) {
        api := &ArvTestClient{}
        kc := &KeepTestClient{}
        cr := NewContainerRunner(api, kc, nil, "zzzzz-zzzzz-zzzzzzzzzzzzzzz")
-       cr.Cancelled = true
+       cr.cCancelled = true
        cr.finalState = "Cancelled"
 
        err := cr.UpdateContainerFinal()
@@ -650,6 +671,56 @@ func (s *TestSuite) TestCrunchstat(c *C) {
        c.Check(api.Logs["crunchstat"].String(), Matches, `(?ms).*cgroup stats files never appeared for abcde\n`)
 }
 
+func (s *TestSuite) TestNodeInfoLog(c *C) {
+       api, _, _ := FullRunHelper(c, `{
+               "command": ["sleep", "1"],
+               "container_image": "d4ab34d3d4f8a72f5c4973051ae69fab+122",
+               "cwd": ".",
+               "environment": {},
+               "mounts": {"/tmp": {"kind": "tmp"} },
+               "output_path": "/tmp",
+               "priority": 1,
+               "runtime_constraints": {}
+       }`, nil, func(t *TestDockerClient) {
+               time.Sleep(time.Second)
+               t.logWriter.Close()
+               t.finish <- dockerclient.WaitResult{}
+       })
+
+       c.Check(api.CalledWith("container.exit_code", 0), NotNil)
+       c.Check(api.CalledWith("container.state", "Complete"), NotNil)
+
+       c.Assert(api.Logs["node-info"], NotNil)
+       c.Check(api.Logs["node-info"].String(), Matches, `(?ms).*Host Information.*`)
+       c.Check(api.Logs["node-info"].String(), Matches, `(?ms).*CPU Information.*`)
+       c.Check(api.Logs["node-info"].String(), Matches, `(?ms).*Memory Information.*`)
+       c.Check(api.Logs["node-info"].String(), Matches, `(?ms).*Disk Space.*`)
+       c.Check(api.Logs["node-info"].String(), Matches, `(?ms).*Disk INodes.*`)
+}
+
+func (s *TestSuite) TestContainerRecordLog(c *C) {
+       api, _, _ := FullRunHelper(c, `{
+               "command": ["sleep", "1"],
+               "container_image": "d4ab34d3d4f8a72f5c4973051ae69fab+122",
+               "cwd": ".",
+               "environment": {},
+               "mounts": {"/tmp": {"kind": "tmp"} },
+               "output_path": "/tmp",
+               "priority": 1,
+               "runtime_constraints": {}
+       }`, nil, func(t *TestDockerClient) {
+               time.Sleep(time.Second)
+               t.logWriter.Close()
+               t.finish <- dockerclient.WaitResult{}
+       })
+
+       c.Check(api.CalledWith("container.exit_code", 0), NotNil)
+       c.Check(api.CalledWith("container.state", "Complete"), NotNil)
+
+       c.Assert(api.Logs["container"], NotNil)
+       c.Check(api.Logs["container"].String(), Matches, `(?ms).*container_image.*`)
+}
+
 func (s *TestSuite) TestFullRunStderr(c *C) {
        api, _, _ := FullRunHelper(c, `{
     "command": ["/bin/sh", "-c", "echo hello ; echo world 1>&2 ; exit 1"],
@@ -722,7 +793,7 @@ func (s *TestSuite) TestFullRunSetCwd(c *C) {
 func (s *TestSuite) TestStopOnSignal(c *C) {
        s.testStopContainer(c, func(cr *ContainerRunner) {
                go func() {
-                       for cr.ContainerID == "" {
+                       for !cr.cStarted {
                                time.Sleep(time.Millisecond)
                        }
                        cr.SigChan <- syscall.SIGINT
@@ -776,6 +847,7 @@ func (s *TestSuite) testStopContainer(c *C, setup func(cr *ContainerRunner)) {
        }()
        select {
        case <-time.After(20 * time.Second):
+               pprof.Lookup("goroutine").WriteTo(os.Stderr, 1)
                c.Fatal("timed out")
        case err = <-done:
                c.Check(err, IsNil)
index 031058242a21179f15a0ef36ce9456f657811338..91b6b9973d8c946f1e04e48a010c428e9b4c555e 100644 (file)
@@ -3,6 +3,10 @@ Description=Arvados Docker Image Cleaner
 Documentation=https://doc.arvados.org/
 After=network.target
 AssertPathExists=/etc/arvados/docker-cleaner/docker-cleaner.json
+# systemd<230
+StartLimitInterval=0
+# systemd>=230
+StartLimitIntervalSec=0
 
 [Service]
 Type=simple
index a6f5b6e349a69e2c12b297f048f5e4e7d3d2decf..9f1cee3d903b2e8387439b8ae35432a35897fa2e 100644 (file)
@@ -3,6 +3,10 @@ Description=Arvados Keep Balance
 Documentation=https://doc.arvados.org/
 After=network.target
 AssertPathExists=/etc/arvados/keep-balance/keep-balance.yml
+# systemd<230
+StartLimitInterval=0
+# systemd>=230
+StartLimitIntervalSec=0
 
 [Service]
 Type=simple
index 24be7714d5881cf2897cb61b9ddbc751ed215392..3f6c41743e7a1f964efe5f33afeae75f31b7b8ec 100644 (file)
@@ -3,11 +3,16 @@ Description=Arvados Keep web gateway
 Documentation=https://doc.arvados.org/
 After=network.target
 AssertPathExists=/etc/arvados/keep-web/keep-web.yml
+# systemd<230
+StartLimitInterval=0
+# systemd>=230
+StartLimitIntervalSec=0
 
 [Service]
 Type=notify
 ExecStart=/usr/bin/keep-web
 Restart=always
+RestartSec=1
 
 [Install]
 WantedBy=multi-user.target
index c340fabc0e961c541af82ce9fca8d1dc04b292ca..887df22634e67f9240a23b2029d8bf67725a43b4 100644 (file)
@@ -3,11 +3,16 @@ Description=Arvados Keep Proxy
 Documentation=https://doc.arvados.org/
 After=network.target
 AssertPathExists=/etc/arvados/keepproxy/keepproxy.yml
+# systemd<230
+StartLimitInterval=0
+# systemd>=230
+StartLimitIntervalSec=0
 
 [Service]
 Type=notify
 ExecStart=/usr/bin/keepproxy
 Restart=always
+RestartSec=1
 
 [Install]
 WantedBy=multi-user.target
index b9e2793e1f7874842e063544365545d4bef44ee4..2ba19c6ba0b5982d52c6cbcef3ed3d2b74f71000 100644 (file)
@@ -3,11 +3,16 @@ Description=Arvados Keep Storage Daemon
 Documentation=https://doc.arvados.org/
 After=network.target
 AssertPathExists=/etc/arvados/keepstore/keepstore.yml
+# systemd<230
+StartLimitInterval=0
+# systemd>=230
+StartLimitIntervalSec=0
 
 [Service]
 Type=notify
 ExecStart=/usr/bin/keepstore
 Restart=always
+RestartSec=1
 
 [Install]
 WantedBy=multi-user.target
index fc3ff05777ab2a550263554ca4225c961b140a53..71f9083c01a3e99b35020c69cf2dee301f7e848e 100644 (file)
@@ -348,10 +348,6 @@ class ComputeNodeMonitorActor(config.actor_class):
         if self.arvados_node is None:
             return 'unpaired'
 
-        # This node is indicated as non-functioning by the cloud
-        if self._cloud.broken(self.cloud_node):
-            return 'down'
-
         state = self.arvados_node['crunch_worker_state']
 
         # If state information is not available because it is missing or the
index b4f17849f1e77b296ee08ed632b2dab47b1e173b..f23b2615e29876aeb5689c79959fe605d09d7e11 100644 (file)
@@ -232,7 +232,7 @@ class NodeManagerDaemonActor(actor_class):
     def try_pairing(self):
         for record in self.cloud_nodes.unpaired():
             for arv_rec in self.arvados_nodes.unpaired():
-                if record.actor.offer_arvados_pair(arv_rec.arvados_node).get():
+                if record.actor is not None and record.actor.offer_arvados_pair(arv_rec.arvados_node).get():
                     self._pair_nodes(record, arv_rec.arvados_node)
                     break
 
@@ -426,16 +426,25 @@ class NodeManagerDaemonActor(actor_class):
 
     @_check_poll_freshness
     def node_can_shutdown(self, node_actor):
-        if self._nodes_excess(node_actor.cloud_node.get().size) > 0:
-            self._begin_node_shutdown(node_actor, cancellable=True)
-        elif self.cloud_nodes.nodes.get(node_actor.cloud_node.get().id).arvados_node is None:
-            # Node is unpaired, which means it probably exceeded its booting
-            # grace period without a ping, so shut it down so we can boot a new
-            # node in its place.
-            self._begin_node_shutdown(node_actor, cancellable=False)
-        elif node_actor.in_state('down').get():
-            # Node is down and unlikely to come back.
-            self._begin_node_shutdown(node_actor, cancellable=False)
+        try:
+            if self._nodes_excess(node_actor.cloud_node.get().size) > 0:
+                self._begin_node_shutdown(node_actor, cancellable=True)
+            elif self.cloud_nodes.nodes.get(node_actor.cloud_node.get().id).arvados_node is None:
+                # Node is unpaired, which means it probably exceeded its booting
+                # grace period without a ping, so shut it down so we can boot a new
+                # node in its place.
+                self._begin_node_shutdown(node_actor, cancellable=False)
+            elif node_actor.in_state('down').get():
+                # Node is down and unlikely to come back.
+                self._begin_node_shutdown(node_actor, cancellable=False)
+        except pykka.ActorDeadError as e:
+            # The monitor actor sends shutdown suggestions every time the
+            # node's state is updated, and these go into the daemon actor's
+            # message queue.  It's possible that the node has already been shut
+            # down (which shuts down the node monitor actor).  In that case,
+            # this message is stale and we'll get ActorDeadError when we try to
+            # access node_actor.  Log the error.
+            self._logger.debug("ActorDeadError in node_can_shutdown: %s", e)
 
     def node_finished_shutdown(self, shutdown_actor):
         try:
index f1d168c1e647d132d179fd179862a3d97c121ed2..e49fc39eed3dad01be48004047341ec650a81a5e 100644 (file)
@@ -529,27 +529,6 @@ class NodeManagerDaemonActorTestCase(testutil.ActorTestMixin,
         self.daemon.node_finished_shutdown(self.last_shutdown).get(self.TIMEOUT)
         self.assertEqual(0, self.alive_monitor_count())
 
-    def test_broken_node_not_counted(self):
-        size = testutil.MockSize(8)
-        cloud_node = testutil.cloud_node_mock(8, size=size)
-        wishlist = [size]
-        self.make_daemon([cloud_node], [testutil.arvados_node_mock(8)],
-                         wishlist, avail_sizes=[(size, {"cores":1})])
-        self.assertEqual(1, self.alive_monitor_count())
-        self.assertFalse(self.node_setup.start.called)
-        monitor = self.monitor_list()[0].proxy()
-        shutdown_proxy = self.node_shutdown.start().proxy
-        shutdown_proxy().cloud_node.get.return_value = cloud_node
-        shutdown_proxy().success.get.return_value = False
-        self.cloud_factory().broken.return_value = True
-        self.daemon.update_server_wishlist([]).get(self.TIMEOUT)
-        self.daemon.node_can_shutdown(monitor).get(self.TIMEOUT)
-        self.daemon.node_finished_shutdown(shutdown_proxy()).get(self.TIMEOUT)
-        self.daemon.update_cloud_nodes([cloud_node]).get(self.TIMEOUT)
-        self.daemon.update_server_wishlist(wishlist).get(self.TIMEOUT)
-        self.stop_proxy(self.daemon)
-        self.assertEqual(1, self.node_setup.start.call_count)
-
     def test_nodes_shutting_down_replaced_below_max_nodes(self):
         size = testutil.MockSize(6)
         cloud_node = testutil.cloud_node_mock(6, size=size)
index ebccf0c89d2aa942da613848a99698ae12f6504c..9d37f31170cbe02bd97484cd59c97391e38d1a11 100644 (file)
@@ -3,11 +3,16 @@ Description=Arvados websocket server
 Documentation=https://doc.arvados.org/
 After=network.target
 AssertPathExists=/etc/arvados/ws/ws.yml
+# systemd<230
+StartLimitInterval=0
+# systemd>=230
+StartLimitIntervalSec=0
 
 [Service]
 Type=notify
 ExecStart=/usr/bin/arvados-ws
 Restart=always
+RestartSec=1
 
 [Install]
 WantedBy=multi-user.target