17574: Merge branch 'main'
authorTom Clegg <tom@tomclegg.ca>
Wed, 4 Aug 2021 14:02:15 +0000 (10:02 -0400)
committerTom Clegg <tom@tomclegg.ca>
Wed, 4 Aug 2021 14:02:15 +0000 (10:02 -0400)
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom@curii.com>

72 files changed:
build/package-build-dockerfiles/Makefile
build/package-build-dockerfiles/debian11/Dockerfile [moved from build/package-build-dockerfiles/ubuntu1604/Dockerfile with 78% similarity]
build/package-test-dockerfiles/Makefile
build/package-test-dockerfiles/debian11/Dockerfile [moved from build/package-test-dockerfiles/ubuntu1604/Dockerfile with 69% similarity]
build/package-test-dockerfiles/ubuntu1604/etc-apt-preferences.d-arvados [deleted file]
build/package-testing/test-packages-debian11.sh [moved from build/package-testing/test-packages-debian8.sh with 100% similarity]
build/package-testing/test-packages-debian9.sh [deleted symlink]
build/package-testing/test-packages-ubuntu1404.sh [deleted symlink]
build/package-testing/test-packages-ubuntu1604.sh [deleted symlink]
build/run-build-packages-one-target.sh
build/run-build-packages.sh
build/run-library.sh
cmd/arvados-server/arvados-controller.service
cmd/arvados-server/arvados-dispatch-cloud.service
cmd/arvados-server/arvados-dispatch-lsf.service [new file with mode: 0644]
cmd/arvados-server/arvados-ws.service
cmd/arvados-server/cmd.go
doc/_config.yml
doc/_includes/_install_ruby_and_bundler.liquid
doc/api/keep-web-urls.html.textile.liquid
doc/install/crunch2-lsf/install-dispatch.html.textile.liquid [new file with mode: 0644]
doc/install/crunch2-slurm/install-dispatch.html.textile.liquid
doc/install/install-keep-web.html.textile.liquid
doc/install/install-manual-prerequisites.html.textile.liquid
doc/install/packages.html.textile.liquid
lib/config/config.default.yml
lib/config/export.go
lib/config/generated_config.go
lib/controller/integration_test.go
lib/crunchrun/crunchrun.go
lib/crunchrun/crunchrun_test.go
lib/crunchrun/docker.go
lib/crunchrun/executor.go
lib/crunchrun/executor_test.go
lib/crunchrun/singularity.go
lib/lsf/dispatch.go [new file with mode: 0644]
lib/lsf/dispatch_test.go [new file with mode: 0644]
lib/lsf/lsfcli.go [new file with mode: 0644]
lib/lsf/lsfqueue.go [new file with mode: 0644]
sdk/cwl/fpm-info.sh
sdk/go/arvados/config.go
sdk/go/arvados/container.go
sdk/go/arvadostest/fixtures.go
sdk/go/dispatch/dispatch.go
sdk/go/dispatch/dispatch_test.go
sdk/go/health/aggregator_test.go
sdk/go/httpserver/id_generator.go
sdk/go/keepclient/keepclient.go
sdk/go/keepclient/keepclient_test.go
sdk/go/keepclient/support.go
services/api/app/controllers/application_controller.rb
services/api/app/models/container.rb
services/api/config/initializers/request_id_middleware.rb [new file with mode: 0644]
services/api/test/functional/application_controller_test.rb
services/api/test/integration/errors_test.rb
services/arv-git-httpd/arvados-git-httpd.service
services/crunch-dispatch-local/crunch-dispatch-local.go
services/crunch-dispatch-local/crunch-dispatch-local.service
services/crunch-dispatch-local/crunch-dispatch-local_test.go
services/crunch-dispatch-slurm/crunch-dispatch-slurm.go
services/crunch-dispatch-slurm/crunch-dispatch-slurm.service
services/crunch-dispatch-slurm/crunch-dispatch-slurm_test.go
services/dockercleaner/arvados-docker-cleaner.service
services/dockercleaner/fpm-info.sh
services/health/arvados-health.service
services/keep-balance/keep-balance.service
services/keep-web/keep-web.service
services/keepproxy/keepproxy.service
services/keepproxy/keepproxy_test.go
services/keepstore/keepstore.service
services/login-sync/bin/arvados-login-sync
tools/salt-install/config_examples/multi_host/aws/pillars/arvados.sls

index 161a50406998f0d486aa721f429397a5993f5003..80f7c12c101ceb3910fdde2a17b793d4841069cb 100644 (file)
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: AGPL-3.0
 
-all: centos7/generated debian10/generated ubuntu1604/generated ubuntu1804/generated ubuntu2004/generated
+all: centos7/generated debian10/generated debian11/generated ubuntu1804/generated ubuntu2004/generated
 
 centos7/generated: common-generated-all
        test -d centos7/generated || mkdir centos7/generated
@@ -12,9 +12,9 @@ debian10/generated: common-generated-all
        test -d debian10/generated || mkdir debian10/generated
        cp -f -rlt debian10/generated common-generated/*
 
-ubuntu1604/generated: common-generated-all
-       test -d ubuntu1604/generated || mkdir ubuntu1604/generated
-       cp -f -rlt ubuntu1604/generated common-generated/*
+debian11/generated: common-generated-all
+       test -d debian11/generated || mkdir debian11/generated
+       cp -f -rlt debian11/generated common-generated/*
 
 ubuntu1804/generated: common-generated-all
        test -d ubuntu1804/generated || mkdir ubuntu1804/generated
similarity index 78%
rename from build/package-build-dockerfiles/ubuntu1604/Dockerfile
rename to build/package-build-dockerfiles/debian11/Dockerfile
index 77df54dcbd11f45c31f062a0ee8bc10619bfc824..99effa3cca6c922b7a6f237b92bb895082b3ed46 100644 (file)
@@ -2,13 +2,14 @@
 #
 # SPDX-License-Identifier: AGPL-3.0
 
-FROM ubuntu:xenial
+## dont use debian:11 here since the word 'bullseye' is used for rvm precompiled binaries
+FROM debian:bullseye
 MAINTAINER Arvados Package Maintainers <packaging@arvados.org>
 
 ENV DEBIAN_FRONTEND noninteractive
 
 # Install dependencies.
-RUN /usr/bin/apt-get update && /usr/bin/apt-get install -q -y python3 python-setuptools python3-setuptools python3-pip libcurl4-gnutls-dev libgnutls-dev curl git libattr1-dev libfuse-dev libpq-dev unzip tzdata python3-venv python3-dev libpam-dev
+RUN /usr/bin/apt-get update && /usr/bin/apt-get install -q -y python3 python3-setuptools python3-pip libcurl4-gnutls-dev curl git procps libattr1-dev libfuse-dev libgnutls28-dev libpq-dev unzip python3-venv python3-dev libpam-dev equivs
 
 # Install virtualenv
 RUN /usr/bin/pip3 install 'virtualenv<20'
@@ -21,6 +22,7 @@ RUN gpg --import --no-tty /tmp/mpapis.asc && \
     curl -L https://get.rvm.io | bash -s stable && \
     /usr/local/rvm/bin/rvm install 2.5 && \
     /usr/local/rvm/bin/rvm alias create default ruby-2.5 && \
+    echo "gem: --no-document" >> /etc/gemrc && \
     /usr/local/rvm/bin/rvm-exec default gem install bundler --version 2.2.19 && \
     /usr/local/rvm/bin/rvm-exec default gem install fpm --version 1.10.2
 
@@ -35,4 +37,4 @@ RUN ln -s /usr/local/node-v10.23.1-linux-x64/bin/* /usr/local/bin/
 RUN git clone --depth 1 git://git.arvados.org/arvados.git /tmp/arvados && cd /tmp/arvados/services/api && /usr/local/rvm/bin/rvm-exec default bundle && cd /tmp/arvados/apps/workbench && /usr/local/rvm/bin/rvm-exec default bundle
 
 ENV WORKSPACE /arvados
-CMD ["/usr/local/rvm/bin/rvm-exec", "default", "bash", "/jenkins/run-build-packages.sh", "--target", "ubuntu1604"]
+CMD ["/usr/local/rvm/bin/rvm-exec", "default", "bash", "/jenkins/run-build-packages.sh", "--target", "debian11"]
index 227b74bbab35faa2f7c12fe939e03fc51d2487de..849decb9a5016bb2676ea4a9e0e92ba639edd6d9 100644 (file)
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: AGPL-3.0
 
-all: centos7/generated debian10/generated ubuntu1604/generated ubuntu1804/generated ubuntu2004/generated
+all: centos7/generated debian10/generated debian11/generated ubuntu1804/generated ubuntu2004/generated
 
 centos7/generated: common-generated-all
        test -d centos7/generated || mkdir centos7/generated
@@ -12,9 +12,9 @@ debian10/generated: common-generated-all
        test -d debian10/generated || mkdir debian10/generated
        cp -f -rlt debian10/generated common-generated/*
 
-ubuntu1604/generated: common-generated-all
-       test -d ubuntu1604/generated || mkdir ubuntu1604/generated
-       cp -f -rlt ubuntu1604/generated common-generated/*
+debian11/generated: common-generated-all
+       test -d debian11/generated || mkdir debian11/generated
+       cp -f -rlt debian11/generated common-generated/*
 
 ubuntu1804/generated: common-generated-all
        test -d ubuntu1804/generated || mkdir ubuntu1804/generated
similarity index 69%
rename from build/package-test-dockerfiles/ubuntu1604/Dockerfile
rename to build/package-test-dockerfiles/debian11/Dockerfile
index 0b03e412a120f22bcf999a8aa17ad077e01e4a5c..7cc543cf0d67619f7107d9dacd07497b06c07b41 100644 (file)
@@ -2,14 +2,14 @@
 #
 # SPDX-License-Identifier: AGPL-3.0
 
-FROM ubuntu:xenial
+FROM debian:bullseye
 MAINTAINER Arvados Package Maintainers <packaging@arvados.org>
 
 ENV DEBIAN_FRONTEND noninteractive
 
 # Install dependencies
 RUN apt-get update && \
-    apt-get -y install --no-install-recommends curl ca-certificates
+    apt-get -y install --no-install-recommends curl ca-certificates gpg procps gpg-agent
 
 # Install RVM
 ADD generated/mpapis.asc /tmp/
@@ -19,14 +19,10 @@ RUN gpg --import --no-tty /tmp/mpapis.asc && \
     curl -L https://get.rvm.io | bash -s stable && \
     /usr/local/rvm/bin/rvm install 2.5 && \
     /usr/local/rvm/bin/rvm alias create default ruby-2.5 && \
+    echo "gem: --no-document" >> /etc/gemrc && \
     /usr/local/rvm/bin/rvm-exec default gem install bundler --version 2.2.19
 
 # udev daemon can't start in a container, so don't try.
 RUN mkdir -p /etc/udev/disabled
 
-RUN echo "deb file:///arvados/packages/ubuntu1604/ /" >>/etc/apt/sources.list
-
-# Add preferences file for the Arvados packages. This pins Arvados
-# packages at priority 501, so that older python dependency versions
-# are preferred in those cases where we need them
-ADD etc-apt-preferences.d-arvados /etc/apt/preferences.d/arvados
+RUN echo "deb file:///arvados/packages/debian11/ /" >>/etc/apt/sources.list
diff --git a/build/package-test-dockerfiles/ubuntu1604/etc-apt-preferences.d-arvados b/build/package-test-dockerfiles/ubuntu1604/etc-apt-preferences.d-arvados
deleted file mode 100644 (file)
index 9e24695..0000000
+++ /dev/null
@@ -1,3 +0,0 @@
-Package: *
-Pin: release o=Arvados
-Pin-Priority: 501
diff --git a/build/package-testing/test-packages-debian9.sh b/build/package-testing/test-packages-debian9.sh
deleted file mode 120000 (symlink)
index 54ce94c..0000000
+++ /dev/null
@@ -1 +0,0 @@
-deb-common-test-packages.sh
\ No newline at end of file
diff --git a/build/package-testing/test-packages-ubuntu1404.sh b/build/package-testing/test-packages-ubuntu1404.sh
deleted file mode 120000 (symlink)
index 54ce94c..0000000
+++ /dev/null
@@ -1 +0,0 @@
-deb-common-test-packages.sh
\ No newline at end of file
diff --git a/build/package-testing/test-packages-ubuntu1604.sh b/build/package-testing/test-packages-ubuntu1604.sh
deleted file mode 120000 (symlink)
index 54ce94c..0000000
+++ /dev/null
@@ -1 +0,0 @@
-deb-common-test-packages.sh
\ No newline at end of file
index 8365fecadbd29705924623374ba21763f934185e..81aac9c616c11ea2894482b240c08495a577511d 100755 (executable)
@@ -106,6 +106,9 @@ while [ $# -gt 0 ]; do
             elif ! [[ "$2" =~ (.*)-(.*) ]]; then
                 echo >&2 "FATAL: --build-version '$2' does not include an iteration. Try '${2}-1'?"
                 exit 1
+            elif ! [[ "$2" =~ ^[0-9]+\.[0-9]+\.[0-9]+(\.[0-9]+|)(~rc[0-9]+|~dev[0-9]+|)-[0-9]+$ ]]; then
+                echo >&2 "FATAL: --build-version '$2' is invalid, must match pattern ^[0-9]+\.[0-9]+\.[0-9]+(\.[0-9]+|)(~rc[0-9]+|~dev[0-9]+|)-[0-9]+$"
+                exit 1
             else
                 ARVADOS_BUILDING_VERSION="${BASH_REMATCH[1]}"
                 ARVADOS_BUILDING_ITERATION="${BASH_REMATCH[2]}"
@@ -194,6 +197,7 @@ if test -z "$packages" ; then
         arvados-client
         arvados-controller
         arvados-dispatch-cloud
+        arvados-dispatch-lsf
         arvados-docker-cleaner
         arvados-git-httpd
         arvados-health
index d46c246da7c9fadb0fe96fbe7edce0f4c623c351..7829c8c6cd61792535960a153bb20baf1b7e1622 100755 (executable)
@@ -277,6 +277,8 @@ package_go_binary cmd/arvados-server arvados-controller \
     "Arvados cluster controller daemon"
 package_go_binary cmd/arvados-server arvados-dispatch-cloud \
     "Arvados cluster cloud dispatch"
+package_go_binary cmd/arvados-server arvados-dispatch-lsf \
+    "Dispatch Arvados containers to an LSF cluster"
 package_go_binary services/arv-git-httpd arvados-git-httpd \
     "Provide authenticated http access to Arvados-hosted git repositories"
 package_go_binary services/crunch-dispatch-local crunch-dispatch-local \
index 0c3cbde8d86cafdba972d0f45cd245033b49a3b4..37a3d4b513794397c83b94686acc2cb79c7309cd 100755 (executable)
@@ -337,7 +337,7 @@ test_package_presence() {
     elif [[ "$FORMAT" == "deb" ]]; then
       declare -A dd
       dd[debian10]=buster
-      dd[ubuntu1604]=xenial
+      dd[debian11]=bullseye
       dd[ubuntu1804]=bionic
       dd[ubuntu2004]=focal
       D=${dd[$TARGET]}
index 1a43d7cd3a72bb31a1e1ef2152e152c75fe2c6aa..420cbb035a7e7177f84ef7a9ca07117d70e37e5f 100644 (file)
@@ -8,9 +8,6 @@ Documentation=https://doc.arvados.org/
 After=network.target
 AssertPathExists=/etc/arvados/config.yml
 
-# systemd==229 (ubuntu:xenial) obeys StartLimitInterval in the [Unit] section
-StartLimitInterval=0
-
 # systemd>=230 (debian:9) obeys StartLimitIntervalSec in the [Unit] section
 StartLimitIntervalSec=0
 
index d3f476b7df725c2597f3c972e476d4a51ff86165..8d57e8a1612ca49ecc9d98b44a716eb1485e2640 100644 (file)
@@ -8,9 +8,6 @@ Documentation=https://doc.arvados.org/
 After=network.target
 AssertPathExists=/etc/arvados/config.yml
 
-# systemd==229 (ubuntu:xenial) obeys StartLimitInterval in the [Unit] section
-StartLimitInterval=0
-
 # systemd>=230 (debian:9) obeys StartLimitIntervalSec in the [Unit] section
 StartLimitIntervalSec=0
 
diff --git a/cmd/arvados-server/arvados-dispatch-lsf.service b/cmd/arvados-server/arvados-dispatch-lsf.service
new file mode 100644 (file)
index 0000000..65d8786
--- /dev/null
@@ -0,0 +1,27 @@
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: AGPL-3.0
+
+[Unit]
+Description=arvados-dispatch-lsf
+Documentation=https://doc.arvados.org/
+After=network.target
+AssertPathExists=/etc/arvados/config.yml
+
+# systemd>=230 (debian:9) obeys StartLimitIntervalSec in the [Unit] section
+StartLimitIntervalSec=0
+
+[Service]
+Type=notify
+EnvironmentFile=-/etc/arvados/environment
+ExecStart=/usr/bin/arvados-dispatch-lsf
+# Set a reasonable default for the open file limit
+LimitNOFILE=65536
+Restart=always
+RestartSec=1
+
+# systemd<=219 (centos:7, debian:8, ubuntu:trusty) obeys StartLimitInterval in the [Service] section
+StartLimitInterval=0
+
+[Install]
+WantedBy=multi-user.target
index aebc56a79f333b19f061f5f0aadce793e799529c..f73db5d08032369c619e42429ddf7a68550b8551 100644 (file)
@@ -8,9 +8,6 @@ Documentation=https://doc.arvados.org/
 After=network.target
 AssertPathExists=/etc/arvados/config.yml
 
-# systemd==229 (ubuntu:xenial) obeys StartLimitInterval in the [Unit] section
-StartLimitInterval=0
-
 # systemd>=230 (debian:9) obeys StartLimitIntervalSec in the [Unit] section
 StartLimitIntervalSec=0
 
index d0aa9da94df537bf80a3ed232c2a0ae2c3a0e1d6..4b94a7813869915c38c14ec7927a8a2662e30475 100644 (file)
@@ -15,6 +15,7 @@ import (
        "git.arvados.org/arvados.git/lib/crunchrun"
        "git.arvados.org/arvados.git/lib/dispatchcloud"
        "git.arvados.org/arvados.git/lib/install"
+       "git.arvados.org/arvados.git/lib/lsf"
        "git.arvados.org/arvados.git/lib/recovercollection"
        "git.arvados.org/arvados.git/services/ws"
 )
@@ -33,6 +34,7 @@ var (
                "controller":         controller.Command,
                "crunch-run":         crunchrun.Command,
                "dispatch-cloud":     dispatchcloud.Command,
+               "dispatch-lsf":       lsf.DispatchCommand,
                "install":            install.Command,
                "init":               install.InitCommand,
                "recover-collection": recovercollection.Command,
index 39fe22fde3e1379205a26e0e5f28ef398c3d3980..b18607ebb7490622d38e119ad6a0f0383fdb35ba 100644 (file)
@@ -254,6 +254,8 @@ navbar:
       - install/crunch2-slurm/configure-slurm.html.textile.liquid
       - install/crunch2-slurm/install-compute-node.html.textile.liquid
       - install/crunch2-slurm/install-test.html.textile.liquid
+    - Containers API (lsf):
+      - install/crunch2-lsf/install-dispatch.html.textile.liquid
     - Additional configuration:
       - install/container-shell-access.html.textile.liquid
     - External dependencies:
index 387f023f24ddb9f6feb6ba9af0f012a355029c39..fe7714c62feae18bd12fa6c85890ec7ba14182a3 100644 (file)
@@ -22,8 +22,6 @@ The Ruby version shipped with Centos 7 is too old.  Use "RVM":#rvm to install Ru
 
 h3. Debian and Ubuntu
 
-Ubuntu 16.04 (xenial) ships with Ruby 2.3, which is not supported by Arvados.  Use "RVM":#rvm to install Ruby 2.5 or later.
-
 Debian 10 (buster) and Ubuntu 18.04 (bionic) and later ship with Ruby 2.5, which is supported by Arvados.
 
 <notextile>
index 1770a259b7b0cc3c213bf6ca52f89d3d54413093..ed0bde8137334f0850b980e71742399b93943000 100644 (file)
@@ -14,7 +14,7 @@ Files served by @keep-web@ can be rendered directly in the browser, or @keep-web
 
 When serving files that will render directly in the browser, it is important to properly configure the keep-web service to migitate cross-site-scripting (XSS) attacks.  A HTML page can be stored in a collection.  If an attacker causes a victim to visit that page through Workbench, the HTML will be rendered by the browser.  If all collections are served at the same domain, the browser will consider collections as coming from the same origin, which will grant access to the same browsing data (cookies and local storage).  This would enable malicious Javascript on that page to access Arvados on behalf of the victim.
 
-This can be mitigated by having separate domains for each collection, or limiting preview to circumstances where the collection is not accessed with the user's regular full-access token.  For cluster administrators that understand the risks, this protection can also be turned off.
+This can be mitigated by having separate domains for each collection, or limiting preview to circumstances where the collection is not accessed with the user's regular full-access token.  For clusters where this risk is acceptable, this protection can also be turned off by setting the @Collections/TrustAllContent@ configuration flag to true, see the "configuration reference":../admin/config.html for more detail.
 
 The following "same origin" URL patterns are supported for public collections and collections shared anonymously via secret links (i.e., collections which can be served by keep-web without making use of any implicit credentials like cookies). See "Same-origin URLs" below.
 
@@ -82,4 +82,4 @@ When a client passes a token in the URL, keep-web sends a redirect response plac
 
 This mainly affects Workbench's ability to show inline content, so it should be taken into account when configuring both services' URL schemes.
 
-You can read more about the definition of a _same-site_ request at the "RFC 6265bis-03 page":https://tools.ietf.org/html/draft-ietf-httpbis-rfc6265bis-03#section-5.2
\ No newline at end of file
+You can read more about the definition of a _same-site_ request at the "RFC 6265bis-03 page":https://tools.ietf.org/html/draft-ietf-httpbis-rfc6265bis-03#section-5.2
diff --git a/doc/install/crunch2-lsf/install-dispatch.html.textile.liquid b/doc/install/crunch2-lsf/install-dispatch.html.textile.liquid
new file mode 100644 (file)
index 0000000..66b562d
--- /dev/null
@@ -0,0 +1,112 @@
+---
+layout: default
+navsection: installguide
+title: Install the LSF dispatcher
+...
+{% comment %}
+Copyright (C) The Arvados Authors. All rights reserved.
+
+SPDX-License-Identifier: CC-BY-SA-3.0
+{% endcomment %}
+
+{% include 'notebox_begin_warning' %}
+arvados-dispatch-lsf is only relevant for on premises clusters that will spool jobs to LSF. Skip this section if you are installing a cloud cluster.
+{% include 'notebox_end' %}
+
+Containers can be dispatched to an LSF cluster.  The dispatcher sends work to the cluster using LSF's @bsub@ command, so it works in a variety of LSF configurations.
+
+*LSF support is currently considered experimental.*
+
+Limitations include:
+* Arvados container priority is not propagated to LSF job priority. This can cause inefficient use of compute resources, and even deadlock if there are fewer compute nodes than concurrent Arvados workflows.
+* Combining LSF with docker may not work, depending on LSF configuration and user/group IDs (if LSF only sets up the configured user's primary group ID when executing the crunch-run process on a compute node, it may not have permission to connect to the docker daemon).
+
+In order to run containers, you must choose a user that has permission to set up FUSE mounts and run Singularity/Docker containers on each compute node.  This install guide refers to this user as the @crunch@ user.  We recommend you create this user on each compute node with the same UID and GID, and add it to the @fuse@ and @docker@ system groups to grant it the necessary permissions.  However, you can run the dispatcher under any account with sufficient permissions across the cluster.
+
+Set up all of your compute nodes "as you would for a SLURM cluster":../crunch2-slurm/install-compute-node.html.
+
+
+h2(#update-config). Update config.yml
+
+Arvados-dispatch-lsf reads the common configuration file at @/etc/arvados/config.yml@.
+
+Review the following configuration parameters and adjust as needed.
+
+
+h3(#BsubSudoUser). Containers.LSF.BsubSudoUser
+
+arvados-dispatch-lsf uses @sudo@ to execute @bsub@, for example @sudo -E -u crunch bsub [...]@. This means the @crunch@ account must exist on the hosts where LSF jobs run ("execution hosts"), as well as on the host where you are installing the Arvados LSF dispatcher (the "submission host"). To use a user account other than @crunch@, configure @BsubSudoUser@:
+
+<notextile>
+<pre>    Containers:
+      LSF:
+        <code class="userinput">BsubSudoUser: <b>lsfuser</b>
+</code></pre>
+</notextile>
+
+Alternatively, you can arrange for the arvados-dispatch-lsf process to run as an unprivileged user that has a corresponding account on all compute nodes, and disable the use of @sudo@ by specifying an empty string:
+
+<notextile>
+<pre>    Containers:
+      LSF:
+        # Don't use sudo
+        <code class="userinput">BsubSudoUser: <b>""</b>
+</code></pre>
+</notextile>
+
+
+h3(#SbatchArguments). Containers.LSF.BsubArgumentsList
+
+When arvados-dispatch-lsf invokes @bsub@, you can add arguments to the command by specifying @BsubArgumentsList@.  You can use this to send the jobs to specific cluster partitions or add resource requests.  Set @BsubArgumentsList@ to an array of strings.  For example:
+
+<notextile>
+<pre>    Containers:
+      LSF:
+        <code class="userinput">BsubArgumentsList: <b>["-C", "0"]</b></code>
+</pre>
+</notextile>
+
+
+h3(#PollPeriod). Containers.PollInterval
+
+arvados-dispatch-lsf polls the API server periodically for new containers to run.  The @PollInterval@ option controls how often this poll happens.  Set this to a string of numbers suffixed with one of the time units @s@, @m@, or @h@.  For example:
+
+<notextile>
+<pre>    Containers:
+      <code class="userinput">PollInterval: <b>10s</b>
+</code></pre>
+</notextile>
+
+
+h3(#ReserveExtraRAM). Containers.ReserveExtraRAM: Extra RAM for jobs
+
+Extra RAM to reserve (in bytes) on each LSF job submitted by Arvados, which is added to the amount specified in the container's @runtime_constraints@.  If not provided, the default value is zero.
+
+Supports suffixes @KB@, @KiB@, @MB@, @MiB@, @GB@, @GiB@, @TB@, @TiB@, @PB@, @PiB@, @EB@, @EiB@ (where @KB@ is 10[^3^], @KiB@ is 2[^10^], @MB@ is 10[^6^], @MiB@ is 2[^20^] and so forth).
+
+<notextile>
+<pre>    Containers:
+      <code class="userinput">ReserveExtraRAM: <b>256MiB</b></code>
+</pre>
+</notextile>
+
+
+h3(#CrunchRunCommand-network). Containers.CrunchRunArgumentList: Using host networking for containers
+
+Older Linux kernels (prior to 3.18) have bugs in network namespace handling which can lead to compute node lockups.  This by is indicated by blocked kernel tasks in "Workqueue: netns cleanup_net".   If you are experiencing this problem, as a workaround you can disable use of network namespaces by Docker across the cluster.  Be aware this reduces container isolation, which may be a security risk.
+
+<notextile>
+<pre>    Containers:
+      <code class="userinput">CrunchRunArgumentsList:
+        - <b>"-container-enable-networking=always"</b>
+        - <b>"-container-network-mode=host"</b></code>
+</pre>
+</notextile>
+
+{% assign arvados_component = 'arvados-dispatch-lsf' %}
+
+{% include 'install_packages' %}
+
+{% include 'start_service' %}
+
+{% include 'restart_api' %}
index 3996cc7930a70a44ace17a8bd55cade99876bd7c..5b5b868e57611fe0262b0e16e708289f1a001f95 100644 (file)
@@ -44,7 +44,7 @@ crunch-dispatch-slurm polls the API server periodically for new containers to ru
 
 h3(#ReserveExtraRAM). Containers.ReserveExtraRAM: Extra RAM for jobs
 
-Extra RAM to reserve (in bytes) on each Slurm job submitted by Arvados, which is added to the amount specified in the container's @runtime_constraints@.  If not provided, the default value is zero.  Helpful when using @-cgroup-parent-subsystem@, where @crunch-run@ and @arv-mount@ share the control group memory limit with the user process.  In this situation, at least 256MiB is recommended to accomodate each container's @crunch-run@ and @arv-mount@ processes.
+Extra RAM to reserve (in bytes) on each Slurm job submitted by Arvados, which is added to the amount specified in the container's @runtime_constraints@.  If not provided, the default value is zero.  Helpful when using @-cgroup-parent-subsystem@, where @crunch-run@ and @arv-mount@ share the control group memory limit with the user process.  In this situation, at least 256MiB is recommended to accommodate each container's @crunch-run@ and @arv-mount@ processes.
 
 Supports suffixes @KB@, @KiB@, @MB@, @MiB@, @GB@, @GiB@, @TB@, @TiB@, @PB@, @PiB@, @EB@, @EiB@ (where @KB@ is 10[^3^], @KiB@ is 2[^10^], @MB@ is 10[^6^], @MiB@ is 2[^20^] and so forth).
 
index 5ff9f44194fa5996f90de4ab75ebfb55a542dc6f..9f63d1bcfcf8ad3def1968700c58930c4e7a3ccb 100644 (file)
@@ -90,7 +90,7 @@ Note the trailing slash.
 {% include 'notebox_begin' %}
 Whether you choose to serve collections from their own subdomain or from a single domain, it's important to keep in mind that they should be served from me same _site_ as Workbench for the inline previews to work.
 
-Please check "keep-web's URL pattern guide":/api/keep-web-urls.html#same-site to learn more.
+Please check "keep-web's URL pattern guide":../api/keep-web-urls.html#same-site to learn more.
 {% include 'notebox_end' %}
 
 h2. Set InternalURLs
index 1f0186e33aba5a257da8ce5afb9886dd5f9e9ce3..084f32e029c4c3a99e4db207b5b07a4f51374e36 100644 (file)
@@ -29,6 +29,7 @@ h2(#supportedlinux). Supported GNU/Linux distributions
 table(table table-bordered table-condensed).
 |_. Distribution|_. State|_. Last supported version|
 |CentOS 7|Supported|Latest|
+|Debian 11 ("bullseye")|Supported|Latest|
 |Debian 10 ("buster")|Supported|Latest|
 |Ubuntu 20.04 ("focal")|Supported|Latest|
 |Ubuntu 18.04 ("bionic")|Supported|Latest|
@@ -141,26 +142,73 @@ You may also use a different method to pick the cluster identifier. The cluster
 
 h2(#dnstls). DNS entries and TLS certificates
 
-The following services are normally public-facing and require DNS entries and corresponding TLS certificates.  Get certificates from your preferred TLS certificate provider.  We recommend using "Let's Encrypt":https://letsencrypt.org/.  You can run several services on same node, but each distinct hostname requires its own TLS certificate.
+The following services are normally public-facing and require DNS entries and corresponding TLS certificates.  Get certificates from your preferred TLS certificate provider.  We recommend using "Let's Encrypt":https://letsencrypt.org/.  You can run several services on the same node, but each distinct DNS name requires a valid, matching TLS certificate.
 
-This guide uses the following hostname conventions.  A later part of this guide will describe how to set up Nginx virtual hosts.
+This guide uses the following DNS name conventions.  A later part of this guide will describe how to set up Nginx virtual hosts.
 
 <div class="offset1">
 table(table table-bordered table-condensed).
-|_. Function|_. Hostname|
+|_. Function|_. DNS name|
 |Arvados API|@ClusterID.example.com@|
 |Arvados Git server|git.@ClusterID.example.com@|
+|Arvados Webshell|webshell.@ClusterID.example.com@|
 |Arvados Websockets endpoint|ws.@ClusterID.example.com@|
 |Arvados Workbench|workbench.@ClusterID.example.com@|
 |Arvados Workbench 2|workbench2.@ClusterID.example.com@|
 |Arvados Keepproxy server|keep.@ClusterID.example.com@|
 |Arvados Keep-web server|download.@ClusterID.example.com@
 _and_
-*.collections.@ClusterID.example.com@ or
-*<notextile>--</notextile>collections.@ClusterID.example.com@ or
+*.collections.@ClusterID.example.com@ _or_
+*<notextile>--</notextile>collections.@ClusterID.example.com@ _or_
 collections.@ClusterID.example.com@ (see the "keep-web install docs":install-keep-web.html)|
 </div>
 
+Setting up Arvados is easiest when Wildcard TLS and wildcard DNS are available. It is also possible to set up Arvados without wildcard TLS and DNS, but not having a wildcard for @keep-web@ (i.e. not having *.collections.@ClusterID.example.com@) comes with a tradeoff: it will disable some features that allow users to view Arvados-hosted data in their browsers. More information on this tradeoff caused by the CORS rules applied by modern browsers is available in the "keep-web URL pattern guide":../api/keep-web-urls.html.
+
+The table below lists the required TLS certificates and DNS names in each scenario.
+
+<div class="offset1">
+table(table table-bordered table-condensed).
+||_. Wildcard TLS and DNS available|_. Wildcard TLS available|_. Other|
+|TLS|*.@ClusterID.example.com@
+@ClusterID.example.com@
+*.collections.@ClusterID.example.com@|*.@ClusterID.example.com@
+@ClusterID.example.com@|@ClusterID.example.com@
+git.@ClusterID.example.com@
+webshell.@ClusterID.example.com@
+ws.@ClusterID.example.com@
+workbench.@ClusterID.example.com@
+workbench2.@ClusterID.example.com@
+keep.@ClusterID.example.com@
+download.@ClusterID.example.com@
+collections.@ClusterID.example.com@|
+|DNS|@ClusterID.example.com@
+git.@ClusterID.example.com@
+webshell.@ClusterID.example.com@
+ws.@ClusterID.example.com@
+workbench.@ClusterID.example.com@
+workbench2.@ClusterID.example.com@
+keep.@ClusterID.example.com@
+download.@ClusterID.example.com@
+*.collections.@ClusterID.example.com@|@ClusterID.example.com@
+git.@ClusterID.example.com@
+webshell.@ClusterID.example.com@
+ws.@ClusterID.example.com@
+workbench.@ClusterID.example.com@
+workbench2.@ClusterID.example.com@
+keep.@ClusterID.example.com@
+download.@ClusterID.example.com@
+collections.@ClusterID.example.com@|@ClusterID.example.com@
+git.@ClusterID.example.com@
+webshell.@ClusterID.example.com@
+ws.@ClusterID.example.com@
+workbench.@ClusterID.example.com@
+workbench2.@ClusterID.example.com@
+keep.@ClusterID.example.com@
+download.@ClusterID.example.com@
+collections.@ClusterID.example.com@|
+</div>
+
 {% include 'notebox_begin' %}
 It is also possible to create your own certificate authority, issue server certificates, and install a custom root certificate in the browser.  This is out of scope for this guide.
 {% include 'notebox_end' %}
index 5d74e4a7e30271923bf3992d100119cea824e724..fb296ad5ad47019024d49fe6e0bf61f30a48b0d5 100644 (file)
@@ -41,6 +41,7 @@ As root, add the Arvados package repository to your sources.  This command depen
 
 table(table table-bordered table-condensed).
 |_. OS version|_. Command|
+|Debian 11 ("bullseye")|<notextile><code><span class="userinput">echo "deb http://apt.arvados.org/buster bullseye main" &#x7c; tee /etc/apt/sources.list.d/arvados.list</span></code></notextile>|
 |Debian 10 ("buster")|<notextile><code><span class="userinput">echo "deb http://apt.arvados.org/buster buster main" &#x7c; tee /etc/apt/sources.list.d/arvados.list</span></code></notextile>|
 |Ubuntu 20.04 ("focal")[1]|<notextile><code><span class="userinput">echo "deb http://apt.arvados.org/focal focal main" &#x7c; tee /etc/apt/sources.list.d/arvados.list</span></code></notextile>|
 |Ubuntu 18.04 ("bionic")[1]|<notextile><code><span class="userinput">echo "deb http://apt.arvados.org/bionic bionic main" &#x7c; tee /etc/apt/sources.list.d/arvados.list</span></code></notextile>|
index c5bc1c8e87f6e8c6598eadc93f2f3766d222e703..66f508b5adf2c3c10f2bca8ce2d72f0e44a0f327 100644 (file)
@@ -52,6 +52,9 @@ Clusters:
       DispatchCloud:
         InternalURLs: {SAMPLE: {}}
         ExternalURL: "-"
+      DispatchLSF:
+        InternalURLs: {SAMPLE: {}}
+        ExternalURL: "-"
       Keepproxy:
         InternalURLs: {SAMPLE: {}}
         ExternalURL: ""
@@ -524,10 +527,10 @@ Clusters:
       # WebDAV would have to expose XSS vulnerabilities in order to
       # handle the redirect (see discussion on Services.WebDAV).
       #
-      # This setting has no effect in the recommended configuration,
-      # where the WebDAV is configured to have a separate domain for
-      # every collection; in this case XSS protection is provided by
-      # browsers' same-origin policy.
+      # This setting has no effect in the recommended configuration, where the
+      # WebDAV service is configured to have a separate domain for every
+      # collection and XSS protection is provided by browsers' same-origin
+      # policy.
       #
       # The default setting (false) is appropriate for a multi-user site.
       TrustAllContent: false
@@ -1019,6 +1022,19 @@ Clusters:
           # (See http://ruby-doc.org/core-2.2.2/Kernel.html#method-i-format for more.)
           AssignNodeHostname: "compute%<slot_number>d"
 
+      LSF:
+        # Additional arguments to bsub when submitting Arvados
+        # containers as LSF jobs.
+        BsubArgumentsList: []
+
+        # Use sudo to switch to this user account when submitting LSF
+        # jobs.
+        #
+        # This account must exist on the hosts where LSF jobs run
+        # ("execution hosts"), as well as on the host where the
+        # Arvados LSF dispatcher runs ("submission host").
+        BsubSudoUser: "crunch"
+
       JobsAPI:
         # Enable the legacy 'jobs' API (crunch v1).  This value must be a string.
         #
index 2faacc85953ffb724015b427f29796cb4634002e..bbc5ea6c55b885244fc0c33e51a50f36c0f64ca1 100644 (file)
@@ -121,6 +121,7 @@ var whitelist = map[string]bool{
        "Containers.JobsAPI.GitInternalDir":                   false,
        "Containers.Logging":                                  false,
        "Containers.LogReuseDecisions":                        false,
+       "Containers.LSF":                                      false,
        "Containers.MaxComputeVMs":                            false,
        "Containers.MaxDispatchAttempts":                      false,
        "Containers.MaxRetryAttempts":                         true,
index fb9f888ebb87361b071136ab7226665f0a1e51c5..ee230841354522ad155e25ce794dbca6f549b58f 100644 (file)
@@ -58,6 +58,9 @@ Clusters:
       DispatchCloud:
         InternalURLs: {SAMPLE: {}}
         ExternalURL: "-"
+      DispatchLSF:
+        InternalURLs: {SAMPLE: {}}
+        ExternalURL: "-"
       Keepproxy:
         InternalURLs: {SAMPLE: {}}
         ExternalURL: ""
@@ -530,10 +533,10 @@ Clusters:
       # WebDAV would have to expose XSS vulnerabilities in order to
       # handle the redirect (see discussion on Services.WebDAV).
       #
-      # This setting has no effect in the recommended configuration,
-      # where the WebDAV is configured to have a separate domain for
-      # every collection; in this case XSS protection is provided by
-      # browsers' same-origin policy.
+      # This setting has no effect in the recommended configuration, where the
+      # WebDAV service is configured to have a separate domain for every
+      # collection and XSS protection is provided by browsers' same-origin
+      # policy.
       #
       # The default setting (false) is appropriate for a multi-user site.
       TrustAllContent: false
@@ -1025,6 +1028,19 @@ Clusters:
           # (See http://ruby-doc.org/core-2.2.2/Kernel.html#method-i-format for more.)
           AssignNodeHostname: "compute%<slot_number>d"
 
+      LSF:
+        # Additional arguments to bsub when submitting Arvados
+        # containers as LSF jobs.
+        BsubArgumentsList: []
+
+        # Use sudo to switch to this user account when submitting LSF
+        # jobs.
+        #
+        # This account must exist on the hosts where LSF jobs run
+        # ("execution hosts"), as well as on the host where the
+        # Arvados LSF dispatcher runs ("submission host").
+        BsubSudoUser: "crunch"
+
       JobsAPI:
         # Enable the legacy 'jobs' API (crunch v1).  This value must be a string.
         #
index 44c99bf30f8c3a6ae9aa70b8306268b7c4c8fb6d..26f0dbb0d1388da1886cea726fc644648b4d57e3 100644 (file)
@@ -26,6 +26,7 @@ import (
        "git.arvados.org/arvados.git/sdk/go/arvados"
        "git.arvados.org/arvados.git/sdk/go/arvadostest"
        "git.arvados.org/arvados.git/sdk/go/ctxlog"
+       "git.arvados.org/arvados.git/sdk/go/httpserver"
        check "gopkg.in/check.v1"
 )
 
@@ -432,6 +433,74 @@ func (s *IntegrationSuite) TestCreateContainerRequestWithBadToken(c *check.C) {
        }
 }
 
+func (s *IntegrationSuite) TestRequestIDHeader(c *check.C) {
+       conn1 := s.testClusters["z1111"].Conn()
+       rootctx1, _, _ := s.testClusters["z1111"].RootClients()
+       userctx1, ac1, _, _ := s.testClusters["z1111"].UserClients(rootctx1, c, conn1, "user@example.com", true)
+
+       coll, err := conn1.CollectionCreate(userctx1, arvados.CreateOptions{})
+       c.Check(err, check.IsNil)
+       specimen, err := conn1.SpecimenCreate(userctx1, arvados.CreateOptions{})
+       c.Check(err, check.IsNil)
+
+       tests := []struct {
+               path            string
+               reqIdProvided   bool
+               notFoundRequest bool
+       }{
+               {"/arvados/v1/collections", false, false},
+               {"/arvados/v1/collections", true, false},
+               {"/arvados/v1/nonexistant", false, true},
+               {"/arvados/v1/nonexistant", true, true},
+               {"/arvados/v1/collections/" + coll.UUID, false, false},
+               {"/arvados/v1/collections/" + coll.UUID, true, false},
+               {"/arvados/v1/specimens/" + specimen.UUID, false, false},
+               {"/arvados/v1/specimens/" + specimen.UUID, true, false},
+               {"/arvados/v1/collections/z1111-4zz18-0123456789abcde", false, true},
+               {"/arvados/v1/collections/z1111-4zz18-0123456789abcde", true, true},
+               {"/arvados/v1/specimens/z1111-j58dm-0123456789abcde", false, true},
+               {"/arvados/v1/specimens/z1111-j58dm-0123456789abcde", true, true},
+       }
+
+       for _, tt := range tests {
+               c.Log(c.TestName() + " " + tt.path)
+               req, err := http.NewRequest("GET", "https://"+ac1.APIHost+tt.path, nil)
+               c.Assert(err, check.IsNil)
+               customReqId := "abcdeG"
+               if !tt.reqIdProvided {
+                       c.Assert(req.Header.Get("X-Request-Id"), check.Equals, "")
+               } else {
+                       req.Header.Set("X-Request-Id", customReqId)
+               }
+               resp, err := ac1.Do(req)
+               c.Assert(err, check.IsNil)
+               if tt.notFoundRequest {
+                       c.Check(resp.StatusCode, check.Equals, http.StatusNotFound)
+               } else {
+                       c.Check(resp.StatusCode, check.Equals, http.StatusOK)
+               }
+               if !tt.reqIdProvided {
+                       c.Check(resp.Header.Get("X-Request-Id"), check.Matches, "^req-[0-9a-zA-Z]{20}$")
+                       if tt.notFoundRequest {
+                               var jresp httpserver.ErrorResponse
+                               err := json.NewDecoder(resp.Body).Decode(&jresp)
+                               c.Check(err, check.IsNil)
+                               c.Assert(jresp.Errors, check.HasLen, 1)
+                               c.Check(jresp.Errors[0], check.Matches, "^.*(req-[0-9a-zA-Z]{20}).*$")
+                       }
+               } else {
+                       c.Check(resp.Header.Get("X-Request-Id"), check.Equals, customReqId)
+                       if tt.notFoundRequest {
+                               var jresp httpserver.ErrorResponse
+                               err := json.NewDecoder(resp.Body).Decode(&jresp)
+                               c.Check(err, check.IsNil)
+                               c.Assert(jresp.Errors, check.HasLen, 1)
+                               c.Check(jresp.Errors[0], check.Matches, "^.*("+customReqId+").*$")
+                       }
+               }
+       }
+}
+
 // We test the direct access to the database
 // normally an integration test would not have a database access, but  in this case we need
 // to test tokens that are secret, so there is no API response that will give them back
index 412f1bbfbfa95027eb5c043c5e1fcf07449139b0..e15303a3155afe81d72e8ce61e881ce76d5282d7 100644 (file)
@@ -260,18 +260,16 @@ func (runner *ContainerRunner) LoadImage() (string, error) {
                return "", fmt.Errorf("cannot choose from multiple tar files in image collection: %v", tarfiles)
        }
        imageID := tarfiles[0][:len(tarfiles[0])-4]
-       imageFile := runner.ArvMountPoint + "/by_id/" + runner.Container.ContainerImage + "/" + tarfiles[0]
+       imageTarballPath := runner.ArvMountPoint + "/by_id/" + runner.Container.ContainerImage + "/" + imageID + ".tar"
        runner.CrunchLog.Printf("Using Docker image id %q", imageID)
 
-       if !runner.executor.ImageLoaded(imageID) {
-               runner.CrunchLog.Print("Loading Docker image from keep")
-               err = runner.executor.LoadImage(imageFile)
-               if err != nil {
-                       return "", err
-               }
-       } else {
-               runner.CrunchLog.Print("Docker image is available")
+       runner.CrunchLog.Print("Loading Docker image from keep")
+       err = runner.executor.LoadImage(imageID, imageTarballPath, runner.Container, runner.ArvMountPoint,
+               runner.containerClient)
+       if err != nil {
+               return "", err
        }
+
        return imageID, nil
 }
 
@@ -599,6 +597,7 @@ func (runner *ContainerRunner) SetupMounts() (map[string]bindmount, error) {
        } else {
                arvMountCmd = append(arvMountCmd, "--mount-by-id", "by_id")
        }
+       arvMountCmd = append(arvMountCmd, "--mount-by-id", "by_uuid")
        arvMountCmd = append(arvMountCmd, runner.ArvMountPoint)
 
        runner.ArvMount, err = runner.RunArvMount(arvMountCmd, token)
@@ -1201,12 +1200,14 @@ func (runner *ContainerRunner) CleanupDirs() {
                                }
                        }
                }
+               runner.ArvMount = nil
        }
 
        if runner.ArvMountPoint != "" {
                if rmerr := os.Remove(runner.ArvMountPoint); rmerr != nil {
                        runner.CrunchLog.Printf("While cleaning up arv-mount directory %s: %v", runner.ArvMountPoint, rmerr)
                }
+               runner.ArvMountPoint = ""
        }
 
        if rmerr := os.RemoveAll(runner.parentTemp); rmerr != nil {
@@ -1441,6 +1442,7 @@ func (runner *ContainerRunner) Run() (err error) {
                }
                checkErr("stopHoststat", runner.stopHoststat())
                checkErr("CommitLogs", runner.CommitLogs())
+               runner.CleanupDirs()
                checkErr("UpdateContainerFinal", runner.UpdateContainerFinal())
        }()
 
index bb7ffdf0306b26b2f5c56062aaaaaf7b256e5447..bb982cdee76c32cb9321ce88e8fa47fa0588f2f1 100644 (file)
@@ -112,8 +112,11 @@ type stubExecutor struct {
        exit        chan int
 }
 
-func (e *stubExecutor) ImageLoaded(imageID string) bool { return e.imageLoaded }
-func (e *stubExecutor) LoadImage(filename string) error { e.loaded = filename; return e.loadErr }
+func (e *stubExecutor) LoadImage(imageId string, tarball string, container arvados.Container, keepMount string,
+       containerClient *arvados.Client) error {
+       e.loaded = tarball
+       return e.loadErr
+}
 func (e *stubExecutor) Create(spec containerSpec) error { e.created = spec; return e.createErr }
 func (e *stubExecutor) Start() error                    { e.exit = make(chan int, 1); go e.runFunc(); return e.startErr }
 func (e *stubExecutor) CgroupID() string                { return "cgroupid" }
@@ -403,16 +406,6 @@ func (s *TestSuite) TestLoadImage(c *C) {
        imageID, err = s.runner.LoadImage()
        c.Check(err, ErrorMatches, "image collection does not include a \\.tar image file")
        c.Check(s.executor.loaded, Equals, "")
-
-       // if executor reports image is already loaded, LoadImage should not be called
-       s.runner.Container.ContainerImage = arvadostest.DockerImage112PDH
-       s.executor.imageLoaded = true
-       s.executor.loaded = ""
-       s.executor.loadErr = nil
-       imageID, err = s.runner.LoadImage()
-       c.Check(err, IsNil)
-       c.Check(s.executor.loaded, Equals, "")
-       c.Check(imageID, Equals, strings.TrimSuffix(arvadostest.DockerImage112Filename, ".tar"))
 }
 
 type ArvErrorTestClient struct{}
@@ -1112,7 +1105,7 @@ func (s *TestSuite) TestSetupMounts(c *C) {
                c.Check(err, IsNil)
                c.Check(am.Cmd, DeepEquals, []string{"--foreground", "--allow-other",
                        "--read-write", "--storage-classes", "default", "--crunchstat-interval=5",
-                       "--mount-by-pdh", "by_id", realTemp + "/keep1"})
+                       "--mount-by-pdh", "by_id", "--mount-by-id", "by_uuid", realTemp + "/keep1"})
                c.Check(bindmounts, DeepEquals, map[string]bindmount{"/tmp": {realTemp + "/tmp2", false}})
                os.RemoveAll(cr.ArvMountPoint)
                cr.CleanupDirs()
@@ -1132,7 +1125,7 @@ func (s *TestSuite) TestSetupMounts(c *C) {
                c.Check(err, IsNil)
                c.Check(am.Cmd, DeepEquals, []string{"--foreground", "--allow-other",
                        "--read-write", "--storage-classes", "foo,bar", "--crunchstat-interval=5",
-                       "--mount-by-pdh", "by_id", realTemp + "/keep1"})
+                       "--mount-by-pdh", "by_id", "--mount-by-id", "by_uuid", realTemp + "/keep1"})
                c.Check(bindmounts, DeepEquals, map[string]bindmount{"/out": {realTemp + "/tmp2", false}, "/tmp": {realTemp + "/tmp3", false}})
                os.RemoveAll(cr.ArvMountPoint)
                cr.CleanupDirs()
@@ -1152,7 +1145,7 @@ func (s *TestSuite) TestSetupMounts(c *C) {
                c.Check(err, IsNil)
                c.Check(am.Cmd, DeepEquals, []string{"--foreground", "--allow-other",
                        "--read-write", "--storage-classes", "default", "--crunchstat-interval=5",
-                       "--mount-by-pdh", "by_id", realTemp + "/keep1"})
+                       "--mount-by-pdh", "by_id", "--mount-by-id", "by_uuid", realTemp + "/keep1"})
                c.Check(bindmounts, DeepEquals, map[string]bindmount{"/tmp": {realTemp + "/tmp2", false}, "/etc/arvados/ca-certificates.crt": {stubCertPath, true}})
                os.RemoveAll(cr.ArvMountPoint)
                cr.CleanupDirs()
@@ -1175,7 +1168,7 @@ func (s *TestSuite) TestSetupMounts(c *C) {
                c.Check(err, IsNil)
                c.Check(am.Cmd, DeepEquals, []string{"--foreground", "--allow-other",
                        "--read-write", "--storage-classes", "default", "--crunchstat-interval=5",
-                       "--mount-tmp", "tmp0", "--mount-by-pdh", "by_id", realTemp + "/keep1"})
+                       "--mount-tmp", "tmp0", "--mount-by-pdh", "by_id", "--mount-by-id", "by_uuid", realTemp + "/keep1"})
                c.Check(bindmounts, DeepEquals, map[string]bindmount{"/keeptmp": {realTemp + "/keep1/tmp0", false}})
                os.RemoveAll(cr.ArvMountPoint)
                cr.CleanupDirs()
@@ -1198,7 +1191,7 @@ func (s *TestSuite) TestSetupMounts(c *C) {
                c.Check(err, IsNil)
                c.Check(am.Cmd, DeepEquals, []string{"--foreground", "--allow-other",
                        "--read-write", "--storage-classes", "default", "--crunchstat-interval=5",
-                       "--mount-tmp", "tmp0", "--mount-by-pdh", "by_id", realTemp + "/keep1"})
+                       "--mount-tmp", "tmp0", "--mount-by-pdh", "by_id", "--mount-by-id", "by_uuid", realTemp + "/keep1"})
                c.Check(bindmounts, DeepEquals, map[string]bindmount{
                        "/keepinp": {realTemp + "/keep1/by_id/59389a8f9ee9d399be35462a0f92541c+53", true},
                        "/keepout": {realTemp + "/keep1/tmp0", false},
@@ -1225,7 +1218,7 @@ func (s *TestSuite) TestSetupMounts(c *C) {
                c.Check(err, IsNil)
                c.Check(am.Cmd, DeepEquals, []string{"--foreground", "--allow-other",
                        "--read-write", "--storage-classes", "default", "--crunchstat-interval=5",
-                       "--file-cache", "512", "--mount-tmp", "tmp0", "--mount-by-pdh", "by_id", realTemp + "/keep1"})
+                       "--file-cache", "512", "--mount-tmp", "tmp0", "--mount-by-pdh", "by_id", "--mount-by-id", "by_uuid", realTemp + "/keep1"})
                c.Check(bindmounts, DeepEquals, map[string]bindmount{
                        "/keepinp": {realTemp + "/keep1/by_id/59389a8f9ee9d399be35462a0f92541c+53", true},
                        "/keepout": {realTemp + "/keep1/tmp0", false},
@@ -1308,7 +1301,7 @@ func (s *TestSuite) TestSetupMounts(c *C) {
                c.Check(err, IsNil)
                c.Check(am.Cmd, DeepEquals, []string{"--foreground", "--allow-other",
                        "--read-write", "--storage-classes", "default", "--crunchstat-interval=5",
-                       "--file-cache", "512", "--mount-tmp", "tmp0", "--mount-by-pdh", "by_id", realTemp + "/keep1"})
+                       "--file-cache", "512", "--mount-tmp", "tmp0", "--mount-by-pdh", "by_id", "--mount-by-id", "by_uuid", realTemp + "/keep1"})
                c.Check(bindmounts, DeepEquals, map[string]bindmount{
                        "/tmp":     {realTemp + "/tmp2", false},
                        "/tmp/foo": {realTemp + "/keep1/tmp0", true},
index 861f8c8c1913f07bab8d7ea722dfa3c643678059..656061b77ec552a811c26dfe18be870b154c1b1e 100644 (file)
@@ -11,6 +11,7 @@ import (
        "strings"
        "time"
 
+       "git.arvados.org/arvados.git/sdk/go/arvados"
        dockertypes "github.com/docker/docker/api/types"
        dockercontainer "github.com/docker/docker/api/types/container"
        dockerclient "github.com/docker/docker/client"
@@ -45,13 +46,15 @@ func newDockerExecutor(containerUUID string, logf func(string, ...interface{}),
        }, err
 }
 
-func (e *dockerExecutor) ImageLoaded(imageID string) bool {
+func (e *dockerExecutor) LoadImage(imageID string, imageTarballPath string, container arvados.Container, arvMountPoint string,
+       containerClient *arvados.Client) error {
        _, _, err := e.dockerclient.ImageInspectWithRaw(context.TODO(), imageID)
-       return err == nil
-}
+       if err == nil {
+               // already loaded
+               return nil
+       }
 
-func (e *dockerExecutor) LoadImage(filename string) error {
-       f, err := os.Open(filename)
+       f, err := os.Open(imageTarballPath)
        if err != nil {
                return err
        }
index f4feaa06c21447cc66b2e57a962e2d2c306e6de7..65bf7427b9601c465fb21d811c5cb79d2d41a0f8 100644 (file)
@@ -6,6 +6,7 @@ package crunchrun
 import (
        "io"
 
+       "git.arvados.org/arvados.git/sdk/go/arvados"
        "golang.org/x/net/context"
 )
 
@@ -33,13 +34,10 @@ type containerSpec struct {
 // containerExecutor is an interface to a container runtime
 // (docker/singularity).
 type containerExecutor interface {
-       // ImageLoaded determines whether the given image is already
-       // available to use without calling ImageLoad.
-       ImageLoaded(imageID string) bool
-
        // ImageLoad loads the image from the given tarball such that
        // it can be used to create/start a container.
-       LoadImage(filename string) error
+       LoadImage(imageID string, imageTarballPath string, container arvados.Container, keepMount string,
+               containerClient *arvados.Client) error
 
        // Wait for the container process to finish, and return its
        // exit code. If applicable, also remove the stopped container
index 5934c57b6c5f90bf971664c614a8348fb18b9e50..0f9901d6a1ff0d6ebb268c23b107f5ff5514244b 100644 (file)
@@ -13,6 +13,7 @@ import (
        "strings"
        "time"
 
+       "git.arvados.org/arvados.git/sdk/go/arvados"
        "golang.org/x/net/context"
        . "gopkg.in/check.v1"
 )
@@ -70,7 +71,7 @@ func (s *executorSuite) SetUpTest(c *C) {
                Stdout:      nopWriteCloser{&s.stdout},
                Stderr:      nopWriteCloser{&s.stderr},
        }
-       err := s.executor.LoadImage(busyboxDockerImage(c))
+       err := s.executor.LoadImage("", busyboxDockerImage(c), arvados.Container{}, "", nil)
        c.Assert(err, IsNil)
 }
 
index bcaff3bcc88300e51015f16a94751d20a39d5efe..741f542454e470ede35cc6f682c64c8a9b1bbf09 100644 (file)
@@ -5,12 +5,15 @@
 package crunchrun
 
 import (
+       "fmt"
        "io/ioutil"
        "os"
        "os/exec"
        "sort"
        "syscall"
+       "time"
 
+       "git.arvados.org/arvados.git/sdk/go/arvados"
        "golang.org/x/net/context"
 )
 
@@ -33,39 +36,179 @@ func newSingularityExecutor(logf func(string, ...interface{})) (*singularityExec
        }, nil
 }
 
-func (e *singularityExecutor) ImageLoaded(string) bool {
-       return false
+func (e *singularityExecutor) getOrCreateProject(ownerUuid string, name string, containerClient *arvados.Client) (*arvados.Group, error) {
+       var gp arvados.GroupList
+       err := containerClient.RequestAndDecode(&gp,
+               arvados.EndpointGroupList.Method,
+               arvados.EndpointGroupList.Path,
+               nil, arvados.ListOptions{Filters: []arvados.Filter{
+                       arvados.Filter{"owner_uuid", "=", ownerUuid},
+                       arvados.Filter{"name", "=", name},
+                       arvados.Filter{"group_class", "=", "project"},
+               },
+                       Limit: 1})
+       if err != nil {
+               return nil, err
+       }
+       if len(gp.Items) == 1 {
+               return &gp.Items[0], nil
+       }
+
+       var rgroup arvados.Group
+       err = containerClient.RequestAndDecode(&rgroup,
+               arvados.EndpointGroupCreate.Method,
+               arvados.EndpointGroupCreate.Path,
+               nil, map[string]interface{}{
+                       "group": map[string]string{
+                               "owner_uuid":  ownerUuid,
+                               "name":        name,
+                               "group_class": "project",
+                       },
+               })
+       if err != nil {
+               return nil, err
+       }
+       return &rgroup, nil
+}
+
+func (e *singularityExecutor) checkImageCache(dockerImageID string, container arvados.Container, arvMountPoint string,
+       containerClient *arvados.Client) (collection *arvados.Collection, err error) {
+
+       // Cache the image to keep
+       cacheGroup, err := e.getOrCreateProject(container.RuntimeUserUUID, ".cache", containerClient)
+       if err != nil {
+               return nil, fmt.Errorf("error getting '.cache' project: %v", err)
+       }
+       imageGroup, err := e.getOrCreateProject(cacheGroup.UUID, "auto-generated singularity images", containerClient)
+       if err != nil {
+               return nil, fmt.Errorf("error getting 'auto-generated singularity images' project: %s", err)
+       }
+
+       collectionName := fmt.Sprintf("singularity image for %v", dockerImageID)
+       var cl arvados.CollectionList
+       err = containerClient.RequestAndDecode(&cl,
+               arvados.EndpointCollectionList.Method,
+               arvados.EndpointCollectionList.Path,
+               nil, arvados.ListOptions{Filters: []arvados.Filter{
+                       arvados.Filter{"owner_uuid", "=", imageGroup.UUID},
+                       arvados.Filter{"name", "=", collectionName},
+               },
+                       Limit: 1})
+       if err != nil {
+               return nil, fmt.Errorf("error querying for collection '%v': %v", collectionName, err)
+       }
+       var imageCollection arvados.Collection
+       if len(cl.Items) == 1 {
+               imageCollection = cl.Items[0]
+       } else {
+               collectionName := collectionName + " " + time.Now().UTC().Format(time.RFC3339)
+               exp := time.Now().Add(24 * 7 * 2 * time.Hour)
+               err = containerClient.RequestAndDecode(&imageCollection,
+                       arvados.EndpointCollectionCreate.Method,
+                       arvados.EndpointCollectionCreate.Path,
+                       nil, map[string]interface{}{
+                               "collection": map[string]string{
+                                       "owner_uuid": imageGroup.UUID,
+                                       "name":       collectionName,
+                                       "trash_at":   exp.UTC().Format(time.RFC3339),
+                               },
+                       })
+               if err != nil {
+                       return nil, fmt.Errorf("error creating '%v' collection: %s", collectionName, err)
+               }
+
+       }
+
+       return &imageCollection, nil
 }
 
 // LoadImage will satisfy ContainerExecuter interface transforming
 // containerImage into a sif file for later use.
-func (e *singularityExecutor) LoadImage(imageTarballPath string) error {
-       e.logf("building singularity image")
-       // "singularity build" does not accept a
-       // docker-archive://... filename containing a ":" character,
-       // as in "/path/to/sha256:abcd...1234.tar". Workaround: make a
-       // symlink that doesn't have ":" chars.
-       err := os.Symlink(imageTarballPath, e.tmpdir+"/image.tar")
+func (e *singularityExecutor) LoadImage(dockerImageID string, imageTarballPath string, container arvados.Container, arvMountPoint string,
+       containerClient *arvados.Client) error {
+
+       var imageFilename string
+       var sifCollection *arvados.Collection
+       var err error
+       if containerClient != nil {
+               sifCollection, err = e.checkImageCache(dockerImageID, container, arvMountPoint, containerClient)
+               if err != nil {
+                       return err
+               }
+               imageFilename = fmt.Sprintf("%s/by_uuid/%s/image.sif", arvMountPoint, sifCollection.UUID)
+       } else {
+               imageFilename = e.tmpdir + "/image.sif"
+       }
+
+       if _, err := os.Stat(imageFilename); os.IsNotExist(err) {
+               e.logf("building singularity image")
+               // "singularity build" does not accept a
+               // docker-archive://... filename containing a ":" character,
+               // as in "/path/to/sha256:abcd...1234.tar". Workaround: make a
+               // symlink that doesn't have ":" chars.
+               err := os.Symlink(imageTarballPath, e.tmpdir+"/image.tar")
+               if err != nil {
+                       return err
+               }
+
+               build := exec.Command("singularity", "build", imageFilename, "docker-archive://"+e.tmpdir+"/image.tar")
+               e.logf("%v", build.Args)
+               out, err := build.CombinedOutput()
+               // INFO:    Starting build...
+               // Getting image source signatures
+               // Copying blob ab15617702de done
+               // Copying config 651e02b8a2 done
+               // Writing manifest to image destination
+               // Storing signatures
+               // 2021/04/22 14:42:14  info unpack layer: sha256:21cbfd3a344c52b197b9fa36091e66d9cbe52232703ff78d44734f85abb7ccd3
+               // INFO:    Creating SIF file...
+               // INFO:    Build complete: arvados-jobs.latest.sif
+               e.logf("%s", out)
+               if err != nil {
+                       return err
+               }
+       }
+
+       if containerClient == nil {
+               e.imageFilename = imageFilename
+               return nil
+       }
+
+       // update TTL to now + two weeks
+       exp := time.Now().Add(24 * 7 * 2 * time.Hour)
+
+       uuidPath, err := containerClient.PathForUUID("update", sifCollection.UUID)
        if err != nil {
-               return err
+               e.logf("error PathForUUID: %v", err)
+               return nil
+       }
+       var imageCollection arvados.Collection
+       err = containerClient.RequestAndDecode(&imageCollection,
+               arvados.EndpointCollectionUpdate.Method,
+               uuidPath,
+               nil, map[string]interface{}{
+                       "collection": map[string]string{
+                               "name":     fmt.Sprintf("singularity image for %v", dockerImageID),
+                               "trash_at": exp.UTC().Format(time.RFC3339),
+                       },
+               })
+       if err == nil {
+               // If we just wrote the image to the cache, the
+               // response also returns the updated PDH
+               e.imageFilename = fmt.Sprintf("%s/by_id/%s/image.sif", arvMountPoint, imageCollection.PortableDataHash)
+               return nil
        }
-       e.imageFilename = e.tmpdir + "/image.sif"
-       build := exec.Command("singularity", "build", e.imageFilename, "docker-archive://"+e.tmpdir+"/image.tar")
-       e.logf("%v", build.Args)
-       out, err := build.CombinedOutput()
-       // INFO:    Starting build...
-       // Getting image source signatures
-       // Copying blob ab15617702de done
-       // Copying config 651e02b8a2 done
-       // Writing manifest to image destination
-       // Storing signatures
-       // 2021/04/22 14:42:14  info unpack layer: sha256:21cbfd3a344c52b197b9fa36091e66d9cbe52232703ff78d44734f85abb7ccd3
-       // INFO:    Creating SIF file...
-       // INFO:    Build complete: arvados-jobs.latest.sif
-       e.logf("%s", out)
+
+       e.logf("error updating/renaming collection for cached sif image: %v", err)
+       // Failed to update but maybe it lost a race and there is
+       // another cached collection in the same place, so check the cache
+       // again
+       sifCollection, err = e.checkImageCache(dockerImageID, container, arvMountPoint, containerClient)
        if err != nil {
                return err
        }
+       e.imageFilename = fmt.Sprintf("%s/by_id/%s/image.sif", arvMountPoint, sifCollection.PortableDataHash)
+
        return nil
 }
 
@@ -92,8 +235,6 @@ func (e *singularityExecutor) Start() error {
                mount := e.spec.BindMounts[path]
                args = append(args, "--bind", mount.HostPath+":"+path+":"+readonlyflag[mount.ReadOnly])
        }
-       args = append(args, e.imageFilename)
-       args = append(args, e.spec.Command...)
 
        // This is for singularity 3.5.2. There are some behaviors
        // that will change in singularity 3.6, please see:
@@ -101,9 +242,17 @@ func (e *singularityExecutor) Start() error {
        // https://sylabs.io/guides/3.5/user-guide/environment_and_metadata.html
        env := make([]string, 0, len(e.spec.Env))
        for k, v := range e.spec.Env {
-               env = append(env, "SINGULARITYENV_"+k+"="+v)
+               if k == "HOME" {
+                       // $HOME is a special case
+                       args = append(args, "--home="+v)
+               } else {
+                       env = append(env, "SINGULARITYENV_"+k+"="+v)
+               }
        }
 
+       args = append(args, e.imageFilename)
+       args = append(args, e.spec.Command...)
+
        path, err := exec.LookPath(args[0])
        if err != nil {
                return err
diff --git a/lib/lsf/dispatch.go b/lib/lsf/dispatch.go
new file mode 100644 (file)
index 0000000..7461597
--- /dev/null
@@ -0,0 +1,322 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package lsf
+
+import (
+       "context"
+       "errors"
+       "fmt"
+       "math"
+       "net/http"
+       "regexp"
+       "strings"
+       "sync"
+       "time"
+
+       "git.arvados.org/arvados.git/lib/cmd"
+       "git.arvados.org/arvados.git/lib/dispatchcloud"
+       "git.arvados.org/arvados.git/lib/service"
+       "git.arvados.org/arvados.git/sdk/go/arvados"
+       "git.arvados.org/arvados.git/sdk/go/arvadosclient"
+       "git.arvados.org/arvados.git/sdk/go/auth"
+       "git.arvados.org/arvados.git/sdk/go/ctxlog"
+       "git.arvados.org/arvados.git/sdk/go/dispatch"
+       "git.arvados.org/arvados.git/sdk/go/health"
+       "github.com/julienschmidt/httprouter"
+       "github.com/prometheus/client_golang/prometheus"
+       "github.com/prometheus/client_golang/prometheus/promhttp"
+       "github.com/sirupsen/logrus"
+)
+
+var DispatchCommand cmd.Handler = service.Command(arvados.ServiceNameDispatchLSF, newHandler)
+
+func newHandler(ctx context.Context, cluster *arvados.Cluster, token string, reg *prometheus.Registry) service.Handler {
+       ac, err := arvados.NewClientFromConfig(cluster)
+       if err != nil {
+               return service.ErrorHandler(ctx, cluster, fmt.Errorf("error initializing client from cluster config: %s", err))
+       }
+       d := &dispatcher{
+               Cluster:   cluster,
+               Context:   ctx,
+               ArvClient: ac,
+               AuthToken: token,
+               Registry:  reg,
+       }
+       go d.Start()
+       return d
+}
+
+type dispatcher struct {
+       Cluster   *arvados.Cluster
+       Context   context.Context
+       ArvClient *arvados.Client
+       AuthToken string
+       Registry  *prometheus.Registry
+
+       logger        logrus.FieldLogger
+       lsfcli        lsfcli
+       lsfqueue      lsfqueue
+       arvDispatcher *dispatch.Dispatcher
+       httpHandler   http.Handler
+
+       initOnce sync.Once
+       stop     chan struct{}
+       stopped  chan struct{}
+}
+
+// Start starts the dispatcher. Start can be called multiple times
+// with no ill effect.
+func (disp *dispatcher) Start() {
+       disp.initOnce.Do(func() {
+               disp.init()
+               go func() {
+                       disp.checkLsfQueueForOrphans()
+                       err := disp.arvDispatcher.Run(disp.Context)
+                       if err != nil {
+                               disp.logger.Error(err)
+                               disp.Close()
+                       }
+               }()
+       })
+}
+
+// ServeHTTP implements service.Handler.
+func (disp *dispatcher) ServeHTTP(w http.ResponseWriter, r *http.Request) {
+       disp.Start()
+       disp.httpHandler.ServeHTTP(w, r)
+}
+
+// CheckHealth implements service.Handler.
+func (disp *dispatcher) CheckHealth() error {
+       disp.Start()
+       select {
+       case <-disp.stopped:
+               return errors.New("stopped")
+       default:
+               return nil
+       }
+}
+
+// Done implements service.Handler.
+func (disp *dispatcher) Done() <-chan struct{} {
+       return disp.stopped
+}
+
+// Stop dispatching containers and release resources. Used by tests.
+func (disp *dispatcher) Close() {
+       disp.Start()
+       select {
+       case disp.stop <- struct{}{}:
+       default:
+       }
+       <-disp.stopped
+}
+
+func (disp *dispatcher) init() {
+       disp.logger = ctxlog.FromContext(disp.Context)
+       disp.lsfcli.logger = disp.logger
+       disp.lsfqueue = lsfqueue{
+               logger: disp.logger,
+               period: time.Duration(disp.Cluster.Containers.CloudVMs.PollInterval),
+               lsfcli: &disp.lsfcli,
+       }
+       disp.ArvClient.AuthToken = disp.AuthToken
+       disp.stop = make(chan struct{}, 1)
+       disp.stopped = make(chan struct{})
+
+       arv, err := arvadosclient.New(disp.ArvClient)
+       if err != nil {
+               disp.logger.Fatalf("Error making Arvados client: %v", err)
+       }
+       arv.Retries = 25
+       arv.ApiToken = disp.AuthToken
+       disp.arvDispatcher = &dispatch.Dispatcher{
+               Arv:            arv,
+               Logger:         disp.logger,
+               BatchSize:      disp.Cluster.API.MaxItemsPerResponse,
+               RunContainer:   disp.runContainer,
+               PollPeriod:     time.Duration(disp.Cluster.Containers.CloudVMs.PollInterval),
+               MinRetryPeriod: time.Duration(disp.Cluster.Containers.MinRetryPeriod),
+       }
+
+       if disp.Cluster.ManagementToken == "" {
+               disp.httpHandler = http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+                       http.Error(w, "Management API authentication is not configured", http.StatusForbidden)
+               })
+       } else {
+               mux := httprouter.New()
+               metricsH := promhttp.HandlerFor(disp.Registry, promhttp.HandlerOpts{
+                       ErrorLog: disp.logger,
+               })
+               mux.Handler("GET", "/metrics", metricsH)
+               mux.Handler("GET", "/metrics.json", metricsH)
+               mux.Handler("GET", "/_health/:check", &health.Handler{
+                       Token:  disp.Cluster.ManagementToken,
+                       Prefix: "/_health/",
+                       Routes: health.Routes{"ping": disp.CheckHealth},
+               })
+               disp.httpHandler = auth.RequireLiteralToken(disp.Cluster.ManagementToken, mux)
+       }
+}
+
+func (disp *dispatcher) runContainer(_ *dispatch.Dispatcher, ctr arvados.Container, status <-chan arvados.Container) error {
+       ctx, cancel := context.WithCancel(disp.Context)
+       defer cancel()
+
+       if ctr.State != dispatch.Locked {
+               // already started by prior invocation
+       } else if _, ok := disp.lsfqueue.JobID(ctr.UUID); !ok {
+               disp.logger.Printf("Submitting container %s to LSF", ctr.UUID)
+               cmd := []string{disp.Cluster.Containers.CrunchRunCommand}
+               cmd = append(cmd, "--runtime-engine="+disp.Cluster.Containers.RuntimeEngine)
+               cmd = append(cmd, disp.Cluster.Containers.CrunchRunArgumentsList...)
+               err := disp.submit(ctr, cmd)
+               if err != nil {
+                       return err
+               }
+       }
+
+       disp.logger.Printf("Start monitoring container %v in state %q", ctr.UUID, ctr.State)
+       defer disp.logger.Printf("Done monitoring container %s", ctr.UUID)
+
+       // If the container disappears from the lsf queue, there is
+       // no point in waiting for further dispatch updates: just
+       // clean up and return.
+       go func(uuid string) {
+               for ctx.Err() == nil {
+                       if _, ok := disp.lsfqueue.JobID(uuid); !ok {
+                               disp.logger.Printf("container %s job disappeared from LSF queue", uuid)
+                               cancel()
+                               return
+                       }
+               }
+       }(ctr.UUID)
+
+       for done := false; !done; {
+               select {
+               case <-ctx.Done():
+                       // Disappeared from lsf queue
+                       if err := disp.arvDispatcher.Arv.Get("containers", ctr.UUID, nil, &ctr); err != nil {
+                               disp.logger.Printf("error getting final container state for %s: %s", ctr.UUID, err)
+                       }
+                       switch ctr.State {
+                       case dispatch.Running:
+                               disp.arvDispatcher.UpdateState(ctr.UUID, dispatch.Cancelled)
+                       case dispatch.Locked:
+                               disp.arvDispatcher.Unlock(ctr.UUID)
+                       }
+                       return nil
+               case updated, ok := <-status:
+                       if !ok {
+                               // status channel is closed, which is
+                               // how arvDispatcher tells us to stop
+                               // touching the container record, kill
+                               // off any remaining LSF processes,
+                               // etc.
+                               done = true
+                               break
+                       }
+                       if updated.State != ctr.State {
+                               disp.logger.Infof("container %s changed state from %s to %s", ctr.UUID, ctr.State, updated.State)
+                       }
+                       ctr = updated
+                       if ctr.Priority < 1 {
+                               disp.logger.Printf("container %s has state %s, priority %d: cancel lsf job", ctr.UUID, ctr.State, ctr.Priority)
+                               disp.bkill(ctr)
+                       } else {
+                               disp.lsfqueue.SetPriority(ctr.UUID, int64(ctr.Priority))
+                       }
+               }
+       }
+       disp.logger.Printf("container %s is done", ctr.UUID)
+
+       // Try "bkill" every few seconds until the LSF job disappears
+       // from the queue.
+       ticker := time.NewTicker(5 * time.Second)
+       defer ticker.Stop()
+       for jobid, ok := disp.lsfqueue.JobID(ctr.UUID); ok; _, ok = disp.lsfqueue.JobID(ctr.UUID) {
+               err := disp.lsfcli.Bkill(jobid)
+               if err != nil {
+                       disp.logger.Warnf("%s: bkill(%d): %s", ctr.UUID, jobid, err)
+               }
+               <-ticker.C
+       }
+       return nil
+}
+
+func (disp *dispatcher) submit(container arvados.Container, crunchRunCommand []string) error {
+       // Start with an empty slice here to ensure append() doesn't
+       // modify crunchRunCommand's underlying array
+       var crArgs []string
+       crArgs = append(crArgs, crunchRunCommand...)
+       crArgs = append(crArgs, container.UUID)
+       crScript := execScript(crArgs)
+
+       bsubArgs, err := disp.bsubArgs(container)
+       if err != nil {
+               return err
+       }
+       return disp.lsfcli.Bsub(crScript, bsubArgs, disp.ArvClient)
+}
+
+func (disp *dispatcher) bkill(ctr arvados.Container) {
+       if jobid, ok := disp.lsfqueue.JobID(ctr.UUID); !ok {
+               disp.logger.Debugf("bkill(%s): redundant, job not in queue", ctr.UUID)
+       } else if err := disp.lsfcli.Bkill(jobid); err != nil {
+               disp.logger.Warnf("%s: bkill(%d): %s", ctr.UUID, jobid, err)
+       }
+}
+
+func (disp *dispatcher) bsubArgs(container arvados.Container) ([]string, error) {
+       args := []string{"bsub"}
+       args = append(args, disp.Cluster.Containers.LSF.BsubArgumentsList...)
+       args = append(args, "-J", container.UUID)
+       args = append(args, disp.bsubConstraintArgs(container)...)
+       if u := disp.Cluster.Containers.LSF.BsubSudoUser; u != "" {
+               args = append([]string{"sudo", "-E", "-u", u}, args...)
+       }
+       return args, nil
+}
+
+func (disp *dispatcher) bsubConstraintArgs(container arvados.Container) []string {
+       // TODO: propagate container.SchedulingParameters.Partitions
+       tmp := int64(math.Ceil(float64(dispatchcloud.EstimateScratchSpace(&container)) / 1048576))
+       vcpus := container.RuntimeConstraints.VCPUs
+       mem := int64(math.Ceil(float64(container.RuntimeConstraints.RAM+
+               container.RuntimeConstraints.KeepCacheRAM+
+               int64(disp.Cluster.Containers.ReserveExtraRAM)) / 1048576))
+       return []string{
+               "-R", fmt.Sprintf("rusage[mem=%dMB:tmp=%dMB] affinity[core(%d)]", mem, tmp, vcpus),
+       }
+}
+
+// Check the next bjobs report, and invoke TrackContainer for all the
+// containers in the report. This gives us a chance to cancel existing
+// Arvados LSF jobs (started by a previous dispatch process) that
+// never released their LSF job allocations even though their
+// container states are Cancelled or Complete. See
+// https://dev.arvados.org/issues/10979
+func (disp *dispatcher) checkLsfQueueForOrphans() {
+       containerUuidPattern := regexp.MustCompile(`^[a-z0-9]{5}-dz642-[a-z0-9]{15}$`)
+       for _, uuid := range disp.lsfqueue.All() {
+               if !containerUuidPattern.MatchString(uuid) || !strings.HasPrefix(uuid, disp.Cluster.ClusterID) {
+                       continue
+               }
+               err := disp.arvDispatcher.TrackContainer(uuid)
+               if err != nil {
+                       disp.logger.Warnf("checkLsfQueueForOrphans: TrackContainer(%s): %s", uuid, err)
+               }
+       }
+}
+
+func execScript(args []string) []byte {
+       s := "#!/bin/sh\nexec"
+       for _, w := range args {
+               s += ` '`
+               s += strings.Replace(w, `'`, `'\''`, -1)
+               s += `'`
+       }
+       return []byte(s + "\n")
+}
diff --git a/lib/lsf/dispatch_test.go b/lib/lsf/dispatch_test.go
new file mode 100644 (file)
index 0000000..7cf6df6
--- /dev/null
@@ -0,0 +1,156 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package lsf
+
+import (
+       "context"
+       "fmt"
+       "math/rand"
+       "os/exec"
+       "strconv"
+       "sync"
+       "testing"
+       "time"
+
+       "git.arvados.org/arvados.git/lib/config"
+       "git.arvados.org/arvados.git/sdk/go/arvados"
+       "git.arvados.org/arvados.git/sdk/go/arvadostest"
+       "git.arvados.org/arvados.git/sdk/go/ctxlog"
+       "github.com/prometheus/client_golang/prometheus"
+       "gopkg.in/check.v1"
+)
+
+func Test(t *testing.T) {
+       check.TestingT(t)
+}
+
+var _ = check.Suite(&suite{})
+
+type suite struct {
+       disp *dispatcher
+}
+
+func (s *suite) TearDownTest(c *check.C) {
+       arvados.NewClientFromEnv().RequestAndDecode(nil, "POST", "database/reset", nil, nil)
+}
+
+func (s *suite) SetUpTest(c *check.C) {
+       cfg, err := config.NewLoader(nil, ctxlog.TestLogger(c)).Load()
+       c.Assert(err, check.IsNil)
+       cluster, err := cfg.GetCluster("")
+       c.Assert(err, check.IsNil)
+       cluster.Containers.CloudVMs.PollInterval = arvados.Duration(time.Second)
+       s.disp = newHandler(context.Background(), cluster, arvadostest.Dispatch1Token, prometheus.NewRegistry()).(*dispatcher)
+       s.disp.lsfcli.stubCommand = func(string, ...string) *exec.Cmd {
+               return exec.Command("bash", "-c", "echo >&2 unimplemented stub; false")
+       }
+}
+
+type lsfstub struct {
+       sudoUser  string
+       errorRate float64
+}
+
+func (stub lsfstub) stubCommand(c *check.C) func(prog string, args ...string) *exec.Cmd {
+       mtx := sync.Mutex{}
+       nextjobid := 100
+       fakejobq := map[int]string{}
+       return func(prog string, args ...string) *exec.Cmd {
+               c.Logf("stubCommand: %q %q", prog, args)
+               if rand.Float64() < stub.errorRate {
+                       return exec.Command("bash", "-c", "echo >&2 'stub random failure' && false")
+               }
+               if stub.sudoUser != "" && len(args) > 3 &&
+                       prog == "sudo" &&
+                       args[0] == "-E" &&
+                       args[1] == "-u" &&
+                       args[2] == stub.sudoUser {
+                       prog, args = args[3], args[4:]
+               }
+               switch prog {
+               case "bsub":
+                       c.Assert(args, check.HasLen, 4)
+                       c.Check(args[0], check.Equals, "-J")
+                       switch args[1] {
+                       case arvadostest.LockedContainerUUID:
+                               c.Check(args, check.DeepEquals, []string{"-J", arvadostest.LockedContainerUUID, "-R", "rusage[mem=11701MB:tmp=0MB] affinity[core(4)]"})
+                               mtx.Lock()
+                               fakejobq[nextjobid] = args[1]
+                               nextjobid++
+                               mtx.Unlock()
+                       case arvadostest.QueuedContainerUUID:
+                               c.Check(args, check.DeepEquals, []string{"-J", arvadostest.QueuedContainerUUID, "-R", "rusage[mem=11701MB:tmp=45777MB] affinity[core(4)]"})
+                               mtx.Lock()
+                               fakejobq[nextjobid] = args[1]
+                               nextjobid++
+                               mtx.Unlock()
+                       default:
+                               c.Errorf("unexpected uuid passed to bsub: args %q", args)
+                               return exec.Command("false")
+                       }
+                       return exec.Command("echo", "submitted job")
+               case "bjobs":
+                       c.Check(args, check.DeepEquals, []string{"-u", "all", "-noheader", "-o", "jobid stat job_name:30"})
+                       out := ""
+                       for jobid, uuid := range fakejobq {
+                               out += fmt.Sprintf(`%d %s %s\n`, jobid, "RUN", uuid)
+                       }
+                       c.Logf("bjobs out: %q", out)
+                       return exec.Command("printf", out)
+               case "bkill":
+                       killid, _ := strconv.Atoi(args[0])
+                       if uuid, ok := fakejobq[killid]; !ok {
+                               return exec.Command("bash", "-c", fmt.Sprintf("printf >&2 'Job <%d>: No matching job found\n'", killid))
+                       } else if uuid == "" {
+                               return exec.Command("bash", "-c", fmt.Sprintf("printf >&2 'Job <%d>: Job has already finished\n'", killid))
+                       } else {
+                               go func() {
+                                       time.Sleep(time.Millisecond)
+                                       mtx.Lock()
+                                       delete(fakejobq, killid)
+                                       mtx.Unlock()
+                               }()
+                               return exec.Command("bash", "-c", fmt.Sprintf("printf 'Job <%d> is being terminated\n'", killid))
+                       }
+               default:
+                       return exec.Command("bash", "-c", fmt.Sprintf("echo >&2 'stub: command not found: %+q'", prog))
+               }
+       }
+}
+
+func (s *suite) TestSubmit(c *check.C) {
+       s.disp.lsfcli.stubCommand = lsfstub{
+               errorRate: 0.1,
+               sudoUser:  s.disp.Cluster.Containers.LSF.BsubSudoUser,
+       }.stubCommand(c)
+       s.disp.Start()
+       deadline := time.Now().Add(20 * time.Second)
+       for range time.NewTicker(time.Second).C {
+               if time.Now().After(deadline) {
+                       c.Error("timed out")
+                       break
+               }
+               // "queuedcontainer" should be running
+               if _, ok := s.disp.lsfqueue.JobID(arvadostest.QueuedContainerUUID); !ok {
+                       continue
+               }
+               // "lockedcontainer" should be cancelled because it
+               // has priority 0 (no matching container requests)
+               if _, ok := s.disp.lsfqueue.JobID(arvadostest.LockedContainerUUID); ok {
+                       continue
+               }
+               var ctr arvados.Container
+               if err := s.disp.arvDispatcher.Arv.Get("containers", arvadostest.LockedContainerUUID, nil, &ctr); err != nil {
+                       c.Logf("error getting container state for %s: %s", arvadostest.LockedContainerUUID, err)
+                       continue
+               }
+               if ctr.State != arvados.ContainerStateQueued {
+                       c.Logf("LockedContainer is not in the LSF queue but its arvados record has not been updated to state==Queued (state is %q)", ctr.State)
+                       continue
+               }
+               c.Log("reached desired state")
+               break
+       }
+}
diff --git a/lib/lsf/lsfcli.go b/lib/lsf/lsfcli.go
new file mode 100644 (file)
index 0000000..9d712ee
--- /dev/null
@@ -0,0 +1,92 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package lsf
+
+import (
+       "bytes"
+       "fmt"
+       "os"
+       "os/exec"
+       "strings"
+
+       "git.arvados.org/arvados.git/sdk/go/arvados"
+       "github.com/sirupsen/logrus"
+)
+
+type bjobsEntry struct {
+       id   int
+       name string
+       stat string
+}
+
+type lsfcli struct {
+       logger logrus.FieldLogger
+       // (for testing) if non-nil, call stubCommand() instead of
+       // exec.Command() when running lsf command line programs.
+       stubCommand func(string, ...string) *exec.Cmd
+}
+
+func (cli lsfcli) command(prog string, args ...string) *exec.Cmd {
+       if f := cli.stubCommand; f != nil {
+               return f(prog, args...)
+       } else {
+               return exec.Command(prog, args...)
+       }
+}
+
+func (cli lsfcli) Bsub(script []byte, args []string, arv *arvados.Client) error {
+       cli.logger.Infof("bsub command %q script %q", args, script)
+       cmd := cli.command(args[0], args[1:]...)
+       cmd.Env = append([]string(nil), os.Environ()...)
+       cmd.Env = append(cmd.Env, "ARVADOS_API_HOST="+arv.APIHost)
+       cmd.Env = append(cmd.Env, "ARVADOS_API_TOKEN="+arv.AuthToken)
+       if arv.Insecure {
+               cmd.Env = append(cmd.Env, "ARVADOS_API_HOST_INSECURE=1")
+       }
+       cmd.Stdin = bytes.NewReader(script)
+       out, err := cmd.Output()
+       cli.logger.WithField("stdout", string(out)).Infof("bsub finished")
+       return errWithStderr(err)
+}
+
+func (cli lsfcli) Bjobs() ([]bjobsEntry, error) {
+       cli.logger.Debugf("Bjobs()")
+       cmd := cli.command("bjobs", "-u", "all", "-noheader", "-o", "jobid stat job_name:30")
+       buf, err := cmd.Output()
+       if err != nil {
+               return nil, errWithStderr(err)
+       }
+       var bjobs []bjobsEntry
+       for _, line := range strings.Split(string(buf), "\n") {
+               if line == "" {
+                       continue
+               }
+               var ent bjobsEntry
+               if _, err := fmt.Sscan(line, &ent.id, &ent.stat, &ent.name); err != nil {
+                       cli.logger.Warnf("ignoring unparsed line in bjobs output: %q", line)
+                       continue
+               }
+               bjobs = append(bjobs, ent)
+       }
+       return bjobs, nil
+}
+
+func (cli lsfcli) Bkill(id int) error {
+       cli.logger.Infof("Bkill(%d)", id)
+       cmd := cli.command("bkill", fmt.Sprintf("%d", id))
+       buf, err := cmd.CombinedOutput()
+       if err == nil || strings.Index(string(buf), "already finished") >= 0 {
+               return nil
+       } else {
+               return fmt.Errorf("%s (%q)", err, buf)
+       }
+}
+
+func errWithStderr(err error) error {
+       if err, ok := err.(*exec.ExitError); ok {
+               return fmt.Errorf("%s (%q)", err, err.Stderr)
+       }
+       return err
+}
diff --git a/lib/lsf/lsfqueue.go b/lib/lsf/lsfqueue.go
new file mode 100644 (file)
index 0000000..3c4fc4c
--- /dev/null
@@ -0,0 +1,108 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package lsf
+
+import (
+       "sync"
+       "time"
+
+       "github.com/sirupsen/logrus"
+)
+
+type lsfqueue struct {
+       logger logrus.FieldLogger
+       period time.Duration
+       lsfcli *lsfcli
+
+       initOnce  sync.Once
+       mutex     sync.Mutex
+       nextReady chan (<-chan struct{})
+       updated   *sync.Cond
+       latest    map[string]bjobsEntry
+}
+
+// JobID waits for the next queue update (so even a job that was only
+// submitted a nanosecond ago will show up) and then returns the LSF
+// job ID corresponding to the given container UUID.
+func (q *lsfqueue) JobID(uuid string) (int, bool) {
+       ent, ok := q.getNext()[uuid]
+       return ent.id, ok
+}
+
+// All waits for the next queue update, then returns the names of all
+// jobs in the queue. Used by checkLsfQueueForOrphans().
+func (q *lsfqueue) All() []string {
+       latest := q.getNext()
+       names := make([]string, 0, len(latest))
+       for name := range latest {
+               names = append(names, name)
+       }
+       return names
+}
+
+func (q *lsfqueue) SetPriority(uuid string, priority int64) {
+       q.initOnce.Do(q.init)
+       q.logger.Debug("SetPriority is not implemented")
+}
+
+func (q *lsfqueue) getNext() map[string]bjobsEntry {
+       q.initOnce.Do(q.init)
+       <-(<-q.nextReady)
+       q.mutex.Lock()
+       defer q.mutex.Unlock()
+       return q.latest
+}
+
+func (q *lsfqueue) init() {
+       q.updated = sync.NewCond(&q.mutex)
+       q.nextReady = make(chan (<-chan struct{}))
+       ticker := time.NewTicker(time.Second)
+       go func() {
+               for range ticker.C {
+                       // Send a new "next update ready" channel to
+                       // the next goroutine that wants one (and any
+                       // others that have already queued up since
+                       // the first one started waiting).
+                       //
+                       // Below, when we get a new update, we'll
+                       // signal that to the other goroutines by
+                       // closing the ready chan.
+                       ready := make(chan struct{})
+                       q.nextReady <- ready
+                       for {
+                               select {
+                               case q.nextReady <- ready:
+                                       continue
+                               default:
+                               }
+                               break
+                       }
+                       // Run bjobs repeatedly if needed, until we
+                       // get valid output.
+                       var ents []bjobsEntry
+                       for {
+                               q.logger.Debug("running bjobs")
+                               var err error
+                               ents, err = q.lsfcli.Bjobs()
+                               if err == nil {
+                                       break
+                               }
+                               q.logger.Warnf("bjobs: %s", err)
+                               <-ticker.C
+                       }
+                       next := make(map[string]bjobsEntry, len(ents))
+                       for _, ent := range ents {
+                               next[ent.name] = ent
+                       }
+                       // Replace q.latest and notify all the
+                       // goroutines that the "next update" they
+                       // asked for is now ready.
+                       q.mutex.Lock()
+                       q.latest = next
+                       q.mutex.Unlock()
+                       close(ready)
+               }
+       }()
+}
index 9a52ee70214d7f9474086818768c6cc8fbdafc17..5c17a2fd14cb2429a52d17cdae1c0a7eb7438fe5 100644 (file)
@@ -5,9 +5,6 @@
 fpm_depends+=(nodejs)
 
 case "$TARGET" in
-    ubuntu1604)
-        fpm_depends+=(libcurl3-gnutls)
-        ;;
     debian* | ubuntu*)
         fpm_depends+=(libcurl3-gnutls python3-distutils)
         ;;
index dcffcd25e73051985c1242441cce0372ed0eb7dd..9e7eb521eec079a145c94a840a14b35e502b6f18 100644 (file)
@@ -330,6 +330,7 @@ type Services struct {
        Composer       Service
        Controller     Service
        DispatchCloud  Service
+       DispatchLSF    Service
        GitHTTP        Service
        GitSSH         Service
        Health         Service
@@ -462,6 +463,10 @@ type ContainersConfig struct {
                        AssignNodeHostname     string
                }
        }
+       LSF struct {
+               BsubSudoUser      string
+               BsubArgumentsList []string
+       }
 }
 
 type CloudVMsConfig struct {
@@ -598,6 +603,7 @@ const (
        ServiceNameRailsAPI      ServiceName = "arvados-api-server"
        ServiceNameController    ServiceName = "arvados-controller"
        ServiceNameDispatchCloud ServiceName = "arvados-dispatch-cloud"
+       ServiceNameDispatchLSF   ServiceName = "arvados-dispatch-lsf"
        ServiceNameHealth        ServiceName = "arvados-health"
        ServiceNameWorkbench1    ServiceName = "arvados-workbench1"
        ServiceNameWorkbench2    ServiceName = "arvados-workbench2"
@@ -615,6 +621,7 @@ func (svcs Services) Map() map[ServiceName]Service {
                ServiceNameRailsAPI:      svcs.RailsAPI,
                ServiceNameController:    svcs.Controller,
                ServiceNameDispatchCloud: svcs.DispatchCloud,
+               ServiceNameDispatchLSF:   svcs.DispatchLSF,
                ServiceNameHealth:        svcs.Health,
                ServiceNameWorkbench1:    svcs.Workbench1,
                ServiceNameWorkbench2:    svcs.Workbench2,
index b57dc849442f4934f10611acd0248539af3a827e..384bebb5997ee86b1b1be2396498f1554ee32ecc 100644 (file)
@@ -33,6 +33,9 @@ type Container struct {
        GatewayAddress            string                 `json:"gateway_address"`
        InteractiveSessionStarted bool                   `json:"interactive_session_started"`
        OutputStorageClasses      []string               `json:"output_storage_classes"`
+       RuntimeUserUUID           string                 `json:"runtime_user_uuid"`
+       RuntimeAuthScopes         []string               `json:"runtime_auth_scopes"`
+       RuntimeToken              string                 `json:"runtime_token"`
 }
 
 // ContainerRequest is an arvados#container_request resource.
index d770ca76d1876ed56274a89435c4322db536fbfa..9281f51d0cf0ee2b46ca97c2e59fde9f68051d4d 100644 (file)
@@ -48,6 +48,8 @@ const (
        QueuedContainerRequestUUID = "zzzzz-xvhdp-cr4queuedcontnr"
        QueuedContainerUUID        = "zzzzz-dz642-queuedcontainer"
 
+       LockedContainerUUID = "zzzzz-dz642-lockedcontainer"
+
        RunningContainerUUID = "zzzzz-dz642-runningcontainr"
 
        CompletedContainerUUID         = "zzzzz-dz642-compltcontainer"
index df43c2b10d9778d7c62befc8a3f7e71babacb168..00c75154f656a70e0b42deed7ef0e34fa7a01d7d 100644 (file)
@@ -7,11 +7,13 @@
 package dispatch
 
 import (
+       "bytes"
        "context"
        "fmt"
        "sync"
        "time"
 
+       "git.arvados.org/arvados.git/lib/dispatchcloud"
        "git.arvados.org/arvados.git/sdk/go/arvados"
        "git.arvados.org/arvados.git/sdk/go/arvadosclient"
        "github.com/sirupsen/logrus"
@@ -66,7 +68,7 @@ type Dispatcher struct {
 // running, and return.
 //
 // The DispatchFunc should not return until the container is finished.
-type DispatchFunc func(*Dispatcher, arvados.Container, <-chan arvados.Container)
+type DispatchFunc func(*Dispatcher, arvados.Container, <-chan arvados.Container) error
 
 // Run watches the API server's queue for containers that are either
 // ready to run and available to lock, or are already locked by this
@@ -170,9 +172,34 @@ func (d *Dispatcher) start(c arvados.Container) *runTracker {
        }
        tracker.updates <- c
        go func() {
-               d.RunContainer(d, c, tracker.updates)
-               // RunContainer blocks for the lifetime of the container.  When
-               // it returns, the tracker should delete itself.
+               err := d.RunContainer(d, c, tracker.updates)
+               if err != nil {
+                       text := fmt.Sprintf("Error running container %s: %s", c.UUID, err)
+                       if err, ok := err.(dispatchcloud.ConstraintsNotSatisfiableError); ok {
+                               var logBuf bytes.Buffer
+                               fmt.Fprintf(&logBuf, "cannot run container %s: %s\n", c.UUID, err)
+                               if len(err.AvailableTypes) == 0 {
+                                       fmt.Fprint(&logBuf, "No instance types are configured.\n")
+                               } else {
+                                       fmt.Fprint(&logBuf, "Available instance types:\n")
+                                       for _, t := range err.AvailableTypes {
+                                               fmt.Fprintf(&logBuf,
+                                                       "Type %q: %d VCPUs, %d RAM, %d Scratch, %f Price\n",
+                                                       t.Name, t.VCPUs, t.RAM, t.Scratch, t.Price)
+                                       }
+                               }
+                               text = logBuf.String()
+                               d.UpdateState(c.UUID, Cancelled)
+                       }
+                       d.Logger.Printf("%s", text)
+                       lr := arvadosclient.Dict{"log": arvadosclient.Dict{
+                               "object_uuid": c.UUID,
+                               "event_type":  "dispatch",
+                               "properties":  map[string]string{"text": text}}}
+                       d.Arv.Create("logs", lr, nil)
+                       d.Unlock(c.UUID)
+               }
+
                d.mtx.Lock()
                delete(d.trackers, c.UUID)
                d.mtx.Unlock()
index 25a4d2b87902531b913cc2203c8439a748469753..4b115229b403bc69aadd275a0177b00d4d171f79 100644 (file)
@@ -35,11 +35,12 @@ func (s *suite) TestTrackContainer(c *C) {
        time.AfterFunc(10*time.Second, func() { done <- false })
        d := &Dispatcher{
                Arv: arv,
-               RunContainer: func(dsp *Dispatcher, ctr arvados.Container, status <-chan arvados.Container) {
+               RunContainer: func(dsp *Dispatcher, ctr arvados.Container, status <-chan arvados.Container) error {
                        for ctr := range status {
                                c.Logf("%#v", ctr)
                        }
                        done <- true
+                       return nil
                },
        }
        d.TrackContainer(arvadostest.QueuedContainerUUID)
index 2acf3e59ab81ae10ff816577f5f33fdaea8b9922..04106caa442cfb52fbc098a516112e11b55643bb 100644 (file)
@@ -153,6 +153,7 @@ func (s *AggregatorSuite) setAllServiceURLs(listen string) {
        for _, svc := range []*arvados.Service{
                &svcs.Controller,
                &svcs.DispatchCloud,
+               &svcs.DispatchLSF,
                &svcs.Keepbalance,
                &svcs.Keepproxy,
                &svcs.Keepstore,
index 14d89873b60f7d902a39a6b337eea78e8040d0c3..5b661ae9f71ac4984c3a22a6dfe8a58bbffbdb45 100644 (file)
@@ -54,6 +54,7 @@ func AddRequestIDs(h http.Handler) http.Handler {
                        }
                        req.Header.Set(HeaderRequestID, gen.Next())
                }
+               w.Header().Set("X-Request-Id", req.Header.Get("X-Request-Id"))
                h.ServeHTTP(w, req)
        })
 }
index 2cd6bb4d43ec5e675f5eca46963e2f9d620de0f6..3bc6f4afcddf7cea0e5b77b140ac1b7c6b3a7a62 100644 (file)
@@ -69,11 +69,11 @@ type ErrNotFound struct {
        multipleResponseError
 }
 
-type InsufficientReplicasError error
+type InsufficientReplicasError struct{ error }
 
-type OversizeBlockError error
+type OversizeBlockError struct{ error }
 
-var ErrOversizeBlock = OversizeBlockError(errors.New("Exceeded maximum block size (" + strconv.Itoa(BLOCKSIZE) + ")"))
+var ErrOversizeBlock = OversizeBlockError{error: errors.New("Exceeded maximum block size (" + strconv.Itoa(BLOCKSIZE) + ")")}
 var MissingArvadosApiHost = errors.New("Missing required environment variable ARVADOS_API_HOST")
 var MissingArvadosApiToken = errors.New("Missing required environment variable ARVADOS_API_TOKEN")
 var InvalidLocatorError = errors.New("Invalid locator")
index c52e07b8f6ea3b6ea417f9843049e3d99b986fa5..62268fa463e6dee07083a815e9b97536e83a5c40 100644 (file)
@@ -8,7 +8,6 @@ import (
        "bytes"
        "context"
        "crypto/md5"
-       "errors"
        "fmt"
        "io"
        "io/ioutil"
@@ -586,7 +585,7 @@ func (s *StandaloneSuite) TestPutWithTooManyFail(c *C) {
 
        _, replicas, err := kc.PutB([]byte("foo"))
 
-       c.Check(err, FitsTypeOf, InsufficientReplicasError(errors.New("")))
+       c.Check(err, FitsTypeOf, InsufficientReplicasError{})
        c.Check(replicas, Equals, 1)
        c.Check(<-st.handled, Equals, ks1[0].url)
 }
@@ -1109,7 +1108,7 @@ func (s *StandaloneSuite) TestPutProxyInsufficientReplicas(c *C) {
        _, replicas, err := kc.PutB([]byte("foo"))
        <-st.handled
 
-       c.Check(err, FitsTypeOf, InsufficientReplicasError(errors.New("")))
+       c.Check(err, FitsTypeOf, InsufficientReplicasError{})
        c.Check(replicas, Equals, 2)
 }
 
@@ -1187,7 +1186,7 @@ func (s *StandaloneSuite) TestPutBWant2ReplicasWithOnlyOneWritableLocalRoot(c *C
 
        _, replicas, err := kc.PutB([]byte("foo"))
 
-       c.Check(err, FitsTypeOf, InsufficientReplicasError(errors.New("")))
+       c.Check(err, FitsTypeOf, InsufficientReplicasError{})
        c.Check(replicas, Equals, 1)
 
        c.Check(<-st.handled, Equals, localRoots[fmt.Sprintf("zzzzz-bi6l4-fakefakefake%03d", 0)])
@@ -1225,7 +1224,7 @@ func (s *StandaloneSuite) TestPutBWithNoWritableLocalRoots(c *C) {
 
        _, replicas, err := kc.PutB([]byte("foo"))
 
-       c.Check(err, FitsTypeOf, InsufficientReplicasError(errors.New("")))
+       c.Check(err, FitsTypeOf, InsufficientReplicasError{})
        c.Check(replicas, Equals, 0)
 }
 
index a8c82aac0e70370dced92b3dd3f5bae249cb100c..633ec1896858bd484d6740b8e9dea074c12d82c9 100644 (file)
@@ -255,7 +255,7 @@ func (kc *KeepClient) BlockWrite(ctx context.Context, req arvados.BlockWriteOpti
                                                        msg += resp + "; "
                                                }
                                                msg = msg[:len(msg)-2]
-                                               return resp, InsufficientReplicasError(errors.New(msg))
+                                               return resp, InsufficientReplicasError{error: errors.New(msg)}
                                        }
                                        break
                                }
index fc33dde4477b45d059db2cbd7a63f919eb67e167..c39bdde4b878a26446404979b08e8c3bd08e2b75 100644 (file)
@@ -196,7 +196,7 @@ class ApplicationController < ActionController::Base
     end
     err[:errors] ||= args
     err[:errors].map! do |err|
-      err += " (" + Thread.current[:request_id] + ")"
+      err += " (#{request.request_id})"
     end
     err[:error_token] = [Time.now.utc.to_i, "%08x" % rand(16 ** 8)].join("+")
     status = err.delete(:status) || 422
@@ -419,17 +419,9 @@ class ApplicationController < ActionController::Base
   end
 
   def set_current_request_id
-    req_id = request.headers['X-Request-Id']
-    if !req_id || req_id.length < 1 || req_id.length > 1024
-      # Client-supplied ID is either missing or too long to be
-      # considered friendly.
-      req_id = "req-" + Random::DEFAULT.rand(2**128).to_s(36)[0..19]
-    end
-    response.headers['X-Request-Id'] = Thread.current[:request_id] = req_id
-    Rails.logger.tagged(req_id) do
+    Rails.logger.tagged(request.request_id) do
       yield
     end
-    Thread.current[:request_id] = nil
   end
 
   def append_info_to_payload(payload)
index ddae4581892dd8f1bbe727ff0b67b04addb4c0a0..af058494b2356628c73d9adb502a325d569e87ed 100644 (file)
@@ -21,7 +21,7 @@ class Container < ArvadosModel
   # already know how to properly treat them.
   attribute :secret_mounts, :jsonbHash, default: {}
   attribute :runtime_status, :jsonbHash, default: {}
-  attribute :runtime_auth_scopes, :jsonbHash, default: {}
+  attribute :runtime_auth_scopes, :jsonbArray, default: []
   attribute :output_storage_classes, :jsonbArray, default: ["default"]
 
   serialize :environment, Hash
diff --git a/services/api/config/initializers/request_id_middleware.rb b/services/api/config/initializers/request_id_middleware.rb
new file mode 100644 (file)
index 0000000..e215880
--- /dev/null
@@ -0,0 +1,25 @@
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: AGPL-3.0
+
+module CustomRequestId
+  def make_request_id(req_id)
+    if !req_id || req_id.length < 1 || req_id.length > 1024
+      # Client-supplied ID is either missing or too long to be
+      # considered friendly.
+      internal_request_id
+    else
+      req_id
+    end
+  end
+
+  def internal_request_id
+    "req-" + Random::DEFAULT.rand(2**128).to_s(36)[0..19]
+  end
+end
+
+class ActionDispatch::RequestId
+  # Instead of using the default UUID-like format for X-Request-Id headers,
+  # use our own.
+  prepend CustomRequestId
+end
\ No newline at end of file
index 2cfa054448c29fcbbe3beb0b80edc37af514eb2e..af7882141e31973c7e28d0f42da16abb088ed88c 100644 (file)
@@ -24,9 +24,6 @@ class ApplicationControllerTest < ActionController::TestCase
     token_time = token.split('+', 2).first.to_i
     assert_operator(token_time, :>=, @start_stamp, "error token too old")
     assert_operator(token_time, :<=, now_timestamp, "error token too new")
-    json_response['errors'].each do |err|
-      assert_match(/req-[a-z0-9]{20}/, err, "X-Request-Id value missing on error message")
-    end
   end
 
   def check_404(errmsg="Path not found")
@@ -56,28 +53,6 @@ class ApplicationControllerTest < ActionController::TestCase
     check_error_token
   end
 
-  test "X-Request-Id header" do
-    authorize_with :spectator
-    get(:index)
-    assert_match /^req-[0-9a-zA-Z]{20}$/, response.headers['X-Request-Id']
-  end
-
-  # The response header is the one that gets logged, so this test also
-  # ensures we log the ID supplied in the request, if any.
-  test "X-Request-Id given by client" do
-    authorize_with :spectator
-    @request.headers['X-Request-Id'] = 'abcdefG'
-    get(:index)
-    assert_equal 'abcdefG', response.headers['X-Request-Id']
-  end
-
-  test "X-Request-Id given by client is ignored if too long" do
-    authorize_with :spectator
-    @request.headers['X-Request-Id'] = 'abcdefG' * 1000
-    get(:index)
-    assert_match /^req-[0-9a-zA-Z]{20}$/, response.headers['X-Request-Id']
-  end
-
   ['foo', '', 'FALSE', 'TRUE', nil, [true], {a:true}, '"true"'].each do |bogus|
     test "bogus boolean parameter #{bogus.inspect} returns error" do
       @controller = Arvados::V1::GroupsController.new
index d04e3838318dd8d67ed86c560cd907e032be4bb8..e3224f49127e83bf9b76f8887b83b65bf1733bc0 100644 (file)
@@ -14,6 +14,7 @@ class ErrorsTest < ActionDispatch::IntegrationTest
       assert_nil assigns(:object)
       assert_not_nil json_response['errors']
       assert_response 404
+      assert_match /^req-[0-9a-zA-Z]{20}$/, response.headers['X-Request-Id']
     end
   end
 
@@ -28,4 +29,30 @@ class ErrorsTest < ActionDispatch::IntegrationTest
                    "Unexpected new route: #{route.path.spec}")
     end
   end
+
+  test "X-Request-Id header" do
+    get "/", headers: auth(:spectator)
+    assert_match /^req-[0-9a-zA-Z]{20}$/, response.headers['X-Request-Id']
+  end
+
+  test "X-Request-Id header on non-existant object URL" do
+    get "/arvados/v1/container_requests/invalid",
+      params: {:format => :json}, headers: auth(:active)
+    assert_response 404
+    assert_match /^req-[0-9a-zA-Z]{20}$/, response.headers['X-Request-Id']
+  end
+
+  # The response header is the one that gets logged, so this test also
+  # ensures we log the ID supplied in the request, if any.
+  test "X-Request-Id given by client" do
+    get "/", headers: auth(:spectator).merge({'X-Request-Id': 'abcdefG'})
+    assert_equal 'abcdefG', response.headers['X-Request-Id']
+  end
+
+  test "X-Request-Id given by client is ignored if too long" do
+    authorize_with :spectator
+    long_reqId = 'abcdefG' * 1000
+    get "/", headers: auth(:spectator).merge({'X-Request-Id': long_reqId})
+    assert_match /^req-[0-9a-zA-Z]{20}$/, response.headers['X-Request-Id']
+  end
 end
index 7ac160eea33da990b976910de4693f87d97bdbda..6c2b3bc58d8efc12e2f93787800332196478cb32 100644 (file)
@@ -7,9 +7,6 @@ Description=Arvados git server
 Documentation=https://doc.arvados.org/
 After=network.target
 
-# systemd==229 (ubuntu:xenial) obeys StartLimitInterval in the [Unit] section
-StartLimitInterval=0
-
 # systemd>=230 (debian:9) obeys StartLimitIntervalSec in the [Unit] section
 StartLimitIntervalSec=0
 
index c202e683f2810e85ab6fddda40793f021b3e0eff..a3cb1341a4677e7ecdc7c03976da7483e47c1aa5 100644 (file)
@@ -169,7 +169,7 @@ type LocalRun struct {
 // crunch-run terminates, mark the container as Cancelled.
 func (lr *LocalRun) run(dispatcher *dispatch.Dispatcher,
        container arvados.Container,
-       status <-chan arvados.Container) {
+       status <-chan arvados.Container) error {
 
        uuid := container.UUID
 
@@ -179,7 +179,7 @@ func (lr *LocalRun) run(dispatcher *dispatch.Dispatcher,
                case lr.concurrencyLimit <- true:
                        break
                case <-lr.ctx.Done():
-                       return
+                       return lr.ctx.Err()
                }
 
                defer func() { <-lr.concurrencyLimit }()
@@ -270,4 +270,5 @@ Finish:
        }
 
        dispatcher.Logger.Printf("finalized container %v", uuid)
+       return nil
 }
index 692d81e5701b349330009d5b359da6acb830e3c8..e3dd113c710cd8cf5b1cede361d794ad7fd67839 100644 (file)
@@ -6,9 +6,6 @@ Description=Arvados Crunch Dispatcher for LOCAL service
 Documentation=https://doc.arvados.org/
 After=network.target
 
-# systemd==229 (ubuntu:xenial) obeys StartLimitInterval in the [Unit] section
-StartLimitInterval=0
-
 # systemd>=230 (debian:9) obeys StartLimitIntervalSec in the [Unit] section
 StartLimitIntervalSec=0
 
index d976bf0812950488b796cb063def8c960d128849..92b8d2adcd6fe22e20c66afc1d4f803521ccd545 100644 (file)
@@ -83,9 +83,9 @@ func (s *TestSuite) TestIntegration(c *C) {
 
        cl := arvados.Cluster{Containers: arvados.ContainersConfig{RuntimeEngine: "docker"}}
 
-       dispatcher.RunContainer = func(d *dispatch.Dispatcher, c arvados.Container, s <-chan arvados.Container) {
-               (&LocalRun{startCmd, make(chan bool, 8), ctx, &cl}).run(d, c, s)
-               cancel()
+       dispatcher.RunContainer = func(d *dispatch.Dispatcher, c arvados.Container, s <-chan arvados.Container) error {
+               defer cancel()
+               return (&LocalRun{startCmd, make(chan bool, 8), ctx, &cl}).run(d, c, s)
        }
 
        err = dispatcher.Run(ctx)
@@ -188,9 +188,9 @@ func testWithServerStub(c *C, apiStubResponses map[string]arvadostest.StubRespon
 
        cl := arvados.Cluster{Containers: arvados.ContainersConfig{RuntimeEngine: "docker"}}
 
-       dispatcher.RunContainer = func(d *dispatch.Dispatcher, c arvados.Container, s <-chan arvados.Container) {
-               (&LocalRun{startCmd, make(chan bool, 8), ctx, &cl}).run(d, c, s)
-               cancel()
+       dispatcher.RunContainer = func(d *dispatch.Dispatcher, c arvados.Container, s <-chan arvados.Container) error {
+               defer cancel()
+               return (&LocalRun{startCmd, make(chan bool, 8), ctx, &cl}).run(d, c, s)
        }
 
        re := regexp.MustCompile(`(?ms).*` + expected + `.*`)
index 2f2f013c714a0be6bf863cbf8329efae62e616b6..584db38edf7e93ac57ad8929ca31e04de907b78d 100644 (file)
@@ -7,7 +7,6 @@ package main
 // Dispatcher service for Crunch that submits containers to the slurm queue.
 
 import (
-       "bytes"
        "context"
        "flag"
        "fmt"
@@ -271,7 +270,7 @@ func (disp *Dispatcher) submit(container arvados.Container, crunchRunCommand []s
 // already in the queue).  Cancel the slurm job if the container's
 // priority changes to zero or its state indicates it's no longer
 // running.
-func (disp *Dispatcher) runContainer(_ *dispatch.Dispatcher, ctr arvados.Container, status <-chan arvados.Container) {
+func (disp *Dispatcher) runContainer(_ *dispatch.Dispatcher, ctr arvados.Container, status <-chan arvados.Container) error {
        ctx, cancel := context.WithCancel(context.Background())
        defer cancel()
 
@@ -279,38 +278,9 @@ func (disp *Dispatcher) runContainer(_ *dispatch.Dispatcher, ctr arvados.Contain
                log.Printf("Submitting container %s to slurm", ctr.UUID)
                cmd := []string{disp.cluster.Containers.CrunchRunCommand}
                cmd = append(cmd, disp.cluster.Containers.CrunchRunArgumentsList...)
-               if err := disp.submit(ctr, cmd); err != nil {
-                       var text string
-                       switch err := err.(type) {
-                       case dispatchcloud.ConstraintsNotSatisfiableError:
-                               var logBuf bytes.Buffer
-                               fmt.Fprintf(&logBuf, "cannot run container %s: %s\n", ctr.UUID, err)
-                               if len(err.AvailableTypes) == 0 {
-                                       fmt.Fprint(&logBuf, "No instance types are configured.\n")
-                               } else {
-                                       fmt.Fprint(&logBuf, "Available instance types:\n")
-                                       for _, t := range err.AvailableTypes {
-                                               fmt.Fprintf(&logBuf,
-                                                       "Type %q: %d VCPUs, %d RAM, %d Scratch, %f Price\n",
-                                                       t.Name, t.VCPUs, t.RAM, t.Scratch, t.Price,
-                                               )
-                                       }
-                               }
-                               text = logBuf.String()
-                               disp.UpdateState(ctr.UUID, dispatch.Cancelled)
-                       default:
-                               text = fmt.Sprintf("Error submitting container %s to slurm: %s", ctr.UUID, err)
-                       }
-                       log.Print(text)
-
-                       lr := arvadosclient.Dict{"log": arvadosclient.Dict{
-                               "object_uuid": ctr.UUID,
-                               "event_type":  "dispatch",
-                               "properties":  map[string]string{"text": text}}}
-                       disp.Arv.Create("logs", lr, nil)
-
-                       disp.Unlock(ctr.UUID)
-                       return
+               err := disp.submit(ctr, cmd)
+               if err != nil {
+                       return err
                }
        }
 
@@ -339,7 +309,7 @@ func (disp *Dispatcher) runContainer(_ *dispatch.Dispatcher, ctr arvados.Contain
                        case dispatch.Locked:
                                disp.Unlock(ctr.UUID)
                        }
-                       return
+                       return nil
                case updated, ok := <-status:
                        if !ok {
                                log.Printf("container %s is done: cancel slurm job", ctr.UUID)
index 2af56c8d0c1e62b3c2a1d3eb5f0a5c1a65be7b4a..86830f3a7f67364d0a3dd783e598458984bff3a0 100644 (file)
@@ -7,9 +7,6 @@ Description=Arvados Crunch Dispatcher for SLURM
 Documentation=https://doc.arvados.org/
 After=network.target
 
-# systemd==229 (ubuntu:xenial) obeys StartLimitInterval in the [Unit] section
-StartLimitInterval=0
-
 # systemd>=230 (debian:9) obeys StartLimitIntervalSec in the [Unit] section
 StartLimitIntervalSec=0
 
index 480434de65d291fad8ac2ac8ff8fbb6092ece327..e7a89db23c8743b3e2934a5c26f12a446a5bd6a9 100644 (file)
@@ -104,7 +104,7 @@ func (sf *slurmFake) Cancel(name string) error {
 
 func (s *IntegrationSuite) integrationTest(c *C,
        expectBatch [][]string,
-       runContainer func(*dispatch.Dispatcher, arvados.Container)) arvados.Container {
+       runContainer func(*dispatch.Dispatcher, arvados.Container)) (arvados.Container, error) {
        arvadostest.ResetEnv()
 
        arv, err := arvadosclient.MakeArvadosClient()
@@ -123,18 +123,21 @@ func (s *IntegrationSuite) integrationTest(c *C,
 
        ctx, cancel := context.WithCancel(context.Background())
        doneRun := make(chan struct{})
+       doneDispatch := make(chan error)
 
        s.disp.Dispatcher = &dispatch.Dispatcher{
                Arv:        arv,
                PollPeriod: time.Second,
-               RunContainer: func(disp *dispatch.Dispatcher, ctr arvados.Container, status <-chan arvados.Container) {
+               RunContainer: func(disp *dispatch.Dispatcher, ctr arvados.Container, status <-chan arvados.Container) error {
                        go func() {
                                runContainer(disp, ctr)
                                s.slurm.queue = ""
                                doneRun <- struct{}{}
                        }()
-                       s.disp.runContainer(disp, ctr, status)
+                       err := s.disp.runContainer(disp, ctr, status)
                        cancel()
+                       doneDispatch <- err
+                       return nil
                },
        }
 
@@ -148,6 +151,7 @@ func (s *IntegrationSuite) integrationTest(c *C,
        err = s.disp.Dispatcher.Run(ctx)
        <-doneRun
        c.Assert(err, Equals, context.Canceled)
+       errDispatch := <-doneDispatch
 
        s.disp.sqCheck.Stop()
 
@@ -162,12 +166,12 @@ func (s *IntegrationSuite) integrationTest(c *C,
        var container arvados.Container
        err = arv.Get("containers", "zzzzz-dz642-queuedcontainer", nil, &container)
        c.Check(err, IsNil)
-       return container
+       return container, errDispatch
 }
 
 func (s *IntegrationSuite) TestNormal(c *C) {
        s.slurm = slurmFake{queue: "zzzzz-dz642-queuedcontainer 10000 100 PENDING Resources\n"}
-       container := s.integrationTest(c,
+       container, _ := s.integrationTest(c,
                nil,
                func(dispatcher *dispatch.Dispatcher, container arvados.Container) {
                        dispatcher.UpdateState(container.UUID, dispatch.Running)
@@ -181,7 +185,7 @@ func (s *IntegrationSuite) TestCancel(c *C) {
        s.slurm = slurmFake{queue: "zzzzz-dz642-queuedcontainer 10000 100 PENDING Resources\n"}
        readyToCancel := make(chan bool)
        s.slurm.onCancel = func() { <-readyToCancel }
-       container := s.integrationTest(c,
+       container, _ := s.integrationTest(c,
                nil,
                func(dispatcher *dispatch.Dispatcher, container arvados.Container) {
                        dispatcher.UpdateState(container.UUID, dispatch.Running)
@@ -199,7 +203,7 @@ func (s *IntegrationSuite) TestCancel(c *C) {
 }
 
 func (s *IntegrationSuite) TestMissingFromSqueue(c *C) {
-       container := s.integrationTest(c,
+       container, _ := s.integrationTest(c,
                [][]string{{
                        fmt.Sprintf("--job-name=%s", "zzzzz-dz642-queuedcontainer"),
                        fmt.Sprintf("--nice=%d", 10000),
@@ -218,24 +222,14 @@ func (s *IntegrationSuite) TestMissingFromSqueue(c *C) {
 
 func (s *IntegrationSuite) TestSbatchFail(c *C) {
        s.slurm = slurmFake{errBatch: errors.New("something terrible happened")}
-       container := s.integrationTest(c,
+       container, err := s.integrationTest(c,
                [][]string{{"--job-name=zzzzz-dz642-queuedcontainer", "--nice=10000", "--no-requeue", "--mem=11445", "--cpus-per-task=4", "--tmp=45777"}},
                func(dispatcher *dispatch.Dispatcher, container arvados.Container) {
                        dispatcher.UpdateState(container.UUID, dispatch.Running)
                        dispatcher.UpdateState(container.UUID, dispatch.Complete)
                })
        c.Check(container.State, Equals, arvados.ContainerStateComplete)
-
-       arv, err := arvadosclient.MakeArvadosClient()
-       c.Assert(err, IsNil)
-
-       var ll arvados.LogList
-       err = arv.List("logs", arvadosclient.Dict{"filters": [][]string{
-               {"object_uuid", "=", container.UUID},
-               {"event_type", "=", "dispatch"},
-       }}, &ll)
-       c.Assert(err, IsNil)
-       c.Assert(len(ll.Items), Equals, 1)
+       c.Check(err, ErrorMatches, `something terrible happened`)
 }
 
 type StubbedSuite struct {
@@ -280,7 +274,7 @@ func (s *StubbedSuite) testWithServerStub(c *C, apiStubResponses map[string]arva
        dispatcher := dispatch.Dispatcher{
                Arv:        arv,
                PollPeriod: time.Second,
-               RunContainer: func(disp *dispatch.Dispatcher, ctr arvados.Container, status <-chan arvados.Container) {
+               RunContainer: func(disp *dispatch.Dispatcher, ctr arvados.Container, status <-chan arvados.Container) error {
                        go func() {
                                time.Sleep(time.Second)
                                disp.UpdateState(ctr.UUID, dispatch.Running)
@@ -288,6 +282,7 @@ func (s *StubbedSuite) testWithServerStub(c *C, apiStubResponses map[string]arva
                        }()
                        s.disp.runContainer(disp, ctr, status)
                        cancel()
+                       return nil
                },
        }
 
index 7e049144ae1ef49a700ad71580d50a48b0df144f..2aab42b2a37c7c4be9a6ff6907a6b6c38c3373cf 100644 (file)
@@ -7,9 +7,6 @@ Description=Arvados Docker Image Cleaner
 Documentation=https://doc.arvados.org/
 After=network.target
 
-# systemd==229 (ubuntu:xenial) obeys StartLimitInterval in the [Unit] section
-StartLimitInterval=0
-
 # systemd>=230 (debian:9) obeys StartLimitIntervalSec in the [Unit] section
 StartLimitIntervalSec=0
 
index ccb7a467af45906f60710c94e0dfe98bf84bf034..31580433035692ee1d4110f06aec56e036f8d961 100644 (file)
@@ -3,9 +3,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
 case "$TARGET" in
-    ubuntu1604)
-        fpm_depends+=()
-        ;;
     debian* | ubuntu*)
         fpm_depends+=(python3-distutils)
         ;;
index ca3744c28db265832229ac5a0635114396164b35..4b8745d1549c6950e14cf74cdbc09edc12f1650d 100644 (file)
@@ -8,9 +8,6 @@ Documentation=https://doc.arvados.org/
 After=network.target
 AssertPathExists=/etc/arvados/config.yml
 
-# systemd==229 (ubuntu:xenial) obeys StartLimitInterval in the [Unit] section
-StartLimitInterval=0
-
 # systemd>=230 (debian:9) obeys StartLimitIntervalSec in the [Unit] section
 StartLimitIntervalSec=0
 
index 0a38597e6caf28a3f1a4ba45a9b568c2920d56c8..859d70724106f815ab9a990e0a9615c09a4a5189 100644 (file)
@@ -7,9 +7,6 @@ Description=Arvados Keep Balance
 Documentation=https://doc.arvados.org/
 After=network.target
 
-# systemd==229 (ubuntu:xenial) obeys StartLimitInterval in the [Unit] section
-StartLimitInterval=0
-
 # systemd>=230 (debian:9) obeys StartLimitIntervalSec in the [Unit] section
 StartLimitIntervalSec=0
 
index 80978227770998fa2ac1b69d5149cfd4e2ca9530..cb5fdf84fc5f3362d34f3b2d56e729f2e74a6911 100644 (file)
@@ -7,9 +7,6 @@ Description=Arvados Keep web gateway
 Documentation=https://doc.arvados.org/
 After=network.target
 
-# systemd==229 (ubuntu:xenial) obeys StartLimitInterval in the [Unit] section
-StartLimitInterval=0
-
 # systemd>=230 (debian:9) obeys StartLimitIntervalSec in the [Unit] section
 StartLimitIntervalSec=0
 
index 4c63161a0d7a260ce7dd349f62214d6e7d162f36..9548cb219b1f2bd7efa2490590b30bcb52b85fdc 100644 (file)
@@ -7,9 +7,6 @@ Description=Arvados Keep Proxy
 Documentation=https://doc.arvados.org/
 After=network.target
 
-# systemd==229 (ubuntu:xenial) obeys StartLimitInterval in the [Unit] section
-StartLimitInterval=0
-
 # systemd>=230 (debian:9) obeys StartLimitIntervalSec in the [Unit] section
 StartLimitIntervalSec=0
 
index c49fbe0bb368b90733f985990acadb651323f7fe..4bdb4202026e3aeb2705fa79faa0ef3e7a296307 100644 (file)
@@ -7,7 +7,6 @@ package main
 import (
        "bytes"
        "crypto/md5"
-       "errors"
        "fmt"
        "io/ioutil"
        "math/rand"
@@ -265,12 +264,16 @@ func (s *ServerRequiredSuite) TestDesiredReplicas(c *C) {
        content := []byte("TestDesiredReplicas")
        hash := fmt.Sprintf("%x", md5.Sum(content))
 
-       for _, kc.Want_replicas = range []int{0, 1, 2} {
+       for _, kc.Want_replicas = range []int{0, 1, 2, 3} {
                locator, rep, err := kc.PutB(content)
-               c.Check(err, Equals, nil)
-               c.Check(rep, Equals, kc.Want_replicas)
-               if rep > 0 {
-                       c.Check(locator, Matches, fmt.Sprintf(`^%s\+%d(\+.+)?$`, hash, len(content)))
+               if kc.Want_replicas < 3 {
+                       c.Check(err, Equals, nil)
+                       c.Check(rep, Equals, kc.Want_replicas)
+                       if rep > 0 {
+                               c.Check(locator, Matches, fmt.Sprintf(`^%s\+%d(\+.+)?$`, hash, len(content)))
+                       }
+               } else {
+                       c.Check(err, ErrorMatches, ".*503.*")
                }
        }
 }
@@ -438,7 +441,7 @@ func (s *ServerRequiredSuite) TestPutAskGetForbidden(c *C) {
        hash2, rep, err := kc.PutB([]byte("bar"))
        c.Check(hash2, Equals, "")
        c.Check(rep, Equals, 0)
-       c.Check(err, FitsTypeOf, keepclient.InsufficientReplicasError(errors.New("")))
+       c.Check(err, FitsTypeOf, keepclient.InsufficientReplicasError{})
 
        blocklen, _, err := kc.Ask(hash)
        c.Check(err, FitsTypeOf, &keepclient.ErrNotFound{})
@@ -491,7 +494,7 @@ func testPermission(c *C, admin bool, perm arvados.UploadDownloadPermission) {
                } else {
                        c.Check(hash2, Equals, "")
                        c.Check(rep, Equals, 0)
-                       c.Check(err, FitsTypeOf, keepclient.InsufficientReplicasError(errors.New("")))
+                       c.Check(err, FitsTypeOf, keepclient.InsufficientReplicasError{})
                }
                logbuf.Reset()
        }
index 7047f0e6b970a0c0a9598d6e680d5589425c08c1..1f14c3f464c4b3a701a1f800e99534865a058908 100644 (file)
@@ -7,9 +7,6 @@ Description=Arvados Keep Storage Daemon
 Documentation=https://doc.arvados.org/
 After=network.target
 
-# systemd==229 (ubuntu:xenial) obeys StartLimitInterval in the [Unit] section
-StartLimitInterval=0
-
 # systemd>=230 (debian:9) obeys StartLimitIntervalSec in the [Unit] section
 StartLimitIntervalSec=0
 
index d8836f19b32704258a77a6a3b1446f7fda2fc416..8e5c6deb5dc8ca47a08dd169157116141aeb0518 100755 (executable)
@@ -144,7 +144,7 @@ begin
       if existing_groups.index(addgroup).nil?
         # User should be in group, but isn't, so add them.
         STDERR.puts "Add user #{username} to #{addgroup} group"
-        system("adduser", username, addgroup)
+        system("usermod", "-aG", addgroup, username)
       end
     end
 
@@ -152,7 +152,7 @@ begin
       if groups.index(removegroup).nil?
         # User is in a group, but shouldn't be, so remove them.
         STDERR.puts "Remove user #{username} from #{removegroup} group"
-        system("deluser", username, removegroup)
+        system("gpasswd", "-d", username, removegroup)
       end
     end
 
index f7052efc105abcce54b1e50aa6b294debacf13b8..a7784fd7beced037199a5216f58ed9782235223c 100644 (file)
@@ -196,69 +196,59 @@ arvados:
         ProviderType: t3.small
         VCPUs: 2
         RAM: 2GiB
-        IncludedScratch: 50GB
         AddedScratch: 50GB
         Price: 0.0208
       c5large:
         ProviderType: c5.large
         VCPUs: 2
         RAM: 4GiB
-        IncludedScratch: 50GB
         AddedScratch: 50GB
         Price: 0.085
       m5large:
         ProviderType: m5.large
         VCPUs: 2
         RAM: 8GiB
-        IncludedScratch: 50GB
         AddedScratch: 50GB
         Price: 0.096
       c5xlarge:
         ProviderType: c5.xlarge
         VCPUs: 4
         RAM: 8GiB
-        IncludedScratch: 100GB
         AddedScratch: 100GB
         Price: 0.17
       m5xlarge:
         ProviderType: m5.xlarge
         VCPUs: 4
         RAM: 16GiB
-        IncludedScratch: 100GB
         AddedScratch: 100GB
         Price: 0.192
       m5xlarge_extradisk:
         ProviderType: m5.xlarge
         VCPUs: 4
         RAM: 16GiB
-        IncludedScratch: 400GB
         AddedScratch: 400GB
         Price: 0.193
       c52xlarge:
         ProviderType: c5.2xlarge
         VCPUs: 8
         RAM: 16GiB
-        IncludedScratch: 200GB
         AddedScratch: 200GB
         Price: 0.34
       m52xlarge:
         ProviderType: m5.2xlarge
         VCPUs: 8
         RAM: 32GiB
-        IncludedScratch: 200GB
         AddedScratch: 200GB
         Price: 0.384
       c54xlarge:
         ProviderType: c5.4xlarge
         VCPUs: 16
         RAM: 32GiB
-        IncludedScratch: 400GB
         AddedScratch: 400GB
         Price: 0.68
       m54xlarge:
         ProviderType: m5.4xlarge
         VCPUs: 16
         RAM: 64GiB
-        IncludedScratch: 400GB
         AddedScratch: 400GB
         Price: 0.768