Merge branch '17535-test-provision-jenkins'
authorJavier Bértoli <jbertoli@curii.com>
Fri, 20 Aug 2021 00:57:19 +0000 (21:57 -0300)
committerJavier Bértoli <jbertoli@curii.com>
Fri, 20 Aug 2021 00:57:19 +0000 (21:57 -0300)
closes #17535
Arvados-DCO-1.1-Signed-off-by: Javier Bértoli <jbertoli@curii.com>

146 files changed:
apps/workbench/app/views/layouts/body.html.erb
apps/workbench/test/integration/application_layout_test.rb
build/package-build-dockerfiles/Makefile
build/package-build-dockerfiles/centos7/Dockerfile
build/package-build-dockerfiles/debian10/Dockerfile
build/package-build-dockerfiles/debian11/Dockerfile [moved from build/package-build-dockerfiles/ubuntu1604/Dockerfile with 77% similarity]
build/package-build-dockerfiles/ubuntu1804/Dockerfile
build/package-build-dockerfiles/ubuntu2004/Dockerfile
build/package-test-dockerfiles/Makefile
build/package-test-dockerfiles/centos7/Dockerfile
build/package-test-dockerfiles/debian10/Dockerfile
build/package-test-dockerfiles/debian11/Dockerfile [moved from build/package-test-dockerfiles/ubuntu1604/Dockerfile with 68% similarity]
build/package-test-dockerfiles/ubuntu1604/etc-apt-preferences.d-arvados [deleted file]
build/package-test-dockerfiles/ubuntu1804/Dockerfile
build/package-test-dockerfiles/ubuntu2004/Dockerfile
build/package-testing/test-packages-debian11.sh [moved from build/package-testing/test-packages-debian8.sh with 100% similarity]
build/package-testing/test-packages-debian9.sh [deleted symlink]
build/package-testing/test-packages-ubuntu1404.sh [deleted symlink]
build/package-testing/test-packages-ubuntu1604.sh [deleted symlink]
build/rails-package-scripts/arvados-api-server.sh
build/rails-package-scripts/postinst.sh
build/run-build-packages-one-target.sh
build/run-build-packages.sh
build/run-library.sh
build/run-tests.sh
cmd/arvados-server/arvados-controller.service
cmd/arvados-server/arvados-dispatch-cloud.service
cmd/arvados-server/arvados-dispatch-lsf.service [new file with mode: 0644]
cmd/arvados-server/arvados-ws.service
cmd/arvados-server/cmd.go
doc/_config.yml
doc/_includes/_create_superuser_token.liquid
doc/_includes/_install_rails_command.liquid
doc/_includes/_install_ruby_and_bundler.liquid
doc/admin/config-migration.html.textile.liquid
doc/admin/storage-classes.html.textile.liquid
doc/admin/token-expiration-policy.html.textile.liquid
doc/admin/upgrading.html.textile.liquid
doc/api/keep-web-urls.html.textile.liquid
doc/install/crunch2-lsf/install-dispatch.html.textile.liquid [new file with mode: 0644]
doc/install/crunch2-slurm/install-dispatch.html.textile.liquid
doc/install/install-arv-git-httpd.html.textile.liquid
doc/install/install-keep-balance.html.textile.liquid
doc/install/install-keep-web.html.textile.liquid
doc/install/install-manual-prerequisites.html.textile.liquid
doc/install/packages.html.textile.liquid
doc/install/singularity.html.textile.liquid [new file with mode: 0644]
doc/user/cwl/cwl-extensions.html.textile.liquid
doc/user/cwl/cwl-run-options.html.textile.liquid
doc/user/topics/arv-copy.html.textile.liquid
doc/user/topics/arv-docker.html.textile.liquid
doc/user/topics/storage-classes.html.textile.liquid
lib/boot/passenger.go
lib/boot/supervisor.go
lib/config/config.default.yml
lib/config/export.go
lib/config/generated_config.go
lib/config/load.go
lib/config/load_test.go
lib/controller/integration_test.go
lib/crunchrun/crunchrun.go
lib/crunchrun/crunchrun_test.go
lib/crunchrun/docker.go
lib/crunchrun/executor.go
lib/crunchrun/executor_test.go
lib/crunchrun/singularity.go
lib/install/deps.go
lib/lsf/dispatch.go [new file with mode: 0644]
lib/lsf/dispatch_test.go [new file with mode: 0644]
lib/lsf/lsfcli.go [new file with mode: 0644]
lib/lsf/lsfqueue.go [new file with mode: 0644]
lib/service/cmd.go
sdk/cwl/arvados_cwl/__init__.py
sdk/cwl/arvados_cwl/arv-cwl-schema-v1.0.yml
sdk/cwl/arvados_cwl/arv-cwl-schema-v1.1.yml
sdk/cwl/arvados_cwl/arv-cwl-schema-v1.2.yml
sdk/cwl/arvados_cwl/arvcontainer.py
sdk/cwl/arvados_cwl/arvtool.py
sdk/cwl/arvados_cwl/context.py
sdk/cwl/arvados_cwl/executor.py
sdk/cwl/fpm-info.sh
sdk/cwl/tests/test_container.py
sdk/cwl/tests/test_submit.py
sdk/cwl/tests/wf/submit_storage_class_wf.cwl [new file with mode: 0644]
sdk/go/arvados/collection.go
sdk/go/arvados/config.go
sdk/go/arvados/container.go
sdk/go/arvados/fs_base.go
sdk/go/arvadostest/fixtures.go
sdk/go/dispatch/dispatch.go
sdk/go/dispatch/dispatch_test.go
sdk/go/health/aggregator_test.go
sdk/go/httpserver/id_generator.go
sdk/go/keepclient/keepclient.go
sdk/go/keepclient/keepclient_test.go
sdk/go/keepclient/support.go
sdk/python/arvados/commands/arv_copy.py
sdk/python/arvados/commands/keepdocker.py
sdk/python/arvados/commands/put.py
sdk/python/tests/test_arv_copy.py
sdk/python/tests/test_arv_put.py
services/api/app/controllers/application_controller.rb
services/api/app/mailers/user_notifier.rb
services/api/app/models/collection.rb
services/api/app/models/container.rb
services/api/app/models/container_request.rb
services/api/config/arvados_config.rb
services/api/config/initializers/request_id_middleware.rb [new file with mode: 0644]
services/api/lib/tasks/manage_long_lived_tokens.rake
services/api/test/functional/application_controller_test.rb
services/api/test/integration/errors_test.rb
services/api/test/unit/collection_test.rb
services/api/test/unit/container_request_test.rb
services/api/test/unit/user_notifier_test.rb
services/arv-git-httpd/arvados-git-httpd.service
services/crunch-dispatch-local/crunch-dispatch-local.go
services/crunch-dispatch-local/crunch-dispatch-local.service
services/crunch-dispatch-local/crunch-dispatch-local_test.go
services/crunch-dispatch-slurm/crunch-dispatch-slurm.go
services/crunch-dispatch-slurm/crunch-dispatch-slurm.service
services/crunch-dispatch-slurm/crunch-dispatch-slurm_test.go
services/dockercleaner/arvados-docker-cleaner.service
services/dockercleaner/fpm-info.sh
services/fuse/arvados_fuse/fusedir.py
services/health/arvados-health.service
services/keep-balance/balance.go
services/keep-balance/balance_run_test.go
services/keep-balance/balance_test.go
services/keep-balance/block_state.go
services/keep-balance/block_state_test.go [new file with mode: 0644]
services/keep-balance/collection.go
services/keep-balance/collection_test.go
services/keep-balance/integration_test.go
services/keep-balance/keep-balance.service
services/keep-balance/main.go
services/keep-balance/main_test.go
services/keep-balance/server.go
services/keep-web/keep-web.service
services/keepproxy/keepproxy.service
services/keepproxy/keepproxy_test.go
services/keepstore/handler_test.go
services/keepstore/handlers.go
services/keepstore/keepstore.service
services/keepstore/volume.go
services/login-sync/bin/arvados-login-sync
tools/salt-install/config_examples/multi_host/aws/pillars/arvados.sls

index 9da55cbeb3fedd36a3add088ab8a9d0e29607ed1..ed19d51d2574a0b7bc044e923e4d3c66d5ca4e0b 100644 (file)
@@ -161,15 +161,9 @@ SPDX-License-Identifier: AGPL-3.0 %>
                   <li role="menuitem"><a href="/groups">
                       <i class="fa fa-lg fa-users fa-fw"></i> Groups
                   </a></li>
-                  <li role="menuitem"><a href="/nodes">
-                      <i class="fa fa-lg fa-cloud fa-fw"></i> Compute nodes
-                  </a></li>
                   <li role="menuitem"><a href="/keep_services">
                       <i class="fa fa-lg fa-exchange fa-fw"></i> Keep services
                   </a></li>
-                  <li role="menuitem"><a href="/keep_disks">
-                      <i class="fa fa-lg fa-hdd-o fa-fw"></i> Keep disks
-                  </a></li>
                 </ul>
               </li>
             <% end %>
index 7d34c43deb5103d72ac986f90a49dd24c48a5e73..35a1415213db789d441c4973b1f544ff16b6797c 100644 (file)
@@ -251,9 +251,7 @@ class ApplicationLayoutTest < ActionDispatch::IntegrationTest
     ['SSH keys', nil, 'public_key'],
     ['Links', nil, 'link_class'],
     ['Groups', nil, 'All users'],
-    ['Compute nodes', nil, 'ping_secret'],
     ['Keep services', nil, 'service_ssl_flag'],
-    ['Keep disks', nil, 'bytes_free'],
   ].each do |page_name, add_button_text, look_for|
     test "test system menu #{page_name} link" do
       visit page_with_token('admin')
index 161a50406998f0d486aa721f429397a5993f5003..80f7c12c101ceb3910fdde2a17b793d4841069cb 100644 (file)
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: AGPL-3.0
 
-all: centos7/generated debian10/generated ubuntu1604/generated ubuntu1804/generated ubuntu2004/generated
+all: centos7/generated debian10/generated debian11/generated ubuntu1804/generated ubuntu2004/generated
 
 centos7/generated: common-generated-all
        test -d centos7/generated || mkdir centos7/generated
@@ -12,9 +12,9 @@ debian10/generated: common-generated-all
        test -d debian10/generated || mkdir debian10/generated
        cp -f -rlt debian10/generated common-generated/*
 
-ubuntu1604/generated: common-generated-all
-       test -d ubuntu1604/generated || mkdir ubuntu1604/generated
-       cp -f -rlt ubuntu1604/generated common-generated/*
+debian11/generated: common-generated-all
+       test -d debian11/generated || mkdir debian11/generated
+       cp -f -rlt debian11/generated common-generated/*
 
 ubuntu1804/generated: common-generated-all
        test -d ubuntu1804/generated || mkdir ubuntu1804/generated
index b2d32b6761a78231955bb14d1b691dfbccc0d79a..332a8dc83003d4512da61e27613b6f2121eaa426 100644 (file)
@@ -16,7 +16,7 @@ RUN gpg --import --no-tty /tmp/mpapis.asc && \
     curl -L https://get.rvm.io | bash -s stable && \
     /usr/local/rvm/bin/rvm install 2.5 && \
     /usr/local/rvm/bin/rvm alias create default ruby-2.5 && \
-    /usr/local/rvm/bin/rvm-exec default gem install bundler --version 2.0.2 && \
+    /usr/local/rvm/bin/rvm-exec default gem install bundler --version 2.2.19 && \
     /usr/local/rvm/bin/rvm-exec default gem install fpm --version 1.10.2
 
 # Install Bash 4.4.12 // see https://dev.arvados.org/issues/15612
index 90833f011d3d1029d2e89221beb966a51757cde4..ad8198fa0db0461fefcf49aca0c7aff9b77db734 100644 (file)
@@ -22,7 +22,7 @@ RUN gpg --import --no-tty /tmp/mpapis.asc && \
     curl -L https://get.rvm.io | bash -s stable && \
     /usr/local/rvm/bin/rvm install 2.5 && \
     /usr/local/rvm/bin/rvm alias create default ruby-2.5 && \
-    /usr/local/rvm/bin/rvm-exec default gem install bundler --version 2.0.2 && \
+    /usr/local/rvm/bin/rvm-exec default gem install bundler --version 2.2.19 && \
     /usr/local/rvm/bin/rvm-exec default gem install fpm --version 1.10.2
 
 # Install golang binary
similarity index 77%
rename from build/package-build-dockerfiles/ubuntu1604/Dockerfile
rename to build/package-build-dockerfiles/debian11/Dockerfile
index 880576f2bf480f4bbde7eb925fd6643dab5158af..99effa3cca6c922b7a6f237b92bb895082b3ed46 100644 (file)
@@ -2,13 +2,14 @@
 #
 # SPDX-License-Identifier: AGPL-3.0
 
-FROM ubuntu:xenial
+## dont use debian:11 here since the word 'bullseye' is used for rvm precompiled binaries
+FROM debian:bullseye
 MAINTAINER Arvados Package Maintainers <packaging@arvados.org>
 
 ENV DEBIAN_FRONTEND noninteractive
 
 # Install dependencies.
-RUN /usr/bin/apt-get update && /usr/bin/apt-get install -q -y python3 python-setuptools python3-setuptools python3-pip libcurl4-gnutls-dev libgnutls-dev curl git libattr1-dev libfuse-dev libpq-dev unzip tzdata python3-venv python3-dev libpam-dev
+RUN /usr/bin/apt-get update && /usr/bin/apt-get install -q -y python3 python3-setuptools python3-pip libcurl4-gnutls-dev curl git procps libattr1-dev libfuse-dev libgnutls28-dev libpq-dev unzip python3-venv python3-dev libpam-dev equivs
 
 # Install virtualenv
 RUN /usr/bin/pip3 install 'virtualenv<20'
@@ -21,7 +22,8 @@ RUN gpg --import --no-tty /tmp/mpapis.asc && \
     curl -L https://get.rvm.io | bash -s stable && \
     /usr/local/rvm/bin/rvm install 2.5 && \
     /usr/local/rvm/bin/rvm alias create default ruby-2.5 && \
-    /usr/local/rvm/bin/rvm-exec default gem install bundler --version 2.0.2 && \
+    echo "gem: --no-document" >> /etc/gemrc && \
+    /usr/local/rvm/bin/rvm-exec default gem install bundler --version 2.2.19 && \
     /usr/local/rvm/bin/rvm-exec default gem install fpm --version 1.10.2
 
 # Install golang binary
@@ -35,4 +37,4 @@ RUN ln -s /usr/local/node-v10.23.1-linux-x64/bin/* /usr/local/bin/
 RUN git clone --depth 1 git://git.arvados.org/arvados.git /tmp/arvados && cd /tmp/arvados/services/api && /usr/local/rvm/bin/rvm-exec default bundle && cd /tmp/arvados/apps/workbench && /usr/local/rvm/bin/rvm-exec default bundle
 
 ENV WORKSPACE /arvados
-CMD ["/usr/local/rvm/bin/rvm-exec", "default", "bash", "/jenkins/run-build-packages.sh", "--target", "ubuntu1604"]
+CMD ["/usr/local/rvm/bin/rvm-exec", "default", "bash", "/jenkins/run-build-packages.sh", "--target", "debian11"]
index fd18f3e202a81ecf29a29335221c8b84d1b40dc4..65c477a91684feaa19acb940b816013377848190 100644 (file)
@@ -21,7 +21,7 @@ RUN gpg --import --no-tty /tmp/mpapis.asc && \
     curl -L https://get.rvm.io | bash -s stable && \
     /usr/local/rvm/bin/rvm install 2.5 && \
     /usr/local/rvm/bin/rvm alias create default ruby-2.5 && \
-    /usr/local/rvm/bin/rvm-exec default gem install bundler --version 2.0.2 && \
+    /usr/local/rvm/bin/rvm-exec default gem install bundler --version 2.2.19 && \
     /usr/local/rvm/bin/rvm-exec default gem install fpm --version 1.10.2
 
 # Install golang binary
index 0e11b8738e1f9a6a4d8e825f1a6a8a2ccfee8437..ba6c57df3880797e22218893ab1e42d6660227c4 100644 (file)
@@ -21,7 +21,7 @@ RUN gpg --import --no-tty /tmp/mpapis.asc && \
     curl -L https://get.rvm.io | bash -s stable && \
     /usr/local/rvm/bin/rvm install 2.5 && \
     /usr/local/rvm/bin/rvm alias create default ruby-2.5 && \
-    /usr/local/rvm/bin/rvm-exec default gem install bundler --version 2.0.2 && \
+    /usr/local/rvm/bin/rvm-exec default gem install bundler --version 2.2.19 && \
     /usr/local/rvm/bin/rvm-exec default gem install fpm --version 1.10.2
 
 # Install golang binary
index 227b74bbab35faa2f7c12fe939e03fc51d2487de..849decb9a5016bb2676ea4a9e0e92ba639edd6d9 100644 (file)
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: AGPL-3.0
 
-all: centos7/generated debian10/generated ubuntu1604/generated ubuntu1804/generated ubuntu2004/generated
+all: centos7/generated debian10/generated debian11/generated ubuntu1804/generated ubuntu2004/generated
 
 centos7/generated: common-generated-all
        test -d centos7/generated || mkdir centos7/generated
@@ -12,9 +12,9 @@ debian10/generated: common-generated-all
        test -d debian10/generated || mkdir debian10/generated
        cp -f -rlt debian10/generated common-generated/*
 
-ubuntu1604/generated: common-generated-all
-       test -d ubuntu1604/generated || mkdir ubuntu1604/generated
-       cp -f -rlt ubuntu1604/generated common-generated/*
+debian11/generated: common-generated-all
+       test -d debian11/generated || mkdir debian11/generated
+       cp -f -rlt debian11/generated common-generated/*
 
 ubuntu1804/generated: common-generated-all
        test -d ubuntu1804/generated || mkdir ubuntu1804/generated
index eebeac0c83b993c4cbd61f5e0453849586f4e1e1..f83941824e2e05c192f042ac3bb2cd4347762120 100644 (file)
@@ -17,7 +17,7 @@ RUN touch /var/lib/rpm/* && \
     curl -L https://get.rvm.io | bash -s stable && \
     /usr/local/rvm/bin/rvm install 2.5 && \
     /usr/local/rvm/bin/rvm alias create default ruby-2.5 && \
-    /usr/local/rvm/bin/rvm-exec default gem install bundler --version 2.0.2
+    /usr/local/rvm/bin/rvm-exec default gem install bundler --version 2.2.9
 
 # Install Bash 4.4.12  // see https://dev.arvados.org/issues/15612
 RUN cd /usr/local/src \
index 32996e4a54eb395252730198d87d49f08eb78b70..3f9393ee55e592f3deb93bc2a2cf15c6881d245d 100644 (file)
@@ -19,7 +19,7 @@ RUN gpg --import --no-tty /tmp/mpapis.asc && \
     curl -L https://get.rvm.io | bash -s stable && \
     /usr/local/rvm/bin/rvm install 2.5 && \
     /usr/local/rvm/bin/rvm alias create default ruby-2.5 && \
-    /usr/local/rvm/bin/rvm-exec default gem install bundler --version 2.0.2
+    /usr/local/rvm/bin/rvm-exec default gem install bundler --version 2.2.19
 
 # udev daemon can't start in a container, so don't try.
 RUN mkdir -p /etc/udev/disabled
similarity index 68%
rename from build/package-test-dockerfiles/ubuntu1604/Dockerfile
rename to build/package-test-dockerfiles/debian11/Dockerfile
index e0432c20eed471f728bc69b320dbb57849df0b16..7cc543cf0d67619f7107d9dacd07497b06c07b41 100644 (file)
@@ -2,14 +2,14 @@
 #
 # SPDX-License-Identifier: AGPL-3.0
 
-FROM ubuntu:xenial
+FROM debian:bullseye
 MAINTAINER Arvados Package Maintainers <packaging@arvados.org>
 
 ENV DEBIAN_FRONTEND noninteractive
 
 # Install dependencies
 RUN apt-get update && \
-    apt-get -y install --no-install-recommends curl ca-certificates
+    apt-get -y install --no-install-recommends curl ca-certificates gpg procps gpg-agent
 
 # Install RVM
 ADD generated/mpapis.asc /tmp/
@@ -19,14 +19,10 @@ RUN gpg --import --no-tty /tmp/mpapis.asc && \
     curl -L https://get.rvm.io | bash -s stable && \
     /usr/local/rvm/bin/rvm install 2.5 && \
     /usr/local/rvm/bin/rvm alias create default ruby-2.5 && \
-    /usr/local/rvm/bin/rvm-exec default gem install bundler --version 2.0.2
+    echo "gem: --no-document" >> /etc/gemrc && \
+    /usr/local/rvm/bin/rvm-exec default gem install bundler --version 2.2.19
 
 # udev daemon can't start in a container, so don't try.
 RUN mkdir -p /etc/udev/disabled
 
-RUN echo "deb file:///arvados/packages/ubuntu1604/ /" >>/etc/apt/sources.list
-
-# Add preferences file for the Arvados packages. This pins Arvados
-# packages at priority 501, so that older python dependency versions
-# are preferred in those cases where we need them
-ADD etc-apt-preferences.d-arvados /etc/apt/preferences.d/arvados
+RUN echo "deb file:///arvados/packages/debian11/ /" >>/etc/apt/sources.list
diff --git a/build/package-test-dockerfiles/ubuntu1604/etc-apt-preferences.d-arvados b/build/package-test-dockerfiles/ubuntu1604/etc-apt-preferences.d-arvados
deleted file mode 100644 (file)
index 9e24695..0000000
+++ /dev/null
@@ -1,3 +0,0 @@
-Package: *
-Pin: release o=Arvados
-Pin-Priority: 501
index 2d4189879ec6c86ec38e7f80d49b1a15569c65f3..7347f32c8fe7c5afa36c903dff25671f8c2650c2 100644 (file)
@@ -19,7 +19,7 @@ RUN gpg --import --no-tty /tmp/mpapis.asc && \
     curl -L https://get.rvm.io | bash -s stable && \
     /usr/local/rvm/bin/rvm install 2.5 && \
     /usr/local/rvm/bin/rvm alias create default ruby-2.5 && \
-    /usr/local/rvm/bin/rvm-exec default gem install bundler --version 2.0.2
+    /usr/local/rvm/bin/rvm-exec default gem install bundler --version 2.2.19
 
 # udev daemon can't start in a container, so don't try.
 RUN mkdir -p /etc/udev/disabled
index 0a3bda8f147654e7c07e2737deec30fd0bc5142e..061c8848ee39e9f6a409a7d25e4fc1333cb79bfe 100644 (file)
@@ -19,7 +19,7 @@ RUN gpg --import --no-tty /tmp/mpapis.asc && \
     curl -L https://get.rvm.io | bash -s stable && \
     /usr/local/rvm/bin/rvm install 2.5 && \
     /usr/local/rvm/bin/rvm alias create default ruby-2.5 && \
-    /usr/local/rvm/bin/rvm-exec default gem install bundler --version 2.0.2
+    /usr/local/rvm/bin/rvm-exec default gem install bundler --version 2.2.19
 
 # udev daemon can't start in a container, so don't try.
 RUN mkdir -p /etc/udev/disabled
diff --git a/build/package-testing/test-packages-debian9.sh b/build/package-testing/test-packages-debian9.sh
deleted file mode 120000 (symlink)
index 54ce94c..0000000
+++ /dev/null
@@ -1 +0,0 @@
-deb-common-test-packages.sh
\ No newline at end of file
diff --git a/build/package-testing/test-packages-ubuntu1404.sh b/build/package-testing/test-packages-ubuntu1404.sh
deleted file mode 120000 (symlink)
index 54ce94c..0000000
+++ /dev/null
@@ -1 +0,0 @@
-deb-common-test-packages.sh
\ No newline at end of file
diff --git a/build/package-testing/test-packages-ubuntu1604.sh b/build/package-testing/test-packages-ubuntu1604.sh
deleted file mode 120000 (symlink)
index 54ce94c..0000000
+++ /dev/null
@@ -1 +0,0 @@
-deb-common-test-packages.sh
\ No newline at end of file
index 027383ab4f62294aa9661b5e9ce1651457f3bf1a..a513c3ad09f885dc18c7187822418d77e535e201 100644 (file)
@@ -21,7 +21,7 @@ setup_before_nginx_restart() {
   # initialize git_internal_dir
   # usually /var/lib/arvados/internal.git (set in application.default.yml )
   if [ "$APPLICATION_READY" = "1" ]; then
-      GIT_INTERNAL_DIR=$($COMMAND_PREFIX bundle exec rake config:dump 2>&1 | grep GitInternalDir | awk '{ print $2 }' |tr -d '"')
+      GIT_INTERNAL_DIR=$($COMMAND_PREFIX bin/rake config:dump 2>&1 | grep GitInternalDir | awk '{ print $2 }' |tr -d '"')
       if [ ! -e "$GIT_INTERNAL_DIR" ]; then
         run_and_report "Creating git_internal_dir '$GIT_INTERNAL_DIR'" \
           mkdir -p "$GIT_INTERNAL_DIR"
index bcd7a27c85125c729fb304ef7a2e70d105950044..f6ae48c0fc4e9373be9d3756698016f28f64493d 100644 (file)
@@ -125,17 +125,17 @@ setup_conffile() {
 }
 
 prepare_database() {
-  DB_MIGRATE_STATUS=`$COMMAND_PREFIX bundle exec rake db:migrate:status 2>&1 || true`
+  DB_MIGRATE_STATUS=`$COMMAND_PREFIX bin/rake db:migrate:status 2>&1 || true`
   if echo "$DB_MIGRATE_STATUS" | grep -qF 'Schema migrations table does not exist yet.'; then
       # The database exists, but the migrations table doesn't.
-      run_and_report "Setting up database" $COMMAND_PREFIX bundle exec \
-                     rake "$RAILSPKG_DATABASE_LOAD_TASK" db:seed
+      run_and_report "Setting up database" $COMMAND_PREFIX bin/rake \
+                     "$RAILSPKG_DATABASE_LOAD_TASK" db:seed
   elif echo "$DB_MIGRATE_STATUS" | grep -q '^database: '; then
       run_and_report "Running db:migrate" \
-                     $COMMAND_PREFIX bundle exec rake db:migrate
+                     $COMMAND_PREFIX bin/rake db:migrate
   elif echo "$DB_MIGRATE_STATUS" | grep -q 'database .* does not exist'; then
       if ! run_and_report "Running db:setup" \
-           $COMMAND_PREFIX bundle exec rake db:setup 2>/dev/null; then
+           $COMMAND_PREFIX bin/rake db:setup 2>/dev/null; then
           echo "Warning: unable to set up database." >&2
           DATABASE_READY=0
       fi
@@ -198,12 +198,15 @@ configure_version() {
   cd "$RELEASE_PATH"
   export RAILS_ENV=production
 
-  if ! $COMMAND_PREFIX bundle --version >/dev/null; then
-      run_and_report "Installing bundler" $COMMAND_PREFIX gem install bundler --version 1.17.3
+  if ! $COMMAND_PREFIX bundle --version >/dev/null 2>&1; then
+      run_and_report "Installing bundler" $COMMAND_PREFIX gem install bundler --version 2.2.19 --no-document
   fi
 
+  run_and_report "Running bundle config set --local path $SHARED_PATH/vendor_bundle" \
+      $COMMAND_PREFIX bin/bundle config set --local path $SHARED_PATH/vendor_bundle
+
   run_and_report "Running bundle install" \
-      $COMMAND_PREFIX bundle install --path $SHARED_PATH/vendor_bundle --local --quiet
+      $COMMAND_PREFIX bin/bundle install --local --quiet
 
   echo -n "Ensuring directory and file permissions ..."
   # Ensure correct ownership of a few files
@@ -230,7 +233,7 @@ configure_version() {
       # warn about config errors (deprecated/removed keys from
       # previous version, etc)
       run_and_report "Checking configuration for completeness" \
-                     $COMMAND_PREFIX bundle exec rake config:check || APPLICATION_READY=0
+                     $COMMAND_PREFIX bin/rake config:check || APPLICATION_READY=0
   else
       APPLICATION_READY=0
   fi
index 8365fecadbd29705924623374ba21763f934185e..81aac9c616c11ea2894482b240c08495a577511d 100755 (executable)
@@ -106,6 +106,9 @@ while [ $# -gt 0 ]; do
             elif ! [[ "$2" =~ (.*)-(.*) ]]; then
                 echo >&2 "FATAL: --build-version '$2' does not include an iteration. Try '${2}-1'?"
                 exit 1
+            elif ! [[ "$2" =~ ^[0-9]+\.[0-9]+\.[0-9]+(\.[0-9]+|)(~rc[0-9]+|~dev[0-9]+|)-[0-9]+$ ]]; then
+                echo >&2 "FATAL: --build-version '$2' is invalid, must match pattern ^[0-9]+\.[0-9]+\.[0-9]+(\.[0-9]+|)(~rc[0-9]+|~dev[0-9]+|)-[0-9]+$"
+                exit 1
             else
                 ARVADOS_BUILDING_VERSION="${BASH_REMATCH[1]}"
                 ARVADOS_BUILDING_ITERATION="${BASH_REMATCH[2]}"
@@ -194,6 +197,7 @@ if test -z "$packages" ; then
         arvados-client
         arvados-controller
         arvados-dispatch-cloud
+        arvados-dispatch-lsf
         arvados-docker-cleaner
         arvados-git-httpd
         arvados-health
index e231a83df8d22e291a7d048bc8fa3d00c136a109..7829c8c6cd61792535960a153bb20baf1b7e1622 100755 (executable)
@@ -277,6 +277,8 @@ package_go_binary cmd/arvados-server arvados-controller \
     "Arvados cluster controller daemon"
 package_go_binary cmd/arvados-server arvados-dispatch-cloud \
     "Arvados cluster cloud dispatch"
+package_go_binary cmd/arvados-server arvados-dispatch-lsf \
+    "Dispatch Arvados containers to an LSF cluster"
 package_go_binary services/arv-git-httpd arvados-git-httpd \
     "Provide authenticated http access to Arvados-hosted git repositories"
 package_go_binary services/crunch-dispatch-local crunch-dispatch-local \
@@ -401,8 +403,8 @@ if [[ "$?" == "0" ]] ; then
       mv /tmp/x /etc/arvados/config.yml
       perl -p -i -e 'BEGIN{undef $/;} s/WebDAV(.*?):\n( *)ExternalURL: ""/WebDAV$1:\n$2ExternalURL: "example.com"/g' /etc/arvados/config.yml
 
-      ARVADOS_CONFIG=none RAILS_ENV=production RAILS_GROUPS=assets bundle exec rake npm:install >"$STDOUT_IF_DEBUG"
-      ARVADOS_CONFIG=none RAILS_ENV=production RAILS_GROUPS=assets bundle exec rake assets:precompile >"$STDOUT_IF_DEBUG"
+      ARVADOS_CONFIG=none RAILS_ENV=production RAILS_GROUPS=assets bin/rake npm:install >"$STDOUT_IF_DEBUG"
+      ARVADOS_CONFIG=none RAILS_ENV=production RAILS_GROUPS=assets bin/rake assets:precompile >"$STDOUT_IF_DEBUG"
 
       # Remove generated configuration files so they don't go in the package.
       rm -rf /etc/arvados/
index 0c3cbde8d86cafdba972d0f45cd245033b49a3b4..22bb065872e357aae2db7fc0437203d949c452df 100755 (executable)
@@ -337,7 +337,7 @@ test_package_presence() {
     elif [[ "$FORMAT" == "deb" ]]; then
       declare -A dd
       dd[debian10]=buster
-      dd[ubuntu1604]=xenial
+      dd[debian11]=bullseye
       dd[ubuntu1804]=bionic
       dd[ubuntu2004]=focal
       D=${dd[$TARGET]}
@@ -621,6 +621,11 @@ fpm_build_virtualenv () {
   LICENSE_STRING=`grep license $WORKSPACE/$PKG_DIR/setup.py|cut -f2 -d=|sed -e "s/[',\\"]//g"`
   COMMAND_ARR+=('--license' "$LICENSE_STRING")
 
+  if [[ "$FORMAT" == "rpm" ]]; then
+    # Make sure to conflict with the old rh-python36 packages we used to publish
+    COMMAND_ARR+=('--conflicts' "rh-python36-python-$PKG")
+  fi
+
   if [[ "$DEBUG" != "0" ]]; then
     COMMAND_ARR+=('--verbose' '--log' 'info')
   fi
@@ -685,8 +690,8 @@ fpm_build_virtualenv () {
   fi
 
   # the python3-arvados-cwl-runner package comes with cwltool, expose that version
-  if [[ -e "$WORKSPACE/$PKG_DIR/dist/build/usr/share/$python/dist/python-arvados-cwl-runner/bin/cwltool" ]]; then
-    COMMAND_ARR+=("usr/share/$python/dist/python-arvados-cwl-runner/bin/cwltool=/usr/bin/")
+  if [[ -e "$WORKSPACE/$PKG_DIR/dist/build/usr/share/$python/dist/$PYTHON_PKG/bin/cwltool" ]]; then
+    COMMAND_ARR+=("usr/share/$python/dist/$PYTHON_PKG/bin/cwltool=/usr/bin/")
   fi
 
   COMMAND_ARR+=(".")
index e309d24011d4f01079ae7707c2e6b1f0bbd45832..71da30ce43be5bc0b9e53fc2598fdc9face4e8c7 100755 (executable)
@@ -517,10 +517,10 @@ setup_ruby_environment() {
             || fatal 'rvm gemset setup'
 
         rvm env
-        (bundle version | grep -q 2.0.2) || gem install bundler -v 2.0.2
+        (bundle version | grep -q 2.2.19) || gem install bundler -v 2.2.19
         bundle="$(which bundle)"
         echo "$bundle"
-        "$bundle" version | grep 2.0.2 || fatal 'install bundler'
+        "$bundle" version | grep 2.2.19 || fatal 'install bundler'
     else
         # When our "bundle install"s need to install new gems to
         # satisfy dependencies, we want them to go where "gem install
@@ -550,7 +550,7 @@ setup_ruby_environment() {
         (
             export HOME=$GEMHOME
             bundlers="$(gem list --details bundler)"
-            versions=(1.16.6 1.17.3 2.0.2)
+            versions=(2.2.19)
             for v in ${versions[@]}; do
                 if ! echo "$bundlers" | fgrep -q "($v)"; then
                     gem install --user $(for v in ${versions[@]}; do echo bundler:${v}; done)
index 1a43d7cd3a72bb31a1e1ef2152e152c75fe2c6aa..420cbb035a7e7177f84ef7a9ca07117d70e37e5f 100644 (file)
@@ -8,9 +8,6 @@ Documentation=https://doc.arvados.org/
 After=network.target
 AssertPathExists=/etc/arvados/config.yml
 
-# systemd==229 (ubuntu:xenial) obeys StartLimitInterval in the [Unit] section
-StartLimitInterval=0
-
 # systemd>=230 (debian:9) obeys StartLimitIntervalSec in the [Unit] section
 StartLimitIntervalSec=0
 
index d3f476b7df725c2597f3c972e476d4a51ff86165..8d57e8a1612ca49ecc9d98b44a716eb1485e2640 100644 (file)
@@ -8,9 +8,6 @@ Documentation=https://doc.arvados.org/
 After=network.target
 AssertPathExists=/etc/arvados/config.yml
 
-# systemd==229 (ubuntu:xenial) obeys StartLimitInterval in the [Unit] section
-StartLimitInterval=0
-
 # systemd>=230 (debian:9) obeys StartLimitIntervalSec in the [Unit] section
 StartLimitIntervalSec=0
 
diff --git a/cmd/arvados-server/arvados-dispatch-lsf.service b/cmd/arvados-server/arvados-dispatch-lsf.service
new file mode 100644 (file)
index 0000000..65d8786
--- /dev/null
@@ -0,0 +1,27 @@
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: AGPL-3.0
+
+[Unit]
+Description=arvados-dispatch-lsf
+Documentation=https://doc.arvados.org/
+After=network.target
+AssertPathExists=/etc/arvados/config.yml
+
+# systemd>=230 (debian:9) obeys StartLimitIntervalSec in the [Unit] section
+StartLimitIntervalSec=0
+
+[Service]
+Type=notify
+EnvironmentFile=-/etc/arvados/environment
+ExecStart=/usr/bin/arvados-dispatch-lsf
+# Set a reasonable default for the open file limit
+LimitNOFILE=65536
+Restart=always
+RestartSec=1
+
+# systemd<=219 (centos:7, debian:8, ubuntu:trusty) obeys StartLimitInterval in the [Service] section
+StartLimitInterval=0
+
+[Install]
+WantedBy=multi-user.target
index aebc56a79f333b19f061f5f0aadce793e799529c..f73db5d08032369c619e42429ddf7a68550b8551 100644 (file)
@@ -8,9 +8,6 @@ Documentation=https://doc.arvados.org/
 After=network.target
 AssertPathExists=/etc/arvados/config.yml
 
-# systemd==229 (ubuntu:xenial) obeys StartLimitInterval in the [Unit] section
-StartLimitInterval=0
-
 # systemd>=230 (debian:9) obeys StartLimitIntervalSec in the [Unit] section
 StartLimitIntervalSec=0
 
index d0aa9da94df537bf80a3ed232c2a0ae2c3a0e1d6..4b94a7813869915c38c14ec7927a8a2662e30475 100644 (file)
@@ -15,6 +15,7 @@ import (
        "git.arvados.org/arvados.git/lib/crunchrun"
        "git.arvados.org/arvados.git/lib/dispatchcloud"
        "git.arvados.org/arvados.git/lib/install"
+       "git.arvados.org/arvados.git/lib/lsf"
        "git.arvados.org/arvados.git/lib/recovercollection"
        "git.arvados.org/arvados.git/services/ws"
 )
@@ -33,6 +34,7 @@ var (
                "controller":         controller.Command,
                "crunch-run":         crunchrun.Command,
                "dispatch-cloud":     dispatchcloud.Command,
+               "dispatch-lsf":       lsf.DispatchCommand,
                "install":            install.Command,
                "init":               install.InitCommand,
                "recover-collection": recovercollection.Command,
index 39fe22fde3e1379205a26e0e5f28ef398c3d3980..ff924e2f378381f4c8f4e586f3c2c3570d8202e2 100644 (file)
@@ -254,7 +254,10 @@ navbar:
       - install/crunch2-slurm/configure-slurm.html.textile.liquid
       - install/crunch2-slurm/install-compute-node.html.textile.liquid
       - install/crunch2-slurm/install-test.html.textile.liquid
+    - Containers API (lsf):
+      - install/crunch2-lsf/install-dispatch.html.textile.liquid
     - Additional configuration:
+      - install/singularity.html.textile.liquid
       - install/container-shell-access.html.textile.liquid
     - External dependencies:
       - install/install-postgresql.html.textile.liquid
index 07d8a4ae40ac7e0caabed2713bda5fb7a139379f..ed085ea105b5fa48009200b59f1bdfd553cf63b4 100644 (file)
@@ -8,7 +8,7 @@ On the <strong>API server</strong>, use the following commands:
 
 <notextile>
 <pre><code>~$ <span class="userinput">cd /var/www/arvados-api/current</span>
-$ <span class="userinput">sudo -u <b>webserver-user</b> RAILS_ENV=production bundle exec script/create_superuser_token.rb</span>
+$ <span class="userinput">sudo -u <b>webserver-user</b> RAILS_ENV=production bin/bundle exec script/create_superuser_token.rb</span>
 zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz
 </code></pre>
 </notextile>
index 10c17a0c2d4bdbe703b3cf91d7ea434ca5e1e4e5..b648e92161e63ea485f0585242caf7947da3bb78 100644 (file)
@@ -8,7 +8,7 @@ SPDX-License-Identifier: CC-BY-SA-3.0
 This template recognizes four variables:
 * railshost: The hostname included in the prompt, to let the user know where to run the command.  If this is the empty string, no hostname will be displayed.  Default "apiserver".
 * railsdir: The directory included in the prompt, to let the user know where to run the command.  Default "/var/www/arvados-api/current".
-* railscmd: The full command to run.  Default "bundle exec rails console".
+* railscmd: The full command to run.  Default "bin/rails console".
 * railsout: The expected output of the command, if any.
 {% endcomment %} Change *@webserver-user@* to the user that runs your web server process.  If you install Phusion Passenger as we recommend, this is *@www-data@* on Debian-based systems, and *@nginx@* on Red Hat-based systems.
 
@@ -25,7 +25,7 @@ This template recognizes four variables:
 {% endunless %}
 
 {% unless railscmd %}
-  {% assign railscmd = "bundle exec rails console" %}
+  {% assign railscmd = "bin/rails console" %}
 {% endunless %}
 
 <notextile>
index 387f023f24ddb9f6feb6ba9af0f012a355029c39..fe7714c62feae18bd12fa6c85890ec7ba14182a3 100644 (file)
@@ -22,8 +22,6 @@ The Ruby version shipped with Centos 7 is too old.  Use "RVM":#rvm to install Ru
 
 h3. Debian and Ubuntu
 
-Ubuntu 16.04 (xenial) ships with Ruby 2.3, which is not supported by Arvados.  Use "RVM":#rvm to install Ruby 2.5 or later.
-
 Debian 10 (buster) and Ubuntu 18.04 (bionic) and later ship with Ruby 2.5, which is supported by Arvados.
 
 <notextile>
index a1f7872df4bff34e520d26c2f33680fdaec031a6..e36655a0f9eaecb4949f47334fa597225c5e57d5 100644 (file)
@@ -23,7 +23,7 @@ The legacy API server configuration is stored in @config/application.yml@ and @c
 Change to the API server directory and use the following commands:
 
 <pre>
-$ RAILS_ENV=production bundle exec rake config:migrate > config.yml
+$ RAILS_ENV=production bin/rake config:migrate > config.yml
 $ cp config.yml /etc/arvados/config.yml
 </pre>
 
@@ -32,7 +32,7 @@ This will print the contents of @config.yml@ after merging the legacy @applicati
 If you wish to update @config.yml@ configuration by hand, or check that everything has been migrated, use @config:diff@ to print configuration items that differ between @application.yml@ and the system @config.yml@.
 
 <pre>
-$ RAILS_ENV=production bundle exec rake config:diff
+$ RAILS_ENV=production bin/rake config:diff
 </pre>
 
 This command will also report if no migrations are required.
@@ -44,7 +44,7 @@ The legacy workbench configuration is stored in @config/application.yml@.  After
 Change to the workbench server directory and use the following commands:
 
 <pre>
-$ RAILS_ENV=production bundle exec rake config:migrate > config.yml
+$ RAILS_ENV=production bin/rake config:migrate > config.yml
 $ cp config.yml /etc/arvados/config.yml
 </pre>
 
@@ -53,7 +53,7 @@ This will print the contents of @config.yml@ after merging the legacy @applicati
 If you wish to update @config.yml@ configuration by hand, or check that everything has been migrated, use @config:diff@ to print configuration items that differ between @application.yml@ and the system @config.yml@.
 
 <pre>
-$ RAILS_ENV=production bundle exec rake config:diff
+$ RAILS_ENV=production bin/rake config:diff
 </pre>
 
 This command will also report if no migrations are required.
index 3e17831b58d6d32f21b0c278253aea4394334fac..ade9633879cc3e667ab1fd9fd41d53a668d21cec 100644 (file)
@@ -12,21 +12,44 @@ SPDX-License-Identifier: CC-BY-SA-3.0
 
 Storage classes (alternately known as "storage tiers") allow you to control which volumes should be used to store particular collection data blocks.  This can be used to implement data storage policies such as moving data to archival storage.
 
-The storage classes for each volume are set in the per-volume "keepstore configuration":{{site.baseurl}}/install/install-keepstore.html
+In the default Arvados configuration, with no storage classes specified in the configuration file, all volumes belong to a single implicit storage class called "default". Apart from that, names of storage classes are internal to the cluster and decided by the administrator.  Other than the implicit "default" class, Arvados currently does not define any standard storage class names.
+
+To use multiple storage classes, update the @StorageClasses@ and @Volumes@ sections of your configuration file.
+* Every storage class you use (including "default") must be defined in the @StorageClasses@ section.
+* The @StorageClasses@ section must use @Default: true@ to indicate at least one default storage class. When a client/user does not specify storage classes when creating a new collection, the default storage classes are used implicitly.
+* If some storage classes are faster or cheaper to access than others, assign a higher @Priority@ to the faster ones. When reading data, volumes with high priority storage classes are searched first.
+
+Example:
 
 <pre>
+    StorageClasses:
+
+      default:
+        # When reading a block that is stored on multiple volumes,
+        # prefer a volume with this class.
+        Priority: 20
+
+        # When a client does not specify a storage class when saving a
+        # new collection, use this one.
+        Default: true
+
+      archival:
+        Priority: 10
+
     Volumes:
+
       ClusterID-nyw5e-000000000000000:
         # This volume is in the "default" storage class.
         StorageClasses:
           default: true
+
       ClusterID-nyw5e-000000000000001:
-        # Specify this volume is in the "archival" storage class.
+        # This volume is in the "archival" storage class.
         StorageClasses:
           archival: true
 </pre>
 
-Names of storage classes are internal to the cluster and decided by the administrator.  Aside from "default", Arvados currently does not define any standard storage class names.
+Refer to the "configuration reference":{{site.baseurl}}/admin/config.html for more details.
 
 h3. Using storage classes
 
index c71d86c47f234b6cd3db8b2580faabcc46eb6a0c..5efbccbc19a0c4289643ba98632c24fda8b6f985 100644 (file)
@@ -113,7 +113,7 @@ If you have an existing Arvados installation and want to set a token lifetime po
 The @db:check_long_lived_tokens@ task will list which users have tokens with no expiration date.
 
 <notextile>
-<pre><code># <span class="userinput">bundle exec rake db:check_long_lived_tokens</span>
+<pre><code># <span class="userinput">bin/rake db:check_long_lived_tokens</span>
 Found 6 long-lived tokens from users:
 user2,user2@example.com,zzzzz-tpzed-5vzt5wc62k46p6r
 admin,admin@example.com,zzzzz-tpzed-6drplgwq9nm5cox
@@ -124,7 +124,7 @@ user1,user1@example.com,zzzzz-tpzed-ftz2tfurbpf7xox
 To apply the new policy to existing tokens, use the @db:fix_long_lived_tokens@ task.
 
 <notextile>
-<pre><code># <span class="userinput">bundle exec rake db:fix_long_lived_tokens</span>
+<pre><code># <span class="userinput">bin/rake db:fix_long_lived_tokens</span>
 Setting token expiration to: 2020-08-25 03:30:50 +0000
 6 tokens updated.
 </code></pre>
index 3c283c354cf2f6e48dd4f3f2dfb2e732276f169e..8435e2871f882fd6588b709711182443dcc8f2f3 100644 (file)
@@ -39,18 +39,26 @@ h2(#main). development main (as of 2021-07-15)
 
 "Upgrading from 2.2.0":#v2_2_0
 
-h3. crunch-dispatch-local now requires config.yml
+h3. Storage classes must be defined explicitly
 
-The @crunch-dispatch-local@ dispatcher now reads the API host and token from the system wide @/etc/arvados/config.yml@ .  It will fail to start that file is not found or not readable.
+If your configuration uses the StorageClasses attribute on any Keep volumes, you must add a new @StorageClasses@ section that lists all of your storage classes. Refer to the updated documentation about "configuring storage classes":{{site.baseurl}}/admin/storage-classes.html for details.
 
-h2(#v2_2_0). v2.2.0 (2021-06-03)
+h3. keep-balance requires access to PostgreSQL
 
-"Upgrading from 2.1.0":#v2_1_0
+Make sure the keep-balance process can connect to your PostgreSQL server using the settings in your config file. (In previous versions, keep-balance accessed the database through controller instead of connecting to the database server directly.)
+
+h3. crunch-dispatch-local now requires config.yml
+
+The @crunch-dispatch-local@ dispatcher now reads the API host and token from the system wide @/etc/arvados/config.yml@ .  It will fail to start that file is not found or not readable.
 
 h3. Multi-file docker image collections
 
 Typically a docker image collection contains a single @.tar@ file at the top level. Handling of atypical cases has changed. If a docker image collection contains files with extensions other than @.tar@, they will be ignored (previously they could cause errors). If a docker image collection contains multiple @.tar@ files, it will cause an error at runtime, "cannot choose from multiple tar files in image collection" (previously one of the @.tar@ files was selected). Subdirectories are ignored. The @arv keep docker@ command always creates a collection with a single @.tar@ file, and never uses subdirectories, so this change will not affect most users.
 
+h2(#v2_2_0). v2.2.0 (2021-06-03)
+
+"Upgrading from 2.1.0":#v2_1_0
+
 h3. New spelling of S3 credential configs
 
 If you use the S3 driver for Keep volumes and specify credentials in your configuration file (as opposed to using an IAM role), you should change the spelling of the @AccessKey@ and @SecretKey@ config keys to @AccessKeyID@ and @SecretAccessKey@. If you don't update them, the previous spellings will still be accepted, but warnings will be logged at server startup.
index 1770a259b7b0cc3c213bf6ca52f89d3d54413093..ed0bde8137334f0850b980e71742399b93943000 100644 (file)
@@ -14,7 +14,7 @@ Files served by @keep-web@ can be rendered directly in the browser, or @keep-web
 
 When serving files that will render directly in the browser, it is important to properly configure the keep-web service to migitate cross-site-scripting (XSS) attacks.  A HTML page can be stored in a collection.  If an attacker causes a victim to visit that page through Workbench, the HTML will be rendered by the browser.  If all collections are served at the same domain, the browser will consider collections as coming from the same origin, which will grant access to the same browsing data (cookies and local storage).  This would enable malicious Javascript on that page to access Arvados on behalf of the victim.
 
-This can be mitigated by having separate domains for each collection, or limiting preview to circumstances where the collection is not accessed with the user's regular full-access token.  For cluster administrators that understand the risks, this protection can also be turned off.
+This can be mitigated by having separate domains for each collection, or limiting preview to circumstances where the collection is not accessed with the user's regular full-access token.  For clusters where this risk is acceptable, this protection can also be turned off by setting the @Collections/TrustAllContent@ configuration flag to true, see the "configuration reference":../admin/config.html for more detail.
 
 The following "same origin" URL patterns are supported for public collections and collections shared anonymously via secret links (i.e., collections which can be served by keep-web without making use of any implicit credentials like cookies). See "Same-origin URLs" below.
 
@@ -82,4 +82,4 @@ When a client passes a token in the URL, keep-web sends a redirect response plac
 
 This mainly affects Workbench's ability to show inline content, so it should be taken into account when configuring both services' URL schemes.
 
-You can read more about the definition of a _same-site_ request at the "RFC 6265bis-03 page":https://tools.ietf.org/html/draft-ietf-httpbis-rfc6265bis-03#section-5.2
\ No newline at end of file
+You can read more about the definition of a _same-site_ request at the "RFC 6265bis-03 page":https://tools.ietf.org/html/draft-ietf-httpbis-rfc6265bis-03#section-5.2
diff --git a/doc/install/crunch2-lsf/install-dispatch.html.textile.liquid b/doc/install/crunch2-lsf/install-dispatch.html.textile.liquid
new file mode 100644 (file)
index 0000000..66b562d
--- /dev/null
@@ -0,0 +1,112 @@
+---
+layout: default
+navsection: installguide
+title: Install the LSF dispatcher
+...
+{% comment %}
+Copyright (C) The Arvados Authors. All rights reserved.
+
+SPDX-License-Identifier: CC-BY-SA-3.0
+{% endcomment %}
+
+{% include 'notebox_begin_warning' %}
+arvados-dispatch-lsf is only relevant for on premises clusters that will spool jobs to LSF. Skip this section if you are installing a cloud cluster.
+{% include 'notebox_end' %}
+
+Containers can be dispatched to an LSF cluster.  The dispatcher sends work to the cluster using LSF's @bsub@ command, so it works in a variety of LSF configurations.
+
+*LSF support is currently considered experimental.*
+
+Limitations include:
+* Arvados container priority is not propagated to LSF job priority. This can cause inefficient use of compute resources, and even deadlock if there are fewer compute nodes than concurrent Arvados workflows.
+* Combining LSF with docker may not work, depending on LSF configuration and user/group IDs (if LSF only sets up the configured user's primary group ID when executing the crunch-run process on a compute node, it may not have permission to connect to the docker daemon).
+
+In order to run containers, you must choose a user that has permission to set up FUSE mounts and run Singularity/Docker containers on each compute node.  This install guide refers to this user as the @crunch@ user.  We recommend you create this user on each compute node with the same UID and GID, and add it to the @fuse@ and @docker@ system groups to grant it the necessary permissions.  However, you can run the dispatcher under any account with sufficient permissions across the cluster.
+
+Set up all of your compute nodes "as you would for a SLURM cluster":../crunch2-slurm/install-compute-node.html.
+
+
+h2(#update-config). Update config.yml
+
+Arvados-dispatch-lsf reads the common configuration file at @/etc/arvados/config.yml@.
+
+Review the following configuration parameters and adjust as needed.
+
+
+h3(#BsubSudoUser). Containers.LSF.BsubSudoUser
+
+arvados-dispatch-lsf uses @sudo@ to execute @bsub@, for example @sudo -E -u crunch bsub [...]@. This means the @crunch@ account must exist on the hosts where LSF jobs run ("execution hosts"), as well as on the host where you are installing the Arvados LSF dispatcher (the "submission host"). To use a user account other than @crunch@, configure @BsubSudoUser@:
+
+<notextile>
+<pre>    Containers:
+      LSF:
+        <code class="userinput">BsubSudoUser: <b>lsfuser</b>
+</code></pre>
+</notextile>
+
+Alternatively, you can arrange for the arvados-dispatch-lsf process to run as an unprivileged user that has a corresponding account on all compute nodes, and disable the use of @sudo@ by specifying an empty string:
+
+<notextile>
+<pre>    Containers:
+      LSF:
+        # Don't use sudo
+        <code class="userinput">BsubSudoUser: <b>""</b>
+</code></pre>
+</notextile>
+
+
+h3(#SbatchArguments). Containers.LSF.BsubArgumentsList
+
+When arvados-dispatch-lsf invokes @bsub@, you can add arguments to the command by specifying @BsubArgumentsList@.  You can use this to send the jobs to specific cluster partitions or add resource requests.  Set @BsubArgumentsList@ to an array of strings.  For example:
+
+<notextile>
+<pre>    Containers:
+      LSF:
+        <code class="userinput">BsubArgumentsList: <b>["-C", "0"]</b></code>
+</pre>
+</notextile>
+
+
+h3(#PollPeriod). Containers.PollInterval
+
+arvados-dispatch-lsf polls the API server periodically for new containers to run.  The @PollInterval@ option controls how often this poll happens.  Set this to a string of numbers suffixed with one of the time units @s@, @m@, or @h@.  For example:
+
+<notextile>
+<pre>    Containers:
+      <code class="userinput">PollInterval: <b>10s</b>
+</code></pre>
+</notextile>
+
+
+h3(#ReserveExtraRAM). Containers.ReserveExtraRAM: Extra RAM for jobs
+
+Extra RAM to reserve (in bytes) on each LSF job submitted by Arvados, which is added to the amount specified in the container's @runtime_constraints@.  If not provided, the default value is zero.
+
+Supports suffixes @KB@, @KiB@, @MB@, @MiB@, @GB@, @GiB@, @TB@, @TiB@, @PB@, @PiB@, @EB@, @EiB@ (where @KB@ is 10[^3^], @KiB@ is 2[^10^], @MB@ is 10[^6^], @MiB@ is 2[^20^] and so forth).
+
+<notextile>
+<pre>    Containers:
+      <code class="userinput">ReserveExtraRAM: <b>256MiB</b></code>
+</pre>
+</notextile>
+
+
+h3(#CrunchRunCommand-network). Containers.CrunchRunArgumentList: Using host networking for containers
+
+Older Linux kernels (prior to 3.18) have bugs in network namespace handling which can lead to compute node lockups.  This by is indicated by blocked kernel tasks in "Workqueue: netns cleanup_net".   If you are experiencing this problem, as a workaround you can disable use of network namespaces by Docker across the cluster.  Be aware this reduces container isolation, which may be a security risk.
+
+<notextile>
+<pre>    Containers:
+      <code class="userinput">CrunchRunArgumentsList:
+        - <b>"-container-enable-networking=always"</b>
+        - <b>"-container-network-mode=host"</b></code>
+</pre>
+</notextile>
+
+{% assign arvados_component = 'arvados-dispatch-lsf' %}
+
+{% include 'install_packages' %}
+
+{% include 'start_service' %}
+
+{% include 'restart_api' %}
index 3996cc7930a70a44ace17a8bd55cade99876bd7c..5b5b868e57611fe0262b0e16e708289f1a001f95 100644 (file)
@@ -44,7 +44,7 @@ crunch-dispatch-slurm polls the API server periodically for new containers to ru
 
 h3(#ReserveExtraRAM). Containers.ReserveExtraRAM: Extra RAM for jobs
 
-Extra RAM to reserve (in bytes) on each Slurm job submitted by Arvados, which is added to the amount specified in the container's @runtime_constraints@.  If not provided, the default value is zero.  Helpful when using @-cgroup-parent-subsystem@, where @crunch-run@ and @arv-mount@ share the control group memory limit with the user process.  In this situation, at least 256MiB is recommended to accomodate each container's @crunch-run@ and @arv-mount@ processes.
+Extra RAM to reserve (in bytes) on each Slurm job submitted by Arvados, which is added to the amount specified in the container's @runtime_constraints@.  If not provided, the default value is zero.  Helpful when using @-cgroup-parent-subsystem@, where @crunch-run@ and @arv-mount@ share the control group memory limit with the user process.  In this situation, at least 256MiB is recommended to accommodate each container's @crunch-run@ and @arv-mount@ processes.
 
 Supports suffixes @KB@, @KiB@, @MB@, @MiB@, @GB@, @GiB@, @TB@, @TiB@, @PB@, @PiB@, @EB@, @EiB@ (where @KB@ is 10[^3^], @KiB@ is 2[^10^], @MB@ is 10[^6^], @MiB@ is 2[^20^] and so forth).
 
index 3d70fc4de9497e8bb20b48cec6ecdfa8f62ef2ca..b7589032561cb03046bd597bd973506a665278d9 100644 (file)
@@ -183,7 +183,7 @@ $ sudo chmod og-rwx /var/www/arvados-api/current/config/arvados-clients.yml
 
 h3. Test configuration
 
-notextile. <pre><code>$ <span class="userinput">sudo -u git -i bash -c 'cd /var/www/arvados-api/current && bundle exec script/arvados-git-sync.rb production'</span></code></pre>
+notextile. <pre><code>$ <span class="userinput">sudo -u git -i bash -c 'cd /var/www/arvados-api/current && bin/bundle exec script/arvados-git-sync.rb production'</span></code></pre>
 
 h3. Enable the synchronization script
 
@@ -192,7 +192,7 @@ The API server package includes a script that retrieves the current set of repos
 Create @/etc/cron.d/arvados-git-sync@ with the following content:
 
 <notextile>
-<pre><code><span class="userinput">*/5 * * * * git cd /var/www/arvados-api/current && bundle exec script/arvados-git-sync.rb production</span>
+<pre><code><span class="userinput">*/5 * * * * git cd /var/www/arvados-api/current && bin/bundle exec script/arvados-git-sync.rb production</span>
 </code></pre>
 </notextile>
 
index 1d9b654b25c285d13ba51b843fb0c53b4ad26b4a..bb4ae7b3d8ef1b9b5f810d2cfecfe8901f7a2be0 100644 (file)
@@ -18,13 +18,13 @@ h2(#introduction). Introduction
 
 Keep-balance deletes unreferenced and overreplicated blocks from Keep servers, makes additional copies of underreplicated blocks, and moves blocks into optimal locations as needed (e.g., after adding new servers). See "Balancing Keep servers":{{site.baseurl}}/admin/keep-balance.html for usage details.
 
-Keep-balance can be installed anywhere with network access to Keep services. Typically it runs on the same host as keepproxy.
+Keep-balance can be installed anywhere with network access to Keep services, arvados-controller, and PostgreSQL. Typically it runs on the same host as keepproxy.
 
 *A cluster should have only one instance of keep-balance running at a time.*
 
 {% include 'notebox_begin' %}
 
-If you are installing keep-balance on an existing system with valuable data, you can run keep-balance in "dry run" mode first and review its logs as a precaution. To do this, edit your keep-balance startup script to use the flags @-commit-pulls=false -commit-trash=false@.
+If you are installing keep-balance on an existing system with valuable data, you can run keep-balance in "dry run" mode first and review its logs as a precaution. To do this, edit your keep-balance startup script to use the flags @-commit-pulls=false -commit-trash=false -commit-confirmed-fields=false@.
 
 {% include 'notebox_end' %}
 
index 033efe63f1b76786e058ab9adf385f406adbd31b..9f63d1bcfcf8ad3def1968700c58930c4e7a3ccb 100644 (file)
@@ -90,7 +90,7 @@ Note the trailing slash.
 {% include 'notebox_begin' %}
 Whether you choose to serve collections from their own subdomain or from a single domain, it's important to keep in mind that they should be served from me same _site_ as Workbench for the inline previews to work.
 
-Please check "keep-web's URL pattern guide":/api/keep-web-urls.html#same-site to learn more.
+Please check "keep-web's URL pattern guide":../api/keep-web-urls.html#same-site to learn more.
 {% include 'notebox_end' %}
 
 h2. Set InternalURLs
@@ -105,7 +105,7 @@ h2. Set InternalURLs
 
 h2(#update-config). Configure anonymous user token
 
-{% assign railscmd = "bundle exec ./script/get_anonymous_user_token.rb --get" %}
+{% assign railscmd = "bin/bundle exec ./script/get_anonymous_user_token.rb --get" %}
 {% assign railsout = "zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz" %}
 If you intend to use Keep-web to serve public data to anonymous clients, configure it with an anonymous token.
 
index 1f0186e33aba5a257da8ce5afb9886dd5f9e9ce3..084f32e029c4c3a99e4db207b5b07a4f51374e36 100644 (file)
@@ -29,6 +29,7 @@ h2(#supportedlinux). Supported GNU/Linux distributions
 table(table table-bordered table-condensed).
 |_. Distribution|_. State|_. Last supported version|
 |CentOS 7|Supported|Latest|
+|Debian 11 ("bullseye")|Supported|Latest|
 |Debian 10 ("buster")|Supported|Latest|
 |Ubuntu 20.04 ("focal")|Supported|Latest|
 |Ubuntu 18.04 ("bionic")|Supported|Latest|
@@ -141,26 +142,73 @@ You may also use a different method to pick the cluster identifier. The cluster
 
 h2(#dnstls). DNS entries and TLS certificates
 
-The following services are normally public-facing and require DNS entries and corresponding TLS certificates.  Get certificates from your preferred TLS certificate provider.  We recommend using "Let's Encrypt":https://letsencrypt.org/.  You can run several services on same node, but each distinct hostname requires its own TLS certificate.
+The following services are normally public-facing and require DNS entries and corresponding TLS certificates.  Get certificates from your preferred TLS certificate provider.  We recommend using "Let's Encrypt":https://letsencrypt.org/.  You can run several services on the same node, but each distinct DNS name requires a valid, matching TLS certificate.
 
-This guide uses the following hostname conventions.  A later part of this guide will describe how to set up Nginx virtual hosts.
+This guide uses the following DNS name conventions.  A later part of this guide will describe how to set up Nginx virtual hosts.
 
 <div class="offset1">
 table(table table-bordered table-condensed).
-|_. Function|_. Hostname|
+|_. Function|_. DNS name|
 |Arvados API|@ClusterID.example.com@|
 |Arvados Git server|git.@ClusterID.example.com@|
+|Arvados Webshell|webshell.@ClusterID.example.com@|
 |Arvados Websockets endpoint|ws.@ClusterID.example.com@|
 |Arvados Workbench|workbench.@ClusterID.example.com@|
 |Arvados Workbench 2|workbench2.@ClusterID.example.com@|
 |Arvados Keepproxy server|keep.@ClusterID.example.com@|
 |Arvados Keep-web server|download.@ClusterID.example.com@
 _and_
-*.collections.@ClusterID.example.com@ or
-*<notextile>--</notextile>collections.@ClusterID.example.com@ or
+*.collections.@ClusterID.example.com@ _or_
+*<notextile>--</notextile>collections.@ClusterID.example.com@ _or_
 collections.@ClusterID.example.com@ (see the "keep-web install docs":install-keep-web.html)|
 </div>
 
+Setting up Arvados is easiest when Wildcard TLS and wildcard DNS are available. It is also possible to set up Arvados without wildcard TLS and DNS, but not having a wildcard for @keep-web@ (i.e. not having *.collections.@ClusterID.example.com@) comes with a tradeoff: it will disable some features that allow users to view Arvados-hosted data in their browsers. More information on this tradeoff caused by the CORS rules applied by modern browsers is available in the "keep-web URL pattern guide":../api/keep-web-urls.html.
+
+The table below lists the required TLS certificates and DNS names in each scenario.
+
+<div class="offset1">
+table(table table-bordered table-condensed).
+||_. Wildcard TLS and DNS available|_. Wildcard TLS available|_. Other|
+|TLS|*.@ClusterID.example.com@
+@ClusterID.example.com@
+*.collections.@ClusterID.example.com@|*.@ClusterID.example.com@
+@ClusterID.example.com@|@ClusterID.example.com@
+git.@ClusterID.example.com@
+webshell.@ClusterID.example.com@
+ws.@ClusterID.example.com@
+workbench.@ClusterID.example.com@
+workbench2.@ClusterID.example.com@
+keep.@ClusterID.example.com@
+download.@ClusterID.example.com@
+collections.@ClusterID.example.com@|
+|DNS|@ClusterID.example.com@
+git.@ClusterID.example.com@
+webshell.@ClusterID.example.com@
+ws.@ClusterID.example.com@
+workbench.@ClusterID.example.com@
+workbench2.@ClusterID.example.com@
+keep.@ClusterID.example.com@
+download.@ClusterID.example.com@
+*.collections.@ClusterID.example.com@|@ClusterID.example.com@
+git.@ClusterID.example.com@
+webshell.@ClusterID.example.com@
+ws.@ClusterID.example.com@
+workbench.@ClusterID.example.com@
+workbench2.@ClusterID.example.com@
+keep.@ClusterID.example.com@
+download.@ClusterID.example.com@
+collections.@ClusterID.example.com@|@ClusterID.example.com@
+git.@ClusterID.example.com@
+webshell.@ClusterID.example.com@
+ws.@ClusterID.example.com@
+workbench.@ClusterID.example.com@
+workbench2.@ClusterID.example.com@
+keep.@ClusterID.example.com@
+download.@ClusterID.example.com@
+collections.@ClusterID.example.com@|
+</div>
+
 {% include 'notebox_begin' %}
 It is also possible to create your own certificate authority, issue server certificates, and install a custom root certificate in the browser.  This is out of scope for this guide.
 {% include 'notebox_end' %}
index 5d74e4a7e30271923bf3992d100119cea824e724..fb296ad5ad47019024d49fe6e0bf61f30a48b0d5 100644 (file)
@@ -41,6 +41,7 @@ As root, add the Arvados package repository to your sources.  This command depen
 
 table(table table-bordered table-condensed).
 |_. OS version|_. Command|
+|Debian 11 ("bullseye")|<notextile><code><span class="userinput">echo "deb http://apt.arvados.org/buster bullseye main" &#x7c; tee /etc/apt/sources.list.d/arvados.list</span></code></notextile>|
 |Debian 10 ("buster")|<notextile><code><span class="userinput">echo "deb http://apt.arvados.org/buster buster main" &#x7c; tee /etc/apt/sources.list.d/arvados.list</span></code></notextile>|
 |Ubuntu 20.04 ("focal")[1]|<notextile><code><span class="userinput">echo "deb http://apt.arvados.org/focal focal main" &#x7c; tee /etc/apt/sources.list.d/arvados.list</span></code></notextile>|
 |Ubuntu 18.04 ("bionic")[1]|<notextile><code><span class="userinput">echo "deb http://apt.arvados.org/bionic bionic main" &#x7c; tee /etc/apt/sources.list.d/arvados.list</span></code></notextile>|
diff --git a/doc/install/singularity.html.textile.liquid b/doc/install/singularity.html.textile.liquid
new file mode 100644 (file)
index 0000000..1f38253
--- /dev/null
@@ -0,0 +1,42 @@
+---
+layout: default
+navsection: installguide
+title: Singularity container runtime
+...
+{% comment %}
+Copyright (C) The Arvados Authors. All rights reserved.
+
+SPDX-License-Identifier: CC-BY-SA-3.0
+{% endcomment %}
+
+Arvados can be configured to use "Singularity":https://sylabs.io/singularity/ instead of Docker to execute containers on cloud nodes or a SLURM/LSF cluster. Singularity may be preferable due to its simpler installation and lack of long-running daemon process and special system users/groups.
+
+Please note:
+* *Singularity support is currently considered experimental.*
+* Even when using the singularity runtime, users' container images are expected to be saved in Docker format using @arv keep docker@. Arvados converts the Docker image to Singularity format (@.sif@) at runtime as needed. Specifying a @.sif@ file as an image when submitting a container request is not yet supported.
+* Singularity does not limit the amount of memory available in a container. Each container will have access to all memory on the host where it runs, unless memory use is restricted by SLURM/LSF.
+* Programs running in containers may behave differently due to differences between Singularity and Docker.
+** The root (image) filesystem is read-only in a Singularity container. Programs that attempt to write outside a designated output or temporary directory are likely to fail.
+** The Docker ENTRYPOINT instruction is ignored.
+* Arvados is currently tested with Singularity version 3.5.2.
+
+To use singularity, first make sure "Singularity is installed":https://sylabs.io/guides/3.5/user-guide/quick_start.html on your cloud worker image or SLURM/LSF compute nodes as applicable. Note @squashfs-tools@ is required.
+
+<notextile>
+<pre><code>$ <span class="userinput">singularity version</span>
+3.5.2
+$ <span class="userinput">mksquashfs -version</span>
+mksquashfs version 4.3-git (2014/06/09)
+[...]
+</code></pre>
+</notextile>
+
+Then update @Containers.RuntimeEngine@ in your cluster configuration:
+
+<notextile>
+<pre><code>      # Container runtime: "docker" (default) or "singularity" (experimental)
+      RuntimeEngine: singularity
+</code></pre>
+</notextile>
+
+Restart your dispatcher (@crunch-dispatch-slurm@, @arvados-dispatch-cloud@, or @arvados-dispatch-lsf@) after updating your configuration file.
index 09a553becfbff4d65ecbc9c82467b2cf557c8640..206b75d58f20695c34f322236ee249ee5f3fefab 100644 (file)
@@ -24,29 +24,34 @@ For portability, most Arvados extensions should go into the @hints@ section of y
 {% codeblock as yaml %}
 hints:
   arv:RunInSingleContainer: {}
+
   arv:RuntimeConstraints:
     keep_cache: 123456
     outputDirType: keep_output_dir
+
   arv:PartitionRequirement:
     partition: dev_partition
+
   arv:APIRequirement: {}
-  cwltool:LoadListingRequirement:
-    loadListing: shallow_listing
+
   arv:IntermediateOutput:
     outputTTL: 3600
-  arv:ReuseRequirement:
-    enableReuse: false
+
   cwltool:Secrets:
     secrets: [input1, input2]
-  cwltool:TimeLimit:
-    timelimit: 14400
+
   arv:WorkflowRunnerResources:
     ramMin: 2048
     coresMin: 2
     keep_cache: 512
+
   arv:ClusterTarget:
     cluster_id: clsr1
     project_uuid: clsr1-j7d0g-qxc4jcji7n4lafx
+
+  arv:OutputStorageClass:
+    intermediateStorageClass: fast_storage
+    finalStorageClass: robust_storage
 {% endcodeblock %}
 
 h2(#RunInSingleContainer). arv:RunInSingleContainer
@@ -89,7 +94,7 @@ Specify desired handling of intermediate output collections.
 table(table table-bordered table-condensed).
 |_. Field |_. Type |_. Description |
 |outputTTL|int|If the value is greater than zero, consider intermediate output collections to be temporary and should be automatically trashed. Temporary collections will be trashed @outputTTL@ seconds after creation.  A value of zero means intermediate output should be retained indefinitely (this is the default behavior).
-Note: arvados-cwl-runner currently does not take workflow dependencies into account when setting the TTL on an intermediate output collection. If the TTL is too short, it is possible for a collection to be trashed before downstream steps that consume it are started.  The recommended minimum value for TTL is the expected duration of the entire the workflow.|
+Note: arvados-cwl-runner currently does not take workflow dependencies into account when setting the TTL on an intermediate output collection. If the TTL is too short, it is possible for a collection to be trashed before downstream steps that consume it are started.  The recommended minimum value for TTL is the expected duration of the entire workflow.|
 
 h2. cwltool:Secrets
 
@@ -120,6 +125,15 @@ table(table table-bordered table-condensed).
 |cluster_id|string|The five-character alphanumeric cluster id (uuid prefix) where a container or subworkflow will execute.  May be an expression.|
 |project_uuid|string|The uuid of the project which will own container request and output of the container.  May be an expression.|
 
+h2(#OutputStorageClass). arv:OutputStorageClass
+
+Specify the "storage class":{{site.baseurl}}/user/topics/storage-classes.html to use for intermediate and final outputs.
+
+table(table table-bordered table-condensed).
+|_. Field |_. Type |_. Description |
+|intermediateStorageClass|string or array of strings|The storage class for output of intermediate steps.  For example, faster "hot" storage.|
+|finalStorageClass_uuid|string or array of strings|The storage class for the final output.  |
+
 h2. arv:dockerCollectionPDH
 
 This is an optional extension field appearing on the standard @DockerRequirement@.  It specifies the portable data hash of the Arvados collection containing the Docker image.  If present, it takes precedence over @dockerPull@ or @dockerImageId@.
@@ -135,6 +149,16 @@ h1. Deprecated extensions
 
 The following extensions are deprecated because equivalent features are part of the CWL v1.1 standard.
 
+{% codeblock as yaml %}
+hints:
+  cwltool:LoadListingRequirement:
+    loadListing: shallow_listing
+  arv:ReuseRequirement:
+    enableReuse: false
+  cwltool:TimeLimit:
+    timelimit: 14400
+{% endcodeblock %}
+
 h2. cwltool:LoadListingRequirement
 
 For CWL v1.1 scripts, this is deprecated in favor of "loadListing":https://www.commonwl.org/v1.1/CommandLineTool.html#CommandInputParameter or "LoadListingRequirement":https://www.commonwl.org/v1.1/CommandLineTool.html#LoadListingRequirement
index 761d198ee4f504bc477b6575d9d1cde0c5b25085..a1c102593a4f1d790f1d1921627e6f5ec8bc07a3 100644 (file)
@@ -47,19 +47,19 @@ table(table table-bordered table-condensed).
 |==--no-wait==|             Submit workflow runner and exit.|
 |==--log-timestamps==|      Prefix logging lines with timestamp|
 |==--no-log-timestamps==|   No timestamp on logging lines|
-|==--api== {containers}|Select work submission API.  Only supports 'containers'|
 |==--compute-checksum==|    Compute checksum of contents while collecting outputs|
 |==--submit-runner-ram== SUBMIT_RUNNER_RAM|RAM (in MiB) required for the workflow runner (default 1024)|
 |==--submit-runner-image== SUBMIT_RUNNER_IMAGE|Docker image for workflow runner|
 |==--always-submit-runner==|When invoked with --submit --wait, always submit a runner to manage the workflow, even when only running a single CommandLineTool|
-|==--submit-request-uuid== UUID|Update and commit to supplied container request instead of creating a new one (containers API only).|
-|==--submit-runner-cluster== CLUSTER_ID|Submit workflow runner to a remote cluster (containers API only)|
+|==--submit-request-uuid== UUID|Update and commit to supplied container request instead of creating a new one.|
+|==--submit-runner-cluster== CLUSTER_ID|Submit workflow runner to a remote cluster|
 |==--name NAME==|Name to use for workflow execution instance.|
 |==--on-error== {stop,continue}|Desired workflow behavior when a step fails.  One of 'stop' (do not submit any more steps) or 'continue' (may submit other steps that are not downstream from the error). Default is 'continue'.|
 |==--enable-dev==|Enable loading and running development versions of CWL spec.|
-|==--storage-classes== STORAGE_CLASSES|Specify comma separated list of storage classes to be used when saving workflow output to Keep.|
+|==--storage-classes== STORAGE_CLASSES|Specify comma separated list of storage classes to be used when saving the final workflow output to Keep.|
+|==--intermediate-storage-classes== STORAGE_CLASSES|Specify comma separated list of storage classes to be used when intermediate workflow output to Keep.|
 |==--intermediate-output-ttl== N|If N > 0, intermediate output collections will be trashed N seconds after creation. Default is 0 (don't trash).|
-|==--priority== PRIORITY|Workflow priority (range 1..1000, higher has precedence over lower, containers api only)|
+|==--priority== PRIORITY|Workflow priority (range 1..1000, higher has precedence over lower)|
 |==--thread-count== THREAD_COUNT|Number of threads to use for container submit and output collection.|
 |==--http-timeout== HTTP_TIMEOUT|API request timeout in seconds. Default is 300 seconds (5 minutes).|
 |==--trash-intermediate==|Immediately trash intermediate outputs on workflow success.|
index d35df4fcec9f92f12e2b9dc3020667be5f4153f0..6e41a4f237d0bcf7d133007c96c742a2c7fb75d9 100644 (file)
@@ -57,12 +57,14 @@ arvados.arv-copy[1234] INFO: Success: created copy with uuid dstcl-4zz18-xxxxxxx
 
 The output of arv-copy displays the uuid of the collection generated in the destination cluster. By default, the output is placed in your home project in the destination cluster. If you want to place your collection in an existing project, you can specify the project you want it to be in using the tag @--project-uuid@ followed by the project uuid.
 
-For example, this will copy the collection to project dstcl-j7d0g-a894213ukjhal12 in the destination cluster.
+For example, this will copy the collection to project @dstcl-j7d0g-a894213ukjhal12@ in the destination cluster.
 
 <notextile> <pre><code>~$ <span class="userinput">arv-copy --src pirca --dst dstcl --project-uuid dstcl-j7d0g-a894213ukjhal12 jutro-4zz18-tv416l321i4r01e
 </code></pre>
 </notextile>
 
+Additionally, if you need to specify the storage classes where to save the copied data on the destination cluster, you can do that by using the @--storage-classes LIST@ argument, where @LIST@ is a comma-separated list of storage class names.
+
 h3. How to copy a workflow
 
 We will use the uuid @jutro-7fd4e-mkmmq53m1ze6apx@ as an example workflow.
index bb1c7dd53e8cdaffd88d83b95b2177ae571fa55a..8a97df6e162187fc546d15f56b79b624c7adb648 100644 (file)
@@ -1,7 +1,7 @@
 ---
 layout: default
 navsection: userguide
-title: "Working with Docker images"
+title: "Working with container images"
 ...
 {% comment %}
 Copyright (C) The Arvados Authors. All rights reserved.
@@ -9,7 +9,9 @@ Copyright (C) The Arvados Authors. All rights reserved.
 SPDX-License-Identifier: CC-BY-SA-3.0
 {% endcomment %}
 
-This page describes how to set up the runtime environment (e.g., the programs, libraries, and other dependencies needed to run a job) that a workflow step will be run in using "Docker.":https://www.docker.com/  Docker is a tool for building and running containers that isolate applications from other applications running on the same node.  For detailed information about Docker, see the "Docker User Guide.":https://docs.docker.com/userguide/
+This page describes how to set up the runtime environment (e.g., the programs, libraries, and other dependencies needed to run a job) that a workflow step will be run in using "Docker":https://www.docker.com/ or "Singularity":https://sylabs.io/singularity/.  Docker and Singularity are tools for building and running containers that isolate applications from other applications running on the same node.  For detailed information, see the "Docker User Guide":https://docs.docker.com/userguide/ and the "Introduction to Singularity":https://sylabs.io/guides/3.5/user-guide/introduction.html.
+
+Note that Arvados always works with Docker images, even when it is configured to use Singularity to run containers. There are some differences between the two runtimes that can affect your containers. See the "Singularity container runtime":{{site.baseurl}}/install/singularity.html page for details.
 
 This page describes:
 
@@ -19,7 +21,7 @@ This page describes:
 
 {% include 'tutorial_expectations_workstation' %}
 
-You also need ensure that "Docker is installed,":https://docs.docker.com/installation/ the Docker daemon is running, and you have permission to access Docker.  You can test this by running @docker version@.  If you receive a permission denied error, your user account may need to be added to the @docker@ group.  If you have root access, you can add yourself to the @docker@ group using @$ sudo addgroup $USER docker@ then log out and log back in again; otherwise consult your local sysadmin.
+You also need to ensure that "Docker is installed,":https://docs.docker.com/installation/ the Docker daemon is running, and you have permission to access Docker.  You can test this by running @docker version@.  If you receive a permission denied error, your user account may need to be added to the @docker@ group.  If you have root access, you can add yourself to the @docker@ group using @$ sudo addgroup $USER docker@ then log out and log back in again; otherwise consult your local sysadmin.
 
 h2(#create). Create a custom image using a Dockerfile
 
index 99556af10aecfce48a9ac07c37e07959c44e169a..06fd4d811c2b0e927f13e4fbb1af2c3f8395b386 100644 (file)
@@ -10,9 +10,11 @@ Copyright (C) The Arvados Authors. All rights reserved.
 SPDX-License-Identifier: CC-BY-SA-3.0
 {% endcomment %}
 
-Storage classes (alternately known as "storage tiers") allow you to control which volumes should be used to store particular collection data blocks.  This can be used to implement data storage policies such as moving data to archival storage.
+Storage classes (sometimes called as "storage tiers") allow you to control which back-end storage volumes should be used to store the data blocks of a particular collection.  This can be used to implement data storage policies such as assigning data collections to "fast", "robust" or "archival" storage.
 
-Names of storage classes are internal to the cluster and decided by the administrator.  Aside from "default", Arvados currently does not define any standard storage class names.
+Names of storage classes are internal to the cluster and decided by the administrator.  Aside from "default", Arvados currently does not define any standard storage class names.  Consult your cluster administrator for guidance on what storage classes are available to use on your specific Arvados instance.
+
+Note that when changing the storage class of an existing collection, it does not take effect immediately, the blocks are asynchronously copied to the new storage class and removed from the old one.  The collection field "storage_classes_confirmed" is updated to reflect when data blocks have been successfully copied.
 
 h3. arv-put
 
@@ -32,14 +34,12 @@ $ arv-mount --storage-classes=transient --mount-tmp=scratch keep
 
 h3. arvados-cwl-runner
 
-You may also specify the desired storage class for the final output collection produced by @arvados-cwl-runner@:
+You may specify the desired storage class for the intermediate and final output collections produced by @arvados-cwl-runner@ on the command line or using the "arv:OutputStorageClass hint":{{site.baseurl}}/user/cwl/cwl-extensions.html#OutputStorageClass .
 
 <pre>
-$ arvados-cwl-runner --storage-classes=hot myworkflow.cwl myinput.yml
+$ arvados-cwl-runner --intermediate-storage-classes=hot_storage --storage-classes=robust_storage myworkflow.cwl myinput.yml
 </pre>
 
-(Note: intermediate collections produced by a workflow run will have "default" storage class.)
-
 h3. arv command line
 
 You may set the storage class on an existing collection by setting the "storage_classes_desired" field of a Collection.  For example, at the command line:
@@ -52,8 +52,6 @@ By setting "storage_classes_desired" to "archival", the blocks that make up the
 
 h3. Storage class notes
 
-Collection blocks will be in the "default" storage class if not otherwise specified.
+Collection blocks will be in the cluster's configured default storage class(es) if not otherwise specified.
 
 Any user with write access to a collection may set any storage class on that collection.
-
-Names of storage classes are internal to the cluster and decided by the administrator.  Aside from "default", Arvados currently does not define any standard storage class names.
index d77a4d76e7e101f828a8695e8ead5b2f8685ed14..f0cd02946f376719c4ab8907d01d5533903a24d4 100644 (file)
@@ -54,9 +54,9 @@ func (runner installPassenger) Run(ctx context.Context, fail func(error), super
        if err != nil {
                return err
        }
-       for _, version := range []string{"1.16.6", "1.17.3", "2.0.2"} {
+       for _, version := range []string{"2.2.19"} {
                if !strings.Contains(buf.String(), "("+version+")") {
-                       err = super.RunProgram(ctx, runner.src, runOptions{}, "gem", "install", "--user", "--conservative", "--no-document", "bundler:1.16.6", "bundler:1.17.3", "bundler:2.0.2")
+                       err = super.RunProgram(ctx, runner.src, runOptions{}, "gem", "install", "--user", "--conservative", "--no-document", "bundler:2.2.19")
                        if err != nil {
                                return err
                        }
index 4e009f45ab55ad6353944bbea5cb7ca5b09811ac..2026b8c843fc16d12cc4eef6a0a547f3a1b9b164 100644 (file)
@@ -748,6 +748,11 @@ func (super *Supervisor) autofillConfig(cfg *arvados.Config) error {
                                },
                        }
                }
+               cluster.StorageClasses = map[string]arvados.StorageClassConfig{
+                       "default": {Default: true},
+                       "foo":     {},
+                       "bar":     {},
+               }
        }
        if super.OwnTemporaryDatabase {
                cluster.PostgreSQL.Connection = arvados.PostgreSQLConnection{
index e28d5cbb7f0cd09b3ad559f6cab0c9a9967c10d5..ec32613905f8ad9b40bde6820199413f5d30fe0d 100644 (file)
@@ -52,6 +52,9 @@ Clusters:
       DispatchCloud:
         InternalURLs: {SAMPLE: {}}
         ExternalURL: "-"
+      DispatchLSF:
+        InternalURLs: {SAMPLE: {}}
+        ExternalURL: "-"
       Keepproxy:
         InternalURLs: {SAMPLE: {}}
         ExternalURL: ""
@@ -270,6 +273,7 @@ Clusters:
       AdminNotifierEmailFrom: arvados@example.com
       EmailSubjectPrefix: "[ARVADOS] "
       UserNotifierEmailFrom: arvados@example.com
+      UserNotifierEmailBcc: {}
       NewUserNotificationRecipients: {}
       NewInactiveUserNotificationRecipients: {}
 
@@ -456,6 +460,13 @@ Clusters:
       # long-running balancing operation.
       BalanceTimeout: 6h
 
+      # Maximum number of replication_confirmed /
+      # storage_classes_confirmed updates to write to the database
+      # after a rebalancing run. When many updates are needed, this
+      # spreads them over a few runs rather than applying them all at
+      # once.
+      BalanceUpdateLimit: 100000
+
       # Default lifetime for ephemeral collections: 2 weeks. This must not
       # be less than BlobSigningTTL.
       DefaultTrashLifetime: 336h
@@ -517,10 +528,10 @@ Clusters:
       # WebDAV would have to expose XSS vulnerabilities in order to
       # handle the redirect (see discussion on Services.WebDAV).
       #
-      # This setting has no effect in the recommended configuration,
-      # where the WebDAV is configured to have a separate domain for
-      # every collection; in this case XSS protection is provided by
-      # browsers' same-origin policy.
+      # This setting has no effect in the recommended configuration, where the
+      # WebDAV service is configured to have a separate domain for every
+      # collection and XSS protection is provided by browsers' same-origin
+      # policy.
       #
       # The default setting (false) is appropriate for a multi-user site.
       TrustAllContent: false
@@ -1012,6 +1023,19 @@ Clusters:
           # (See http://ruby-doc.org/core-2.2.2/Kernel.html#method-i-format for more.)
           AssignNodeHostname: "compute%<slot_number>d"
 
+      LSF:
+        # Additional arguments to bsub when submitting Arvados
+        # containers as LSF jobs.
+        BsubArgumentsList: []
+
+        # Use sudo to switch to this user account when submitting LSF
+        # jobs.
+        #
+        # This account must exist on the hosts where LSF jobs run
+        # ("execution hosts"), as well as on the host where the
+        # Arvados LSF dispatcher runs ("submission host").
+        BsubSudoUser: "crunch"
+
       JobsAPI:
         # Enable the legacy 'jobs' API (crunch v1).  This value must be a string.
         #
@@ -1205,6 +1229,29 @@ Clusters:
         Price: 0.1
         Preemptible: false
 
+    StorageClasses:
+
+      # If you use multiple storage classes, specify them here, using
+      # the storage class name as the key (in place of "SAMPLE" in
+      # this sample entry).
+      #
+      # Further info/examples:
+      # https://doc.arvados.org/admin/storage-classes.html
+      SAMPLE:
+
+        # Priority determines the order volumes should be searched
+        # when reading data, in cases where a keepstore server has
+        # access to multiple volumes with different storage classes.
+        Priority: 0
+
+        # Default determines which storage class(es) should be used
+        # when a user/client writes data or saves a new collection
+        # without specifying storage classes.
+        #
+        # If any StorageClasses are configured, at least one of them
+        # must have Default: true.
+        Default: true
+
     Volumes:
       SAMPLE:
         # AccessViaHosts specifies which keepstore processes can read
@@ -1228,7 +1275,9 @@ Clusters:
         ReadOnly: false
         Replication: 1
         StorageClasses:
-          default: true
+          # If you have configured storage classes (see StorageClasses
+          # section above), add an entry here for each storage class
+          # satisfied by this volume.
           SAMPLE: true
         Driver: S3
         DriverParameters:
index bb939321c9ce17e220bb031f041efae53c79fa46..065011cc2e8d31d2fb133f6546a32a8f506861b2 100644 (file)
@@ -84,6 +84,7 @@ var whitelist = map[string]bool{
        "Collections.BalanceCollectionBuffers":                false,
        "Collections.BalancePeriod":                           false,
        "Collections.BalanceTimeout":                          false,
+       "Collections.BalanceUpdateLimit":                      false,
        "Collections.BlobDeleteConcurrency":                   false,
        "Collections.BlobMissingReport":                       false,
        "Collections.BlobReplicateConcurrency":                false,
@@ -120,6 +121,7 @@ var whitelist = map[string]bool{
        "Containers.JobsAPI.GitInternalDir":                   false,
        "Containers.Logging":                                  false,
        "Containers.LogReuseDecisions":                        false,
+       "Containers.LSF":                                      false,
        "Containers.MaxComputeVMs":                            false,
        "Containers.MaxDispatchAttempts":                      false,
        "Containers.MaxRetryAttempts":                         true,
@@ -203,6 +205,10 @@ var whitelist = map[string]bool{
        "Services.*":                                          true,
        "Services.*.ExternalURL":                              true,
        "Services.*.InternalURLs":                             false,
+       "StorageClasses":                                      true,
+       "StorageClasses.*":                                    true,
+       "StorageClasses.*.Default":                            true,
+       "StorageClasses.*.Priority":                           true,
        "SystemLogs":                                          false,
        "SystemRootToken":                                     false,
        "TLS":                                                 false,
@@ -221,6 +227,7 @@ var whitelist = map[string]bool{
        "Users.NewUsersAreActive":                             false,
        "Users.PreferDomainForUsername":                       false,
        "Users.UserNotifierEmailFrom":                         false,
+       "Users.UserNotifierEmailBcc":                          false,
        "Users.UserProfileNotificationAddress":                false,
        "Users.UserSetupMailText":                             false,
        "Volumes":                                             true,
index b15bf7eebc29facda6f1a0e2670c5482f83055cf..36a21d4c357165628aa321ef8d97b67c1ea894bd 100644 (file)
@@ -58,6 +58,9 @@ Clusters:
       DispatchCloud:
         InternalURLs: {SAMPLE: {}}
         ExternalURL: "-"
+      DispatchLSF:
+        InternalURLs: {SAMPLE: {}}
+        ExternalURL: "-"
       Keepproxy:
         InternalURLs: {SAMPLE: {}}
         ExternalURL: ""
@@ -276,6 +279,7 @@ Clusters:
       AdminNotifierEmailFrom: arvados@example.com
       EmailSubjectPrefix: "[ARVADOS] "
       UserNotifierEmailFrom: arvados@example.com
+      UserNotifierEmailBcc: {}
       NewUserNotificationRecipients: {}
       NewInactiveUserNotificationRecipients: {}
 
@@ -462,6 +466,13 @@ Clusters:
       # long-running balancing operation.
       BalanceTimeout: 6h
 
+      # Maximum number of replication_confirmed /
+      # storage_classes_confirmed updates to write to the database
+      # after a rebalancing run. When many updates are needed, this
+      # spreads them over a few runs rather than applying them all at
+      # once.
+      BalanceUpdateLimit: 100000
+
       # Default lifetime for ephemeral collections: 2 weeks. This must not
       # be less than BlobSigningTTL.
       DefaultTrashLifetime: 336h
@@ -523,10 +534,10 @@ Clusters:
       # WebDAV would have to expose XSS vulnerabilities in order to
       # handle the redirect (see discussion on Services.WebDAV).
       #
-      # This setting has no effect in the recommended configuration,
-      # where the WebDAV is configured to have a separate domain for
-      # every collection; in this case XSS protection is provided by
-      # browsers' same-origin policy.
+      # This setting has no effect in the recommended configuration, where the
+      # WebDAV service is configured to have a separate domain for every
+      # collection and XSS protection is provided by browsers' same-origin
+      # policy.
       #
       # The default setting (false) is appropriate for a multi-user site.
       TrustAllContent: false
@@ -1018,6 +1029,19 @@ Clusters:
           # (See http://ruby-doc.org/core-2.2.2/Kernel.html#method-i-format for more.)
           AssignNodeHostname: "compute%<slot_number>d"
 
+      LSF:
+        # Additional arguments to bsub when submitting Arvados
+        # containers as LSF jobs.
+        BsubArgumentsList: []
+
+        # Use sudo to switch to this user account when submitting LSF
+        # jobs.
+        #
+        # This account must exist on the hosts where LSF jobs run
+        # ("execution hosts"), as well as on the host where the
+        # Arvados LSF dispatcher runs ("submission host").
+        BsubSudoUser: "crunch"
+
       JobsAPI:
         # Enable the legacy 'jobs' API (crunch v1).  This value must be a string.
         #
@@ -1211,6 +1235,29 @@ Clusters:
         Price: 0.1
         Preemptible: false
 
+    StorageClasses:
+
+      # If you use multiple storage classes, specify them here, using
+      # the storage class name as the key (in place of "SAMPLE" in
+      # this sample entry).
+      #
+      # Further info/examples:
+      # https://doc.arvados.org/admin/storage-classes.html
+      SAMPLE:
+
+        # Priority determines the order volumes should be searched
+        # when reading data, in cases where a keepstore server has
+        # access to multiple volumes with different storage classes.
+        Priority: 0
+
+        # Default determines which storage class(es) should be used
+        # when a user/client writes data or saves a new collection
+        # without specifying storage classes.
+        #
+        # If any StorageClasses are configured, at least one of them
+        # must have Default: true.
+        Default: true
+
     Volumes:
       SAMPLE:
         # AccessViaHosts specifies which keepstore processes can read
@@ -1234,7 +1281,9 @@ Clusters:
         ReadOnly: false
         Replication: 1
         StorageClasses:
-          default: true
+          # If you have configured storage classes (see StorageClasses
+          # section above), add an entry here for each storage class
+          # satisfied by this volume.
           SAMPLE: true
         Driver: S3
         DriverParameters:
index 169b252a0e8fbbe44b0e6722e7fe1fe745ca01b6..248960beb99f875620dca5ee7738f8575877c57d 100644 (file)
@@ -269,6 +269,7 @@ func (ldr *Loader) Load() (*arvados.Config, error) {
                        ldr.loadOldKeepBalanceConfig,
                )
        }
+       loadFuncs = append(loadFuncs, ldr.setImplicitStorageClasses)
        for _, f := range loadFuncs {
                err = f(&cfg)
                if err != nil {
@@ -296,6 +297,7 @@ func (ldr *Loader) Load() (*arvados.Config, error) {
                        checkKeyConflict(fmt.Sprintf("Clusters.%s.PostgreSQL.Connection", id), cc.PostgreSQL.Connection),
                        ldr.checkEmptyKeepstores(cc),
                        ldr.checkUnlistedKeepstores(cc),
+                       ldr.checkStorageClasses(cc),
                        // TODO: check non-empty Rendezvous on
                        // services other than Keepstore
                } {
@@ -336,6 +338,57 @@ func (ldr *Loader) checkToken(label, token string) error {
        return nil
 }
 
+func (ldr *Loader) setImplicitStorageClasses(cfg *arvados.Config) error {
+cluster:
+       for id, cc := range cfg.Clusters {
+               if len(cc.StorageClasses) > 0 {
+                       continue cluster
+               }
+               for _, vol := range cc.Volumes {
+                       if len(vol.StorageClasses) > 0 {
+                               continue cluster
+                       }
+               }
+               // No explicit StorageClasses config info at all; fill
+               // in implicit defaults.
+               for id, vol := range cc.Volumes {
+                       vol.StorageClasses = map[string]bool{"default": true}
+                       cc.Volumes[id] = vol
+               }
+               cc.StorageClasses = map[string]arvados.StorageClassConfig{"default": {Default: true}}
+               cfg.Clusters[id] = cc
+       }
+       return nil
+}
+
+func (ldr *Loader) checkStorageClasses(cc arvados.Cluster) error {
+       classOnVolume := map[string]bool{}
+       for volid, vol := range cc.Volumes {
+               if len(vol.StorageClasses) == 0 {
+                       return fmt.Errorf("%s: volume has no StorageClasses listed", volid)
+               }
+               for classid := range vol.StorageClasses {
+                       if _, ok := cc.StorageClasses[classid]; !ok {
+                               return fmt.Errorf("%s: volume refers to storage class %q that is not defined in StorageClasses", volid, classid)
+                       }
+                       classOnVolume[classid] = true
+               }
+       }
+       haveDefault := false
+       for classid, sc := range cc.StorageClasses {
+               if !classOnVolume[classid] && len(cc.Volumes) > 0 {
+                       ldr.Logger.Warnf("there are no volumes providing storage class %q", classid)
+               }
+               if sc.Default {
+                       haveDefault = true
+               }
+       }
+       if !haveDefault {
+               return fmt.Errorf("there is no default storage class (at least one entry in StorageClasses must have Default: true)")
+       }
+       return nil
+}
+
 func checkKeyConflict(label string, m map[string]string) error {
        saw := map[string]bool{}
        for k := range m {
index 396faca48461b30d7d5708a55f05bafcf73159ac..d4896c39cb19c2aa782a36389d9d92af728f04f5 100644 (file)
@@ -29,6 +29,8 @@ func Test(t *testing.T) {
 
 var _ = check.Suite(&LoadSuite{})
 
+var emptyConfigYAML = `Clusters: {"z1111": {}}`
+
 // Return a new Loader that reads cluster config from configdata
 // (instead of the usual default /etc/arvados/config.yml), and logs to
 // logdst or (if that's nil) c.Log.
@@ -59,7 +61,7 @@ func (s *LoadSuite) TestEmpty(c *check.C) {
 }
 
 func (s *LoadSuite) TestNoConfigs(c *check.C) {
-       cfg, err := testLoader(c, `Clusters: {"z1111": {}}`, nil).Load()
+       cfg, err := testLoader(c, emptyConfigYAML, nil).Load()
        c.Assert(err, check.IsNil)
        c.Assert(cfg.Clusters, check.HasLen, 1)
        cc, err := cfg.GetCluster("z1111")
@@ -79,7 +81,7 @@ func (s *LoadSuite) TestMungeLegacyConfigArgs(c *check.C) {
        f, err = ioutil.TempFile("", "")
        c.Check(err, check.IsNil)
        defer os.Remove(f.Name())
-       io.WriteString(f, "Clusters: {aaaaa: {}}\n")
+       io.WriteString(f, emptyConfigYAML)
        newfile := f.Name()
 
        for _, trial := range []struct {
@@ -562,11 +564,122 @@ func (s *LoadSuite) TestListKeys(c *check.C) {
                c.Errorf("Should have produced an error")
        }
 
-       var logbuf bytes.Buffer
-       loader := testLoader(c, string(DefaultYAML), &logbuf)
+       loader := testLoader(c, string(DefaultYAML), nil)
        cfg, err := loader.Load()
        c.Assert(err, check.IsNil)
        if err := checkListKeys("", cfg); err != nil {
                c.Error(err)
        }
 }
+
+func (s *LoadSuite) TestImplicitStorageClasses(c *check.C) {
+       // If StorageClasses and Volumes.*.StorageClasses are all
+       // empty, there is a default storage class named "default".
+       ldr := testLoader(c, `{"Clusters":{"z1111":{}}}`, nil)
+       cfg, err := ldr.Load()
+       c.Assert(err, check.IsNil)
+       cc, err := cfg.GetCluster("z1111")
+       c.Assert(err, check.IsNil)
+       c.Check(cc.StorageClasses, check.HasLen, 1)
+       c.Check(cc.StorageClasses["default"].Default, check.Equals, true)
+       c.Check(cc.StorageClasses["default"].Priority, check.Equals, 0)
+
+       // The implicit "default" storage class is used by all
+       // volumes.
+       ldr = testLoader(c, `
+Clusters:
+ z1111:
+  Volumes:
+   z: {}`, nil)
+       cfg, err = ldr.Load()
+       c.Assert(err, check.IsNil)
+       cc, err = cfg.GetCluster("z1111")
+       c.Assert(err, check.IsNil)
+       c.Check(cc.StorageClasses, check.HasLen, 1)
+       c.Check(cc.StorageClasses["default"].Default, check.Equals, true)
+       c.Check(cc.StorageClasses["default"].Priority, check.Equals, 0)
+       c.Check(cc.Volumes["z"].StorageClasses["default"], check.Equals, true)
+
+       // The "default" storage class isn't implicit if any classes
+       // are configured explicitly.
+       ldr = testLoader(c, `
+Clusters:
+ z1111:
+  StorageClasses:
+   local:
+    Default: true
+    Priority: 111
+  Volumes:
+   z:
+    StorageClasses:
+     local: true`, nil)
+       cfg, err = ldr.Load()
+       c.Assert(err, check.IsNil)
+       cc, err = cfg.GetCluster("z1111")
+       c.Assert(err, check.IsNil)
+       c.Check(cc.StorageClasses, check.HasLen, 1)
+       c.Check(cc.StorageClasses["local"].Default, check.Equals, true)
+       c.Check(cc.StorageClasses["local"].Priority, check.Equals, 111)
+
+       // It is an error for a volume to refer to a storage class
+       // that isn't listed in StorageClasses.
+       ldr = testLoader(c, `
+Clusters:
+ z1111:
+  StorageClasses:
+   local:
+    Default: true
+    Priority: 111
+  Volumes:
+   z:
+    StorageClasses:
+     nx: true`, nil)
+       _, err = ldr.Load()
+       c.Assert(err, check.ErrorMatches, `z: volume refers to storage class "nx" that is not defined.*`)
+
+       // It is an error for a volume to refer to a storage class
+       // that isn't listed in StorageClasses ... even if it's
+       // "default", which would exist implicitly if it weren't
+       // referenced explicitly by a volume.
+       ldr = testLoader(c, `
+Clusters:
+ z1111:
+  Volumes:
+   z:
+    StorageClasses:
+     default: true`, nil)
+       _, err = ldr.Load()
+       c.Assert(err, check.ErrorMatches, `z: volume refers to storage class "default" that is not defined.*`)
+
+       // If the "default" storage class is configured explicitly, it
+       // is not used implicitly by any volumes, even if it's the
+       // only storage class.
+       var logbuf bytes.Buffer
+       ldr = testLoader(c, `
+Clusters:
+ z1111:
+  StorageClasses:
+   default:
+    Default: true
+    Priority: 111
+  Volumes:
+   z: {}`, &logbuf)
+       _, err = ldr.Load()
+       c.Assert(err, check.ErrorMatches, `z: volume has no StorageClasses listed`)
+
+       // If StorageClasses are configured explicitly, there must be
+       // at least one with Default: true. (Calling one "default" is
+       // not sufficient.)
+       ldr = testLoader(c, `
+Clusters:
+ z1111:
+  StorageClasses:
+   default:
+    Priority: 111
+  Volumes:
+   z:
+    StorageClasses:
+     default: true`, nil)
+       _, err = ldr.Load()
+       c.Assert(err, check.ErrorMatches, `there is no default storage class.*`)
+}
index 44c99bf30f8c3a6ae9aa70b8306268b7c4c8fb6d..26f0dbb0d1388da1886cea726fc644648b4d57e3 100644 (file)
@@ -26,6 +26,7 @@ import (
        "git.arvados.org/arvados.git/sdk/go/arvados"
        "git.arvados.org/arvados.git/sdk/go/arvadostest"
        "git.arvados.org/arvados.git/sdk/go/ctxlog"
+       "git.arvados.org/arvados.git/sdk/go/httpserver"
        check "gopkg.in/check.v1"
 )
 
@@ -432,6 +433,74 @@ func (s *IntegrationSuite) TestCreateContainerRequestWithBadToken(c *check.C) {
        }
 }
 
+func (s *IntegrationSuite) TestRequestIDHeader(c *check.C) {
+       conn1 := s.testClusters["z1111"].Conn()
+       rootctx1, _, _ := s.testClusters["z1111"].RootClients()
+       userctx1, ac1, _, _ := s.testClusters["z1111"].UserClients(rootctx1, c, conn1, "user@example.com", true)
+
+       coll, err := conn1.CollectionCreate(userctx1, arvados.CreateOptions{})
+       c.Check(err, check.IsNil)
+       specimen, err := conn1.SpecimenCreate(userctx1, arvados.CreateOptions{})
+       c.Check(err, check.IsNil)
+
+       tests := []struct {
+               path            string
+               reqIdProvided   bool
+               notFoundRequest bool
+       }{
+               {"/arvados/v1/collections", false, false},
+               {"/arvados/v1/collections", true, false},
+               {"/arvados/v1/nonexistant", false, true},
+               {"/arvados/v1/nonexistant", true, true},
+               {"/arvados/v1/collections/" + coll.UUID, false, false},
+               {"/arvados/v1/collections/" + coll.UUID, true, false},
+               {"/arvados/v1/specimens/" + specimen.UUID, false, false},
+               {"/arvados/v1/specimens/" + specimen.UUID, true, false},
+               {"/arvados/v1/collections/z1111-4zz18-0123456789abcde", false, true},
+               {"/arvados/v1/collections/z1111-4zz18-0123456789abcde", true, true},
+               {"/arvados/v1/specimens/z1111-j58dm-0123456789abcde", false, true},
+               {"/arvados/v1/specimens/z1111-j58dm-0123456789abcde", true, true},
+       }
+
+       for _, tt := range tests {
+               c.Log(c.TestName() + " " + tt.path)
+               req, err := http.NewRequest("GET", "https://"+ac1.APIHost+tt.path, nil)
+               c.Assert(err, check.IsNil)
+               customReqId := "abcdeG"
+               if !tt.reqIdProvided {
+                       c.Assert(req.Header.Get("X-Request-Id"), check.Equals, "")
+               } else {
+                       req.Header.Set("X-Request-Id", customReqId)
+               }
+               resp, err := ac1.Do(req)
+               c.Assert(err, check.IsNil)
+               if tt.notFoundRequest {
+                       c.Check(resp.StatusCode, check.Equals, http.StatusNotFound)
+               } else {
+                       c.Check(resp.StatusCode, check.Equals, http.StatusOK)
+               }
+               if !tt.reqIdProvided {
+                       c.Check(resp.Header.Get("X-Request-Id"), check.Matches, "^req-[0-9a-zA-Z]{20}$")
+                       if tt.notFoundRequest {
+                               var jresp httpserver.ErrorResponse
+                               err := json.NewDecoder(resp.Body).Decode(&jresp)
+                               c.Check(err, check.IsNil)
+                               c.Assert(jresp.Errors, check.HasLen, 1)
+                               c.Check(jresp.Errors[0], check.Matches, "^.*(req-[0-9a-zA-Z]{20}).*$")
+                       }
+               } else {
+                       c.Check(resp.Header.Get("X-Request-Id"), check.Equals, customReqId)
+                       if tt.notFoundRequest {
+                               var jresp httpserver.ErrorResponse
+                               err := json.NewDecoder(resp.Body).Decode(&jresp)
+                               c.Check(err, check.IsNil)
+                               c.Assert(jresp.Errors, check.HasLen, 1)
+                               c.Check(jresp.Errors[0], check.Matches, "^.*("+customReqId+").*$")
+                       }
+               }
+       }
+}
+
 // We test the direct access to the database
 // normally an integration test would not have a database access, but  in this case we need
 // to test tokens that are secret, so there is no API response that will give them back
index 412f1bbfbfa95027eb5c043c5e1fcf07449139b0..e15303a3155afe81d72e8ce61e881ce76d5282d7 100644 (file)
@@ -260,18 +260,16 @@ func (runner *ContainerRunner) LoadImage() (string, error) {
                return "", fmt.Errorf("cannot choose from multiple tar files in image collection: %v", tarfiles)
        }
        imageID := tarfiles[0][:len(tarfiles[0])-4]
-       imageFile := runner.ArvMountPoint + "/by_id/" + runner.Container.ContainerImage + "/" + tarfiles[0]
+       imageTarballPath := runner.ArvMountPoint + "/by_id/" + runner.Container.ContainerImage + "/" + imageID + ".tar"
        runner.CrunchLog.Printf("Using Docker image id %q", imageID)
 
-       if !runner.executor.ImageLoaded(imageID) {
-               runner.CrunchLog.Print("Loading Docker image from keep")
-               err = runner.executor.LoadImage(imageFile)
-               if err != nil {
-                       return "", err
-               }
-       } else {
-               runner.CrunchLog.Print("Docker image is available")
+       runner.CrunchLog.Print("Loading Docker image from keep")
+       err = runner.executor.LoadImage(imageID, imageTarballPath, runner.Container, runner.ArvMountPoint,
+               runner.containerClient)
+       if err != nil {
+               return "", err
        }
+
        return imageID, nil
 }
 
@@ -599,6 +597,7 @@ func (runner *ContainerRunner) SetupMounts() (map[string]bindmount, error) {
        } else {
                arvMountCmd = append(arvMountCmd, "--mount-by-id", "by_id")
        }
+       arvMountCmd = append(arvMountCmd, "--mount-by-id", "by_uuid")
        arvMountCmd = append(arvMountCmd, runner.ArvMountPoint)
 
        runner.ArvMount, err = runner.RunArvMount(arvMountCmd, token)
@@ -1201,12 +1200,14 @@ func (runner *ContainerRunner) CleanupDirs() {
                                }
                        }
                }
+               runner.ArvMount = nil
        }
 
        if runner.ArvMountPoint != "" {
                if rmerr := os.Remove(runner.ArvMountPoint); rmerr != nil {
                        runner.CrunchLog.Printf("While cleaning up arv-mount directory %s: %v", runner.ArvMountPoint, rmerr)
                }
+               runner.ArvMountPoint = ""
        }
 
        if rmerr := os.RemoveAll(runner.parentTemp); rmerr != nil {
@@ -1441,6 +1442,7 @@ func (runner *ContainerRunner) Run() (err error) {
                }
                checkErr("stopHoststat", runner.stopHoststat())
                checkErr("CommitLogs", runner.CommitLogs())
+               runner.CleanupDirs()
                checkErr("UpdateContainerFinal", runner.UpdateContainerFinal())
        }()
 
index bb7ffdf0306b26b2f5c56062aaaaaf7b256e5447..bb982cdee76c32cb9321ce88e8fa47fa0588f2f1 100644 (file)
@@ -112,8 +112,11 @@ type stubExecutor struct {
        exit        chan int
 }
 
-func (e *stubExecutor) ImageLoaded(imageID string) bool { return e.imageLoaded }
-func (e *stubExecutor) LoadImage(filename string) error { e.loaded = filename; return e.loadErr }
+func (e *stubExecutor) LoadImage(imageId string, tarball string, container arvados.Container, keepMount string,
+       containerClient *arvados.Client) error {
+       e.loaded = tarball
+       return e.loadErr
+}
 func (e *stubExecutor) Create(spec containerSpec) error { e.created = spec; return e.createErr }
 func (e *stubExecutor) Start() error                    { e.exit = make(chan int, 1); go e.runFunc(); return e.startErr }
 func (e *stubExecutor) CgroupID() string                { return "cgroupid" }
@@ -403,16 +406,6 @@ func (s *TestSuite) TestLoadImage(c *C) {
        imageID, err = s.runner.LoadImage()
        c.Check(err, ErrorMatches, "image collection does not include a \\.tar image file")
        c.Check(s.executor.loaded, Equals, "")
-
-       // if executor reports image is already loaded, LoadImage should not be called
-       s.runner.Container.ContainerImage = arvadostest.DockerImage112PDH
-       s.executor.imageLoaded = true
-       s.executor.loaded = ""
-       s.executor.loadErr = nil
-       imageID, err = s.runner.LoadImage()
-       c.Check(err, IsNil)
-       c.Check(s.executor.loaded, Equals, "")
-       c.Check(imageID, Equals, strings.TrimSuffix(arvadostest.DockerImage112Filename, ".tar"))
 }
 
 type ArvErrorTestClient struct{}
@@ -1112,7 +1105,7 @@ func (s *TestSuite) TestSetupMounts(c *C) {
                c.Check(err, IsNil)
                c.Check(am.Cmd, DeepEquals, []string{"--foreground", "--allow-other",
                        "--read-write", "--storage-classes", "default", "--crunchstat-interval=5",
-                       "--mount-by-pdh", "by_id", realTemp + "/keep1"})
+                       "--mount-by-pdh", "by_id", "--mount-by-id", "by_uuid", realTemp + "/keep1"})
                c.Check(bindmounts, DeepEquals, map[string]bindmount{"/tmp": {realTemp + "/tmp2", false}})
                os.RemoveAll(cr.ArvMountPoint)
                cr.CleanupDirs()
@@ -1132,7 +1125,7 @@ func (s *TestSuite) TestSetupMounts(c *C) {
                c.Check(err, IsNil)
                c.Check(am.Cmd, DeepEquals, []string{"--foreground", "--allow-other",
                        "--read-write", "--storage-classes", "foo,bar", "--crunchstat-interval=5",
-                       "--mount-by-pdh", "by_id", realTemp + "/keep1"})
+                       "--mount-by-pdh", "by_id", "--mount-by-id", "by_uuid", realTemp + "/keep1"})
                c.Check(bindmounts, DeepEquals, map[string]bindmount{"/out": {realTemp + "/tmp2", false}, "/tmp": {realTemp + "/tmp3", false}})
                os.RemoveAll(cr.ArvMountPoint)
                cr.CleanupDirs()
@@ -1152,7 +1145,7 @@ func (s *TestSuite) TestSetupMounts(c *C) {
                c.Check(err, IsNil)
                c.Check(am.Cmd, DeepEquals, []string{"--foreground", "--allow-other",
                        "--read-write", "--storage-classes", "default", "--crunchstat-interval=5",
-                       "--mount-by-pdh", "by_id", realTemp + "/keep1"})
+                       "--mount-by-pdh", "by_id", "--mount-by-id", "by_uuid", realTemp + "/keep1"})
                c.Check(bindmounts, DeepEquals, map[string]bindmount{"/tmp": {realTemp + "/tmp2", false}, "/etc/arvados/ca-certificates.crt": {stubCertPath, true}})
                os.RemoveAll(cr.ArvMountPoint)
                cr.CleanupDirs()
@@ -1175,7 +1168,7 @@ func (s *TestSuite) TestSetupMounts(c *C) {
                c.Check(err, IsNil)
                c.Check(am.Cmd, DeepEquals, []string{"--foreground", "--allow-other",
                        "--read-write", "--storage-classes", "default", "--crunchstat-interval=5",
-                       "--mount-tmp", "tmp0", "--mount-by-pdh", "by_id", realTemp + "/keep1"})
+                       "--mount-tmp", "tmp0", "--mount-by-pdh", "by_id", "--mount-by-id", "by_uuid", realTemp + "/keep1"})
                c.Check(bindmounts, DeepEquals, map[string]bindmount{"/keeptmp": {realTemp + "/keep1/tmp0", false}})
                os.RemoveAll(cr.ArvMountPoint)
                cr.CleanupDirs()
@@ -1198,7 +1191,7 @@ func (s *TestSuite) TestSetupMounts(c *C) {
                c.Check(err, IsNil)
                c.Check(am.Cmd, DeepEquals, []string{"--foreground", "--allow-other",
                        "--read-write", "--storage-classes", "default", "--crunchstat-interval=5",
-                       "--mount-tmp", "tmp0", "--mount-by-pdh", "by_id", realTemp + "/keep1"})
+                       "--mount-tmp", "tmp0", "--mount-by-pdh", "by_id", "--mount-by-id", "by_uuid", realTemp + "/keep1"})
                c.Check(bindmounts, DeepEquals, map[string]bindmount{
                        "/keepinp": {realTemp + "/keep1/by_id/59389a8f9ee9d399be35462a0f92541c+53", true},
                        "/keepout": {realTemp + "/keep1/tmp0", false},
@@ -1225,7 +1218,7 @@ func (s *TestSuite) TestSetupMounts(c *C) {
                c.Check(err, IsNil)
                c.Check(am.Cmd, DeepEquals, []string{"--foreground", "--allow-other",
                        "--read-write", "--storage-classes", "default", "--crunchstat-interval=5",
-                       "--file-cache", "512", "--mount-tmp", "tmp0", "--mount-by-pdh", "by_id", realTemp + "/keep1"})
+                       "--file-cache", "512", "--mount-tmp", "tmp0", "--mount-by-pdh", "by_id", "--mount-by-id", "by_uuid", realTemp + "/keep1"})
                c.Check(bindmounts, DeepEquals, map[string]bindmount{
                        "/keepinp": {realTemp + "/keep1/by_id/59389a8f9ee9d399be35462a0f92541c+53", true},
                        "/keepout": {realTemp + "/keep1/tmp0", false},
@@ -1308,7 +1301,7 @@ func (s *TestSuite) TestSetupMounts(c *C) {
                c.Check(err, IsNil)
                c.Check(am.Cmd, DeepEquals, []string{"--foreground", "--allow-other",
                        "--read-write", "--storage-classes", "default", "--crunchstat-interval=5",
-                       "--file-cache", "512", "--mount-tmp", "tmp0", "--mount-by-pdh", "by_id", realTemp + "/keep1"})
+                       "--file-cache", "512", "--mount-tmp", "tmp0", "--mount-by-pdh", "by_id", "--mount-by-id", "by_uuid", realTemp + "/keep1"})
                c.Check(bindmounts, DeepEquals, map[string]bindmount{
                        "/tmp":     {realTemp + "/tmp2", false},
                        "/tmp/foo": {realTemp + "/keep1/tmp0", true},
index 861f8c8c1913f07bab8d7ea722dfa3c643678059..656061b77ec552a811c26dfe18be870b154c1b1e 100644 (file)
@@ -11,6 +11,7 @@ import (
        "strings"
        "time"
 
+       "git.arvados.org/arvados.git/sdk/go/arvados"
        dockertypes "github.com/docker/docker/api/types"
        dockercontainer "github.com/docker/docker/api/types/container"
        dockerclient "github.com/docker/docker/client"
@@ -45,13 +46,15 @@ func newDockerExecutor(containerUUID string, logf func(string, ...interface{}),
        }, err
 }
 
-func (e *dockerExecutor) ImageLoaded(imageID string) bool {
+func (e *dockerExecutor) LoadImage(imageID string, imageTarballPath string, container arvados.Container, arvMountPoint string,
+       containerClient *arvados.Client) error {
        _, _, err := e.dockerclient.ImageInspectWithRaw(context.TODO(), imageID)
-       return err == nil
-}
+       if err == nil {
+               // already loaded
+               return nil
+       }
 
-func (e *dockerExecutor) LoadImage(filename string) error {
-       f, err := os.Open(filename)
+       f, err := os.Open(imageTarballPath)
        if err != nil {
                return err
        }
index f4feaa06c21447cc66b2e57a962e2d2c306e6de7..65bf7427b9601c465fb21d811c5cb79d2d41a0f8 100644 (file)
@@ -6,6 +6,7 @@ package crunchrun
 import (
        "io"
 
+       "git.arvados.org/arvados.git/sdk/go/arvados"
        "golang.org/x/net/context"
 )
 
@@ -33,13 +34,10 @@ type containerSpec struct {
 // containerExecutor is an interface to a container runtime
 // (docker/singularity).
 type containerExecutor interface {
-       // ImageLoaded determines whether the given image is already
-       // available to use without calling ImageLoad.
-       ImageLoaded(imageID string) bool
-
        // ImageLoad loads the image from the given tarball such that
        // it can be used to create/start a container.
-       LoadImage(filename string) error
+       LoadImage(imageID string, imageTarballPath string, container arvados.Container, keepMount string,
+               containerClient *arvados.Client) error
 
        // Wait for the container process to finish, and return its
        // exit code. If applicable, also remove the stopped container
index 5934c57b6c5f90bf971664c614a8348fb18b9e50..0f9901d6a1ff0d6ebb268c23b107f5ff5514244b 100644 (file)
@@ -13,6 +13,7 @@ import (
        "strings"
        "time"
 
+       "git.arvados.org/arvados.git/sdk/go/arvados"
        "golang.org/x/net/context"
        . "gopkg.in/check.v1"
 )
@@ -70,7 +71,7 @@ func (s *executorSuite) SetUpTest(c *C) {
                Stdout:      nopWriteCloser{&s.stdout},
                Stderr:      nopWriteCloser{&s.stderr},
        }
-       err := s.executor.LoadImage(busyboxDockerImage(c))
+       err := s.executor.LoadImage("", busyboxDockerImage(c), arvados.Container{}, "", nil)
        c.Assert(err, IsNil)
 }
 
index bcaff3bcc88300e51015f16a94751d20a39d5efe..741f542454e470ede35cc6f682c64c8a9b1bbf09 100644 (file)
@@ -5,12 +5,15 @@
 package crunchrun
 
 import (
+       "fmt"
        "io/ioutil"
        "os"
        "os/exec"
        "sort"
        "syscall"
+       "time"
 
+       "git.arvados.org/arvados.git/sdk/go/arvados"
        "golang.org/x/net/context"
 )
 
@@ -33,39 +36,179 @@ func newSingularityExecutor(logf func(string, ...interface{})) (*singularityExec
        }, nil
 }
 
-func (e *singularityExecutor) ImageLoaded(string) bool {
-       return false
+func (e *singularityExecutor) getOrCreateProject(ownerUuid string, name string, containerClient *arvados.Client) (*arvados.Group, error) {
+       var gp arvados.GroupList
+       err := containerClient.RequestAndDecode(&gp,
+               arvados.EndpointGroupList.Method,
+               arvados.EndpointGroupList.Path,
+               nil, arvados.ListOptions{Filters: []arvados.Filter{
+                       arvados.Filter{"owner_uuid", "=", ownerUuid},
+                       arvados.Filter{"name", "=", name},
+                       arvados.Filter{"group_class", "=", "project"},
+               },
+                       Limit: 1})
+       if err != nil {
+               return nil, err
+       }
+       if len(gp.Items) == 1 {
+               return &gp.Items[0], nil
+       }
+
+       var rgroup arvados.Group
+       err = containerClient.RequestAndDecode(&rgroup,
+               arvados.EndpointGroupCreate.Method,
+               arvados.EndpointGroupCreate.Path,
+               nil, map[string]interface{}{
+                       "group": map[string]string{
+                               "owner_uuid":  ownerUuid,
+                               "name":        name,
+                               "group_class": "project",
+                       },
+               })
+       if err != nil {
+               return nil, err
+       }
+       return &rgroup, nil
+}
+
+func (e *singularityExecutor) checkImageCache(dockerImageID string, container arvados.Container, arvMountPoint string,
+       containerClient *arvados.Client) (collection *arvados.Collection, err error) {
+
+       // Cache the image to keep
+       cacheGroup, err := e.getOrCreateProject(container.RuntimeUserUUID, ".cache", containerClient)
+       if err != nil {
+               return nil, fmt.Errorf("error getting '.cache' project: %v", err)
+       }
+       imageGroup, err := e.getOrCreateProject(cacheGroup.UUID, "auto-generated singularity images", containerClient)
+       if err != nil {
+               return nil, fmt.Errorf("error getting 'auto-generated singularity images' project: %s", err)
+       }
+
+       collectionName := fmt.Sprintf("singularity image for %v", dockerImageID)
+       var cl arvados.CollectionList
+       err = containerClient.RequestAndDecode(&cl,
+               arvados.EndpointCollectionList.Method,
+               arvados.EndpointCollectionList.Path,
+               nil, arvados.ListOptions{Filters: []arvados.Filter{
+                       arvados.Filter{"owner_uuid", "=", imageGroup.UUID},
+                       arvados.Filter{"name", "=", collectionName},
+               },
+                       Limit: 1})
+       if err != nil {
+               return nil, fmt.Errorf("error querying for collection '%v': %v", collectionName, err)
+       }
+       var imageCollection arvados.Collection
+       if len(cl.Items) == 1 {
+               imageCollection = cl.Items[0]
+       } else {
+               collectionName := collectionName + " " + time.Now().UTC().Format(time.RFC3339)
+               exp := time.Now().Add(24 * 7 * 2 * time.Hour)
+               err = containerClient.RequestAndDecode(&imageCollection,
+                       arvados.EndpointCollectionCreate.Method,
+                       arvados.EndpointCollectionCreate.Path,
+                       nil, map[string]interface{}{
+                               "collection": map[string]string{
+                                       "owner_uuid": imageGroup.UUID,
+                                       "name":       collectionName,
+                                       "trash_at":   exp.UTC().Format(time.RFC3339),
+                               },
+                       })
+               if err != nil {
+                       return nil, fmt.Errorf("error creating '%v' collection: %s", collectionName, err)
+               }
+
+       }
+
+       return &imageCollection, nil
 }
 
 // LoadImage will satisfy ContainerExecuter interface transforming
 // containerImage into a sif file for later use.
-func (e *singularityExecutor) LoadImage(imageTarballPath string) error {
-       e.logf("building singularity image")
-       // "singularity build" does not accept a
-       // docker-archive://... filename containing a ":" character,
-       // as in "/path/to/sha256:abcd...1234.tar". Workaround: make a
-       // symlink that doesn't have ":" chars.
-       err := os.Symlink(imageTarballPath, e.tmpdir+"/image.tar")
+func (e *singularityExecutor) LoadImage(dockerImageID string, imageTarballPath string, container arvados.Container, arvMountPoint string,
+       containerClient *arvados.Client) error {
+
+       var imageFilename string
+       var sifCollection *arvados.Collection
+       var err error
+       if containerClient != nil {
+               sifCollection, err = e.checkImageCache(dockerImageID, container, arvMountPoint, containerClient)
+               if err != nil {
+                       return err
+               }
+               imageFilename = fmt.Sprintf("%s/by_uuid/%s/image.sif", arvMountPoint, sifCollection.UUID)
+       } else {
+               imageFilename = e.tmpdir + "/image.sif"
+       }
+
+       if _, err := os.Stat(imageFilename); os.IsNotExist(err) {
+               e.logf("building singularity image")
+               // "singularity build" does not accept a
+               // docker-archive://... filename containing a ":" character,
+               // as in "/path/to/sha256:abcd...1234.tar". Workaround: make a
+               // symlink that doesn't have ":" chars.
+               err := os.Symlink(imageTarballPath, e.tmpdir+"/image.tar")
+               if err != nil {
+                       return err
+               }
+
+               build := exec.Command("singularity", "build", imageFilename, "docker-archive://"+e.tmpdir+"/image.tar")
+               e.logf("%v", build.Args)
+               out, err := build.CombinedOutput()
+               // INFO:    Starting build...
+               // Getting image source signatures
+               // Copying blob ab15617702de done
+               // Copying config 651e02b8a2 done
+               // Writing manifest to image destination
+               // Storing signatures
+               // 2021/04/22 14:42:14  info unpack layer: sha256:21cbfd3a344c52b197b9fa36091e66d9cbe52232703ff78d44734f85abb7ccd3
+               // INFO:    Creating SIF file...
+               // INFO:    Build complete: arvados-jobs.latest.sif
+               e.logf("%s", out)
+               if err != nil {
+                       return err
+               }
+       }
+
+       if containerClient == nil {
+               e.imageFilename = imageFilename
+               return nil
+       }
+
+       // update TTL to now + two weeks
+       exp := time.Now().Add(24 * 7 * 2 * time.Hour)
+
+       uuidPath, err := containerClient.PathForUUID("update", sifCollection.UUID)
        if err != nil {
-               return err
+               e.logf("error PathForUUID: %v", err)
+               return nil
+       }
+       var imageCollection arvados.Collection
+       err = containerClient.RequestAndDecode(&imageCollection,
+               arvados.EndpointCollectionUpdate.Method,
+               uuidPath,
+               nil, map[string]interface{}{
+                       "collection": map[string]string{
+                               "name":     fmt.Sprintf("singularity image for %v", dockerImageID),
+                               "trash_at": exp.UTC().Format(time.RFC3339),
+                       },
+               })
+       if err == nil {
+               // If we just wrote the image to the cache, the
+               // response also returns the updated PDH
+               e.imageFilename = fmt.Sprintf("%s/by_id/%s/image.sif", arvMountPoint, imageCollection.PortableDataHash)
+               return nil
        }
-       e.imageFilename = e.tmpdir + "/image.sif"
-       build := exec.Command("singularity", "build", e.imageFilename, "docker-archive://"+e.tmpdir+"/image.tar")
-       e.logf("%v", build.Args)
-       out, err := build.CombinedOutput()
-       // INFO:    Starting build...
-       // Getting image source signatures
-       // Copying blob ab15617702de done
-       // Copying config 651e02b8a2 done
-       // Writing manifest to image destination
-       // Storing signatures
-       // 2021/04/22 14:42:14  info unpack layer: sha256:21cbfd3a344c52b197b9fa36091e66d9cbe52232703ff78d44734f85abb7ccd3
-       // INFO:    Creating SIF file...
-       // INFO:    Build complete: arvados-jobs.latest.sif
-       e.logf("%s", out)
+
+       e.logf("error updating/renaming collection for cached sif image: %v", err)
+       // Failed to update but maybe it lost a race and there is
+       // another cached collection in the same place, so check the cache
+       // again
+       sifCollection, err = e.checkImageCache(dockerImageID, container, arvMountPoint, containerClient)
        if err != nil {
                return err
        }
+       e.imageFilename = fmt.Sprintf("%s/by_id/%s/image.sif", arvMountPoint, sifCollection.PortableDataHash)
+
        return nil
 }
 
@@ -92,8 +235,6 @@ func (e *singularityExecutor) Start() error {
                mount := e.spec.BindMounts[path]
                args = append(args, "--bind", mount.HostPath+":"+path+":"+readonlyflag[mount.ReadOnly])
        }
-       args = append(args, e.imageFilename)
-       args = append(args, e.spec.Command...)
 
        // This is for singularity 3.5.2. There are some behaviors
        // that will change in singularity 3.6, please see:
@@ -101,9 +242,17 @@ func (e *singularityExecutor) Start() error {
        // https://sylabs.io/guides/3.5/user-guide/environment_and_metadata.html
        env := make([]string, 0, len(e.spec.Env))
        for k, v := range e.spec.Env {
-               env = append(env, "SINGULARITYENV_"+k+"="+v)
+               if k == "HOME" {
+                       // $HOME is a special case
+                       args = append(args, "--home="+v)
+               } else {
+                       env = append(env, "SINGULARITYENV_"+k+"="+v)
+               }
        }
 
+       args = append(args, e.imageFilename)
+       args = append(args, e.spec.Command...)
+
        path, err := exec.LookPath(args[0])
        if err != nil {
                return err
index 77a64df895e584fd38ee5b2cb516ac5891eeb8e6..73aecd01e634ed844e1b4298fd2314fba3bb596e 100644 (file)
@@ -490,7 +490,7 @@ make -C ./builddir install
                                {"mkdir", "-p", "log", "tmp", ".bundle", "/var/www/.gem", "/var/www/.bundle", "/var/www/.passenger"},
                                {"touch", "log/production.log"},
                                {"chown", "-R", "--from=root", "www-data:www-data", "/var/www/.gem", "/var/www/.bundle", "/var/www/.passenger", "log", "tmp", ".bundle", "Gemfile.lock", "config.ru", "config/environment.rb"},
-                               {"sudo", "-u", "www-data", "/var/lib/arvados/bin/gem", "install", "--user", "--conservative", "--no-document", "bundler:1.16.6", "bundler:1.17.3", "bundler:2.0.2"},
+                               {"sudo", "-u", "www-data", "/var/lib/arvados/bin/gem", "install", "--user", "--conservative", "--no-document", "bundler:2.2.19"},
                                {"sudo", "-u", "www-data", "/var/lib/arvados/bin/bundle", "install", "--deployment", "--jobs", "8", "--path", "/var/www/.gem"},
                                {"sudo", "-u", "www-data", "/var/lib/arvados/bin/bundle", "exec", "passenger-config", "build-native-support"},
                                {"sudo", "-u", "www-data", "/var/lib/arvados/bin/bundle", "exec", "passenger-config", "install-standalone-runtime"},
diff --git a/lib/lsf/dispatch.go b/lib/lsf/dispatch.go
new file mode 100644 (file)
index 0000000..7461597
--- /dev/null
@@ -0,0 +1,322 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package lsf
+
+import (
+       "context"
+       "errors"
+       "fmt"
+       "math"
+       "net/http"
+       "regexp"
+       "strings"
+       "sync"
+       "time"
+
+       "git.arvados.org/arvados.git/lib/cmd"
+       "git.arvados.org/arvados.git/lib/dispatchcloud"
+       "git.arvados.org/arvados.git/lib/service"
+       "git.arvados.org/arvados.git/sdk/go/arvados"
+       "git.arvados.org/arvados.git/sdk/go/arvadosclient"
+       "git.arvados.org/arvados.git/sdk/go/auth"
+       "git.arvados.org/arvados.git/sdk/go/ctxlog"
+       "git.arvados.org/arvados.git/sdk/go/dispatch"
+       "git.arvados.org/arvados.git/sdk/go/health"
+       "github.com/julienschmidt/httprouter"
+       "github.com/prometheus/client_golang/prometheus"
+       "github.com/prometheus/client_golang/prometheus/promhttp"
+       "github.com/sirupsen/logrus"
+)
+
+var DispatchCommand cmd.Handler = service.Command(arvados.ServiceNameDispatchLSF, newHandler)
+
+func newHandler(ctx context.Context, cluster *arvados.Cluster, token string, reg *prometheus.Registry) service.Handler {
+       ac, err := arvados.NewClientFromConfig(cluster)
+       if err != nil {
+               return service.ErrorHandler(ctx, cluster, fmt.Errorf("error initializing client from cluster config: %s", err))
+       }
+       d := &dispatcher{
+               Cluster:   cluster,
+               Context:   ctx,
+               ArvClient: ac,
+               AuthToken: token,
+               Registry:  reg,
+       }
+       go d.Start()
+       return d
+}
+
+type dispatcher struct {
+       Cluster   *arvados.Cluster
+       Context   context.Context
+       ArvClient *arvados.Client
+       AuthToken string
+       Registry  *prometheus.Registry
+
+       logger        logrus.FieldLogger
+       lsfcli        lsfcli
+       lsfqueue      lsfqueue
+       arvDispatcher *dispatch.Dispatcher
+       httpHandler   http.Handler
+
+       initOnce sync.Once
+       stop     chan struct{}
+       stopped  chan struct{}
+}
+
+// Start starts the dispatcher. Start can be called multiple times
+// with no ill effect.
+func (disp *dispatcher) Start() {
+       disp.initOnce.Do(func() {
+               disp.init()
+               go func() {
+                       disp.checkLsfQueueForOrphans()
+                       err := disp.arvDispatcher.Run(disp.Context)
+                       if err != nil {
+                               disp.logger.Error(err)
+                               disp.Close()
+                       }
+               }()
+       })
+}
+
+// ServeHTTP implements service.Handler.
+func (disp *dispatcher) ServeHTTP(w http.ResponseWriter, r *http.Request) {
+       disp.Start()
+       disp.httpHandler.ServeHTTP(w, r)
+}
+
+// CheckHealth implements service.Handler.
+func (disp *dispatcher) CheckHealth() error {
+       disp.Start()
+       select {
+       case <-disp.stopped:
+               return errors.New("stopped")
+       default:
+               return nil
+       }
+}
+
+// Done implements service.Handler.
+func (disp *dispatcher) Done() <-chan struct{} {
+       return disp.stopped
+}
+
+// Stop dispatching containers and release resources. Used by tests.
+func (disp *dispatcher) Close() {
+       disp.Start()
+       select {
+       case disp.stop <- struct{}{}:
+       default:
+       }
+       <-disp.stopped
+}
+
+func (disp *dispatcher) init() {
+       disp.logger = ctxlog.FromContext(disp.Context)
+       disp.lsfcli.logger = disp.logger
+       disp.lsfqueue = lsfqueue{
+               logger: disp.logger,
+               period: time.Duration(disp.Cluster.Containers.CloudVMs.PollInterval),
+               lsfcli: &disp.lsfcli,
+       }
+       disp.ArvClient.AuthToken = disp.AuthToken
+       disp.stop = make(chan struct{}, 1)
+       disp.stopped = make(chan struct{})
+
+       arv, err := arvadosclient.New(disp.ArvClient)
+       if err != nil {
+               disp.logger.Fatalf("Error making Arvados client: %v", err)
+       }
+       arv.Retries = 25
+       arv.ApiToken = disp.AuthToken
+       disp.arvDispatcher = &dispatch.Dispatcher{
+               Arv:            arv,
+               Logger:         disp.logger,
+               BatchSize:      disp.Cluster.API.MaxItemsPerResponse,
+               RunContainer:   disp.runContainer,
+               PollPeriod:     time.Duration(disp.Cluster.Containers.CloudVMs.PollInterval),
+               MinRetryPeriod: time.Duration(disp.Cluster.Containers.MinRetryPeriod),
+       }
+
+       if disp.Cluster.ManagementToken == "" {
+               disp.httpHandler = http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+                       http.Error(w, "Management API authentication is not configured", http.StatusForbidden)
+               })
+       } else {
+               mux := httprouter.New()
+               metricsH := promhttp.HandlerFor(disp.Registry, promhttp.HandlerOpts{
+                       ErrorLog: disp.logger,
+               })
+               mux.Handler("GET", "/metrics", metricsH)
+               mux.Handler("GET", "/metrics.json", metricsH)
+               mux.Handler("GET", "/_health/:check", &health.Handler{
+                       Token:  disp.Cluster.ManagementToken,
+                       Prefix: "/_health/",
+                       Routes: health.Routes{"ping": disp.CheckHealth},
+               })
+               disp.httpHandler = auth.RequireLiteralToken(disp.Cluster.ManagementToken, mux)
+       }
+}
+
+func (disp *dispatcher) runContainer(_ *dispatch.Dispatcher, ctr arvados.Container, status <-chan arvados.Container) error {
+       ctx, cancel := context.WithCancel(disp.Context)
+       defer cancel()
+
+       if ctr.State != dispatch.Locked {
+               // already started by prior invocation
+       } else if _, ok := disp.lsfqueue.JobID(ctr.UUID); !ok {
+               disp.logger.Printf("Submitting container %s to LSF", ctr.UUID)
+               cmd := []string{disp.Cluster.Containers.CrunchRunCommand}
+               cmd = append(cmd, "--runtime-engine="+disp.Cluster.Containers.RuntimeEngine)
+               cmd = append(cmd, disp.Cluster.Containers.CrunchRunArgumentsList...)
+               err := disp.submit(ctr, cmd)
+               if err != nil {
+                       return err
+               }
+       }
+
+       disp.logger.Printf("Start monitoring container %v in state %q", ctr.UUID, ctr.State)
+       defer disp.logger.Printf("Done monitoring container %s", ctr.UUID)
+
+       // If the container disappears from the lsf queue, there is
+       // no point in waiting for further dispatch updates: just
+       // clean up and return.
+       go func(uuid string) {
+               for ctx.Err() == nil {
+                       if _, ok := disp.lsfqueue.JobID(uuid); !ok {
+                               disp.logger.Printf("container %s job disappeared from LSF queue", uuid)
+                               cancel()
+                               return
+                       }
+               }
+       }(ctr.UUID)
+
+       for done := false; !done; {
+               select {
+               case <-ctx.Done():
+                       // Disappeared from lsf queue
+                       if err := disp.arvDispatcher.Arv.Get("containers", ctr.UUID, nil, &ctr); err != nil {
+                               disp.logger.Printf("error getting final container state for %s: %s", ctr.UUID, err)
+                       }
+                       switch ctr.State {
+                       case dispatch.Running:
+                               disp.arvDispatcher.UpdateState(ctr.UUID, dispatch.Cancelled)
+                       case dispatch.Locked:
+                               disp.arvDispatcher.Unlock(ctr.UUID)
+                       }
+                       return nil
+               case updated, ok := <-status:
+                       if !ok {
+                               // status channel is closed, which is
+                               // how arvDispatcher tells us to stop
+                               // touching the container record, kill
+                               // off any remaining LSF processes,
+                               // etc.
+                               done = true
+                               break
+                       }
+                       if updated.State != ctr.State {
+                               disp.logger.Infof("container %s changed state from %s to %s", ctr.UUID, ctr.State, updated.State)
+                       }
+                       ctr = updated
+                       if ctr.Priority < 1 {
+                               disp.logger.Printf("container %s has state %s, priority %d: cancel lsf job", ctr.UUID, ctr.State, ctr.Priority)
+                               disp.bkill(ctr)
+                       } else {
+                               disp.lsfqueue.SetPriority(ctr.UUID, int64(ctr.Priority))
+                       }
+               }
+       }
+       disp.logger.Printf("container %s is done", ctr.UUID)
+
+       // Try "bkill" every few seconds until the LSF job disappears
+       // from the queue.
+       ticker := time.NewTicker(5 * time.Second)
+       defer ticker.Stop()
+       for jobid, ok := disp.lsfqueue.JobID(ctr.UUID); ok; _, ok = disp.lsfqueue.JobID(ctr.UUID) {
+               err := disp.lsfcli.Bkill(jobid)
+               if err != nil {
+                       disp.logger.Warnf("%s: bkill(%d): %s", ctr.UUID, jobid, err)
+               }
+               <-ticker.C
+       }
+       return nil
+}
+
+func (disp *dispatcher) submit(container arvados.Container, crunchRunCommand []string) error {
+       // Start with an empty slice here to ensure append() doesn't
+       // modify crunchRunCommand's underlying array
+       var crArgs []string
+       crArgs = append(crArgs, crunchRunCommand...)
+       crArgs = append(crArgs, container.UUID)
+       crScript := execScript(crArgs)
+
+       bsubArgs, err := disp.bsubArgs(container)
+       if err != nil {
+               return err
+       }
+       return disp.lsfcli.Bsub(crScript, bsubArgs, disp.ArvClient)
+}
+
+func (disp *dispatcher) bkill(ctr arvados.Container) {
+       if jobid, ok := disp.lsfqueue.JobID(ctr.UUID); !ok {
+               disp.logger.Debugf("bkill(%s): redundant, job not in queue", ctr.UUID)
+       } else if err := disp.lsfcli.Bkill(jobid); err != nil {
+               disp.logger.Warnf("%s: bkill(%d): %s", ctr.UUID, jobid, err)
+       }
+}
+
+func (disp *dispatcher) bsubArgs(container arvados.Container) ([]string, error) {
+       args := []string{"bsub"}
+       args = append(args, disp.Cluster.Containers.LSF.BsubArgumentsList...)
+       args = append(args, "-J", container.UUID)
+       args = append(args, disp.bsubConstraintArgs(container)...)
+       if u := disp.Cluster.Containers.LSF.BsubSudoUser; u != "" {
+               args = append([]string{"sudo", "-E", "-u", u}, args...)
+       }
+       return args, nil
+}
+
+func (disp *dispatcher) bsubConstraintArgs(container arvados.Container) []string {
+       // TODO: propagate container.SchedulingParameters.Partitions
+       tmp := int64(math.Ceil(float64(dispatchcloud.EstimateScratchSpace(&container)) / 1048576))
+       vcpus := container.RuntimeConstraints.VCPUs
+       mem := int64(math.Ceil(float64(container.RuntimeConstraints.RAM+
+               container.RuntimeConstraints.KeepCacheRAM+
+               int64(disp.Cluster.Containers.ReserveExtraRAM)) / 1048576))
+       return []string{
+               "-R", fmt.Sprintf("rusage[mem=%dMB:tmp=%dMB] affinity[core(%d)]", mem, tmp, vcpus),
+       }
+}
+
+// Check the next bjobs report, and invoke TrackContainer for all the
+// containers in the report. This gives us a chance to cancel existing
+// Arvados LSF jobs (started by a previous dispatch process) that
+// never released their LSF job allocations even though their
+// container states are Cancelled or Complete. See
+// https://dev.arvados.org/issues/10979
+func (disp *dispatcher) checkLsfQueueForOrphans() {
+       containerUuidPattern := regexp.MustCompile(`^[a-z0-9]{5}-dz642-[a-z0-9]{15}$`)
+       for _, uuid := range disp.lsfqueue.All() {
+               if !containerUuidPattern.MatchString(uuid) || !strings.HasPrefix(uuid, disp.Cluster.ClusterID) {
+                       continue
+               }
+               err := disp.arvDispatcher.TrackContainer(uuid)
+               if err != nil {
+                       disp.logger.Warnf("checkLsfQueueForOrphans: TrackContainer(%s): %s", uuid, err)
+               }
+       }
+}
+
+func execScript(args []string) []byte {
+       s := "#!/bin/sh\nexec"
+       for _, w := range args {
+               s += ` '`
+               s += strings.Replace(w, `'`, `'\''`, -1)
+               s += `'`
+       }
+       return []byte(s + "\n")
+}
diff --git a/lib/lsf/dispatch_test.go b/lib/lsf/dispatch_test.go
new file mode 100644 (file)
index 0000000..7cf6df6
--- /dev/null
@@ -0,0 +1,156 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package lsf
+
+import (
+       "context"
+       "fmt"
+       "math/rand"
+       "os/exec"
+       "strconv"
+       "sync"
+       "testing"
+       "time"
+
+       "git.arvados.org/arvados.git/lib/config"
+       "git.arvados.org/arvados.git/sdk/go/arvados"
+       "git.arvados.org/arvados.git/sdk/go/arvadostest"
+       "git.arvados.org/arvados.git/sdk/go/ctxlog"
+       "github.com/prometheus/client_golang/prometheus"
+       "gopkg.in/check.v1"
+)
+
+func Test(t *testing.T) {
+       check.TestingT(t)
+}
+
+var _ = check.Suite(&suite{})
+
+type suite struct {
+       disp *dispatcher
+}
+
+func (s *suite) TearDownTest(c *check.C) {
+       arvados.NewClientFromEnv().RequestAndDecode(nil, "POST", "database/reset", nil, nil)
+}
+
+func (s *suite) SetUpTest(c *check.C) {
+       cfg, err := config.NewLoader(nil, ctxlog.TestLogger(c)).Load()
+       c.Assert(err, check.IsNil)
+       cluster, err := cfg.GetCluster("")
+       c.Assert(err, check.IsNil)
+       cluster.Containers.CloudVMs.PollInterval = arvados.Duration(time.Second)
+       s.disp = newHandler(context.Background(), cluster, arvadostest.Dispatch1Token, prometheus.NewRegistry()).(*dispatcher)
+       s.disp.lsfcli.stubCommand = func(string, ...string) *exec.Cmd {
+               return exec.Command("bash", "-c", "echo >&2 unimplemented stub; false")
+       }
+}
+
+type lsfstub struct {
+       sudoUser  string
+       errorRate float64
+}
+
+func (stub lsfstub) stubCommand(c *check.C) func(prog string, args ...string) *exec.Cmd {
+       mtx := sync.Mutex{}
+       nextjobid := 100
+       fakejobq := map[int]string{}
+       return func(prog string, args ...string) *exec.Cmd {
+               c.Logf("stubCommand: %q %q", prog, args)
+               if rand.Float64() < stub.errorRate {
+                       return exec.Command("bash", "-c", "echo >&2 'stub random failure' && false")
+               }
+               if stub.sudoUser != "" && len(args) > 3 &&
+                       prog == "sudo" &&
+                       args[0] == "-E" &&
+                       args[1] == "-u" &&
+                       args[2] == stub.sudoUser {
+                       prog, args = args[3], args[4:]
+               }
+               switch prog {
+               case "bsub":
+                       c.Assert(args, check.HasLen, 4)
+                       c.Check(args[0], check.Equals, "-J")
+                       switch args[1] {
+                       case arvadostest.LockedContainerUUID:
+                               c.Check(args, check.DeepEquals, []string{"-J", arvadostest.LockedContainerUUID, "-R", "rusage[mem=11701MB:tmp=0MB] affinity[core(4)]"})
+                               mtx.Lock()
+                               fakejobq[nextjobid] = args[1]
+                               nextjobid++
+                               mtx.Unlock()
+                       case arvadostest.QueuedContainerUUID:
+                               c.Check(args, check.DeepEquals, []string{"-J", arvadostest.QueuedContainerUUID, "-R", "rusage[mem=11701MB:tmp=45777MB] affinity[core(4)]"})
+                               mtx.Lock()
+                               fakejobq[nextjobid] = args[1]
+                               nextjobid++
+                               mtx.Unlock()
+                       default:
+                               c.Errorf("unexpected uuid passed to bsub: args %q", args)
+                               return exec.Command("false")
+                       }
+                       return exec.Command("echo", "submitted job")
+               case "bjobs":
+                       c.Check(args, check.DeepEquals, []string{"-u", "all", "-noheader", "-o", "jobid stat job_name:30"})
+                       out := ""
+                       for jobid, uuid := range fakejobq {
+                               out += fmt.Sprintf(`%d %s %s\n`, jobid, "RUN", uuid)
+                       }
+                       c.Logf("bjobs out: %q", out)
+                       return exec.Command("printf", out)
+               case "bkill":
+                       killid, _ := strconv.Atoi(args[0])
+                       if uuid, ok := fakejobq[killid]; !ok {
+                               return exec.Command("bash", "-c", fmt.Sprintf("printf >&2 'Job <%d>: No matching job found\n'", killid))
+                       } else if uuid == "" {
+                               return exec.Command("bash", "-c", fmt.Sprintf("printf >&2 'Job <%d>: Job has already finished\n'", killid))
+                       } else {
+                               go func() {
+                                       time.Sleep(time.Millisecond)
+                                       mtx.Lock()
+                                       delete(fakejobq, killid)
+                                       mtx.Unlock()
+                               }()
+                               return exec.Command("bash", "-c", fmt.Sprintf("printf 'Job <%d> is being terminated\n'", killid))
+                       }
+               default:
+                       return exec.Command("bash", "-c", fmt.Sprintf("echo >&2 'stub: command not found: %+q'", prog))
+               }
+       }
+}
+
+func (s *suite) TestSubmit(c *check.C) {
+       s.disp.lsfcli.stubCommand = lsfstub{
+               errorRate: 0.1,
+               sudoUser:  s.disp.Cluster.Containers.LSF.BsubSudoUser,
+       }.stubCommand(c)
+       s.disp.Start()
+       deadline := time.Now().Add(20 * time.Second)
+       for range time.NewTicker(time.Second).C {
+               if time.Now().After(deadline) {
+                       c.Error("timed out")
+                       break
+               }
+               // "queuedcontainer" should be running
+               if _, ok := s.disp.lsfqueue.JobID(arvadostest.QueuedContainerUUID); !ok {
+                       continue
+               }
+               // "lockedcontainer" should be cancelled because it
+               // has priority 0 (no matching container requests)
+               if _, ok := s.disp.lsfqueue.JobID(arvadostest.LockedContainerUUID); ok {
+                       continue
+               }
+               var ctr arvados.Container
+               if err := s.disp.arvDispatcher.Arv.Get("containers", arvadostest.LockedContainerUUID, nil, &ctr); err != nil {
+                       c.Logf("error getting container state for %s: %s", arvadostest.LockedContainerUUID, err)
+                       continue
+               }
+               if ctr.State != arvados.ContainerStateQueued {
+                       c.Logf("LockedContainer is not in the LSF queue but its arvados record has not been updated to state==Queued (state is %q)", ctr.State)
+                       continue
+               }
+               c.Log("reached desired state")
+               break
+       }
+}
diff --git a/lib/lsf/lsfcli.go b/lib/lsf/lsfcli.go
new file mode 100644 (file)
index 0000000..9d712ee
--- /dev/null
@@ -0,0 +1,92 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package lsf
+
+import (
+       "bytes"
+       "fmt"
+       "os"
+       "os/exec"
+       "strings"
+
+       "git.arvados.org/arvados.git/sdk/go/arvados"
+       "github.com/sirupsen/logrus"
+)
+
+type bjobsEntry struct {
+       id   int
+       name string
+       stat string
+}
+
+type lsfcli struct {
+       logger logrus.FieldLogger
+       // (for testing) if non-nil, call stubCommand() instead of
+       // exec.Command() when running lsf command line programs.
+       stubCommand func(string, ...string) *exec.Cmd
+}
+
+func (cli lsfcli) command(prog string, args ...string) *exec.Cmd {
+       if f := cli.stubCommand; f != nil {
+               return f(prog, args...)
+       } else {
+               return exec.Command(prog, args...)
+       }
+}
+
+func (cli lsfcli) Bsub(script []byte, args []string, arv *arvados.Client) error {
+       cli.logger.Infof("bsub command %q script %q", args, script)
+       cmd := cli.command(args[0], args[1:]...)
+       cmd.Env = append([]string(nil), os.Environ()...)
+       cmd.Env = append(cmd.Env, "ARVADOS_API_HOST="+arv.APIHost)
+       cmd.Env = append(cmd.Env, "ARVADOS_API_TOKEN="+arv.AuthToken)
+       if arv.Insecure {
+               cmd.Env = append(cmd.Env, "ARVADOS_API_HOST_INSECURE=1")
+       }
+       cmd.Stdin = bytes.NewReader(script)
+       out, err := cmd.Output()
+       cli.logger.WithField("stdout", string(out)).Infof("bsub finished")
+       return errWithStderr(err)
+}
+
+func (cli lsfcli) Bjobs() ([]bjobsEntry, error) {
+       cli.logger.Debugf("Bjobs()")
+       cmd := cli.command("bjobs", "-u", "all", "-noheader", "-o", "jobid stat job_name:30")
+       buf, err := cmd.Output()
+       if err != nil {
+               return nil, errWithStderr(err)
+       }
+       var bjobs []bjobsEntry
+       for _, line := range strings.Split(string(buf), "\n") {
+               if line == "" {
+                       continue
+               }
+               var ent bjobsEntry
+               if _, err := fmt.Sscan(line, &ent.id, &ent.stat, &ent.name); err != nil {
+                       cli.logger.Warnf("ignoring unparsed line in bjobs output: %q", line)
+                       continue
+               }
+               bjobs = append(bjobs, ent)
+       }
+       return bjobs, nil
+}
+
+func (cli lsfcli) Bkill(id int) error {
+       cli.logger.Infof("Bkill(%d)", id)
+       cmd := cli.command("bkill", fmt.Sprintf("%d", id))
+       buf, err := cmd.CombinedOutput()
+       if err == nil || strings.Index(string(buf), "already finished") >= 0 {
+               return nil
+       } else {
+               return fmt.Errorf("%s (%q)", err, buf)
+       }
+}
+
+func errWithStderr(err error) error {
+       if err, ok := err.(*exec.ExitError); ok {
+               return fmt.Errorf("%s (%q)", err, err.Stderr)
+       }
+       return err
+}
diff --git a/lib/lsf/lsfqueue.go b/lib/lsf/lsfqueue.go
new file mode 100644 (file)
index 0000000..3c4fc4c
--- /dev/null
@@ -0,0 +1,108 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package lsf
+
+import (
+       "sync"
+       "time"
+
+       "github.com/sirupsen/logrus"
+)
+
+type lsfqueue struct {
+       logger logrus.FieldLogger
+       period time.Duration
+       lsfcli *lsfcli
+
+       initOnce  sync.Once
+       mutex     sync.Mutex
+       nextReady chan (<-chan struct{})
+       updated   *sync.Cond
+       latest    map[string]bjobsEntry
+}
+
+// JobID waits for the next queue update (so even a job that was only
+// submitted a nanosecond ago will show up) and then returns the LSF
+// job ID corresponding to the given container UUID.
+func (q *lsfqueue) JobID(uuid string) (int, bool) {
+       ent, ok := q.getNext()[uuid]
+       return ent.id, ok
+}
+
+// All waits for the next queue update, then returns the names of all
+// jobs in the queue. Used by checkLsfQueueForOrphans().
+func (q *lsfqueue) All() []string {
+       latest := q.getNext()
+       names := make([]string, 0, len(latest))
+       for name := range latest {
+               names = append(names, name)
+       }
+       return names
+}
+
+func (q *lsfqueue) SetPriority(uuid string, priority int64) {
+       q.initOnce.Do(q.init)
+       q.logger.Debug("SetPriority is not implemented")
+}
+
+func (q *lsfqueue) getNext() map[string]bjobsEntry {
+       q.initOnce.Do(q.init)
+       <-(<-q.nextReady)
+       q.mutex.Lock()
+       defer q.mutex.Unlock()
+       return q.latest
+}
+
+func (q *lsfqueue) init() {
+       q.updated = sync.NewCond(&q.mutex)
+       q.nextReady = make(chan (<-chan struct{}))
+       ticker := time.NewTicker(time.Second)
+       go func() {
+               for range ticker.C {
+                       // Send a new "next update ready" channel to
+                       // the next goroutine that wants one (and any
+                       // others that have already queued up since
+                       // the first one started waiting).
+                       //
+                       // Below, when we get a new update, we'll
+                       // signal that to the other goroutines by
+                       // closing the ready chan.
+                       ready := make(chan struct{})
+                       q.nextReady <- ready
+                       for {
+                               select {
+                               case q.nextReady <- ready:
+                                       continue
+                               default:
+                               }
+                               break
+                       }
+                       // Run bjobs repeatedly if needed, until we
+                       // get valid output.
+                       var ents []bjobsEntry
+                       for {
+                               q.logger.Debug("running bjobs")
+                               var err error
+                               ents, err = q.lsfcli.Bjobs()
+                               if err == nil {
+                                       break
+                               }
+                               q.logger.Warnf("bjobs: %s", err)
+                               <-ticker.C
+                       }
+                       next := make(map[string]bjobsEntry, len(ents))
+                       for _, ent := range ents {
+                               next[ent.name] = ent
+                       }
+                       // Replace q.latest and notify all the
+                       // goroutines that the "next update" they
+                       // asked for is now ready.
+                       q.mutex.Lock()
+                       q.latest = next
+                       q.mutex.Unlock()
+                       close(ready)
+               }
+       }()
+}
index 9ca24312582060d42f6ed878f600c780ae9d5872..40db4f9c7c7f80f744ab3c44da874794d925c9c1 100644 (file)
@@ -12,6 +12,7 @@ import (
        "io"
        "net"
        "net/http"
+       _ "net/http/pprof"
        "net/url"
        "os"
        "strings"
@@ -70,6 +71,7 @@ func (c *command) RunCommand(prog string, args []string, stdin io.Reader, stdout
        loader := config.NewLoader(stdin, log)
        loader.SetupFlags(flags)
        versionFlag := flags.Bool("version", false, "Write version information to stdout and exit 0")
+       pprofAddr := flags.String("pprof", "", "Serve Go profile data at `[addr]:port`")
        err = flags.Parse(args)
        if err == flag.ErrHelp {
                err = nil
@@ -80,6 +82,12 @@ func (c *command) RunCommand(prog string, args []string, stdin io.Reader, stdout
                return cmd.Version.RunCommand(prog, args, stdin, stdout, stderr)
        }
 
+       if *pprofAddr != "" {
+               go func() {
+                       log.Println(http.ListenAndServe(*pprofAddr, nil))
+               }()
+       }
+
        if strings.HasSuffix(prog, "controller") {
                // Some config-loader checks try to make API calls via
                // controller. Those can't be expected to work if this
index 04db611fbef349692bca2dd38a4988225d980917..7bbb3b29e8b6a27181df115f29b719304f51cb56 100644 (file)
@@ -22,6 +22,7 @@ import cwltool.main
 import cwltool.workflow
 import cwltool.process
 import cwltool.argparser
+from cwltool.errors import WorkflowException
 from cwltool.process import shortname, UnsupportedRequirement, use_custom_schema
 from cwltool.utils import adjustFileObjs, adjustDirObjs, get_listing
 
@@ -178,7 +179,9 @@ def arg_parser():  # type: () -> argparse.ArgumentParser
                         help="Enable loading and running development versions "
                              "of the CWL standards.", default=False)
     parser.add_argument('--storage-classes', default="default",
-                        help="Specify comma separated list of storage classes to be used when saving workflow output to Keep.")
+                        help="Specify comma separated list of storage classes to be used when saving final workflow output to Keep.")
+    parser.add_argument('--intermediate-storage-classes', default="default",
+                        help="Specify comma separated list of storage classes to be used when saving intermediate workflow output to Keep.")
 
     parser.add_argument("--intermediate-output-ttl", type=int, metavar="N",
                         help="If N > 0, intermediate output collections will be trashed N seconds after creation.  Default is 0 (don't trash).",
@@ -245,7 +248,8 @@ def add_arv_hints():
         "http://commonwl.org/cwltool#LoadListingRequirement",
         "http://arvados.org/cwl#IntermediateOutput",
         "http://arvados.org/cwl#ReuseRequirement",
-        "http://arvados.org/cwl#ClusterTarget"
+        "http://arvados.org/cwl#ClusterTarget",
+        "http://arvados.org/cwl#OutputStorageClass"
     ])
 
 def exit_signal_handler(sigcode, frame):
@@ -259,10 +263,6 @@ def main(args, stdout, stderr, api_client=None, keep_client=None,
     job_order_object = None
     arvargs = parser.parse_args(args)
 
-    if len(arvargs.storage_classes.strip().split(',')) > 1:
-        logger.error(str(u"Multiple storage classes are not supported currently."))
-        return 1
-
     arvargs.use_container = True
     arvargs.relax_path_checks = True
     arvargs.print_supported_versions = False
@@ -301,6 +301,9 @@ def main(args, stdout, stderr, api_client=None, keep_client=None,
         if keep_client is None:
             keep_client = arvados.keep.KeepClient(api_client=api_client, num_retries=4)
         executor = ArvCwlExecutor(api_client, arvargs, keep_client=keep_client, num_retries=4)
+    except WorkflowException as e:
+        logger.error(e, exc_info=(sys.exc_info()[1] if arvargs.debug else False))
+        return 1
     except Exception:
         logger.exception("Error creating the Arvados CWL Executor")
         return 1
index 8a3fa3173a9dd98baa5d8aa1ab74e19fe4bceb6d..2a2e857e073e1c14aa8e294a035182f03ab40549 100644 (file)
@@ -266,3 +266,32 @@ $graph:
     project_uuid:
       type: string?
       doc: The project that will own the container requests and intermediate collections
+
+
+- name: OutputStorageClass
+  type: record
+  extends: cwl:ProcessRequirement
+  inVocab: false
+  doc: |
+    Specify the storage class to be used for intermediate and final output
+  fields:
+    class:
+      type: string
+      doc: "Always 'arv:StorageClassHint"
+      jsonldPredicate:
+        _id: "@type"
+        _type: "@vocab"
+    intermediateStorageClass:
+      type:
+        - "null"
+        - string
+        - type: array
+          items: string
+      doc: One or more storages classes
+    finalStorageClass:
+      type:
+        - "null"
+        - string
+        - type: array
+          items: string
+      doc: One or more storages classes
index 95ed0a75bc69bfe94929ce0e2ee80adf6dcec7e6..fb14a63e315fc3dd7e56d8f6b832df87aae243a7 100644 (file)
@@ -210,3 +210,31 @@ $graph:
     project_uuid:
       type: string?
       doc: The project that will own the container requests and intermediate collections
+
+- name: OutputStorageClass
+  type: record
+  extends: cwl:ProcessRequirement
+  inVocab: false
+  doc: |
+    Specify the storage class to be used for intermediate and final output
+  fields:
+    class:
+      type: string
+      doc: "Always 'arv:StorageClassHint"
+      jsonldPredicate:
+        _id: "@type"
+        _type: "@vocab"
+    intermediateStorageClass:
+      type:
+        - "null"
+        - string
+        - type: array
+          items: string
+      doc: One or more storages classes
+    finalStorageClass:
+      type:
+        - "null"
+        - string
+        - type: array
+          items: string
+      doc: One or more storages classes
index 95ed0a75bc69bfe94929ce0e2ee80adf6dcec7e6..dd5919fc88033b44eefdcb851711ac1c447bc8c6 100644 (file)
@@ -210,3 +210,32 @@ $graph:
     project_uuid:
       type: string?
       doc: The project that will own the container requests and intermediate collections
+
+
+- name: OutputStorageClass
+  type: record
+  extends: cwl:ProcessRequirement
+  inVocab: false
+  doc: |
+    Specify the storage class to be used for intermediate and final output
+  fields:
+    class:
+      type: string
+      doc: "Always 'arv:StorageClassHint"
+      jsonldPredicate:
+        _id: "@type"
+        _type: "@vocab"
+    intermediateStorageClass:
+      type:
+        - "null"
+        - string
+        - type: array
+          items: string
+      doc: One or more storages classes
+    finalStorageClass:
+      type:
+        - "null"
+        - string
+        - type: array
+          items: string
+      doc: One or more storages classes
index 72ef14f6731baf83de87df28534a5c76b5a7dc42..c9170c51b732deee8f5a8bc383746c527ae5def7 100644 (file)
@@ -273,6 +273,12 @@ class ArvadosContainer(JobBase):
         if self.output_ttl < 0:
             raise WorkflowException("Invalid value %d for output_ttl, cannot be less than zero" % container_request["output_ttl"])
 
+        storage_class_req, _ = self.get_requirement("http://arvados.org/cwl#OutputStorageClass")
+        if storage_class_req and storage_class_req.get("intermediateStorageClass"):
+            container_request["output_storage_classes"] = aslist(storage_class_req["intermediateStorageClass"])
+        else:
+            container_request["output_storage_classes"] = runtimeContext.intermediate_storage_classes.strip().split(",")
+
         if self.timelimit is not None and self.timelimit > 0:
             scheduling_parameters["max_run_time"] = self.timelimit
 
@@ -495,6 +501,9 @@ class RunnerContainer(Runner):
         if runtimeContext.storage_classes != "default":
             command.append("--storage-classes=" + runtimeContext.storage_classes)
 
+        if runtimeContext.intermediate_storage_classes != "default":
+            command.append("--intermediate-storage-classes=" + runtimeContext.intermediate_storage_classes)
+
         if self.on_error:
             command.append("--on-error=" + self.on_error)
 
index 89176923519ee4c737d9376032463dee08a80102..13664a8dfb0d57df0477d4c627928b9be17ad8d7 100644 (file)
@@ -19,9 +19,15 @@ def validate_cluster_target(arvrunner, runtimeContext):
     if runtimeContext.project_uuid:
         cluster_target = runtimeContext.submit_runner_cluster or arvrunner.api._rootDesc["uuidPrefix"]
         if not runtimeContext.project_uuid.startswith(cluster_target):
-            raise WorkflowException("Project uuid '%s' must be for target cluster '%s'" % (runtimeContext.project_uuid, cluster_target))
+            raise WorkflowException("Project uuid '%s' should start with id of target cluster '%s'" % (runtimeContext.project_uuid, cluster_target))
+
         try:
-            arvrunner.api.groups().get(uuid=runtimeContext.project_uuid).execute()
+            if runtimeContext.project_uuid[5:12] == '-tpzed-':
+                arvrunner.api.users().get(uuid=runtimeContext.project_uuid).execute()
+            else:
+                proj = arvrunner.api.groups().get(uuid=runtimeContext.project_uuid).execute()
+                if proj["group_class"] != "project":
+                    raise Exception("not a project, group_class is '%s'" % (proj["group_class"]))
         except Exception as e:
             raise WorkflowException("Invalid project uuid '%s': %s" % (runtimeContext.project_uuid, e))
 
index 8cfe22ad7b6619f1f02d95eaf71153e44e52fd01..77d4027ccbabccf72e3fe5f60ad049726c1b99d1 100644 (file)
@@ -29,6 +29,7 @@ class ArvRuntimeContext(RuntimeContext):
         self.wait = True
         self.cwl_runner_job = None
         self.storage_classes = "default"
+        self.intermediate_storage_classes = "default"
         self.current_container = None
         self.http_timeout = 300
         self.submit_runner_cluster = None
index f60c480873b833dca11b0dba1a6cc853f4c29e2c..edb9d5b523c09bee4aa43f16705e27f2f15194d9 100644 (file)
@@ -42,7 +42,7 @@ from .context import ArvLoadingContext, ArvRuntimeContext
 from ._version import __version__
 
 from cwltool.process import shortname, UnsupportedRequirement, use_custom_schema
-from cwltool.utils import adjustFileObjs, adjustDirObjs, get_listing, visit_class
+from cwltool.utils import adjustFileObjs, adjustDirObjs, get_listing, visit_class, aslist
 from cwltool.command_line_tool import compute_checksums
 from cwltool.load_tool import load_tool
 
@@ -549,6 +549,12 @@ The 'jobs' API is no longer supported.
         if runtimeContext.submit_request_uuid and self.work_api != "containers":
             raise Exception("--submit-request-uuid requires containers API, but using '{}' api".format(self.work_api))
 
+        default_storage_classes = ",".join([k for k,v in self.api.config()["StorageClasses"].items() if v.get("Default") is True])
+        if runtimeContext.storage_classes == "default":
+            runtimeContext.storage_classes = default_storage_classes
+        if runtimeContext.intermediate_storage_classes == "default":
+            runtimeContext.intermediate_storage_classes = default_storage_classes
+
         if not runtimeContext.name:
             runtimeContext.name = self.name = updated_tool.tool.get("label") or updated_tool.metadata.get("label") or os.path.basename(updated_tool.tool["id"])
 
@@ -771,7 +777,13 @@ The 'jobs' API is no longer supported.
             if self.output_tags is None:
                 self.output_tags = ""
 
-            storage_classes = runtimeContext.storage_classes.strip().split(",")
+            storage_classes = ""
+            storage_class_req, _ = tool.get_requirement("http://arvados.org/cwl#OutputStorageClass")
+            if storage_class_req and storage_class_req.get("finalStorageClass"):
+                storage_classes = aslist(storage_class_req["finalStorageClass"])
+            else:
+                storage_classes = runtimeContext.storage_classes.strip().split(",")
+
             self.final_output, self.final_output_collection = self.make_output_collection(self.output_name, storage_classes, self.output_tags, self.final_output)
             self.set_crunch_output()
 
index 9a52ee70214d7f9474086818768c6cc8fbdafc17..5c17a2fd14cb2429a52d17cdae1c0a7eb7438fe5 100644 (file)
@@ -5,9 +5,6 @@
 fpm_depends+=(nodejs)
 
 case "$TARGET" in
-    ubuntu1604)
-        fpm_depends+=(libcurl3-gnutls)
-        ;;
     debian* | ubuntu*)
         fpm_depends+=(libcurl3-gnutls python3-distutils)
         ;;
index 09983f87a2cea8f46b8790e6749c05a5bc57d8f8..2b46b89c604cafc0f91ee3966ecd1a3c068dbad7 100644 (file)
@@ -163,7 +163,8 @@ class TestContainer(unittest.TestCase):
                         'cwd': '/var/spool/cwl',
                         'scheduling_parameters': {},
                         'properties': {},
-                        'secret_mounts': {}
+                        'secret_mounts': {},
+                        'output_storage_classes': ["default"]
                     }))
 
     # The test passes some fields in builder.resources
@@ -250,7 +251,8 @@ class TestContainer(unittest.TestCase):
                 'partitions': ['blurb']
             },
             'properties': {},
-            'secret_mounts': {}
+            'secret_mounts': {},
+            'output_storage_classes': ["default"]
         }
 
         call_body = call_kwargs.get('body', None)
@@ -379,7 +381,8 @@ class TestContainer(unittest.TestCase):
             'scheduling_parameters': {
             },
             'properties': {},
-            'secret_mounts': {}
+            'secret_mounts': {},
+            'output_storage_classes': ["default"]
         }
 
         call_body = call_kwargs.get('body', None)
@@ -463,7 +466,8 @@ class TestContainer(unittest.TestCase):
                     'cwd': '/var/spool/cwl',
                     'scheduling_parameters': {},
                     'properties': {},
-                    'secret_mounts': {}
+                    'secret_mounts': {},
+                    'output_storage_classes': ["default"]
                 }))
 
     @mock.patch("arvados.collection.Collection")
@@ -696,7 +700,8 @@ class TestContainer(unittest.TestCase):
                     'cwd': '/var/spool/cwl',
                     'scheduling_parameters': {},
                     'properties': {},
-                    'secret_mounts': {}
+                    'secret_mounts': {},
+                    'output_storage_classes': ["default"]
                 }))
 
     # The test passes no builder.resources
@@ -791,7 +796,8 @@ class TestContainer(unittest.TestCase):
                             "content": "username: user\npassword: blorp\n",
                             "kind": "text"
                         }
-                    }
+                    },
+                    'output_storage_classes': ["default"]
                 }))
 
     # The test passes no builder.resources
@@ -835,6 +841,79 @@ class TestContainer(unittest.TestCase):
         self.assertEqual(42, kwargs['body']['scheduling_parameters'].get('max_run_time'))
 
 
+    # The test passes no builder.resources
+    # Hence the default resources will apply: {'cores': 1, 'ram': 1024, 'outdirSize': 1024, 'tmpdirSize': 1024}
+    @mock.patch("arvados.commands.keepdocker.list_images_in_arv")
+    def test_setting_storage_class(self, keepdocker):
+        arv_docker_clear_cache()
+
+        runner = mock.MagicMock()
+        runner.ignore_docker_for_reuse = False
+        runner.intermediate_output_ttl = 0
+        runner.secret_store = cwltool.secrets.SecretStore()
+
+        keepdocker.return_value = [("zzzzz-4zz18-zzzzzzzzzzzzzz3", "")]
+        runner.api.collections().get().execute.return_value = {
+            "portable_data_hash": "99999999999999999999999999999993+99"}
+
+        tool = cmap({
+            "inputs": [],
+            "outputs": [],
+            "baseCommand": "ls",
+            "arguments": [{"valueFrom": "$(runtime.outdir)"}],
+            "id": "#",
+            "class": "CommandLineTool",
+            "hints": [
+                {
+                    "class": "http://arvados.org/cwl#OutputStorageClass",
+                    "finalStorageClass": ["baz_sc", "qux_sc"],
+                    "intermediateStorageClass": ["foo_sc", "bar_sc"]
+                }
+            ]
+        })
+
+        loadingContext, runtimeContext = self.helper(runner, True)
+
+        arvtool = arvados_cwl.ArvadosCommandTool(runner, tool, loadingContext)
+        arvtool.formatgraph = None
+
+        for j in arvtool.job({}, mock.MagicMock(), runtimeContext):
+            j.run(runtimeContext)
+            runner.api.container_requests().create.assert_called_with(
+                body=JsonDiffMatcher({
+                    'environment': {
+                        'HOME': '/var/spool/cwl',
+                        'TMPDIR': '/tmp'
+                    },
+                    'name': 'test_run_True',
+                    'runtime_constraints': {
+                        'vcpus': 1,
+                        'ram': 1073741824
+                    },
+                    'use_existing': True,
+                    'priority': 500,
+                    'mounts': {
+                        '/tmp': {'kind': 'tmp',
+                                 "capacity": 1073741824
+                             },
+                        '/var/spool/cwl': {'kind': 'tmp',
+                                           "capacity": 1073741824 }
+                    },
+                    'state': 'Committed',
+                    'output_name': 'Output for step test_run_True',
+                    'owner_uuid': 'zzzzz-8i9sb-zzzzzzzzzzzzzzz',
+                    'output_path': '/var/spool/cwl',
+                    'output_ttl': 0,
+                    'container_image': '99999999999999999999999999999993+99',
+                    'command': ['ls', '/var/spool/cwl'],
+                    'cwd': '/var/spool/cwl',
+                    'scheduling_parameters': {},
+                    'properties': {},
+                    'secret_mounts': {},
+                    'output_storage_classes': ["foo_sc", "bar_sc"]
+                }))
+
+
 class TestWorkflow(unittest.TestCase):
     def setUp(self):
         cwltool.process._names = set()
@@ -972,7 +1051,8 @@ class TestWorkflow(unittest.TestCase):
                 "scheduling_parameters": {},
                 "secret_mounts": {},
                 "state": "Committed",
-                "use_existing": True
+                "use_existing": True,
+                'output_storage_classes': ["default"]
             }))
         mockc.open().__enter__().write.assert_has_calls([mock.call(subwf)])
         mockc.open().__enter__().write.assert_has_calls([mock.call(
@@ -1074,7 +1154,8 @@ class TestWorkflow(unittest.TestCase):
                 ],
                 'use_existing': True,
                 'output_name': u'Output for step echo-subwf',
-                'cwd': '/var/spool/cwl'
+                'cwd': '/var/spool/cwl',
+                'output_storage_classes': ["default"]
             }))
 
     def test_default_work_api(self):
index 12daf6b6702c906544a04c3d4ce034e2f7c01eb1..1b646a8e4459dfe5677fb18ca997a393248b6e8f 100644 (file)
@@ -95,6 +95,11 @@ def stubs(func):
         stubs.api.containers().current().execute.return_value = {
             "uuid": stubs.fake_container_uuid,
         }
+        stubs.api.config()["StorageClasses"].items.return_value = {
+            "default": {
+                "Default": True
+            }
+        }.items()
 
         class CollectionExecute(object):
             def __init__(self, exe):
@@ -342,14 +347,6 @@ class TestSubmit(unittest.TestCase):
         cwltool.process._names = set()
         arvados_cwl.arvdocker.arv_docker_clear_cache()
 
-    @stubs
-    def test_error_when_multiple_storage_classes_specified(self, stubs):
-        storage_classes = "foo,bar"
-        exited = arvados_cwl.main(
-                ["--debug", "--storage-classes", storage_classes,
-                 "tests/wf/submit_wf.cwl", "tests/submit_test_job.json"],
-                sys.stdin, sys.stderr, api_client=stubs.api)
-        self.assertEqual(exited, 1)
 
     @mock.patch("time.sleep")
     @stubs
@@ -526,6 +523,27 @@ class TestSubmit(unittest.TestCase):
                          stubs.expect_container_request_uuid + '\n')
         self.assertEqual(exited, 0)
 
+    @stubs
+    def test_submit_multiple_storage_classes(self, stubs):
+        exited = arvados_cwl.main(
+            ["--debug", "--submit", "--no-wait", "--api=containers", "--storage-classes=foo,bar", "--intermediate-storage-classes=baz",
+                "tests/wf/submit_wf.cwl", "tests/submit_test_job.json"],
+            stubs.capture_stdout, sys.stderr, api_client=stubs.api, keep_client=stubs.keep_client)
+
+        expect_container = copy.deepcopy(stubs.expect_container_spec)
+        expect_container["command"] = ['arvados-cwl-runner', '--local', '--api=containers',
+                                       '--no-log-timestamps', '--disable-validate', '--disable-color',
+                                       '--eval-timeout=20', '--thread-count=0',
+                                       '--enable-reuse', "--collection-cache-size=256", "--debug",
+                                       "--storage-classes=foo,bar", "--intermediate-storage-classes=baz", '--on-error=continue',
+                                       '/var/lib/cwl/workflow.json#main', '/var/lib/cwl/cwl.input.json']
+
+        stubs.api.container_requests().create.assert_called_with(
+            body=JsonDiffMatcher(expect_container))
+        self.assertEqual(stubs.capture_stdout.getvalue(),
+                         stubs.expect_container_request_uuid + '\n')
+        self.assertEqual(exited, 0)
+
     @mock.patch("cwltool.task_queue.TaskQueue")
     @mock.patch("arvados_cwl.arvworkflow.ArvadosWorkflow.job")
     @mock.patch("arvados_cwl.executor.ArvCwlExecutor.make_output_collection")
@@ -568,6 +586,27 @@ class TestSubmit(unittest.TestCase):
         make_output.assert_called_with(u'Output of submit_wf.cwl', ['default'], '', 'zzzzz-4zz18-zzzzzzzzzzzzzzzz')
         self.assertEqual(exited, 0)
 
+    @mock.patch("cwltool.task_queue.TaskQueue")
+    @mock.patch("arvados_cwl.arvworkflow.ArvadosWorkflow.job")
+    @mock.patch("arvados_cwl.executor.ArvCwlExecutor.make_output_collection")
+    @stubs
+    def test_storage_class_hint_to_make_output_collection(self, stubs, make_output, job, tq):
+        final_output_c = arvados.collection.Collection()
+        make_output.return_value = ({},final_output_c)
+
+        def set_final_output(job_order, output_callback, runtimeContext):
+            output_callback("zzzzz-4zz18-zzzzzzzzzzzzzzzz", "success")
+            return []
+        job.side_effect = set_final_output
+
+        exited = arvados_cwl.main(
+            ["--debug", "--local",
+                "tests/wf/submit_storage_class_wf.cwl", "tests/submit_test_job.json"],
+            stubs.capture_stdout, sys.stderr, api_client=stubs.api, keep_client=stubs.keep_client)
+
+        make_output.assert_called_with(u'Output of submit_storage_class_wf.cwl', ['foo', 'bar'], '', 'zzzzz-4zz18-zzzzzzzzzzzzzzzz')
+        self.assertEqual(exited, 0)
+
     @stubs
     def test_submit_container_output_ttl(self, stubs):
         exited = arvados_cwl.main(
@@ -853,6 +892,7 @@ class TestSubmit(unittest.TestCase):
     @stubs
     def test_submit_container_project(self, stubs):
         project_uuid = 'zzzzz-j7d0g-zzzzzzzzzzzzzzz'
+        stubs.api.groups().get().execute.return_value = {"group_class": "project"}
         exited = arvados_cwl.main(
             ["--submit", "--no-wait", "--api=containers", "--debug", "--project-uuid="+project_uuid,
                 "tests/wf/submit_wf.cwl", "tests/submit_test_job.json"],
@@ -1265,12 +1305,14 @@ class TestSubmit(unittest.TestCase):
 
     @stubs
     def test_submit_validate_project_uuid(self, stubs):
+        # Fails with bad cluster prefix
         exited = arvados_cwl.main(
             ["--submit", "--no-wait", "--api=containers", "--debug", "--project-uuid=zzzzb-j7d0g-zzzzzzzzzzzzzzz",
              "tests/wf/submit_wf.cwl", "tests/submit_test_job.json"],
             stubs.capture_stdout, sys.stderr, api_client=stubs.api, keep_client=stubs.keep_client)
         self.assertEqual(exited, 1)
 
+        # Project lookup fails
         stubs.api.groups().get().execute.side_effect = Exception("Bad project")
         exited = arvados_cwl.main(
             ["--submit", "--no-wait", "--api=containers", "--debug", "--project-uuid=zzzzz-j7d0g-zzzzzzzzzzzzzzx",
@@ -1278,6 +1320,14 @@ class TestSubmit(unittest.TestCase):
             stubs.capture_stdout, sys.stderr, api_client=stubs.api, keep_client=stubs.keep_client)
         self.assertEqual(exited, 1)
 
+        # It should work this time because it is looking up a user (and only group is stubbed out to fail)
+        exited = arvados_cwl.main(
+            ["--submit", "--no-wait", "--api=containers", "--debug", "--project-uuid=zzzzz-tpzed-zzzzzzzzzzzzzzx",
+             "tests/wf/submit_wf.cwl", "tests/submit_test_job.json"],
+            stubs.capture_stdout, sys.stderr, api_client=stubs.api, keep_client=stubs.keep_client)
+        self.assertEqual(exited, 0)
+
+
     @mock.patch("arvados.collection.CollectionReader")
     @stubs
     def test_submit_uuid_inputs(self, stubs, collectionReader):
@@ -1382,6 +1432,7 @@ class TestCreateWorkflow(unittest.TestCase):
     @stubs
     def test_create(self, stubs):
         project_uuid = 'zzzzz-j7d0g-zzzzzzzzzzzzzzz'
+        stubs.api.groups().get().execute.return_value = {"group_class": "project"}
 
         exited = arvados_cwl.main(
             ["--create-workflow", "--debug",
@@ -1411,6 +1462,7 @@ class TestCreateWorkflow(unittest.TestCase):
     @stubs
     def test_create_name(self, stubs):
         project_uuid = 'zzzzz-j7d0g-zzzzzzzzzzzzzzz'
+        stubs.api.groups().get().execute.return_value = {"group_class": "project"}
 
         exited = arvados_cwl.main(
             ["--create-workflow", "--debug",
@@ -1486,6 +1538,7 @@ class TestCreateWorkflow(unittest.TestCase):
     @stubs
     def test_create_collection_per_tool(self, stubs):
         project_uuid = 'zzzzz-j7d0g-zzzzzzzzzzzzzzz'
+        stubs.api.groups().get().execute.return_value = {"group_class": "project"}
 
         exited = arvados_cwl.main(
             ["--create-workflow", "--debug",
@@ -1515,6 +1568,7 @@ class TestCreateWorkflow(unittest.TestCase):
     @stubs
     def test_create_with_imports(self, stubs):
         project_uuid = 'zzzzz-j7d0g-zzzzzzzzzzzzzzz'
+        stubs.api.groups().get().execute.return_value = {"group_class": "project"}
 
         exited = arvados_cwl.main(
             ["--create-workflow", "--debug",
@@ -1533,6 +1587,7 @@ class TestCreateWorkflow(unittest.TestCase):
     @stubs
     def test_create_with_no_input(self, stubs):
         project_uuid = 'zzzzz-j7d0g-zzzzzzzzzzzzzzz'
+        stubs.api.groups().get().execute.return_value = {"group_class": "project"}
 
         exited = arvados_cwl.main(
             ["--create-workflow", "--debug",
diff --git a/sdk/cwl/tests/wf/submit_storage_class_wf.cwl b/sdk/cwl/tests/wf/submit_storage_class_wf.cwl
new file mode 100644 (file)
index 0000000..c365d96
--- /dev/null
@@ -0,0 +1,30 @@
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# Test case for arvados-cwl-runner
+#
+# Used to test whether scanning a workflow file for dependencies
+# (e.g. submit_tool.cwl) and uploading to Keep works as intended.
+
+class: Workflow
+cwlVersion: v1.0
+$namespaces:
+  arv: "http://arvados.org/cwl#"
+hints:
+  arv:OutputStorageClass:
+    finalStorageClass: [foo, bar]
+inputs:
+  - id: x
+    type: File
+  - id: y
+    type: Directory
+  - id: z
+    type: Directory
+outputs: []
+steps:
+  - id: step1
+    in:
+      - { id: x, source: "#x" }
+    out: []
+    run: ../tool/submit_tool.cwl
index cec20279d1fdc81cb56936ed731dea6c6a4c0e8c..785c18d4a75090a9847be18122b418b392c45474 100644 (file)
@@ -5,11 +5,10 @@
 package arvados
 
 import (
-       "bufio"
+       "bytes"
        "crypto/md5"
        "fmt"
        "regexp"
-       "strings"
        "time"
 
        "git.arvados.org/arvados.git/sdk/go/blockdigest"
@@ -56,40 +55,40 @@ func (c Collection) resourceName() string {
 //
 // Zero-length blocks are not included.
 func (c *Collection) SizedDigests() ([]SizedDigest, error) {
-       manifestText := c.ManifestText
-       if manifestText == "" {
-               manifestText = c.UnsignedManifestText
+       manifestText := []byte(c.ManifestText)
+       if len(manifestText) == 0 {
+               manifestText = []byte(c.UnsignedManifestText)
        }
-       if manifestText == "" && c.PortableDataHash != "d41d8cd98f00b204e9800998ecf8427e+0" {
+       if len(manifestText) == 0 && c.PortableDataHash != "d41d8cd98f00b204e9800998ecf8427e+0" {
                // TODO: Check more subtle forms of corruption, too
                return nil, fmt.Errorf("manifest is missing")
        }
-       var sds []SizedDigest
-       scanner := bufio.NewScanner(strings.NewReader(manifestText))
-       scanner.Buffer(make([]byte, 1048576), len(manifestText))
-       for scanner.Scan() {
-               line := scanner.Text()
-               tokens := strings.Split(line, " ")
+       sds := make([]SizedDigest, 0, len(manifestText)/40)
+       for _, line := range bytes.Split(manifestText, []byte{'\n'}) {
+               if len(line) == 0 {
+                       continue
+               }
+               tokens := bytes.Split(line, []byte{' '})
                if len(tokens) < 3 {
                        return nil, fmt.Errorf("Invalid stream (<3 tokens): %q", line)
                }
                for _, token := range tokens[1:] {
-                       if !blockdigest.LocatorPattern.MatchString(token) {
+                       if !blockdigest.LocatorPattern.Match(token) {
                                // FIXME: ensure it's a file token
                                break
                        }
-                       if strings.HasPrefix(token, "d41d8cd98f00b204e9800998ecf8427e+0") {
+                       if bytes.HasPrefix(token, []byte("d41d8cd98f00b204e9800998ecf8427e+0")) {
                                // Exclude "empty block" placeholder
                                continue
                        }
                        // FIXME: shouldn't assume 32 char hash
-                       if i := strings.IndexRune(token[33:], '+'); i >= 0 {
+                       if i := bytes.IndexRune(token[33:], '+'); i >= 0 {
                                token = token[:33+i]
                        }
-                       sds = append(sds, SizedDigest(token))
+                       sds = append(sds, SizedDigest(string(token)))
                }
        }
-       return sds, scanner.Err()
+       return sds, nil
 }
 
 type CollectionList struct {
index 6e59828a3cbf5656fef1e6c7fc790ca9d3b6268f..4a7c18b3e06324ab0f37a2a8db54270aedcdc2c9 100644 (file)
@@ -138,6 +138,7 @@ type Cluster struct {
                BalanceCollectionBatch   int
                BalanceCollectionBuffers int
                BalanceTimeout           Duration
+               BalanceUpdateLimit       int
 
                WebDAVCache WebDAVCacheConfig
 
@@ -233,12 +234,14 @@ type Cluster struct {
                NewUserNotificationRecipients         StringSet
                NewUsersAreActive                     bool
                UserNotifierEmailFrom                 string
+               UserNotifierEmailBcc                  StringSet
                UserProfileNotificationAddress        string
                PreferDomainForUsername               string
                UserSetupMailText                     string
        }
-       Volumes   map[string]Volume
-       Workbench struct {
+       StorageClasses map[string]StorageClassConfig
+       Volumes        map[string]Volume
+       Workbench      struct {
                ActivationContactLink            string
                APIClientConnectTimeout          Duration
                APIClientReceiveTimeout          Duration
@@ -280,6 +283,11 @@ type Cluster struct {
        }
 }
 
+type StorageClassConfig struct {
+       Default  bool
+       Priority int
+}
+
 type Volume struct {
        AccessViaHosts   map[URL]VolumeAccess
        ReadOnly         bool
@@ -329,6 +337,7 @@ type Services struct {
        Composer       Service
        Controller     Service
        DispatchCloud  Service
+       DispatchLSF    Service
        GitHTTP        Service
        GitSSH         Service
        Health         Service
@@ -461,6 +470,10 @@ type ContainersConfig struct {
                        AssignNodeHostname     string
                }
        }
+       LSF struct {
+               BsubSudoUser      string
+               BsubArgumentsList []string
+       }
 }
 
 type CloudVMsConfig struct {
@@ -597,6 +610,7 @@ const (
        ServiceNameRailsAPI      ServiceName = "arvados-api-server"
        ServiceNameController    ServiceName = "arvados-controller"
        ServiceNameDispatchCloud ServiceName = "arvados-dispatch-cloud"
+       ServiceNameDispatchLSF   ServiceName = "arvados-dispatch-lsf"
        ServiceNameHealth        ServiceName = "arvados-health"
        ServiceNameWorkbench1    ServiceName = "arvados-workbench1"
        ServiceNameWorkbench2    ServiceName = "arvados-workbench2"
@@ -614,6 +628,7 @@ func (svcs Services) Map() map[ServiceName]Service {
                ServiceNameRailsAPI:      svcs.RailsAPI,
                ServiceNameController:    svcs.Controller,
                ServiceNameDispatchCloud: svcs.DispatchCloud,
+               ServiceNameDispatchLSF:   svcs.DispatchLSF,
                ServiceNameHealth:        svcs.Health,
                ServiceNameWorkbench1:    svcs.Workbench1,
                ServiceNameWorkbench2:    svcs.Workbench2,
index b57dc849442f4934f10611acd0248539af3a827e..384bebb5997ee86b1b1be2396498f1554ee32ecc 100644 (file)
@@ -33,6 +33,9 @@ type Container struct {
        GatewayAddress            string                 `json:"gateway_address"`
        InteractiveSessionStarted bool                   `json:"interactive_session_started"`
        OutputStorageClasses      []string               `json:"output_storage_classes"`
+       RuntimeUserUUID           string                 `json:"runtime_user_uuid"`
+       RuntimeAuthScopes         []string               `json:"runtime_auth_scopes"`
+       RuntimeToken              string                 `json:"runtime_token"`
 }
 
 // ContainerRequest is an arvados#container_request resource.
index 4dd8b53e1dc86b633c5c6a0f457ea97903440dbf..5f2747ac9ada9225455331daee0f10640202907f 100644 (file)
@@ -680,8 +680,8 @@ func rlookup(start inode, path string) (node inode, err error) {
                        }
                }
                node, err = func() (inode, error) {
-                       node.RLock()
-                       defer node.RUnlock()
+                       node.Lock()
+                       defer node.Unlock()
                        return node.Child(name, nil)
                }()
                if node == nil || err != nil {
index 4b7ad6dd59fa426e8b1e71c546ee43a851d99c54..9281f51d0cf0ee2b46ca97c2e59fde9f68051d4d 100644 (file)
@@ -31,7 +31,10 @@ const (
        UserAgreementPDH        = "b519d9cb706a29fc7ea24dbea2f05851+93"
        HelloWorldPdh           = "55713e6a34081eb03609e7ad5fcad129+62"
 
-       MultilevelCollection1 = "zzzzz-4zz18-pyw8yp9g3pr7irn"
+       MultilevelCollection1                        = "zzzzz-4zz18-pyw8yp9g3pr7irn"
+       StorageClassesDesiredDefaultConfirmedDefault = "zzzzz-4zz18-3t236wr12769tga"
+       StorageClassesDesiredArchiveConfirmedDefault = "zzzzz-4zz18-3t236wr12769qqa"
+       EmptyCollectionUUID                          = "zzzzz-4zz18-gs9ooj1h9sd5mde"
 
        AProjectUUID    = "zzzzz-j7d0g-v955i6s2oi1cbso"
        ASubprojectUUID = "zzzzz-j7d0g-axqo7eu9pwvna1x"
@@ -45,6 +48,8 @@ const (
        QueuedContainerRequestUUID = "zzzzz-xvhdp-cr4queuedcontnr"
        QueuedContainerUUID        = "zzzzz-dz642-queuedcontainer"
 
+       LockedContainerUUID = "zzzzz-dz642-lockedcontainer"
+
        RunningContainerUUID = "zzzzz-dz642-runningcontainr"
 
        CompletedContainerUUID         = "zzzzz-dz642-compltcontainer"
index df43c2b10d9778d7c62befc8a3f7e71babacb168..00c75154f656a70e0b42deed7ef0e34fa7a01d7d 100644 (file)
@@ -7,11 +7,13 @@
 package dispatch
 
 import (
+       "bytes"
        "context"
        "fmt"
        "sync"
        "time"
 
+       "git.arvados.org/arvados.git/lib/dispatchcloud"
        "git.arvados.org/arvados.git/sdk/go/arvados"
        "git.arvados.org/arvados.git/sdk/go/arvadosclient"
        "github.com/sirupsen/logrus"
@@ -66,7 +68,7 @@ type Dispatcher struct {
 // running, and return.
 //
 // The DispatchFunc should not return until the container is finished.
-type DispatchFunc func(*Dispatcher, arvados.Container, <-chan arvados.Container)
+type DispatchFunc func(*Dispatcher, arvados.Container, <-chan arvados.Container) error
 
 // Run watches the API server's queue for containers that are either
 // ready to run and available to lock, or are already locked by this
@@ -170,9 +172,34 @@ func (d *Dispatcher) start(c arvados.Container) *runTracker {
        }
        tracker.updates <- c
        go func() {
-               d.RunContainer(d, c, tracker.updates)
-               // RunContainer blocks for the lifetime of the container.  When
-               // it returns, the tracker should delete itself.
+               err := d.RunContainer(d, c, tracker.updates)
+               if err != nil {
+                       text := fmt.Sprintf("Error running container %s: %s", c.UUID, err)
+                       if err, ok := err.(dispatchcloud.ConstraintsNotSatisfiableError); ok {
+                               var logBuf bytes.Buffer
+                               fmt.Fprintf(&logBuf, "cannot run container %s: %s\n", c.UUID, err)
+                               if len(err.AvailableTypes) == 0 {
+                                       fmt.Fprint(&logBuf, "No instance types are configured.\n")
+                               } else {
+                                       fmt.Fprint(&logBuf, "Available instance types:\n")
+                                       for _, t := range err.AvailableTypes {
+                                               fmt.Fprintf(&logBuf,
+                                                       "Type %q: %d VCPUs, %d RAM, %d Scratch, %f Price\n",
+                                                       t.Name, t.VCPUs, t.RAM, t.Scratch, t.Price)
+                                       }
+                               }
+                               text = logBuf.String()
+                               d.UpdateState(c.UUID, Cancelled)
+                       }
+                       d.Logger.Printf("%s", text)
+                       lr := arvadosclient.Dict{"log": arvadosclient.Dict{
+                               "object_uuid": c.UUID,
+                               "event_type":  "dispatch",
+                               "properties":  map[string]string{"text": text}}}
+                       d.Arv.Create("logs", lr, nil)
+                       d.Unlock(c.UUID)
+               }
+
                d.mtx.Lock()
                delete(d.trackers, c.UUID)
                d.mtx.Unlock()
index 25a4d2b87902531b913cc2203c8439a748469753..4b115229b403bc69aadd275a0177b00d4d171f79 100644 (file)
@@ -35,11 +35,12 @@ func (s *suite) TestTrackContainer(c *C) {
        time.AfterFunc(10*time.Second, func() { done <- false })
        d := &Dispatcher{
                Arv: arv,
-               RunContainer: func(dsp *Dispatcher, ctr arvados.Container, status <-chan arvados.Container) {
+               RunContainer: func(dsp *Dispatcher, ctr arvados.Container, status <-chan arvados.Container) error {
                        for ctr := range status {
                                c.Logf("%#v", ctr)
                        }
                        done <- true
+                       return nil
                },
        }
        d.TrackContainer(arvadostest.QueuedContainerUUID)
index 2acf3e59ab81ae10ff816577f5f33fdaea8b9922..04106caa442cfb52fbc098a516112e11b55643bb 100644 (file)
@@ -153,6 +153,7 @@ func (s *AggregatorSuite) setAllServiceURLs(listen string) {
        for _, svc := range []*arvados.Service{
                &svcs.Controller,
                &svcs.DispatchCloud,
+               &svcs.DispatchLSF,
                &svcs.Keepbalance,
                &svcs.Keepproxy,
                &svcs.Keepstore,
index 14d89873b60f7d902a39a6b337eea78e8040d0c3..5b661ae9f71ac4984c3a22a6dfe8a58bbffbdb45 100644 (file)
@@ -54,6 +54,7 @@ func AddRequestIDs(h http.Handler) http.Handler {
                        }
                        req.Header.Set(HeaderRequestID, gen.Next())
                }
+               w.Header().Set("X-Request-Id", req.Header.Get("X-Request-Id"))
                h.ServeHTTP(w, req)
        })
 }
index 2cd6bb4d43ec5e675f5eca46963e2f9d620de0f6..3bc6f4afcddf7cea0e5b77b140ac1b7c6b3a7a62 100644 (file)
@@ -69,11 +69,11 @@ type ErrNotFound struct {
        multipleResponseError
 }
 
-type InsufficientReplicasError error
+type InsufficientReplicasError struct{ error }
 
-type OversizeBlockError error
+type OversizeBlockError struct{ error }
 
-var ErrOversizeBlock = OversizeBlockError(errors.New("Exceeded maximum block size (" + strconv.Itoa(BLOCKSIZE) + ")"))
+var ErrOversizeBlock = OversizeBlockError{error: errors.New("Exceeded maximum block size (" + strconv.Itoa(BLOCKSIZE) + ")")}
 var MissingArvadosApiHost = errors.New("Missing required environment variable ARVADOS_API_HOST")
 var MissingArvadosApiToken = errors.New("Missing required environment variable ARVADOS_API_TOKEN")
 var InvalidLocatorError = errors.New("Invalid locator")
index c52e07b8f6ea3b6ea417f9843049e3d99b986fa5..62268fa463e6dee07083a815e9b97536e83a5c40 100644 (file)
@@ -8,7 +8,6 @@ import (
        "bytes"
        "context"
        "crypto/md5"
-       "errors"
        "fmt"
        "io"
        "io/ioutil"
@@ -586,7 +585,7 @@ func (s *StandaloneSuite) TestPutWithTooManyFail(c *C) {
 
        _, replicas, err := kc.PutB([]byte("foo"))
 
-       c.Check(err, FitsTypeOf, InsufficientReplicasError(errors.New("")))
+       c.Check(err, FitsTypeOf, InsufficientReplicasError{})
        c.Check(replicas, Equals, 1)
        c.Check(<-st.handled, Equals, ks1[0].url)
 }
@@ -1109,7 +1108,7 @@ func (s *StandaloneSuite) TestPutProxyInsufficientReplicas(c *C) {
        _, replicas, err := kc.PutB([]byte("foo"))
        <-st.handled
 
-       c.Check(err, FitsTypeOf, InsufficientReplicasError(errors.New("")))
+       c.Check(err, FitsTypeOf, InsufficientReplicasError{})
        c.Check(replicas, Equals, 2)
 }
 
@@ -1187,7 +1186,7 @@ func (s *StandaloneSuite) TestPutBWant2ReplicasWithOnlyOneWritableLocalRoot(c *C
 
        _, replicas, err := kc.PutB([]byte("foo"))
 
-       c.Check(err, FitsTypeOf, InsufficientReplicasError(errors.New("")))
+       c.Check(err, FitsTypeOf, InsufficientReplicasError{})
        c.Check(replicas, Equals, 1)
 
        c.Check(<-st.handled, Equals, localRoots[fmt.Sprintf("zzzzz-bi6l4-fakefakefake%03d", 0)])
@@ -1225,7 +1224,7 @@ func (s *StandaloneSuite) TestPutBWithNoWritableLocalRoots(c *C) {
 
        _, replicas, err := kc.PutB([]byte("foo"))
 
-       c.Check(err, FitsTypeOf, InsufficientReplicasError(errors.New("")))
+       c.Check(err, FitsTypeOf, InsufficientReplicasError{})
        c.Check(replicas, Equals, 0)
 }
 
index a8c82aac0e70370dced92b3dd3f5bae249cb100c..633ec1896858bd484d6740b8e9dea074c12d82c9 100644 (file)
@@ -255,7 +255,7 @@ func (kc *KeepClient) BlockWrite(ctx context.Context, req arvados.BlockWriteOpti
                                                        msg += resp + "; "
                                                }
                                                msg = msg[:len(msg)-2]
-                                               return resp, InsufficientReplicasError(errors.New(msg))
+                                               return resp, InsufficientReplicasError{error: errors.New(msg)}
                                        }
                                        break
                                }
index 93fd6b598aefab0448e450391beecccbb2419f42..79dabd38b2d847072aa7d8eccb119c0224469244 100755 (executable)
@@ -102,6 +102,9 @@ def main():
     copy_opts.add_argument(
         '--project-uuid', dest='project_uuid',
         help='The UUID of the project at the destination to which the collection or workflow should be copied.')
+    copy_opts.add_argument(
+        '--storage-classes', dest='storage_classes',
+        help='Comma separated list of storage classes to be used when saving data to the destinaton Arvados instance.')
 
     copy_opts.add_argument(
         'object_uuid',
@@ -114,6 +117,9 @@ def main():
         parents=[copy_opts, arv_cmd.retry_opt])
     args = parser.parse_args()
 
+    if args.storage_classes:
+        args.storage_classes = [x for x in args.storage_classes.strip().replace(' ', '').split(',') if x]
+
     if args.verbose:
         logger.setLevel(logging.DEBUG)
     else:
@@ -410,6 +416,9 @@ def create_collection_from(c, src, dst, args):
     if not body["name"]:
         body['name'] = "copied from " + collection_uuid
 
+    if args.storage_classes:
+        body['storage_classes_desired'] = args.storage_classes
+
     body['owner_uuid'] = args.project_uuid
 
     dst_collection = dst.collections().create(body=body, ensure_unique_name=True).execute(num_retries=args.retries)
@@ -563,7 +572,7 @@ def copy_collection(obj_uuid, src, dst, args):
                 if progress_writer:
                     progress_writer.report(obj_uuid, bytes_written, bytes_expected)
                 data = src_keep.get(word)
-                dst_locator = dst_keep.put(data)
+                dst_locator = dst_keep.put(data, classes=(args.storage_classes or []))
                 dst_locators[blockhash] = dst_locator
                 bytes_written += loc.size
             dst_manifest.write(' ')
index 8eb9dd75a3ee783200831bc0d9da816863cd1568..f3df4e3f393bc15090dab3de80def1b50c595cc3 100644 (file)
@@ -498,6 +498,9 @@ def main(arguments=None, stdout=sys.stdout, install_sig_handlers=True, api=None)
         arguments = [i for i in arguments if i not in (args.image, args.tag, image_repo_tag)]
         put_args = keepdocker_parser.parse_known_args(arguments)[1]
 
+        # Don't fail when cached manifest is invalid, just ignore the cache.
+        put_args += ['--batch']
+
         if args.name is None:
             put_args += ['--name', collection_name]
 
index ad04807712dd00d7e89e478be1a7eb2c01d0b057..f6f85ba69619ba930cca9efd20d3b4f134f28527 100644 (file)
@@ -215,6 +215,12 @@ Do not print any debug messages to console. (Any error messages will
 still be displayed.)
 """)
 
+run_opts.add_argument('--batch', action='store_true', default=False,
+                      help="""
+Retries with '--no-resume --no-cache' if cached state contains invalid/expired
+block signatures.
+""")
+
 _group = run_opts.add_mutually_exclusive_group()
 _group.add_argument('--resume', action='store_true', default=True,
                     help="""
@@ -438,7 +444,7 @@ class ArvPutUploadJob(object):
     }
 
     def __init__(self, paths, resume=True, use_cache=True, reporter=None,
-                 name=None, owner_uuid=None, api_client=None,
+                 name=None, owner_uuid=None, api_client=None, batch_mode=False,
                  ensure_unique_name=False, num_retries=None,
                  put_threads=None, replication_desired=None, filename=None,
                  update_time=60.0, update_collection=None, storage_classes=None,
@@ -448,6 +454,7 @@ class ArvPutUploadJob(object):
         self.paths = paths
         self.resume = resume
         self.use_cache = use_cache
+        self.batch_mode = batch_mode
         self.update = False
         self.reporter = reporter
         # This will set to 0 before start counting, if no special files are going
@@ -908,12 +915,17 @@ class ArvPutUploadJob(object):
                 # No cache file, set empty state
                 self._state = copy.deepcopy(self.EMPTY_STATE)
             if not self._cached_manifest_valid():
-                raise ResumeCacheInvalidError()
+                if not self.batch_mode:
+                    raise ResumeCacheInvalidError()
+                else:
+                    self.logger.info("Invalid signatures on cache file '{}' while being run in 'batch mode' -- continuing anyways.".format(self._cache_file.name))
+                    self.use_cache = False # Don't overwrite preexisting cache file.
+                    self._state = copy.deepcopy(self.EMPTY_STATE)
             # Load the previous manifest so we can check if files were modified remotely.
             self._local_collection = arvados.collection.Collection(
                 self._state['manifest'],
                 replication_desired=self.replication_desired,
-                storage_classes_desired=(self.storage_classes or ['default']),
+                storage_classes_desired=self.storage_classes,
                 put_threads=self.put_threads,
                 api_client=self._api_client,
                 num_retries=self.num_retries)
@@ -1250,6 +1262,7 @@ def main(arguments=None, stdout=sys.stdout, stderr=sys.stderr,
         writer = ArvPutUploadJob(paths = args.paths,
                                  resume = args.resume,
                                  use_cache = args.use_cache,
+                                 batch_mode= args.batch,
                                  filename = args.filename,
                                  reporter = reporter,
                                  api_client = api_client,
@@ -1278,7 +1291,8 @@ def main(arguments=None, stdout=sys.stdout, stderr=sys.stderr,
             "         or been created with another Arvados user's credentials.",
             "         Switch user or use one of the following options to restart upload:",
             "         --no-resume to start a new resume cache.",
-            "         --no-cache to disable resume cache."]))
+            "         --no-cache to disable resume cache.",
+            "         --batch to ignore the resume cache if invalid."]))
         sys.exit(1)
     except (CollectionUpdateError, PathDoesNotExistError) as error:
         logger.error("\n".join([
index 452c2beba2b0639abfdf637e3f503f2f25526f7a..b560018d385bfd3f62f366333527f54a3ec272c2 100644 (file)
@@ -42,6 +42,8 @@ class ArvCopyVersionTestCase(run_test_server.TestCaseWithServers, tutil.VersionC
         with c.open('foo', 'wt') as f:
             f.write('foo')
         c.save_new("arv-copy foo collection", owner_uuid=src_proj)
+        coll_record = api.collections().get(uuid=c.manifest_locator()).execute()
+        assert coll_record['storage_classes_desired'] == ['default']
 
         dest_proj = api.groups().create(body={"group": {"name": "arv-copy dest project", "group_class": "project"}}).execute()["uuid"]
 
@@ -60,7 +62,7 @@ class ArvCopyVersionTestCase(run_test_server.TestCaseWithServers, tutil.VersionC
             assert len(contents["items"]) == 0
 
             try:
-                self.run_copy(["--project-uuid", dest_proj, src_proj])
+                self.run_copy(["--project-uuid", dest_proj, "--storage-classes", "foo", src_proj])
             except SystemExit as e:
                 assert e.code == 0
 
@@ -76,6 +78,7 @@ class ArvCopyVersionTestCase(run_test_server.TestCaseWithServers, tutil.VersionC
             assert contents["items"][0]["uuid"] != c.manifest_locator()
             assert contents["items"][0]["name"] == "arv-copy foo collection"
             assert contents["items"][0]["portable_data_hash"] == c.portable_data_hash()
+            assert contents["items"][0]["storage_classes_desired"] == ["foo"]
 
         finally:
             os.environ['HOME'] = home_was
index fac970c95a6fdb13ecbef709ff850798c67840a5..2a71f3671a4f956cc7006acf97ee57a7e89bb47c 100644 (file)
@@ -1055,43 +1055,53 @@ class ArvPutIntegrationTest(run_test_server.TestCaseWithServers,
             r'INFO: Cache expired, starting from scratch.*')
         self.assertEqual(p.returncode, 0)
 
-    def test_invalid_signature_invalidates_cache(self):
-        self.authorize_with('active')
-        tmpdir = self.make_tmpdir()
-        with open(os.path.join(tmpdir, 'somefile.txt'), 'w') as f:
-            f.write('foo')
-        # Upload a directory and get the cache file name
-        p = subprocess.Popen([sys.executable, arv_put.__file__, tmpdir],
-                             stdout=subprocess.PIPE,
-                             stderr=subprocess.PIPE,
-                             env=self.ENVIRON)
-        (_, err) = p.communicate()
-        self.assertRegex(err.decode(), r'INFO: Creating new cache file at ')
-        self.assertEqual(p.returncode, 0)
-        cache_filepath = re.search(r'INFO: Creating new cache file at (.*)',
-                                   err.decode()).groups()[0]
-        self.assertTrue(os.path.isfile(cache_filepath))
-        # Load the cache file contents and modify the manifest to simulate
-        # an invalid access token
-        with open(cache_filepath, 'r') as c:
-            cache = json.load(c)
-        self.assertRegex(cache['manifest'], r'\+A\S+\@')
-        cache['manifest'] = re.sub(
-            r'\+A.*\@',
-            "+Aabcdef0123456789abcdef0123456789abcdef01@",
-            cache['manifest'])
-        with open(cache_filepath, 'w') as c:
-            c.write(json.dumps(cache))
-        # Re-run the upload and expect to get an invalid cache message
-        p = subprocess.Popen([sys.executable, arv_put.__file__, tmpdir],
-                             stdout=subprocess.PIPE,
-                             stderr=subprocess.PIPE,
-                             env=self.ENVIRON)
-        (_, err) = p.communicate()
-        self.assertRegex(
-            err.decode(),
-            r'ERROR: arv-put: Resume cache contains invalid signature.*')
-        self.assertEqual(p.returncode, 1)
+    def test_invalid_signature_in_cache(self):
+        for batch_mode in [False, True]:
+            self.authorize_with('active')
+            tmpdir = self.make_tmpdir()
+            with open(os.path.join(tmpdir, 'somefile.txt'), 'w') as f:
+                f.write('foo')
+            # Upload a directory and get the cache file name
+            arv_put_args = [tmpdir]
+            if batch_mode:
+                arv_put_args = ['--batch'] + arv_put_args
+            p = subprocess.Popen([sys.executable, arv_put.__file__] + arv_put_args,
+                                stdout=subprocess.PIPE,
+                                stderr=subprocess.PIPE,
+                                env=self.ENVIRON)
+            (_, err) = p.communicate()
+            self.assertRegex(err.decode(), r'INFO: Creating new cache file at ')
+            self.assertEqual(p.returncode, 0)
+            cache_filepath = re.search(r'INFO: Creating new cache file at (.*)',
+                                    err.decode()).groups()[0]
+            self.assertTrue(os.path.isfile(cache_filepath))
+            # Load the cache file contents and modify the manifest to simulate
+            # an invalid access token
+            with open(cache_filepath, 'r') as c:
+                cache = json.load(c)
+            self.assertRegex(cache['manifest'], r'\+A\S+\@')
+            cache['manifest'] = re.sub(
+                r'\+A.*\@',
+                "+Aabcdef0123456789abcdef0123456789abcdef01@",
+                cache['manifest'])
+            with open(cache_filepath, 'w') as c:
+                c.write(json.dumps(cache))
+            # Re-run the upload and expect to get an invalid cache message
+            p = subprocess.Popen([sys.executable, arv_put.__file__] + arv_put_args,
+                                stdout=subprocess.PIPE,
+                                stderr=subprocess.PIPE,
+                                env=self.ENVIRON)
+            (_, err) = p.communicate()
+            if not batch_mode:
+                self.assertRegex(
+                    err.decode(),
+                    r'ERROR: arv-put: Resume cache contains invalid signature.*')
+                self.assertEqual(p.returncode, 1)
+            else:
+                self.assertRegex(
+                    err.decode(),
+                    r'Invalid signatures on cache file \'.*\' while being run in \'batch mode\' -- continuing anyways.*')
+                self.assertEqual(p.returncode, 0)
 
     def test_single_expired_signature_reuploads_file(self):
         self.authorize_with('active')
index fc33dde4477b45d059db2cbd7a63f919eb67e167..c39bdde4b878a26446404979b08e8c3bd08e2b75 100644 (file)
@@ -196,7 +196,7 @@ class ApplicationController < ActionController::Base
     end
     err[:errors] ||= args
     err[:errors].map! do |err|
-      err += " (" + Thread.current[:request_id] + ")"
+      err += " (#{request.request_id})"
     end
     err[:error_token] = [Time.now.utc.to_i, "%08x" % rand(16 ** 8)].join("+")
     status = err.delete(:status) || 422
@@ -419,17 +419,9 @@ class ApplicationController < ActionController::Base
   end
 
   def set_current_request_id
-    req_id = request.headers['X-Request-Id']
-    if !req_id || req_id.length < 1 || req_id.length > 1024
-      # Client-supplied ID is either missing or too long to be
-      # considered friendly.
-      req_id = "req-" + Random::DEFAULT.rand(2**128).to_s(36)[0..19]
-    end
-    response.headers['X-Request-Id'] = Thread.current[:request_id] = req_id
-    Rails.logger.tagged(req_id) do
+    Rails.logger.tagged(request.request_id) do
       yield
     end
-    Thread.current[:request_id] = nil
   end
 
   def append_info_to_payload(payload)
index ad887d035a339ff0039884237d394e820a077023..03000a37308ad287a2b5f75538dd25ab4054a3be 100644 (file)
@@ -9,7 +9,12 @@ class UserNotifier < ActionMailer::Base
 
   def account_is_setup(user)
     @user = user
-    mail(to: user.email, subject: 'Welcome to Arvados - account enabled')
+    if not Rails.configuration.Users.UserNotifierEmailBcc.empty? then
+      @bcc = Rails.configuration.Users.UserNotifierEmailBcc.keys
+      mail(to: user.email, subject: 'Welcome to Arvados - account enabled', bcc: @bcc)
+    else
+      mail(to: user.email, subject: 'Welcome to Arvados - account enabled')
+    end
   end
 
 end
index 4e7b64cf5374fb38003d70790aebd8caee0931fb..1bbe8cc6614b2becdfeac1afef32cd8fcfb4beea 100644 (file)
@@ -17,7 +17,7 @@ class Collection < ArvadosModel
   # Posgresql JSONB columns should NOT be declared as serialized, Rails 5
   # already know how to properly treat them.
   attribute :properties, :jsonbHash, default: {}
-  attribute :storage_classes_desired, :jsonbArray, default: ["default"]
+  attribute :storage_classes_desired, :jsonbArray, default: lambda { Rails.configuration.DefaultStorageClasses }
   attribute :storage_classes_confirmed, :jsonbArray, default: []
 
   before_validation :default_empty_manifest
@@ -150,7 +150,9 @@ class Collection < ArvadosModel
   def strip_signatures_and_update_replication_confirmed
     if self.manifest_text_changed?
       in_old_manifest = {}
-      if not self.replication_confirmed.nil?
+      # manifest_text_was could be nil when dealing with a freshly created snapshot,
+      # so we skip this case because there was no real manifest change. (Bug #18005)
+      if (not self.replication_confirmed.nil?) and (not self.manifest_text_was.nil?)
         self.class.each_manifest_locator(manifest_text_was) do |match|
           in_old_manifest[match[1]] = true
         end
@@ -630,7 +632,7 @@ class Collection < ArvadosModel
   # validation on empty desired storage classes return an error.
   def default_storage_classes
     if self.storage_classes_desired.nil? || self.storage_classes_desired.empty?
-      self.storage_classes_desired = ["default"]
+      self.storage_classes_desired = Rails.configuration.DefaultStorageClasses
     end
     self.storage_classes_confirmed ||= []
   end
index ddae4581892dd8f1bbe727ff0b67b04addb4c0a0..d6a44c80239d46c05d9d7ead49e14f538e19846d 100644 (file)
@@ -21,8 +21,8 @@ class Container < ArvadosModel
   # already know how to properly treat them.
   attribute :secret_mounts, :jsonbHash, default: {}
   attribute :runtime_status, :jsonbHash, default: {}
-  attribute :runtime_auth_scopes, :jsonbHash, default: {}
-  attribute :output_storage_classes, :jsonbArray, default: ["default"]
+  attribute :runtime_auth_scopes, :jsonbArray, default: []
+  attribute :output_storage_classes, :jsonbArray, default: lambda { Rails.configuration.DefaultStorageClasses }
 
   serialize :environment, Hash
   serialize :mounts, Hash
index 1de71102c61befff8e0aabef5885e7396405a237..4a580816cd737674966781d63dfee9a1b4a1dbee 100644 (file)
@@ -23,7 +23,7 @@ class ContainerRequest < ArvadosModel
   # already know how to properly treat them.
   attribute :properties, :jsonbHash, default: {}
   attribute :secret_mounts, :jsonbHash, default: {}
-  attribute :output_storage_classes, :jsonbArray, default: ["default"]
+  attribute :output_storage_classes, :jsonbArray, default: lambda { Rails.configuration.DefaultStorageClasses }
 
   serialize :environment, Hash
   serialize :mounts, Hash
index a6f1730e86190fc878c4c1ec3fd236af30e7cea3..ea421a289b98cc628d6bfa9b0473a105a2986b0c 100644 (file)
@@ -96,6 +96,7 @@ arvcfg.declare_config "Users.UserProfileNotificationAddress", String, :user_prof
 arvcfg.declare_config "Users.AdminNotifierEmailFrom", String, :admin_notifier_email_from
 arvcfg.declare_config "Users.EmailSubjectPrefix", String, :email_subject_prefix
 arvcfg.declare_config "Users.UserNotifierEmailFrom", String, :user_notifier_email_from
+arvcfg.declare_config "Users.UserNotifierEmailBcc", Hash
 arvcfg.declare_config "Users.NewUserNotificationRecipients", Hash, :new_user_notification_recipients, ->(cfg, k, v) { arrayToHash cfg, "Users.NewUserNotificationRecipients", v }
 arvcfg.declare_config "Users.NewInactiveUserNotificationRecipients", Hash, :new_inactive_user_notification_recipients, method(:arrayToHash)
 arvcfg.declare_config "Login.LoginCluster", String
@@ -170,6 +171,7 @@ arvcfg.declare_config "RemoteClusters", Hash, :remote_hosts, ->(cfg, k, v) {
   ConfigLoader.set_cfg cfg, "RemoteClusters", h
 }
 arvcfg.declare_config "RemoteClusters.*.Proxy", Boolean, :remote_hosts_via_dns
+arvcfg.declare_config "StorageClasses", Hash
 
 dbcfg = ConfigLoader.new
 
@@ -237,6 +239,17 @@ if $arvados_config["Collections"]["DefaultTrashLifetime"] < 86400.seconds then
   raise "default_trash_lifetime is %d, must be at least 86400" % Rails.configuration.Collections.DefaultTrashLifetime
 end
 
+default_storage_classes = []
+$arvados_config["StorageClasses"].each do |cls, cfg|
+  if cfg["Default"]
+    default_storage_classes << cls
+  end
+end
+if default_storage_classes.length == 0
+  default_storage_classes = ["default"]
+end
+$arvados_config["DefaultStorageClasses"] = default_storage_classes.sort
+
 #
 # Special case for test database where there's no database.yml,
 # because the Arvados config.yml doesn't have a concept of multiple
diff --git a/services/api/config/initializers/request_id_middleware.rb b/services/api/config/initializers/request_id_middleware.rb
new file mode 100644 (file)
index 0000000..e215880
--- /dev/null
@@ -0,0 +1,25 @@
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: AGPL-3.0
+
+module CustomRequestId
+  def make_request_id(req_id)
+    if !req_id || req_id.length < 1 || req_id.length > 1024
+      # Client-supplied ID is either missing or too long to be
+      # considered friendly.
+      internal_request_id
+    else
+      req_id
+    end
+  end
+
+  def internal_request_id
+    "req-" + Random::DEFAULT.rand(2**128).to_s(36)[0..19]
+  end
+end
+
+class ActionDispatch::RequestId
+  # Instead of using the default UUID-like format for X-Request-Id headers,
+  # use our own.
+  prepend CustomRequestId
+end
\ No newline at end of file
index 7bcf315b0443a4244da40817c54a26015bcc66b6..d83c2b6030e1e7cc12f6ad4902bda1d13624e756 100644 (file)
@@ -11,30 +11,54 @@ require 'current_api_client'
 namespace :db do
   desc "Apply expiration policy on long lived tokens"
   task fix_long_lived_tokens: :environment do
-    if Rails.configuration.Login.TokenLifetime == 0
-      puts("No expiration policy set on Login.TokenLifetime.")
-    else
-      exp_date = Time.now + Rails.configuration.Login.TokenLifetime
-      puts("Setting token expiration to: #{exp_date}")
-      token_count = 0
-      ll_tokens.each do |auth|
-        if (auth.user.uuid =~ /-tpzed-000000000000000/).nil?
-          CurrentApiClientHelper.act_as_system_user do
-            auth.update_attributes!(expires_at: exp_date)
-          end
-          token_count += 1
+    lifetime = Rails.configuration.API.MaxTokenLifetime
+    if lifetime.nil? or lifetime == 0
+      lifetime = Rails.configuration.Login.TokenLifetime
+    end
+    if lifetime.nil? or lifetime == 0
+      puts("No expiration policy set (API.MaxTokenLifetime nor Login.TokenLifetime is set), nothing to do.")
+      # abort the rake task
+      next
+    end
+    exp_date = Time.now + lifetime
+    puts("Setting token expiration to: #{exp_date}")
+    token_count = 0
+    ll_tokens(lifetime).each do |auth|
+      if auth.user.nil?
+        printf("*** WARNING, found ApiClientAuthorization with invalid user: auth id: %d, user id: %d\n", auth.id, auth.user_id)
+        # skip this token
+        next
+      end
+      if (auth.user.uuid =~ /-tpzed-000000000000000/).nil?
+        CurrentApiClientHelper.act_as_system_user do
+          auth.update_attributes!(expires_at: exp_date)
         end
+        token_count += 1
       end
-      puts("#{token_count} tokens updated.")
     end
+    puts("#{token_count} tokens updated.")
   end
 
   desc "Show users with long lived tokens"
   task check_long_lived_tokens: :environment do
+    lifetime = Rails.configuration.API.MaxTokenLifetime
+    if lifetime.nil? or lifetime == 0
+      lifetime = Rails.configuration.Login.TokenLifetime
+    end
+    if lifetime.nil? or lifetime == 0
+      puts("No expiration policy set (API.MaxTokenLifetime nor Login.TokenLifetime is set), nothing to do.")
+      # abort the rake task
+      next
+    end
     user_ids = Set.new()
     token_count = 0
-    ll_tokens.each do |auth|
-      if (auth.user.uuid =~ /-tpzed-000000000000000/).nil?
+    ll_tokens(lifetime).each do |auth|
+      if auth.user.nil?
+        printf("*** WARNING, found ApiClientAuthorization with invalid user: auth id: %d, user id: %d\n", auth.id, auth.user_id)
+        # skip this token
+        next
+      end
+      if not auth.user.nil? and (auth.user.uuid =~ /-tpzed-000000000000000/).nil?
         user_ids.add(auth.user_id)
         token_count += 1
       end
@@ -51,11 +75,9 @@ namespace :db do
     end
   end
 
-  def ll_tokens
+  def ll_tokens(lifetime)
     query = ApiClientAuthorization.where(expires_at: nil)
-    if Rails.configuration.Login.TokenLifetime > 0
-      query = query.or(ApiClientAuthorization.where("expires_at > ?", Time.now + Rails.configuration.Login.TokenLifetime))
-    end
+    query = query.or(ApiClientAuthorization.where("expires_at > ?", Time.now + lifetime))
     query
   end
 end
index 2cfa054448c29fcbbe3beb0b80edc37af514eb2e..af7882141e31973c7e28d0f42da16abb088ed88c 100644 (file)
@@ -24,9 +24,6 @@ class ApplicationControllerTest < ActionController::TestCase
     token_time = token.split('+', 2).first.to_i
     assert_operator(token_time, :>=, @start_stamp, "error token too old")
     assert_operator(token_time, :<=, now_timestamp, "error token too new")
-    json_response['errors'].each do |err|
-      assert_match(/req-[a-z0-9]{20}/, err, "X-Request-Id value missing on error message")
-    end
   end
 
   def check_404(errmsg="Path not found")
@@ -56,28 +53,6 @@ class ApplicationControllerTest < ActionController::TestCase
     check_error_token
   end
 
-  test "X-Request-Id header" do
-    authorize_with :spectator
-    get(:index)
-    assert_match /^req-[0-9a-zA-Z]{20}$/, response.headers['X-Request-Id']
-  end
-
-  # The response header is the one that gets logged, so this test also
-  # ensures we log the ID supplied in the request, if any.
-  test "X-Request-Id given by client" do
-    authorize_with :spectator
-    @request.headers['X-Request-Id'] = 'abcdefG'
-    get(:index)
-    assert_equal 'abcdefG', response.headers['X-Request-Id']
-  end
-
-  test "X-Request-Id given by client is ignored if too long" do
-    authorize_with :spectator
-    @request.headers['X-Request-Id'] = 'abcdefG' * 1000
-    get(:index)
-    assert_match /^req-[0-9a-zA-Z]{20}$/, response.headers['X-Request-Id']
-  end
-
   ['foo', '', 'FALSE', 'TRUE', nil, [true], {a:true}, '"true"'].each do |bogus|
     test "bogus boolean parameter #{bogus.inspect} returns error" do
       @controller = Arvados::V1::GroupsController.new
index d04e3838318dd8d67ed86c560cd907e032be4bb8..e3224f49127e83bf9b76f8887b83b65bf1733bc0 100644 (file)
@@ -14,6 +14,7 @@ class ErrorsTest < ActionDispatch::IntegrationTest
       assert_nil assigns(:object)
       assert_not_nil json_response['errors']
       assert_response 404
+      assert_match /^req-[0-9a-zA-Z]{20}$/, response.headers['X-Request-Id']
     end
   end
 
@@ -28,4 +29,30 @@ class ErrorsTest < ActionDispatch::IntegrationTest
                    "Unexpected new route: #{route.path.spec}")
     end
   end
+
+  test "X-Request-Id header" do
+    get "/", headers: auth(:spectator)
+    assert_match /^req-[0-9a-zA-Z]{20}$/, response.headers['X-Request-Id']
+  end
+
+  test "X-Request-Id header on non-existant object URL" do
+    get "/arvados/v1/container_requests/invalid",
+      params: {:format => :json}, headers: auth(:active)
+    assert_response 404
+    assert_match /^req-[0-9a-zA-Z]{20}$/, response.headers['X-Request-Id']
+  end
+
+  # The response header is the one that gets logged, so this test also
+  # ensures we log the ID supplied in the request, if any.
+  test "X-Request-Id given by client" do
+    get "/", headers: auth(:spectator).merge({'X-Request-Id': 'abcdefG'})
+    assert_equal 'abcdefG', response.headers['X-Request-Id']
+  end
+
+  test "X-Request-Id given by client is ignored if too long" do
+    authorize_with :spectator
+    long_reqId = 'abcdefG' * 1000
+    get "/", headers: auth(:spectator).merge({'X-Request-Id': long_reqId})
+    assert_match /^req-[0-9a-zA-Z]{20}$/, response.headers['X-Request-Id']
+  end
 end
index 916ca095872db7a3a80d59799654dc32504e1f2b..8b8edbc15319fe7cc9f9aee0c21d46aa3cb23506 100644 (file)
@@ -185,6 +185,23 @@ class CollectionTest < ActiveSupport::TestCase
       c.reload
       assert_equal 'foobar', c.name
       assert_equal 2, c.version
+      # Simulate a keep-balance run and trigger a new versionable update
+      # This tests bug #18005
+      assert_nil c.replication_confirmed
+      assert_nil c.replication_confirmed_at
+      # Updates without validations/callbacks
+      c.update_column('modified_at', fifteen_min_ago)
+      c.update_column('replication_confirmed_at', Time.now)
+      c.update_column('replication_confirmed', 2)
+      c.reload
+      assert_equal fifteen_min_ago.to_i, c.modified_at.to_i
+      assert_not_nil c.replication_confirmed_at
+      assert_not_nil c.replication_confirmed
+      # Make the versionable update
+      c.update_attributes!({'name' => 'foobarbaz'})
+      c.reload
+      assert_equal 'foobarbaz', c.name
+      assert_equal 3, c.version
     end
   end
 
@@ -693,6 +710,19 @@ class CollectionTest < ActiveSupport::TestCase
     end
   end
 
+  test "storage_classes_desired default respects config" do
+    saved = Rails.configuration.DefaultStorageClasses
+    Rails.configuration.DefaultStorageClasses = ["foo"]
+    begin
+      act_as_user users(:active) do
+        c = Collection.create!
+        assert_equal ["foo"], c.storage_classes_desired
+      end
+    ensure
+      Rails.configuration.DefaultStorageClasses = saved
+    end
+  end
+
   test "storage_classes_desired cannot be empty" do
     act_as_user users(:active) do
       c = collections(:collection_owned_by_active)
index 7686e1a140618588fc15dcd8f71cb733d5af04c5..9f412c7bb09ad57a74dae753d83816c325cdefaa 100644 (file)
@@ -1292,14 +1292,20 @@ class ContainerRequestTest < ActiveSupport::TestCase
   end
 
   test "default output_storage_classes" do
-    act_as_user users(:active) do
-      cr = create_minimal_req!(priority: 1,
-                               state: ContainerRequest::Committed,
-                               output_name: 'foo')
-      run_container(cr)
-      cr.reload
-      output = Collection.find_by_uuid(cr.output_uuid)
-      assert_equal ["default"], output.storage_classes_desired
+    saved = Rails.configuration.DefaultStorageClasses
+    Rails.configuration.DefaultStorageClasses = ["foo"]
+    begin
+      act_as_user users(:active) do
+        cr = create_minimal_req!(priority: 1,
+                                 state: ContainerRequest::Committed,
+                                 output_name: 'foo')
+        run_container(cr)
+        cr.reload
+        output = Collection.find_by_uuid(cr.output_uuid)
+        assert_equal ["foo"], output.storage_classes_desired
+      end
+    ensure
+      Rails.configuration.DefaultStorageClasses = saved
     end
   end
 
index c288786c1323246c589af91aa53fc3d0aa37c557..e58c273a6d85adebff1dc1d3d113fa01f6b3a893 100644 (file)
@@ -10,6 +10,7 @@ class UserNotifierTest < ActionMailer::TestCase
   test "account is setup" do
     user = users :active
 
+    Rails.configuration.Users.UserNotifierEmailBcc = ConfigLoader.to_OrderedOptions({"bcc-notify@example.com"=>{},"bcc-notify2@example.com"=>{}})
     Rails.configuration.Users.UserSetupMailText = %{
 <% if not @user.full_name.empty? -%>
 <%= @user.full_name %>,
@@ -33,6 +34,7 @@ The Arvados team.
 
     # Test the body of the sent email contains what we expect it to
     assert_equal Rails.configuration.Users.UserNotifierEmailFrom, email.from.first
+    assert_equal Rails.configuration.Users.UserNotifierEmailBcc.stringify_keys.keys, email.bcc
     assert_equal user.email, email.to.first
     assert_equal 'Welcome to Arvados - account enabled', email.subject
     assert (email.body.to_s.include? 'Your Arvados shell account has been set up'),
index 7ac160eea33da990b976910de4693f87d97bdbda..6c2b3bc58d8efc12e2f93787800332196478cb32 100644 (file)
@@ -7,9 +7,6 @@ Description=Arvados git server
 Documentation=https://doc.arvados.org/
 After=network.target
 
-# systemd==229 (ubuntu:xenial) obeys StartLimitInterval in the [Unit] section
-StartLimitInterval=0
-
 # systemd>=230 (debian:9) obeys StartLimitIntervalSec in the [Unit] section
 StartLimitIntervalSec=0
 
index c202e683f2810e85ab6fddda40793f021b3e0eff..a3cb1341a4677e7ecdc7c03976da7483e47c1aa5 100644 (file)
@@ -169,7 +169,7 @@ type LocalRun struct {
 // crunch-run terminates, mark the container as Cancelled.
 func (lr *LocalRun) run(dispatcher *dispatch.Dispatcher,
        container arvados.Container,
-       status <-chan arvados.Container) {
+       status <-chan arvados.Container) error {
 
        uuid := container.UUID
 
@@ -179,7 +179,7 @@ func (lr *LocalRun) run(dispatcher *dispatch.Dispatcher,
                case lr.concurrencyLimit <- true:
                        break
                case <-lr.ctx.Done():
-                       return
+                       return lr.ctx.Err()
                }
 
                defer func() { <-lr.concurrencyLimit }()
@@ -270,4 +270,5 @@ Finish:
        }
 
        dispatcher.Logger.Printf("finalized container %v", uuid)
+       return nil
 }
index 692d81e5701b349330009d5b359da6acb830e3c8..e3dd113c710cd8cf5b1cede361d794ad7fd67839 100644 (file)
@@ -6,9 +6,6 @@ Description=Arvados Crunch Dispatcher for LOCAL service
 Documentation=https://doc.arvados.org/
 After=network.target
 
-# systemd==229 (ubuntu:xenial) obeys StartLimitInterval in the [Unit] section
-StartLimitInterval=0
-
 # systemd>=230 (debian:9) obeys StartLimitIntervalSec in the [Unit] section
 StartLimitIntervalSec=0
 
index d976bf0812950488b796cb063def8c960d128849..92b8d2adcd6fe22e20c66afc1d4f803521ccd545 100644 (file)
@@ -83,9 +83,9 @@ func (s *TestSuite) TestIntegration(c *C) {
 
        cl := arvados.Cluster{Containers: arvados.ContainersConfig{RuntimeEngine: "docker"}}
 
-       dispatcher.RunContainer = func(d *dispatch.Dispatcher, c arvados.Container, s <-chan arvados.Container) {
-               (&LocalRun{startCmd, make(chan bool, 8), ctx, &cl}).run(d, c, s)
-               cancel()
+       dispatcher.RunContainer = func(d *dispatch.Dispatcher, c arvados.Container, s <-chan arvados.Container) error {
+               defer cancel()
+               return (&LocalRun{startCmd, make(chan bool, 8), ctx, &cl}).run(d, c, s)
        }
 
        err = dispatcher.Run(ctx)
@@ -188,9 +188,9 @@ func testWithServerStub(c *C, apiStubResponses map[string]arvadostest.StubRespon
 
        cl := arvados.Cluster{Containers: arvados.ContainersConfig{RuntimeEngine: "docker"}}
 
-       dispatcher.RunContainer = func(d *dispatch.Dispatcher, c arvados.Container, s <-chan arvados.Container) {
-               (&LocalRun{startCmd, make(chan bool, 8), ctx, &cl}).run(d, c, s)
-               cancel()
+       dispatcher.RunContainer = func(d *dispatch.Dispatcher, c arvados.Container, s <-chan arvados.Container) error {
+               defer cancel()
+               return (&LocalRun{startCmd, make(chan bool, 8), ctx, &cl}).run(d, c, s)
        }
 
        re := regexp.MustCompile(`(?ms).*` + expected + `.*`)
index 2f2f013c714a0be6bf863cbf8329efae62e616b6..584db38edf7e93ac57ad8929ca31e04de907b78d 100644 (file)
@@ -7,7 +7,6 @@ package main
 // Dispatcher service for Crunch that submits containers to the slurm queue.
 
 import (
-       "bytes"
        "context"
        "flag"
        "fmt"
@@ -271,7 +270,7 @@ func (disp *Dispatcher) submit(container arvados.Container, crunchRunCommand []s
 // already in the queue).  Cancel the slurm job if the container's
 // priority changes to zero or its state indicates it's no longer
 // running.
-func (disp *Dispatcher) runContainer(_ *dispatch.Dispatcher, ctr arvados.Container, status <-chan arvados.Container) {
+func (disp *Dispatcher) runContainer(_ *dispatch.Dispatcher, ctr arvados.Container, status <-chan arvados.Container) error {
        ctx, cancel := context.WithCancel(context.Background())
        defer cancel()
 
@@ -279,38 +278,9 @@ func (disp *Dispatcher) runContainer(_ *dispatch.Dispatcher, ctr arvados.Contain
                log.Printf("Submitting container %s to slurm", ctr.UUID)
                cmd := []string{disp.cluster.Containers.CrunchRunCommand}
                cmd = append(cmd, disp.cluster.Containers.CrunchRunArgumentsList...)
-               if err := disp.submit(ctr, cmd); err != nil {
-                       var text string
-                       switch err := err.(type) {
-                       case dispatchcloud.ConstraintsNotSatisfiableError:
-                               var logBuf bytes.Buffer
-                               fmt.Fprintf(&logBuf, "cannot run container %s: %s\n", ctr.UUID, err)
-                               if len(err.AvailableTypes) == 0 {
-                                       fmt.Fprint(&logBuf, "No instance types are configured.\n")
-                               } else {
-                                       fmt.Fprint(&logBuf, "Available instance types:\n")
-                                       for _, t := range err.AvailableTypes {
-                                               fmt.Fprintf(&logBuf,
-                                                       "Type %q: %d VCPUs, %d RAM, %d Scratch, %f Price\n",
-                                                       t.Name, t.VCPUs, t.RAM, t.Scratch, t.Price,
-                                               )
-                                       }
-                               }
-                               text = logBuf.String()
-                               disp.UpdateState(ctr.UUID, dispatch.Cancelled)
-                       default:
-                               text = fmt.Sprintf("Error submitting container %s to slurm: %s", ctr.UUID, err)
-                       }
-                       log.Print(text)
-
-                       lr := arvadosclient.Dict{"log": arvadosclient.Dict{
-                               "object_uuid": ctr.UUID,
-                               "event_type":  "dispatch",
-                               "properties":  map[string]string{"text": text}}}
-                       disp.Arv.Create("logs", lr, nil)
-
-                       disp.Unlock(ctr.UUID)
-                       return
+               err := disp.submit(ctr, cmd)
+               if err != nil {
+                       return err
                }
        }
 
@@ -339,7 +309,7 @@ func (disp *Dispatcher) runContainer(_ *dispatch.Dispatcher, ctr arvados.Contain
                        case dispatch.Locked:
                                disp.Unlock(ctr.UUID)
                        }
-                       return
+                       return nil
                case updated, ok := <-status:
                        if !ok {
                                log.Printf("container %s is done: cancel slurm job", ctr.UUID)
index 2af56c8d0c1e62b3c2a1d3eb5f0a5c1a65be7b4a..86830f3a7f67364d0a3dd783e598458984bff3a0 100644 (file)
@@ -7,9 +7,6 @@ Description=Arvados Crunch Dispatcher for SLURM
 Documentation=https://doc.arvados.org/
 After=network.target
 
-# systemd==229 (ubuntu:xenial) obeys StartLimitInterval in the [Unit] section
-StartLimitInterval=0
-
 # systemd>=230 (debian:9) obeys StartLimitIntervalSec in the [Unit] section
 StartLimitIntervalSec=0
 
index 480434de65d291fad8ac2ac8ff8fbb6092ece327..e7a89db23c8743b3e2934a5c26f12a446a5bd6a9 100644 (file)
@@ -104,7 +104,7 @@ func (sf *slurmFake) Cancel(name string) error {
 
 func (s *IntegrationSuite) integrationTest(c *C,
        expectBatch [][]string,
-       runContainer func(*dispatch.Dispatcher, arvados.Container)) arvados.Container {
+       runContainer func(*dispatch.Dispatcher, arvados.Container)) (arvados.Container, error) {
        arvadostest.ResetEnv()
 
        arv, err := arvadosclient.MakeArvadosClient()
@@ -123,18 +123,21 @@ func (s *IntegrationSuite) integrationTest(c *C,
 
        ctx, cancel := context.WithCancel(context.Background())
        doneRun := make(chan struct{})
+       doneDispatch := make(chan error)
 
        s.disp.Dispatcher = &dispatch.Dispatcher{
                Arv:        arv,
                PollPeriod: time.Second,
-               RunContainer: func(disp *dispatch.Dispatcher, ctr arvados.Container, status <-chan arvados.Container) {
+               RunContainer: func(disp *dispatch.Dispatcher, ctr arvados.Container, status <-chan arvados.Container) error {
                        go func() {
                                runContainer(disp, ctr)
                                s.slurm.queue = ""
                                doneRun <- struct{}{}
                        }()
-                       s.disp.runContainer(disp, ctr, status)
+                       err := s.disp.runContainer(disp, ctr, status)
                        cancel()
+                       doneDispatch <- err
+                       return nil
                },
        }
 
@@ -148,6 +151,7 @@ func (s *IntegrationSuite) integrationTest(c *C,
        err = s.disp.Dispatcher.Run(ctx)
        <-doneRun
        c.Assert(err, Equals, context.Canceled)
+       errDispatch := <-doneDispatch
 
        s.disp.sqCheck.Stop()
 
@@ -162,12 +166,12 @@ func (s *IntegrationSuite) integrationTest(c *C,
        var container arvados.Container
        err = arv.Get("containers", "zzzzz-dz642-queuedcontainer", nil, &container)
        c.Check(err, IsNil)
-       return container
+       return container, errDispatch
 }
 
 func (s *IntegrationSuite) TestNormal(c *C) {
        s.slurm = slurmFake{queue: "zzzzz-dz642-queuedcontainer 10000 100 PENDING Resources\n"}
-       container := s.integrationTest(c,
+       container, _ := s.integrationTest(c,
                nil,
                func(dispatcher *dispatch.Dispatcher, container arvados.Container) {
                        dispatcher.UpdateState(container.UUID, dispatch.Running)
@@ -181,7 +185,7 @@ func (s *IntegrationSuite) TestCancel(c *C) {
        s.slurm = slurmFake{queue: "zzzzz-dz642-queuedcontainer 10000 100 PENDING Resources\n"}
        readyToCancel := make(chan bool)
        s.slurm.onCancel = func() { <-readyToCancel }
-       container := s.integrationTest(c,
+       container, _ := s.integrationTest(c,
                nil,
                func(dispatcher *dispatch.Dispatcher, container arvados.Container) {
                        dispatcher.UpdateState(container.UUID, dispatch.Running)
@@ -199,7 +203,7 @@ func (s *IntegrationSuite) TestCancel(c *C) {
 }
 
 func (s *IntegrationSuite) TestMissingFromSqueue(c *C) {
-       container := s.integrationTest(c,
+       container, _ := s.integrationTest(c,
                [][]string{{
                        fmt.Sprintf("--job-name=%s", "zzzzz-dz642-queuedcontainer"),
                        fmt.Sprintf("--nice=%d", 10000),
@@ -218,24 +222,14 @@ func (s *IntegrationSuite) TestMissingFromSqueue(c *C) {
 
 func (s *IntegrationSuite) TestSbatchFail(c *C) {
        s.slurm = slurmFake{errBatch: errors.New("something terrible happened")}
-       container := s.integrationTest(c,
+       container, err := s.integrationTest(c,
                [][]string{{"--job-name=zzzzz-dz642-queuedcontainer", "--nice=10000", "--no-requeue", "--mem=11445", "--cpus-per-task=4", "--tmp=45777"}},
                func(dispatcher *dispatch.Dispatcher, container arvados.Container) {
                        dispatcher.UpdateState(container.UUID, dispatch.Running)
                        dispatcher.UpdateState(container.UUID, dispatch.Complete)
                })
        c.Check(container.State, Equals, arvados.ContainerStateComplete)
-
-       arv, err := arvadosclient.MakeArvadosClient()
-       c.Assert(err, IsNil)
-
-       var ll arvados.LogList
-       err = arv.List("logs", arvadosclient.Dict{"filters": [][]string{
-               {"object_uuid", "=", container.UUID},
-               {"event_type", "=", "dispatch"},
-       }}, &ll)
-       c.Assert(err, IsNil)
-       c.Assert(len(ll.Items), Equals, 1)
+       c.Check(err, ErrorMatches, `something terrible happened`)
 }
 
 type StubbedSuite struct {
@@ -280,7 +274,7 @@ func (s *StubbedSuite) testWithServerStub(c *C, apiStubResponses map[string]arva
        dispatcher := dispatch.Dispatcher{
                Arv:        arv,
                PollPeriod: time.Second,
-               RunContainer: func(disp *dispatch.Dispatcher, ctr arvados.Container, status <-chan arvados.Container) {
+               RunContainer: func(disp *dispatch.Dispatcher, ctr arvados.Container, status <-chan arvados.Container) error {
                        go func() {
                                time.Sleep(time.Second)
                                disp.UpdateState(ctr.UUID, dispatch.Running)
@@ -288,6 +282,7 @@ func (s *StubbedSuite) testWithServerStub(c *C, apiStubResponses map[string]arva
                        }()
                        s.disp.runContainer(disp, ctr, status)
                        cancel()
+                       return nil
                },
        }
 
index 7e049144ae1ef49a700ad71580d50a48b0df144f..2aab42b2a37c7c4be9a6ff6907a6b6c38c3373cf 100644 (file)
@@ -7,9 +7,6 @@ Description=Arvados Docker Image Cleaner
 Documentation=https://doc.arvados.org/
 After=network.target
 
-# systemd==229 (ubuntu:xenial) obeys StartLimitInterval in the [Unit] section
-StartLimitInterval=0
-
 # systemd>=230 (debian:9) obeys StartLimitIntervalSec in the [Unit] section
 StartLimitIntervalSec=0
 
index ccb7a467af45906f60710c94e0dfe98bf84bf034..31580433035692ee1d4110f06aec56e036f8d961 100644 (file)
@@ -3,9 +3,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
 case "$TARGET" in
-    ubuntu1604)
-        fpm_depends+=()
-        ;;
     debian* | ubuntu*)
         fpm_depends+=(python3-distutils)
         ;;
index 78cbd0d8cfd06f1c638549151c56e74a32025237..2b963d9a68659c342de818af52789f0d96031ef3 100644 (file)
@@ -298,20 +298,52 @@ class CollectionDirectoryBase(Directory):
     def on_event(self, event, collection, name, item):
         if collection == self.collection:
             name = self.sanitize_filename(name)
-            _logger.debug("collection notify %s %s %s %s", event, collection, name, item)
-            with llfuse.lock:
-                if event == arvados.collection.ADD:
-                    self.new_entry(name, item, self.mtime())
-                elif event == arvados.collection.DEL:
-                    ent = self._entries[name]
-                    del self._entries[name]
-                    self.inodes.invalidate_entry(self, name)
-                    self.inodes.del_entry(ent)
-                elif event == arvados.collection.MOD:
-                    if hasattr(item, "fuse_entry") and item.fuse_entry is not None:
-                        self.inodes.invalidate_inode(item.fuse_entry)
-                    elif name in self._entries:
-                        self.inodes.invalidate_inode(self._entries[name])
+
+            #
+            # It's possible for another thread to have llfuse.lock and
+            # be waiting on collection.lock.  Meanwhile, we released
+            # llfuse.lock earlier in the stack, but are still holding
+            # on to the collection lock, and now we need to re-acquire
+            # llfuse.lock.  If we don't release the collection lock,
+            # we'll deadlock where we're holding the collection lock
+            # waiting for llfuse.lock and the other thread is holding
+            # llfuse.lock and waiting for the collection lock.
+            #
+            # The correct locking order here is to take llfuse.lock
+            # first, then the collection lock.
+            #
+            # Since collection.lock is an RLock, it might be locked
+            # multiple times, so we need to release it multiple times,
+            # keep a count, then re-lock it the correct number of
+            # times.
+            #
+            lockcount = 0
+            try:
+                while True:
+                    self.collection.lock.release()
+                    lockcount += 1
+            except RuntimeError:
+                pass
+
+            try:
+                with llfuse.lock:
+                    with self.collection.lock:
+                        if event == arvados.collection.ADD:
+                            self.new_entry(name, item, self.mtime())
+                        elif event == arvados.collection.DEL:
+                            ent = self._entries[name]
+                            del self._entries[name]
+                            self.inodes.invalidate_entry(self, name)
+                            self.inodes.del_entry(ent)
+                        elif event == arvados.collection.MOD:
+                            if hasattr(item, "fuse_entry") and item.fuse_entry is not None:
+                                self.inodes.invalidate_inode(item.fuse_entry)
+                            elif name in self._entries:
+                                self.inodes.invalidate_inode(self._entries[name])
+            finally:
+                while lockcount > 0:
+                    self.collection.lock.acquire()
+                    lockcount -= 1
 
     def populate(self, mtime):
         self._mtime = mtime
@@ -587,10 +619,26 @@ class TmpCollectionDirectory(CollectionDirectoryBase):
     def on_event(self, *args, **kwargs):
         super(TmpCollectionDirectory, self).on_event(*args, **kwargs)
         if self.collection_record_file:
-            with llfuse.lock:
-                self.collection_record_file.invalidate()
-            self.inodes.invalidate_inode(self.collection_record_file)
-            _logger.debug("%s invalidated collection record", self)
+
+            # See discussion in CollectionDirectoryBase.on_event
+            lockcount = 0
+            try:
+                while True:
+                    self.collection.lock.release()
+                    lockcount += 1
+            except RuntimeError:
+                pass
+
+            try:
+                with llfuse.lock:
+                    with self.collection.lock:
+                        self.collection_record_file.invalidate()
+                        self.inodes.invalidate_inode(self.collection_record_file)
+                        _logger.debug("%s invalidated collection record", self)
+            finally:
+                while lockcount > 0:
+                    self.collection.lock.acquire()
+                    lockcount -= 1
 
     def collection_record(self):
         with llfuse.lock_released:
index ca3744c28db265832229ac5a0635114396164b35..4b8745d1549c6950e14cf74cdbc09edc12f1650d 100644 (file)
@@ -8,9 +8,6 @@ Documentation=https://doc.arvados.org/
 After=network.target
 AssertPathExists=/etc/arvados/config.yml
 
-# systemd==229 (ubuntu:xenial) obeys StartLimitInterval in the [Unit] section
-StartLimitInterval=0
-
 # systemd>=230 (debian:9) obeys StartLimitIntervalSec in the [Unit] section
 StartLimitIntervalSec=0
 
index 86423a2976b1e0909470bd563c486aee894743af..bb590e13b33f0535d5a7d2610d8902ddab577300 100644 (file)
@@ -18,11 +18,13 @@ import (
        "sort"
        "strings"
        "sync"
+       "sync/atomic"
        "syscall"
        "time"
 
        "git.arvados.org/arvados.git/sdk/go/arvados"
        "git.arvados.org/arvados.git/sdk/go/keepclient"
+       "github.com/jmoiron/sqlx"
        "github.com/sirupsen/logrus"
 )
 
@@ -36,6 +38,7 @@ import (
 // BlobSignatureTTL; and all N existing replicas of a given data block
 // are in the N best positions in rendezvous probe order.
 type Balancer struct {
+       DB      *sqlx.DB
        Logger  logrus.FieldLogger
        Dumper  logrus.FieldLogger
        Metrics *metrics
@@ -50,7 +53,7 @@ type Balancer struct {
        classes       []string
        mounts        int
        mountsByClass map[string]map[*KeepMount]bool
-       collScanned   int
+       collScanned   int64
        serviceRoots  map[string]string
        errors        []error
        stats         balancerStats
@@ -167,6 +170,15 @@ func (bal *Balancer) Run(client *arvados.Client, cluster *arvados.Cluster, runOp
        }
        if runOptions.CommitTrash {
                err = bal.CommitTrash(ctx, client)
+               if err != nil {
+                       return
+               }
+       }
+       if runOptions.CommitConfirmedFields {
+               err = bal.updateCollections(ctx, client, cluster)
+               if err != nil {
+                       return
+               }
        }
        return
 }
@@ -388,39 +400,14 @@ func (bal *Balancer) GetCurrentState(ctx context.Context, c *arvados.Client, pag
                }(mounts)
        }
 
-       // collQ buffers incoming collections so we can start fetching
-       // the next page without waiting for the current page to
-       // finish processing.
        collQ := make(chan arvados.Collection, bufs)
 
-       // Start a goroutine to process collections. (We could use a
-       // worker pool here, but even with a single worker we already
-       // process collections much faster than we can retrieve them.)
+       // Retrieve all collections from the database and send them to
+       // collQ.
        wg.Add(1)
        go func() {
                defer wg.Done()
-               for coll := range collQ {
-                       err := bal.addCollection(coll)
-                       if err != nil || len(errs) > 0 {
-                               select {
-                               case errs <- err:
-                               default:
-                               }
-                               for range collQ {
-                               }
-                               cancel()
-                               return
-                       }
-                       bal.collScanned++
-               }
-       }()
-
-       // Start a goroutine to retrieve all collections from the
-       // Arvados database and send them to collQ for processing.
-       wg.Add(1)
-       go func() {
-               defer wg.Done()
-               err = EachCollection(ctx, c, pageSize,
+               err = EachCollection(ctx, bal.DB, c,
                        func(coll arvados.Collection) error {
                                collQ <- coll
                                if len(errs) > 0 {
@@ -444,6 +431,27 @@ func (bal *Balancer) GetCurrentState(ctx context.Context, c *arvados.Client, pag
                }
        }()
 
+       // Parse manifests from collQ and pass the block hashes to
+       // BlockStateMap to track desired replication.
+       for i := 0; i < runtime.NumCPU(); i++ {
+               wg.Add(1)
+               go func() {
+                       defer wg.Done()
+                       for coll := range collQ {
+                               err := bal.addCollection(coll)
+                               if err != nil || len(errs) > 0 {
+                                       select {
+                                       case errs <- err:
+                                       default:
+                                       }
+                                       cancel()
+                                       continue
+                               }
+                               atomic.AddInt64(&bal.collScanned, 1)
+                       }
+               }()
+       }
+
        wg.Wait()
        if len(errs) > 0 {
                return <-errs
@@ -460,7 +468,7 @@ func (bal *Balancer) addCollection(coll arvados.Collection) error {
        if coll.ReplicationDesired != nil {
                repl = *coll.ReplicationDesired
        }
-       bal.Logger.Debugf("%v: %d block x%d", coll.UUID, len(blkids), repl)
+       bal.Logger.Debugf("%v: %d blocks x%d", coll.UUID, len(blkids), repl)
        // Pass pdh to IncreaseDesired only if LostBlocksFile is being
        // written -- otherwise it's just a waste of memory.
        pdh := ""
@@ -530,10 +538,6 @@ func (bal *Balancer) setupLookupTables() {
                        // effectively read-only.
                        mnt.ReadOnly = mnt.ReadOnly || srv.ReadOnly
 
-                       if len(mnt.StorageClasses) == 0 {
-                               bal.mountsByClass["default"][mnt] = true
-                               continue
-                       }
                        for class := range mnt.StorageClasses {
                                if mbc := bal.mountsByClass[class]; mbc == nil {
                                        bal.classes = append(bal.classes, class)
index cbdde595e8b4b797c70422ae71a5dc7affb33dcf..4e2c6803ca81cc593798d8285945aa9f90a1446c 100644 (file)
@@ -21,6 +21,7 @@ import (
        "git.arvados.org/arvados.git/sdk/go/arvados"
        "git.arvados.org/arvados.git/sdk/go/arvadostest"
        "git.arvados.org/arvados.git/sdk/go/ctxlog"
+       "github.com/jmoiron/sqlx"
        "github.com/prometheus/client_golang/prometheus"
        "github.com/prometheus/common/expfmt"
        check "gopkg.in/check.v1"
@@ -86,20 +87,24 @@ var stubServices = []arvados.KeepService{
 
 var stubMounts = map[string][]arvados.KeepMount{
        "keep0.zzzzz.arvadosapi.com:25107": {{
-               UUID:     "zzzzz-ivpuk-000000000000000",
-               DeviceID: "keep0-vol0",
+               UUID:           "zzzzz-ivpuk-000000000000000",
+               DeviceID:       "keep0-vol0",
+               StorageClasses: map[string]bool{"default": true},
        }},
        "keep1.zzzzz.arvadosapi.com:25107": {{
-               UUID:     "zzzzz-ivpuk-100000000000000",
-               DeviceID: "keep1-vol0",
+               UUID:           "zzzzz-ivpuk-100000000000000",
+               DeviceID:       "keep1-vol0",
+               StorageClasses: map[string]bool{"default": true},
        }},
        "keep2.zzzzz.arvadosapi.com:25107": {{
-               UUID:     "zzzzz-ivpuk-200000000000000",
-               DeviceID: "keep2-vol0",
+               UUID:           "zzzzz-ivpuk-200000000000000",
+               DeviceID:       "keep2-vol0",
+               StorageClasses: map[string]bool{"default": true},
        }},
        "keep3.zzzzz.arvadosapi.com:25107": {{
-               UUID:     "zzzzz-ivpuk-300000000000000",
-               DeviceID: "keep3-vol0",
+               UUID:           "zzzzz-ivpuk-300000000000000",
+               DeviceID:       "keep3-vol0",
+               StorageClasses: map[string]bool{"default": true},
        }},
 }
 
@@ -309,6 +314,7 @@ func (s *stubServer) serveKeepstorePull() *reqTracker {
 type runSuite struct {
        stub   stubServer
        config *arvados.Cluster
+       db     *sqlx.DB
        client *arvados.Client
 }
 
@@ -320,6 +326,7 @@ func (s *runSuite) newServer(options *RunOptions) *Server {
                Metrics:    newMetrics(prometheus.NewRegistry()),
                Logger:     options.Logger,
                Dumper:     options.Dumper,
+               DB:         s.db,
        }
        return srv
 }
@@ -329,6 +336,8 @@ func (s *runSuite) SetUpTest(c *check.C) {
        c.Assert(err, check.Equals, nil)
        s.config, err = cfg.GetCluster("")
        c.Assert(err, check.Equals, nil)
+       s.db, err = sqlx.Open("postgres", s.config.PostgreSQL.Connection.String())
+       c.Assert(err, check.IsNil)
 
        s.config.Collections.BalancePeriod = arvados.Duration(time.Second)
        arvadostest.SetServiceURL(&s.config.Services.Keepbalance, "http://localhost:/")
@@ -347,6 +356,9 @@ func (s *runSuite) TearDownTest(c *check.C) {
 }
 
 func (s *runSuite) TestRefuseZeroCollections(c *check.C) {
+       defer arvados.NewClientFromEnv().RequestAndDecode(nil, "POST", "database/reset", nil, nil)
+       _, err := s.db.Exec(`delete from collections`)
+       c.Assert(err, check.IsNil)
        opts := RunOptions{
                CommitPulls: true,
                CommitTrash: true,
@@ -360,7 +372,7 @@ func (s *runSuite) TestRefuseZeroCollections(c *check.C) {
        trashReqs := s.stub.serveKeepstoreTrash()
        pullReqs := s.stub.serveKeepstorePull()
        srv := s.newServer(&opts)
-       _, err := srv.runOnce()
+       _, err = srv.runOnce()
        c.Check(err, check.ErrorMatches, "received zero collections")
        c.Check(trashReqs.Count(), check.Equals, 4)
        c.Check(pullReqs.Count(), check.Equals, 0)
@@ -385,26 +397,6 @@ func (s *runSuite) TestRefuseNonAdmin(c *check.C) {
        c.Check(pullReqs.Count(), check.Equals, 0)
 }
 
-func (s *runSuite) TestDetectSkippedCollections(c *check.C) {
-       opts := RunOptions{
-               CommitPulls: true,
-               CommitTrash: true,
-               Logger:      ctxlog.TestLogger(c),
-       }
-       s.stub.serveCurrentUserAdmin()
-       s.stub.serveCollectionsButSkipOne()
-       s.stub.serveKeepServices(stubServices)
-       s.stub.serveKeepstoreMounts()
-       s.stub.serveKeepstoreIndexFoo4Bar1()
-       trashReqs := s.stub.serveKeepstoreTrash()
-       pullReqs := s.stub.serveKeepstorePull()
-       srv := s.newServer(&opts)
-       _, err := srv.runOnce()
-       c.Check(err, check.ErrorMatches, `Retrieved 2 collections with modtime <= .* but server now reports there are 3 collections.*`)
-       c.Check(trashReqs.Count(), check.Equals, 4)
-       c.Check(pullReqs.Count(), check.Equals, 0)
-}
-
 func (s *runSuite) TestWriteLostBlocks(c *check.C) {
        lostf, err := ioutil.TempFile("", "keep-balance-lost-blocks-test-")
        c.Assert(err, check.IsNil)
@@ -428,7 +420,7 @@ func (s *runSuite) TestWriteLostBlocks(c *check.C) {
        c.Check(err, check.IsNil)
        lost, err := ioutil.ReadFile(lostf.Name())
        c.Assert(err, check.IsNil)
-       c.Check(string(lost), check.Equals, "37b51d194a7513e45b56f6524f2d51f2 fa7aeb5140e2848d39b416daeef4ffc5+45\n")
+       c.Check(string(lost), check.Matches, `(?ms).*37b51d194a7513e45b56f6524f2d51f2.* fa7aeb5140e2848d39b416daeef4ffc5\+45.*`)
 }
 
 func (s *runSuite) TestDryRun(c *check.C) {
@@ -459,11 +451,7 @@ func (s *runSuite) TestDryRun(c *check.C) {
 }
 
 func (s *runSuite) TestCommit(c *check.C) {
-       lostf, err := ioutil.TempFile("", "keep-balance-lost-blocks-test-")
-       c.Assert(err, check.IsNil)
-       s.config.Collections.BlobMissingReport = lostf.Name()
-       defer os.Remove(lostf.Name())
-
+       s.config.Collections.BlobMissingReport = c.MkDir() + "/keep-balance-lost-blocks-test-"
        s.config.ManagementToken = "xyzzy"
        opts := RunOptions{
                CommitPulls: true,
@@ -489,17 +477,18 @@ func (s *runSuite) TestCommit(c *check.C) {
        // in a poor rendezvous position
        c.Check(bal.stats.pulls, check.Equals, 2)
 
-       lost, err := ioutil.ReadFile(lostf.Name())
+       lost, err := ioutil.ReadFile(s.config.Collections.BlobMissingReport)
        c.Assert(err, check.IsNil)
-       c.Check(string(lost), check.Equals, "")
+       c.Check(string(lost), check.Not(check.Matches), `(?ms).*acbd18db4cc2f85cedef654fccc4a4d8.*`)
 
        buf, err := s.getMetrics(c, srv)
        c.Check(err, check.IsNil)
-       c.Check(buf, check.Matches, `(?ms).*\narvados_keep_total_bytes 15\n.*`)
-       c.Check(buf, check.Matches, `(?ms).*\narvados_keepbalance_changeset_compute_seconds_sum [0-9\.]+\n.*`)
-       c.Check(buf, check.Matches, `(?ms).*\narvados_keepbalance_changeset_compute_seconds_count 1\n.*`)
-       c.Check(buf, check.Matches, `(?ms).*\narvados_keep_dedup_byte_ratio 1\.5\n.*`)
-       c.Check(buf, check.Matches, `(?ms).*\narvados_keep_dedup_block_ratio 1\.5\n.*`)
+       bufstr := buf.String()
+       c.Check(bufstr, check.Matches, `(?ms).*\narvados_keep_total_bytes 15\n.*`)
+       c.Check(bufstr, check.Matches, `(?ms).*\narvados_keepbalance_changeset_compute_seconds_sum [0-9\.]+\n.*`)
+       c.Check(bufstr, check.Matches, `(?ms).*\narvados_keepbalance_changeset_compute_seconds_count 1\n.*`)
+       c.Check(bufstr, check.Matches, `(?ms).*\narvados_keep_dedup_byte_ratio [1-9].*`)
+       c.Check(bufstr, check.Matches, `(?ms).*\narvados_keep_dedup_block_ratio [1-9].*`)
 }
 
 func (s *runSuite) TestRunForever(c *check.C) {
index 5bc66dbf3fa7b2f23828205b13452b5839d99510..c529ac150e092c37c9f5510d0463be6a0eebe553 100644 (file)
@@ -85,7 +85,8 @@ func (bal *balancerSuite) SetUpTest(c *check.C) {
                }
                srv.mounts = []*KeepMount{{
                        KeepMount: arvados.KeepMount{
-                               UUID: fmt.Sprintf("zzzzz-mount-%015x", i),
+                               UUID:           fmt.Sprintf("zzzzz-mount-%015x", i),
+                               StorageClasses: map[string]bool{"default": true},
                        },
                        KeepService: srv,
                }}
@@ -166,10 +167,11 @@ func (bal *balancerSuite) testMultipleViews(c *check.C, readonly bool) {
                srv.mounts[0].KeepMount.DeviceID = fmt.Sprintf("writable-by-srv-%x", i)
                srv.mounts = append(srv.mounts, &KeepMount{
                        KeepMount: arvados.KeepMount{
-                               DeviceID:    fmt.Sprintf("writable-by-srv-%x", (i+1)%len(bal.srvs)),
-                               UUID:        fmt.Sprintf("zzzzz-mount-%015x", i<<16),
-                               ReadOnly:    readonly,
-                               Replication: 1,
+                               DeviceID:       fmt.Sprintf("writable-by-srv-%x", (i+1)%len(bal.srvs)),
+                               UUID:           fmt.Sprintf("zzzzz-mount-%015x", i<<16),
+                               ReadOnly:       readonly,
+                               Replication:    1,
+                               StorageClasses: map[string]bool{"default": true},
                        },
                        KeepService: srv,
                })
index 029f8c6c0790337f561d679c8c9d21cf33ff502b..e30b4ff7943d4c4a041ec71924000a48a856c4d4 100644 (file)
@@ -133,3 +133,53 @@ func (bsm *BlockStateMap) IncreaseDesired(pdh string, classes []string, n int, b
                bsm.get(blkid).increaseDesired(pdh, classes, n)
        }
 }
+
+// GetConfirmedReplication returns the replication level of the given
+// blocks, considering only the specified storage classes.
+//
+// If len(classes)==0, returns the replication level without regard to
+// storage classes.
+//
+// Safe to call concurrently with other calls to GetCurrent, but not
+// with different BlockStateMap methods.
+func (bsm *BlockStateMap) GetConfirmedReplication(blkids []arvados.SizedDigest, classes []string) int {
+       defaultClasses := map[string]bool{"default": true}
+       min := 0
+       for _, blkid := range blkids {
+               total := 0
+               perclass := make(map[string]int, len(classes))
+               for _, c := range classes {
+                       perclass[c] = 0
+               }
+               for _, r := range bsm.get(blkid).Replicas {
+                       total += r.KeepMount.Replication
+                       mntclasses := r.KeepMount.StorageClasses
+                       if len(mntclasses) == 0 {
+                               mntclasses = defaultClasses
+                       }
+                       for c := range mntclasses {
+                               n, ok := perclass[c]
+                               if !ok {
+                                       // Don't care about this storage class
+                                       continue
+                               }
+                               perclass[c] = n + r.KeepMount.Replication
+                       }
+               }
+               if total == 0 {
+                       return 0
+               }
+               for _, n := range perclass {
+                       if n == 0 {
+                               return 0
+                       }
+                       if n < min || min == 0 {
+                               min = n
+                       }
+               }
+               if len(perclass) == 0 && (min == 0 || min > total) {
+                       min = total
+               }
+       }
+       return min
+}
diff --git a/services/keep-balance/block_state_test.go b/services/keep-balance/block_state_test.go
new file mode 100644 (file)
index 0000000..aaf2c18
--- /dev/null
@@ -0,0 +1,94 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package main
+
+import (
+       "time"
+
+       "git.arvados.org/arvados.git/sdk/go/arvados"
+       check "gopkg.in/check.v1"
+)
+
+var _ = check.Suite(&confirmedReplicationSuite{})
+
+type confirmedReplicationSuite struct {
+       blockStateMap *BlockStateMap
+       mtime         int64
+}
+
+func (s *confirmedReplicationSuite) SetUpTest(c *check.C) {
+       t, _ := time.Parse(time.RFC3339Nano, time.RFC3339Nano)
+       s.mtime = t.UnixNano()
+       s.blockStateMap = NewBlockStateMap()
+       s.blockStateMap.AddReplicas(&KeepMount{KeepMount: arvados.KeepMount{
+               Replication:    1,
+               StorageClasses: map[string]bool{"default": true},
+       }}, []arvados.KeepServiceIndexEntry{
+               {SizedDigest: knownBlkid(10), Mtime: s.mtime},
+       })
+       s.blockStateMap.AddReplicas(&KeepMount{KeepMount: arvados.KeepMount{
+               Replication:    2,
+               StorageClasses: map[string]bool{"default": true},
+       }}, []arvados.KeepServiceIndexEntry{
+               {SizedDigest: knownBlkid(20), Mtime: s.mtime},
+       })
+}
+
+func (s *confirmedReplicationSuite) TestZeroReplication(c *check.C) {
+       n := s.blockStateMap.GetConfirmedReplication([]arvados.SizedDigest{knownBlkid(404), knownBlkid(409)}, []string{"default"})
+       c.Check(n, check.Equals, 0)
+       n = s.blockStateMap.GetConfirmedReplication([]arvados.SizedDigest{knownBlkid(10), knownBlkid(404)}, []string{"default"})
+       c.Check(n, check.Equals, 0)
+       n = s.blockStateMap.GetConfirmedReplication([]arvados.SizedDigest{knownBlkid(10), knownBlkid(404)}, nil)
+       c.Check(n, check.Equals, 0)
+}
+
+func (s *confirmedReplicationSuite) TestBlocksWithDifferentReplication(c *check.C) {
+       n := s.blockStateMap.GetConfirmedReplication([]arvados.SizedDigest{knownBlkid(10), knownBlkid(20)}, []string{"default"})
+       c.Check(n, check.Equals, 1)
+}
+
+func (s *confirmedReplicationSuite) TestBlocksInDifferentClasses(c *check.C) {
+       s.blockStateMap.AddReplicas(&KeepMount{KeepMount: arvados.KeepMount{
+               Replication:    3,
+               StorageClasses: map[string]bool{"three": true},
+       }}, []arvados.KeepServiceIndexEntry{
+               {SizedDigest: knownBlkid(30), Mtime: s.mtime},
+       })
+
+       n := s.blockStateMap.GetConfirmedReplication([]arvados.SizedDigest{knownBlkid(30)}, []string{"three"})
+       c.Check(n, check.Equals, 3)
+       n = s.blockStateMap.GetConfirmedReplication([]arvados.SizedDigest{knownBlkid(20), knownBlkid(30)}, []string{"default"})
+       c.Check(n, check.Equals, 0) // block 30 has repl 0 @ "default"
+       n = s.blockStateMap.GetConfirmedReplication([]arvados.SizedDigest{knownBlkid(20), knownBlkid(30)}, []string{"three"})
+       c.Check(n, check.Equals, 0) // block 20 has repl 0 @ "three"
+       n = s.blockStateMap.GetConfirmedReplication([]arvados.SizedDigest{knownBlkid(20), knownBlkid(30)}, nil)
+       c.Check(n, check.Equals, 2)
+}
+
+func (s *confirmedReplicationSuite) TestBlocksOnMultipleMounts(c *check.C) {
+       s.blockStateMap.AddReplicas(&KeepMount{KeepMount: arvados.KeepMount{
+               Replication:    2,
+               StorageClasses: map[string]bool{"default": true, "four": true},
+       }}, []arvados.KeepServiceIndexEntry{
+               {SizedDigest: knownBlkid(40), Mtime: s.mtime},
+               {SizedDigest: knownBlkid(41), Mtime: s.mtime},
+       })
+       s.blockStateMap.AddReplicas(&KeepMount{KeepMount: arvados.KeepMount{
+               Replication:    2,
+               StorageClasses: map[string]bool{"four": true},
+       }}, []arvados.KeepServiceIndexEntry{
+               {SizedDigest: knownBlkid(40), Mtime: s.mtime},
+               {SizedDigest: knownBlkid(41), Mtime: s.mtime},
+       })
+       n := s.blockStateMap.GetConfirmedReplication([]arvados.SizedDigest{knownBlkid(40), knownBlkid(41)}, []string{"default"})
+       c.Check(n, check.Equals, 2)
+       n = s.blockStateMap.GetConfirmedReplication([]arvados.SizedDigest{knownBlkid(40), knownBlkid(41)}, []string{"four"})
+       c.Check(n, check.Equals, 4)
+       n = s.blockStateMap.GetConfirmedReplication([]arvados.SizedDigest{knownBlkid(40), knownBlkid(41)}, []string{"default", "four"})
+       c.Check(n, check.Equals, 2)
+       n = s.blockStateMap.GetConfirmedReplication([]arvados.SizedDigest{knownBlkid(40), knownBlkid(41)}, nil)
+       c.Check(n, check.Equals, 4)
+}
index 1659918cafe20c62162abfb1e841a40f80170c0a..1e1e51abe7ebcd55e5d0ed7ca55eae02a8b56504 100644 (file)
@@ -6,10 +6,15 @@ package main
 
 import (
        "context"
+       "encoding/json"
        "fmt"
+       "runtime"
+       "sync"
+       "sync/atomic"
        "time"
 
        "git.arvados.org/arvados.git/sdk/go/arvados"
+       "github.com/jmoiron/sqlx"
 )
 
 func countCollections(c *arvados.Client, params arvados.ResourceListParams) (int, error) {
@@ -28,10 +33,7 @@ func countCollections(c *arvados.Client, params arvados.ResourceListParams) (int
 // The progress function is called periodically with done (number of
 // times f has been called) and total (number of times f is expected
 // to be called).
-//
-// If pageSize > 0 it is used as the maximum page size in each API
-// call; otherwise the maximum allowed page size is requested.
-func EachCollection(ctx context.Context, c *arvados.Client, pageSize int, f func(arvados.Collection) error, progress func(done, total int)) error {
+func EachCollection(ctx context.Context, db *sqlx.DB, c *arvados.Client, f func(arvados.Collection) error, progress func(done, total int)) error {
        if progress == nil {
                progress = func(_, _ int) {}
        }
@@ -43,125 +45,222 @@ func EachCollection(ctx context.Context, c *arvados.Client, pageSize int, f func
        if err != nil {
                return err
        }
+       var newestModifiedAt time.Time
 
-       // Note the obvious way to get all collections (sorting by
-       // UUID) would be much easier, but would lose data: If a
-       // client were to move files from collection with uuid="zzz"
-       // to a collection with uuid="aaa" around the time when we
-       // were fetching the "mmm" page, we would never see those
-       // files' block IDs at all -- even if the client is careful to
-       // save "aaa" before saving "zzz".
-       //
-       // Instead, we get pages in modified_at order. Collections
-       // that are modified during the run will be re-fetched in a
-       // subsequent page.
-
-       limit := pageSize
-       if limit <= 0 {
-               // Use the maximum page size the server allows
-               limit = 1<<31 - 1
-       }
-       params := arvados.ResourceListParams{
-               Limit:              &limit,
-               Order:              "modified_at, uuid",
-               Count:              "none",
-               Select:             []string{"uuid", "unsigned_manifest_text", "modified_at", "portable_data_hash", "replication_desired"},
-               IncludeTrash:       true,
-               IncludeOldVersions: true,
+       rows, err := db.QueryxContext(ctx, `SELECT
+               uuid, manifest_text, modified_at, portable_data_hash,
+               replication_desired, replication_confirmed, replication_confirmed_at,
+               storage_classes_desired, storage_classes_confirmed, storage_classes_confirmed_at,
+               is_trashed
+               FROM collections`)
+       if err != nil {
+               return err
        }
-       var last arvados.Collection
-       var filterTime time.Time
+       defer rows.Close()
+       progressTicker := time.NewTicker(10 * time.Second)
+       defer progressTicker.Stop()
        callCount := 0
-       gettingExactTimestamp := false
-       for {
-               progress(callCount, expectCount)
-               var page arvados.CollectionList
-               err := c.RequestAndDecodeContext(ctx, &page, "GET", "arvados/v1/collections", nil, params)
+       for rows.Next() {
+               var coll arvados.Collection
+               var classesDesired, classesConfirmed []byte
+               err = rows.Scan(&coll.UUID, &coll.ManifestText, &coll.ModifiedAt, &coll.PortableDataHash,
+                       &coll.ReplicationDesired, &coll.ReplicationConfirmed, &coll.ReplicationConfirmedAt,
+                       &classesDesired, &classesConfirmed, &coll.StorageClassesConfirmedAt,
+                       &coll.IsTrashed)
                if err != nil {
                        return err
                }
-               for _, coll := range page.Items {
-                       if last.ModifiedAt == coll.ModifiedAt && last.UUID >= coll.UUID {
-                               continue
-                       }
-                       callCount++
-                       err = f(coll)
-                       if err != nil {
-                               return err
-                       }
-                       last = coll
+
+               err = json.Unmarshal(classesDesired, &coll.StorageClassesDesired)
+               if err != nil && len(classesDesired) > 0 {
+                       return err
                }
-               if len(page.Items) == 0 && !gettingExactTimestamp {
-                       break
-               } else if last.ModifiedAt.IsZero() {
-                       return fmt.Errorf("BUG: Last collection on the page (%s) has no modified_at timestamp; cannot make progress", last.UUID)
-               } else if len(page.Items) > 0 && last.ModifiedAt == filterTime {
-                       // If we requested time>=X and never got a
-                       // time>X then we might not have received all
-                       // items with time==X yet. Switch to
-                       // gettingExactTimestamp mode (if we're not
-                       // there already), advancing our UUID
-                       // threshold with each request, until we get
-                       // an empty page.
-                       gettingExactTimestamp = true
-                       params.Filters = []arvados.Filter{{
-                               Attr:     "modified_at",
-                               Operator: "=",
-                               Operand:  filterTime,
-                       }, {
-                               Attr:     "uuid",
-                               Operator: ">",
-                               Operand:  last.UUID,
-                       }}
-               } else if gettingExactTimestamp {
-                       // This must be an empty page (in this mode,
-                       // an unequal timestamp is impossible) so we
-                       // can start getting pages of newer
-                       // collections.
-                       gettingExactTimestamp = false
-                       params.Filters = []arvados.Filter{{
-                               Attr:     "modified_at",
-                               Operator: ">",
-                               Operand:  filterTime,
-                       }}
-               } else {
-                       // In the normal case, we know we have seen
-                       // all collections with modtime<filterTime,
-                       // but we might not have seen all that have
-                       // modtime=filterTime. Hence we use >= instead
-                       // of > and skip the obvious overlapping item,
-                       // i.e., the last item on the previous
-                       // page. In some edge cases this can return
-                       // collections we have already seen, but
-                       // avoiding that would add overhead in the
-                       // overwhelmingly common cases, so we don't
-                       // bother.
-                       filterTime = last.ModifiedAt
-                       params.Filters = []arvados.Filter{{
-                               Attr:     "modified_at",
-                               Operator: ">=",
-                               Operand:  filterTime,
-                       }, {
-                               Attr:     "uuid",
-                               Operator: "!=",
-                               Operand:  last.UUID,
-                       }}
+               err = json.Unmarshal(classesConfirmed, &coll.StorageClassesConfirmed)
+               if err != nil && len(classesConfirmed) > 0 {
+                       return err
+               }
+               if newestModifiedAt.IsZero() || newestModifiedAt.Before(coll.ModifiedAt) {
+                       newestModifiedAt = coll.ModifiedAt
+               }
+               callCount++
+               err = f(coll)
+               if err != nil {
+                       return err
+               }
+               select {
+               case <-progressTicker.C:
+                       progress(callCount, expectCount)
+               default:
                }
        }
        progress(callCount, expectCount)
-
+       err = rows.Close()
+       if err != nil {
+               return err
+       }
        if checkCount, err := countCollections(c, arvados.ResourceListParams{
                Filters: []arvados.Filter{{
                        Attr:     "modified_at",
                        Operator: "<=",
-                       Operand:  filterTime}},
+                       Operand:  newestModifiedAt}},
                IncludeTrash:       true,
                IncludeOldVersions: true,
        }); err != nil {
                return err
        } else if callCount < checkCount {
-               return fmt.Errorf("Retrieved %d collections with modtime <= T=%q, but server now reports there are %d collections with modtime <= T", callCount, filterTime, checkCount)
+               return fmt.Errorf("Retrieved %d collections with modtime <= T=%q, but server now reports there are %d collections with modtime <= T", callCount, newestModifiedAt, checkCount)
        }
 
        return nil
 }
+
+func (bal *Balancer) updateCollections(ctx context.Context, c *arvados.Client, cluster *arvados.Cluster) error {
+       ctx, cancel := context.WithCancel(ctx)
+       defer cancel()
+
+       defer bal.time("update_collections", "wall clock time to update collections")()
+       threshold := time.Now()
+       thresholdStr := threshold.Format(time.RFC3339Nano)
+
+       updated := int64(0)
+
+       errs := make(chan error, 1)
+       collQ := make(chan arvados.Collection, cluster.Collections.BalanceCollectionBuffers)
+       go func() {
+               defer close(collQ)
+               err := EachCollection(ctx, bal.DB, c, func(coll arvados.Collection) error {
+                       if atomic.LoadInt64(&updated) >= int64(cluster.Collections.BalanceUpdateLimit) {
+                               bal.logf("reached BalanceUpdateLimit (%d)", cluster.Collections.BalanceUpdateLimit)
+                               cancel()
+                               return context.Canceled
+                       }
+                       collQ <- coll
+                       return nil
+               }, func(done, total int) {
+                       bal.logf("update collections: %d/%d (%d updated @ %.01f updates/s)", done, total, atomic.LoadInt64(&updated), float64(atomic.LoadInt64(&updated))/time.Since(threshold).Seconds())
+               })
+               if err != nil && err != context.Canceled {
+                       select {
+                       case errs <- err:
+                       default:
+                       }
+               }
+       }()
+
+       var wg sync.WaitGroup
+
+       // Use about 1 goroutine per 2 CPUs. Based on experiments with
+       // a 2-core host, using more concurrent database
+       // calls/transactions makes this process slower, not faster.
+       for i := 0; i < runtime.NumCPU()+1/2; i++ {
+               wg.Add(1)
+               goSendErr(errs, func() error {
+                       defer wg.Done()
+                       tx, err := bal.DB.Beginx()
+                       if err != nil {
+                               return err
+                       }
+                       txPending := 0
+                       flush := func(final bool) error {
+                               err := tx.Commit()
+                               if err != nil && ctx.Err() == nil {
+                                       tx.Rollback()
+                                       return err
+                               }
+                               txPending = 0
+                               if final {
+                                       return nil
+                               }
+                               tx, err = bal.DB.Beginx()
+                               return err
+                       }
+                       txBatch := 100
+                       for coll := range collQ {
+                               if ctx.Err() != nil || len(errs) > 0 {
+                                       continue
+                               }
+                               blkids, err := coll.SizedDigests()
+                               if err != nil {
+                                       bal.logf("%s: %s", coll.UUID, err)
+                                       continue
+                               }
+                               repl := bal.BlockStateMap.GetConfirmedReplication(blkids, coll.StorageClassesDesired)
+
+                               desired := bal.DefaultReplication
+                               if coll.ReplicationDesired != nil {
+                                       desired = *coll.ReplicationDesired
+                               }
+                               if repl > desired {
+                                       // If actual>desired, confirm
+                                       // the desired number rather
+                                       // than actual to avoid
+                                       // flapping updates when
+                                       // replication increases
+                                       // temporarily.
+                                       repl = desired
+                               }
+                               classes := emptyJSONArray
+                               if repl > 0 {
+                                       classes, err = json.Marshal(coll.StorageClassesDesired)
+                                       if err != nil {
+                                               bal.logf("BUG? json.Marshal(%v) failed: %s", classes, err)
+                                               continue
+                                       }
+                               }
+                               needUpdate := coll.ReplicationConfirmed == nil || *coll.ReplicationConfirmed != repl || len(coll.StorageClassesConfirmed) != len(coll.StorageClassesDesired)
+                               for i := range coll.StorageClassesDesired {
+                                       if !needUpdate && coll.StorageClassesDesired[i] != coll.StorageClassesConfirmed[i] {
+                                               needUpdate = true
+                                       }
+                               }
+                               if !needUpdate {
+                                       continue
+                               }
+                               _, err = tx.ExecContext(ctx, `update collections set
+                                       replication_confirmed=$1,
+                                       replication_confirmed_at=$2,
+                                       storage_classes_confirmed=$3,
+                                       storage_classes_confirmed_at=$2
+                                       where uuid=$4`,
+                                       repl, thresholdStr, classes, coll.UUID)
+                               if err != nil {
+                                       if ctx.Err() == nil {
+                                               bal.logf("%s: update failed: %s", coll.UUID, err)
+                                       }
+                                       continue
+                               }
+                               atomic.AddInt64(&updated, 1)
+                               if txPending++; txPending >= txBatch {
+                                       err = flush(false)
+                                       if err != nil {
+                                               return err
+                                       }
+                               }
+                       }
+                       return flush(true)
+               })
+       }
+       wg.Wait()
+       bal.logf("updated %d collections", updated)
+       if len(errs) > 0 {
+               return fmt.Errorf("error updating collections: %s", <-errs)
+       }
+       return nil
+}
+
+// Call f in a new goroutine. If it returns a non-nil error, send the
+// error to the errs channel (unless the channel is already full with
+// another error).
+func goSendErr(errs chan<- error, f func() error) {
+       go func() {
+               err := f()
+               if err != nil {
+                       select {
+                       case errs <- err:
+                       default:
+                       }
+               }
+       }()
+}
+
+var emptyJSONArray = []byte("[]")
index 3ab9d07b2e2ed6bcc7220ae17aad4e6e7a665855..f749bad6ad1865a30670d0fe2978dfe8ebd2764c 100644 (file)
@@ -6,57 +6,34 @@ package main
 
 import (
        "context"
-       "sync"
-       "time"
 
+       "git.arvados.org/arvados.git/lib/config"
        "git.arvados.org/arvados.git/sdk/go/arvados"
+       "git.arvados.org/arvados.git/sdk/go/ctxlog"
+       "github.com/jmoiron/sqlx"
        check "gopkg.in/check.v1"
 )
 
-//  TestIdenticalTimestamps ensures EachCollection returns the same
-//  set of collections for various page sizes -- even page sizes so
-//  small that we get entire pages full of collections with identical
-//  timestamps and exercise our gettingExactTimestamp cases.
-func (s *integrationSuite) TestIdenticalTimestamps(c *check.C) {
-       // pageSize==0 uses the default (large) page size.
-       pageSizes := []int{0, 2, 3, 4, 5}
-       got := make([][]string, len(pageSizes))
-       var wg sync.WaitGroup
-       for trial, pageSize := range pageSizes {
-               wg.Add(1)
-               go func(trial, pageSize int) {
-                       defer wg.Done()
-                       streak := 0
-                       longestStreak := 0
-                       var lastMod time.Time
-                       sawUUID := make(map[string]bool)
-                       err := EachCollection(context.Background(), s.client, pageSize, func(c arvados.Collection) error {
-                               if c.ModifiedAt.IsZero() {
-                                       return nil
-                               }
-                               if sawUUID[c.UUID] {
-                                       // dup
-                                       return nil
-                               }
-                               got[trial] = append(got[trial], c.UUID)
-                               sawUUID[c.UUID] = true
-                               if lastMod == c.ModifiedAt {
-                                       streak++
-                                       if streak > longestStreak {
-                                               longestStreak = streak
-                                       }
-                               } else {
-                                       streak = 0
-                                       lastMod = c.ModifiedAt
-                               }
-                               return nil
-                       }, nil)
-                       c.Check(err, check.IsNil)
-                       c.Check(longestStreak > 25, check.Equals, true)
-               }(trial, pageSize)
-       }
-       wg.Wait()
-       for trial := 1; trial < len(pageSizes); trial++ {
-               c.Check(got[trial], check.DeepEquals, got[0])
-       }
+// TestMissedCollections exercises EachCollection's sanity check:
+// #collections processed >= #old collections that exist in database
+// after processing.
+func (s *integrationSuite) TestMissedCollections(c *check.C) {
+       cfg, err := config.NewLoader(nil, ctxlog.TestLogger(c)).Load()
+       c.Assert(err, check.IsNil)
+       cluster, err := cfg.GetCluster("")
+       c.Assert(err, check.IsNil)
+       db, err := sqlx.Open("postgres", cluster.PostgreSQL.Connection.String())
+       c.Assert(err, check.IsNil)
+
+       defer db.Exec(`delete from collections where uuid = 'zzzzz-4zz18-404040404040404'`)
+       insertedOld := false
+       err = EachCollection(context.Background(), db, s.client, func(coll arvados.Collection) error {
+               if !insertedOld {
+                       insertedOld = true
+                       _, err := db.Exec(`insert into collections (uuid, created_at, updated_at, modified_at) values ('zzzzz-4zz18-404040404040404', '2002-02-02T02:02:02Z', '2002-02-02T02:02:02Z', '2002-02-02T02:02:02Z')`)
+                       return err
+               }
+               return nil
+       }, nil)
+       c.Check(err, check.ErrorMatches, `Retrieved .* collections .* but server now reports .* collections.*`)
 }
index defabd9a109f27b348b61aeccb2ed822d2fa862e..52e6149158253bdff24705dcb8b6fb7a00f8cb02 100644 (file)
@@ -6,6 +6,7 @@ package main
 
 import (
        "bytes"
+       "io"
        "os"
        "strings"
        "testing"
@@ -17,6 +18,7 @@ import (
        "git.arvados.org/arvados.git/sdk/go/arvadostest"
        "git.arvados.org/arvados.git/sdk/go/ctxlog"
        "git.arvados.org/arvados.git/sdk/go/keepclient"
+       "github.com/jmoiron/sqlx"
        "github.com/prometheus/client_golang/prometheus"
        "github.com/sirupsen/logrus"
        check "gopkg.in/check.v1"
@@ -26,6 +28,7 @@ var _ = check.Suite(&integrationSuite{})
 
 type integrationSuite struct {
        config     *arvados.Cluster
+       db         *sqlx.DB
        client     *arvados.Client
        keepClient *keepclient.KeepClient
 }
@@ -67,6 +70,8 @@ func (s *integrationSuite) SetUpTest(c *check.C) {
        c.Assert(err, check.Equals, nil)
        s.config, err = cfg.GetCluster("")
        c.Assert(err, check.Equals, nil)
+       s.db, err = sqlx.Open("postgres", s.config.PostgreSQL.Connection.String())
+       c.Assert(err, check.IsNil)
        s.config.Collections.BalancePeriod = arvados.Duration(time.Second)
 
        s.client = &arvados.Client{
@@ -81,14 +86,16 @@ func (s *integrationSuite) TestBalanceAPIFixtures(c *check.C) {
        for iter := 0; iter < 20; iter++ {
                logBuf.Reset()
                logger := logrus.New()
-               logger.Out = &logBuf
+               logger.Out = io.MultiWriter(&logBuf, os.Stderr)
                opts := RunOptions{
-                       CommitPulls: true,
-                       CommitTrash: true,
-                       Logger:      logger,
+                       CommitPulls:           true,
+                       CommitTrash:           true,
+                       CommitConfirmedFields: true,
+                       Logger:                logger,
                }
 
                bal := &Balancer{
+                       DB:      s.db,
                        Logger:  logger,
                        Metrics: newMetrics(prometheus.NewRegistry()),
                }
@@ -105,4 +112,23 @@ func (s *integrationSuite) TestBalanceAPIFixtures(c *check.C) {
                time.Sleep(200 * time.Millisecond)
        }
        c.Check(logBuf.String(), check.Not(check.Matches), `(?ms).*0 replicas (0 blocks, 0 bytes) underreplicated.*`)
+
+       for _, trial := range []struct {
+               uuid    string
+               repl    int
+               classes []string
+       }{
+               {arvadostest.EmptyCollectionUUID, 0, []string{}},
+               {arvadostest.FooCollection, 2, []string{"default"}},                                // "foo" blk
+               {arvadostest.StorageClassesDesiredDefaultConfirmedDefault, 2, []string{"default"}}, // "bar" blk
+               {arvadostest.StorageClassesDesiredArchiveConfirmedDefault, 0, []string{}},          // "bar" blk
+       } {
+               c.Logf("%#v", trial)
+               var coll arvados.Collection
+               s.client.RequestAndDecode(&coll, "GET", "arvados/v1/collections/"+trial.uuid, nil, nil)
+               if c.Check(coll.ReplicationConfirmed, check.NotNil) {
+                       c.Check(*coll.ReplicationConfirmed, check.Equals, trial.repl)
+               }
+               c.Check(coll.StorageClassesConfirmed, check.DeepEquals, trial.classes)
+       }
 }
index 0a38597e6caf28a3f1a4ba45a9b568c2920d56c8..859d70724106f815ab9a990e0a9615c09a4a5189 100644 (file)
@@ -7,9 +7,6 @@ Description=Arvados Keep Balance
 Documentation=https://doc.arvados.org/
 After=network.target
 
-# systemd==229 (ubuntu:xenial) obeys StartLimitInterval in the [Unit] section
-StartLimitInterval=0
-
 # systemd>=230 (debian:9) obeys StartLimitIntervalSec in the [Unit] section
 StartLimitIntervalSec=0
 
index 8b4ee84c716e4596987b1371b38035610f9ffa2f..e1573e7f733935028d164d6d5dd69d383fdf338f 100644 (file)
@@ -9,6 +9,8 @@ import (
        "flag"
        "fmt"
        "io"
+       "net/http"
+       _ "net/http/pprof"
        "os"
 
        "git.arvados.org/arvados.git/lib/config"
@@ -16,6 +18,8 @@ import (
        "git.arvados.org/arvados.git/sdk/go/arvados"
        "git.arvados.org/arvados.git/sdk/go/ctxlog"
        "git.arvados.org/arvados.git/sdk/go/health"
+       "github.com/jmoiron/sqlx"
+       _ "github.com/lib/pq"
        "github.com/prometheus/client_golang/prometheus"
        "github.com/sirupsen/logrus"
 )
@@ -35,8 +39,17 @@ func runCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.W
                "send pull requests (make more replicas of blocks that are underreplicated or are not in optimal rendezvous probe order)")
        flags.BoolVar(&options.CommitTrash, "commit-trash", false,
                "send trash requests (delete unreferenced old blocks, and excess replicas of overreplicated blocks)")
+       flags.BoolVar(&options.CommitConfirmedFields, "commit-confirmed-fields", true,
+               "update collection fields (replicas_confirmed, storage_classes_confirmed, etc.)")
        flags.Bool("version", false, "Write version information to stdout and exit 0")
        dumpFlag := flags.Bool("dump", false, "dump details for each block to stdout")
+       pprofAddr := flags.String("pprof", "", "serve Go profile data at `[addr]:port`")
+
+       if *pprofAddr != "" {
+               go func() {
+                       logrus.Println(http.ListenAndServe(*pprofAddr, nil))
+               }()
+       }
 
        loader := config.NewLoader(os.Stdin, logger)
        loader.SetupFlags(flags)
@@ -55,10 +68,11 @@ func runCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.W
        // service.Command
        args = nil
        dropFlag := map[string]bool{
-               "once":         true,
-               "commit-pulls": true,
-               "commit-trash": true,
-               "dump":         true,
+               "once":                    true,
+               "commit-pulls":            true,
+               "commit-trash":            true,
+               "commit-confirmed-fields": true,
+               "dump":                    true,
        }
        flags.Visit(func(f *flag.Flag) {
                if !dropFlag[f.Name] {
@@ -78,6 +92,18 @@ func runCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.W
                                return service.ErrorHandler(ctx, cluster, fmt.Errorf("error initializing client from cluster config: %s", err))
                        }
 
+                       db, err := sqlx.Open("postgres", cluster.PostgreSQL.Connection.String())
+                       if err != nil {
+                               return service.ErrorHandler(ctx, cluster, fmt.Errorf("postgresql connection failed: %s", err))
+                       }
+                       if p := cluster.PostgreSQL.ConnectionPool; p > 0 {
+                               db.SetMaxOpenConns(p)
+                       }
+                       err = db.Ping()
+                       if err != nil {
+                               return service.ErrorHandler(ctx, cluster, fmt.Errorf("postgresql connection succeeded but ping failed: %s", err))
+                       }
+
                        if options.Logger == nil {
                                options.Logger = ctxlog.FromContext(ctx)
                        }
@@ -89,6 +115,7 @@ func runCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.W
                                Metrics:    newMetrics(registry),
                                Logger:     options.Logger,
                                Dumper:     options.Dumper,
+                               DB:         db,
                        }
                        srv.Handler = &health.Handler{
                                Token:  cluster.ManagementToken,
index b154f6e99848a3623b167726412ce5b48a59c715..65a2d5567a86505e7d6e4866aa8ffa75e3bf2deb 100644 (file)
@@ -11,6 +11,11 @@ import (
        "net/http"
        "time"
 
+       "git.arvados.org/arvados.git/lib/config"
+       "git.arvados.org/arvados.git/sdk/go/arvados"
+       "git.arvados.org/arvados.git/sdk/go/arvadostest"
+       "git.arvados.org/arvados.git/sdk/go/ctxlog"
+       "github.com/ghodss/yaml"
        check "gopkg.in/check.v1"
 )
 
@@ -26,6 +31,8 @@ func (s *mainSuite) TestVersionFlag(c *check.C) {
 }
 
 func (s *mainSuite) TestHTTPServer(c *check.C) {
+       arvadostest.StartKeep(2, true)
+
        ln, err := net.Listen("tcp", ":0")
        if err != nil {
                c.Fatal(err)
@@ -33,10 +40,17 @@ func (s *mainSuite) TestHTTPServer(c *check.C) {
        _, p, err := net.SplitHostPort(ln.Addr().String())
        c.Check(err, check.IsNil)
        ln.Close()
-       config := "Clusters:\n zzzzz:\n  ManagementToken: abcdefg\n  Services: {Keepbalance: {InternalURLs: {'http://localhost:" + p + "/': {}}}}\n"
+       cfg, err := config.NewLoader(nil, ctxlog.TestLogger(c)).Load()
+       c.Assert(err, check.IsNil)
+       cluster, err := cfg.GetCluster("")
+       c.Assert(err, check.IsNil)
+       cluster.Services.Keepbalance.InternalURLs[arvados.URL{Host: "localhost:" + p, Path: "/"}] = arvados.ServiceInstance{}
+       cfg.Clusters[cluster.ClusterID] = *cluster
+       config, err := yaml.Marshal(cfg)
+       c.Assert(err, check.IsNil)
 
        var stdout bytes.Buffer
-       go runCommand("keep-balance", []string{"-config", "-"}, bytes.NewBufferString(config), &stdout, &stdout)
+       go runCommand("keep-balance", []string{"-config", "-"}, bytes.NewBuffer(config), &stdout, &stdout)
        done := make(chan struct{})
        go func() {
                defer close(done)
@@ -47,7 +61,7 @@ func (s *mainSuite) TestHTTPServer(c *check.C) {
                                c.Fatal(err)
                                return
                        }
-                       req.Header.Set("Authorization", "Bearer abcdefg")
+                       req.Header.Set("Authorization", "Bearer "+cluster.ManagementToken)
                        resp, err := http.DefaultClient.Do(req)
                        if err != nil {
                                c.Logf("error %s", err)
@@ -73,6 +87,7 @@ func (s *mainSuite) TestHTTPServer(c *check.C) {
                c.Log(stdout.String())
                c.Fatal("timeout")
        }
+       c.Log(stdout.String())
 
        // Check non-metrics URL that gets passed through to us from
        // service.Command
index 9801a3fd45d5d13ec40bf661c59b4de5156cfeed..5299b96c1caf2ac3aaa28c639e71d501ddbbd637 100644 (file)
@@ -12,6 +12,7 @@ import (
        "time"
 
        "git.arvados.org/arvados.git/sdk/go/arvados"
+       "github.com/jmoiron/sqlx"
        "github.com/sirupsen/logrus"
 )
 
@@ -23,11 +24,12 @@ import (
 //
 // RunOptions fields are controlled by command line flags.
 type RunOptions struct {
-       Once        bool
-       CommitPulls bool
-       CommitTrash bool
-       Logger      logrus.FieldLogger
-       Dumper      logrus.FieldLogger
+       Once                  bool
+       CommitPulls           bool
+       CommitTrash           bool
+       CommitConfirmedFields bool
+       Logger                logrus.FieldLogger
+       Dumper                logrus.FieldLogger
 
        // SafeRendezvousState from the most recent balance operation,
        // or "" if unknown. If this changes from one run to the next,
@@ -46,11 +48,13 @@ type Server struct {
 
        Logger logrus.FieldLogger
        Dumper logrus.FieldLogger
+
+       DB *sqlx.DB
 }
 
 // CheckHealth implements service.Handler.
 func (srv *Server) CheckHealth() error {
-       return nil
+       return srv.DB.Ping()
 }
 
 // Done implements service.Handler.
@@ -75,6 +79,7 @@ func (srv *Server) run() {
 
 func (srv *Server) runOnce() (*Balancer, error) {
        bal := &Balancer{
+               DB:             srv.DB,
                Logger:         srv.Logger,
                Dumper:         srv.Dumper,
                Metrics:        srv.Metrics,
index 80978227770998fa2ac1b69d5149cfd4e2ca9530..cb5fdf84fc5f3362d34f3b2d56e729f2e74a6911 100644 (file)
@@ -7,9 +7,6 @@ Description=Arvados Keep web gateway
 Documentation=https://doc.arvados.org/
 After=network.target
 
-# systemd==229 (ubuntu:xenial) obeys StartLimitInterval in the [Unit] section
-StartLimitInterval=0
-
 # systemd>=230 (debian:9) obeys StartLimitIntervalSec in the [Unit] section
 StartLimitIntervalSec=0
 
index 4c63161a0d7a260ce7dd349f62214d6e7d162f36..9548cb219b1f2bd7efa2490590b30bcb52b85fdc 100644 (file)
@@ -7,9 +7,6 @@ Description=Arvados Keep Proxy
 Documentation=https://doc.arvados.org/
 After=network.target
 
-# systemd==229 (ubuntu:xenial) obeys StartLimitInterval in the [Unit] section
-StartLimitInterval=0
-
 # systemd>=230 (debian:9) obeys StartLimitIntervalSec in the [Unit] section
 StartLimitIntervalSec=0
 
index c49fbe0bb368b90733f985990acadb651323f7fe..4bdb4202026e3aeb2705fa79faa0ef3e7a296307 100644 (file)
@@ -7,7 +7,6 @@ package main
 import (
        "bytes"
        "crypto/md5"
-       "errors"
        "fmt"
        "io/ioutil"
        "math/rand"
@@ -265,12 +264,16 @@ func (s *ServerRequiredSuite) TestDesiredReplicas(c *C) {
        content := []byte("TestDesiredReplicas")
        hash := fmt.Sprintf("%x", md5.Sum(content))
 
-       for _, kc.Want_replicas = range []int{0, 1, 2} {
+       for _, kc.Want_replicas = range []int{0, 1, 2, 3} {
                locator, rep, err := kc.PutB(content)
-               c.Check(err, Equals, nil)
-               c.Check(rep, Equals, kc.Want_replicas)
-               if rep > 0 {
-                       c.Check(locator, Matches, fmt.Sprintf(`^%s\+%d(\+.+)?$`, hash, len(content)))
+               if kc.Want_replicas < 3 {
+                       c.Check(err, Equals, nil)
+                       c.Check(rep, Equals, kc.Want_replicas)
+                       if rep > 0 {
+                               c.Check(locator, Matches, fmt.Sprintf(`^%s\+%d(\+.+)?$`, hash, len(content)))
+                       }
+               } else {
+                       c.Check(err, ErrorMatches, ".*503.*")
                }
        }
 }
@@ -438,7 +441,7 @@ func (s *ServerRequiredSuite) TestPutAskGetForbidden(c *C) {
        hash2, rep, err := kc.PutB([]byte("bar"))
        c.Check(hash2, Equals, "")
        c.Check(rep, Equals, 0)
-       c.Check(err, FitsTypeOf, keepclient.InsufficientReplicasError(errors.New("")))
+       c.Check(err, FitsTypeOf, keepclient.InsufficientReplicasError{})
 
        blocklen, _, err := kc.Ask(hash)
        c.Check(err, FitsTypeOf, &keepclient.ErrNotFound{})
@@ -491,7 +494,7 @@ func testPermission(c *C, admin bool, perm arvados.UploadDownloadPermission) {
                } else {
                        c.Check(hash2, Equals, "")
                        c.Check(rep, Equals, 0)
-                       c.Check(err, FitsTypeOf, keepclient.InsufficientReplicasError(errors.New("")))
+                       c.Check(err, FitsTypeOf, keepclient.InsufficientReplicasError{})
                }
                logbuf.Reset()
        }
index db64449e48bed2234ae32a97fbc122b360a77576..897447dd11c7a95a5b113d867fb0de28cbed6844 100644 (file)
@@ -21,7 +21,6 @@ import (
        "net/http"
        "net/http/httptest"
        "os"
-       "regexp"
        "sort"
        "strings"
        "time"
@@ -320,6 +319,54 @@ func (s *HandlerSuite) TestPutAndDeleteSkipReadonlyVolumes(c *check.C) {
        }
 }
 
+func (s *HandlerSuite) TestReadsOrderedByStorageClassPriority(c *check.C) {
+       s.cluster.Volumes = map[string]arvados.Volume{
+               "zzzzz-nyw5e-111111111111111": {
+                       Driver:         "mock",
+                       Replication:    1,
+                       StorageClasses: map[string]bool{"class1": true}},
+               "zzzzz-nyw5e-222222222222222": {
+                       Driver:         "mock",
+                       Replication:    1,
+                       StorageClasses: map[string]bool{"class2": true, "class3": true}},
+       }
+
+       for _, trial := range []struct {
+               priority1 int // priority of class1, thus vol1
+               priority2 int // priority of class2
+               priority3 int // priority of class3 (vol2 priority will be max(priority2, priority3))
+               get1      int // expected number of "get" ops on vol1
+               get2      int // expected number of "get" ops on vol2
+       }{
+               {100, 50, 50, 1, 0},   // class1 has higher priority => try vol1 first, no need to try vol2
+               {100, 100, 100, 1, 0}, // same priority, vol1 is first lexicographically => try vol1 first and succeed
+               {66, 99, 33, 1, 1},    // class2 has higher priority => try vol2 first, then try vol1
+               {66, 33, 99, 1, 1},    // class3 has highest priority => vol2 has highest => try vol2 first, then try vol1
+       } {
+               c.Logf("%+v", trial)
+               s.cluster.StorageClasses = map[string]arvados.StorageClassConfig{
+                       "class1": {Priority: trial.priority1},
+                       "class2": {Priority: trial.priority2},
+                       "class3": {Priority: trial.priority3},
+               }
+               c.Assert(s.handler.setup(context.Background(), s.cluster, "", prometheus.NewRegistry(), testServiceURL), check.IsNil)
+               IssueRequest(s.handler,
+                       &RequestTester{
+                               method:         "PUT",
+                               uri:            "/" + TestHash,
+                               requestBody:    TestBlock,
+                               storageClasses: "class1",
+                       })
+               IssueRequest(s.handler,
+                       &RequestTester{
+                               method: "GET",
+                               uri:    "/" + TestHash,
+                       })
+               c.Check(s.handler.volmgr.mountMap["zzzzz-nyw5e-111111111111111"].Volume.(*MockVolume).CallCount("Get"), check.Equals, trial.get1)
+               c.Check(s.handler.volmgr.mountMap["zzzzz-nyw5e-222222222222222"].Volume.(*MockVolume).CallCount("Get"), check.Equals, trial.get2)
+       }
+}
+
 // Test TOUCH requests.
 func (s *HandlerSuite) TestTouchHandler(c *check.C) {
        c.Assert(s.handler.setup(context.Background(), s.cluster, "", prometheus.NewRegistry(), testServiceURL), check.IsNil)
@@ -497,12 +544,8 @@ func (s *HandlerSuite) TestIndexHandler(c *check.C) {
 
        expected := `^` + TestHash + `\+\d+ \d+\n` +
                TestHash2 + `\+\d+ \d+\n\n$`
-       match, _ := regexp.MatchString(expected, response.Body.String())
-       if !match {
-               c.Errorf(
-                       "permissions on, superuser request: expected %s, got:\n%s",
-                       expected, response.Body.String())
-       }
+       c.Check(response.Body.String(), check.Matches, expected, check.Commentf(
+               "permissions on, superuser request"))
 
        // superuser /index/prefix request
        // => OK
@@ -513,12 +556,8 @@ func (s *HandlerSuite) TestIndexHandler(c *check.C) {
                response)
 
        expected = `^` + TestHash + `\+\d+ \d+\n\n$`
-       match, _ = regexp.MatchString(expected, response.Body.String())
-       if !match {
-               c.Errorf(
-                       "permissions on, superuser /index/prefix request: expected %s, got:\n%s",
-                       expected, response.Body.String())
-       }
+       c.Check(response.Body.String(), check.Matches, expected, check.Commentf(
+               "permissions on, superuser /index/prefix request"))
 
        // superuser /index/{no-such-prefix} request
        // => OK
index a60d17d576ae047b8a3198cd8c755c3828ef9875..2b469a13eb993e0827bac8ae1ebe4db46bc8c4df 100644 (file)
@@ -252,6 +252,13 @@ func (rtr *router) handlePUT(resp http.ResponseWriter, req *http.Request) {
                for i, sc := range wantStorageClasses {
                        wantStorageClasses[i] = strings.TrimSpace(sc)
                }
+       } else {
+               // none specified -- use configured default
+               for class, cfg := range rtr.cluster.StorageClasses {
+                       if cfg.Default {
+                               wantStorageClasses = append(wantStorageClasses, class)
+                       }
+               }
        }
 
        buf, err := getBufferWithContext(ctx, bufs, int(req.ContentLength))
index 7047f0e6b970a0c0a9598d6e680d5589425c08c1..1f14c3f464c4b3a701a1f800e99534865a058908 100644 (file)
@@ -7,9 +7,6 @@ Description=Arvados Keep Storage Daemon
 Documentation=https://doc.arvados.org/
 After=network.target
 
-# systemd==229 (ubuntu:xenial) obeys StartLimitInterval in the [Unit] section
-StartLimitInterval=0
-
 # systemd>=230 (debian:9) obeys StartLimitIntervalSec in the [Unit] section
 StartLimitIntervalSec=0
 
index 26e6b731828f9be0861044cb6a7c4e10d097d05f..9bfc6ca3e5191d2953ceac75f915a07cab19c69f 100644 (file)
@@ -10,6 +10,7 @@ import (
        "fmt"
        "io"
        "math/big"
+       "sort"
        "sync/atomic"
        "time"
 
@@ -343,6 +344,27 @@ func makeRRVolumeManager(logger logrus.FieldLogger, cluster *arvados.Cluster, my
                        vm.writables = append(vm.writables, mnt)
                }
        }
+       // pri(i): return highest priority of any storage class
+       // offered by vm.readables[i]
+       pri := func(i int) int {
+               any, best := false, 0
+               for class := range vm.readables[i].KeepMount.StorageClasses {
+                       if p := cluster.StorageClasses[class].Priority; !any || best < p {
+                               best = p
+                               any = true
+                       }
+               }
+               return best
+       }
+       // sort vm.readables, first by highest priority of any offered
+       // storage class (highest->lowest), then by volume UUID
+       sort.Slice(vm.readables, func(i, j int) bool {
+               if pi, pj := pri(i), pri(j); pi != pj {
+                       return pi > pj
+               } else {
+                       return vm.readables[i].KeepMount.UUID < vm.readables[j].KeepMount.UUID
+               }
+       })
        return vm, nil
 }
 
index d8836f19b32704258a77a6a3b1446f7fda2fc416..8e5c6deb5dc8ca47a08dd169157116141aeb0518 100755 (executable)
@@ -144,7 +144,7 @@ begin
       if existing_groups.index(addgroup).nil?
         # User should be in group, but isn't, so add them.
         STDERR.puts "Add user #{username} to #{addgroup} group"
-        system("adduser", username, addgroup)
+        system("usermod", "-aG", addgroup, username)
       end
     end
 
@@ -152,7 +152,7 @@ begin
       if groups.index(removegroup).nil?
         # User is in a group, but shouldn't be, so remove them.
         STDERR.puts "Remove user #{username} from #{removegroup} group"
-        system("deluser", username, removegroup)
+        system("gpasswd", "-d", username, removegroup)
       end
     end
 
index f7052efc105abcce54b1e50aa6b294debacf13b8..a7784fd7beced037199a5216f58ed9782235223c 100644 (file)
@@ -196,69 +196,59 @@ arvados:
         ProviderType: t3.small
         VCPUs: 2
         RAM: 2GiB
-        IncludedScratch: 50GB
         AddedScratch: 50GB
         Price: 0.0208
       c5large:
         ProviderType: c5.large
         VCPUs: 2
         RAM: 4GiB
-        IncludedScratch: 50GB
         AddedScratch: 50GB
         Price: 0.085
       m5large:
         ProviderType: m5.large
         VCPUs: 2
         RAM: 8GiB
-        IncludedScratch: 50GB
         AddedScratch: 50GB
         Price: 0.096
       c5xlarge:
         ProviderType: c5.xlarge
         VCPUs: 4
         RAM: 8GiB
-        IncludedScratch: 100GB
         AddedScratch: 100GB
         Price: 0.17
       m5xlarge:
         ProviderType: m5.xlarge
         VCPUs: 4
         RAM: 16GiB
-        IncludedScratch: 100GB
         AddedScratch: 100GB
         Price: 0.192
       m5xlarge_extradisk:
         ProviderType: m5.xlarge
         VCPUs: 4
         RAM: 16GiB
-        IncludedScratch: 400GB
         AddedScratch: 400GB
         Price: 0.193
       c52xlarge:
         ProviderType: c5.2xlarge
         VCPUs: 8
         RAM: 16GiB
-        IncludedScratch: 200GB
         AddedScratch: 200GB
         Price: 0.34
       m52xlarge:
         ProviderType: m5.2xlarge
         VCPUs: 8
         RAM: 32GiB
-        IncludedScratch: 200GB
         AddedScratch: 200GB
         Price: 0.384
       c54xlarge:
         ProviderType: c5.4xlarge
         VCPUs: 16
         RAM: 32GiB
-        IncludedScratch: 400GB
         AddedScratch: 400GB
         Price: 0.68
       m54xlarge:
         ProviderType: m5.4xlarge
         VCPUs: 16
         RAM: 64GiB
-        IncludedScratch: 400GB
         AddedScratch: 400GB
         Price: 0.768