16535: Merge branch 'master'
authorTom Clegg <tom@tomclegg.ca>
Tue, 18 Aug 2020 20:02:56 +0000 (16:02 -0400)
committerTom Clegg <tom@tomclegg.ca>
Tue, 18 Aug 2020 20:02:56 +0000 (16:02 -0400)
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom@tomclegg.ca>

252 files changed:
.licenseignore
apps/workbench/app/views/application/_show_sharing.html.erb
build/check-copyright-notices
build/package-build-dockerfiles/centos7/Dockerfile
build/package-build-dockerfiles/debian10/Dockerfile
build/package-build-dockerfiles/debian9/Dockerfile
build/package-build-dockerfiles/ubuntu1604/Dockerfile
build/package-build-dockerfiles/ubuntu1804/Dockerfile
build/package-testing/test-package-arvados-docker-cleaner.sh [new file with mode: 0755]
build/package-testing/test-package-arvados-node-manager.sh [deleted file]
build/package-testing/test-package-python-arvados-cwl-runner.sh [deleted symlink]
build/package-testing/test-package-python-arvados-fuse.sh [deleted symlink]
build/package-testing/test-package-python-arvados-python-client.sh [deleted symlink]
build/package-testing/test-package-python-cwltest.sh [deleted symlink]
build/package-testing/test-package-python27-python-arvados-python-client.sh [deleted file]
build/package-testing/test-package-python3-arvados-cwl-runner.sh
build/package-testing/test-package-python3-arvados-python-client.sh
build/package-testing/test-package-python3-crunchstat-summary.sh [moved from services/nodemanager/arvnodeman/test/__init__.py with 66% similarity, mode: 0755]
build/package-testing/test-package-python3-cwltest.sh [moved from build/package-testing/test-package-python27-python-arvados-cwl-runner.sh with 79% similarity]
build/package-testing/test-package-python3-python-arvados-fuse.sh [changed from symlink to file mode: 0755]
build/package-testing/test-package-rh-python36-python-arvados-cwl-runner.sh [new file with mode: 0755]
build/package-testing/test-package-rh-python36-python-arvados-fuse.sh [moved from build/package-testing/test-package-python27-python-arvados-fuse.sh with 100% similarity]
build/package-testing/test-package-rh-python36-python-crunchstat-summary.sh [new file with mode: 0755]
build/package-testing/test-package-rh-python36-python-cwltest.sh [moved from build/package-testing/test-package-python27-python-cwltest.sh with 74% similarity]
build/run-build-packages-one-target.sh
build/run-build-packages-python-and-ruby.sh
build/run-build-packages.sh
build/run-library.sh
build/run-tests.sh
cmd/arvados-client/cmd.go
doc/_config.yml
doc/admin/cloudtest.html.textile.liquid
doc/admin/config-migration.html.textile.liquid
doc/admin/management-token.html.textile.liquid
doc/admin/metrics.html.textile.liquid
doc/admin/spot-instances.html.textile.liquid
doc/admin/upgrading.html.textile.liquid
doc/architecture/index.html.textile.liquid
doc/install/configure-s3-object-storage.html.textile.liquid
doc/install/crunch2-cloud/install-compute-node.html.textile.liquid [new file with mode: 0644]
doc/install/crunch2-cloud/install-dispatch-cloud.html.textile.liquid [moved from doc/install/install-dispatch-cloud.html.textile.liquid with 75% similarity]
doc/install/crunch2-slurm/install-compute-node.html.textile.liquid
doc/install/crunch2-slurm/install-dispatch.html.textile.liquid
doc/install/crunch2-slurm/install-prerequisites.html.textile.liquid
doc/install/crunch2-slurm/install-slurm.html.textile.liquid
doc/install/crunch2-slurm/install-test.html.textile.liquid
doc/install/install-keep-web.html.textile.liquid
doc/install/install-keepproxy.html.textile.liquid
doc/install/install-keepstore.html.textile.liquid
doc/install/install-manual-prerequisites.html.textile.liquid
doc/install/install-nodemanager.html.textile.liquid [deleted file]
doc/user/cwl/cwl-versions.html.textile.liquid
go.mod
go.sum
lib/cloud/azure/azure.go
lib/config/config.default.yml
lib/config/deprecated.go
lib/config/deprecated_test.go
lib/config/generated_config.go
lib/config/load_test.go
lib/controller/api/routable.go [new file with mode: 0644]
lib/controller/federation/generated.go
lib/controller/federation/list.go
lib/controller/handler.go
lib/controller/integration_test.go
lib/controller/localdb/conn.go
lib/controller/localdb/login.go
lib/controller/localdb/login_ldap_test.go
lib/controller/router/router.go
lib/controller/router/router_test.go
lib/controller/rpc/conn_test.go
lib/ctrlctx/db.go [moved from lib/controller/localdb/db.go with 62% similarity]
lib/ctrlctx/db_test.go [moved from lib/controller/localdb/db_test.go with 62% similarity]
lib/deduplicationreport/command.go [new file with mode: 0644]
lib/deduplicationreport/report.go [new file with mode: 0644]
lib/deduplicationreport/report_test.go [new file with mode: 0644]
lib/dispatchcloud/dispatcher_test.go
lib/dispatchcloud/scheduler/run_queue_test.go
lib/dispatchcloud/scheduler/sync.go
lib/dispatchcloud/scheduler/sync_test.go
lib/dispatchcloud/worker/pool.go
lib/dispatchcloud/worker/pool_test.go
lib/dispatchcloud/worker/worker.go
lib/pam/pam_arvados.go
lib/recovercollection/cmd_test.go
lib/service/cmd_test.go
sdk/cli/arvados-cli.gemspec
sdk/cwl/arvados_cwl/__init__.py
sdk/cwl/arvados_cwl/arv-cwl-schema-v1.2.yml [new file with mode: 0644]
sdk/cwl/arvados_cwl/arvcontainer.py
sdk/cwl/arvados_cwl/arvtool.py
sdk/cwl/arvados_cwl/arvworkflow.py
sdk/cwl/arvados_cwl/executor.py
sdk/cwl/arvados_cwl/fsaccess.py
sdk/cwl/arvados_cwl/http.py
sdk/cwl/arvados_cwl/pathmapper.py
sdk/cwl/arvados_cwl/runner.py
sdk/cwl/setup.py
sdk/cwl/tests/13976-keepref-wf.cwl
sdk/cwl/tests/16377-missing-default.cwl [new file with mode: 0644]
sdk/cwl/tests/arvados-tests.yml
sdk/cwl/tests/hello.yml [new file with mode: 0644]
sdk/cwl/tests/test_http.py
sdk/cwl/tests/test_submit.py
sdk/go/arvados/api.go
sdk/go/arvados/config.go
sdk/go/arvados/duration_test.go
sdk/go/arvados/fs_project_test.go
sdk/go/arvados/fs_site_test.go
sdk/go/arvadosclient/arvadosclient_test.go
sdk/go/arvadostest/db.go [new file with mode: 0644]
sdk/go/health/aggregator_test.go
sdk/go/httpserver/logger_test.go
sdk/go/keepclient/collectionreader_test.go
sdk/go/keepclient/keepclient_test.go
sdk/pam/.gitignore [deleted symlink]
sdk/pam/Dockerfile [deleted file]
sdk/pam/LICENSE-2.0.txt [deleted file]
sdk/pam/MANIFEST.in [deleted file]
sdk/pam/README.rst [deleted file]
sdk/pam/arvados_pam/__init__.py [deleted file]
sdk/pam/arvados_pam/auth_event.py [deleted file]
sdk/pam/arvados_version.py [deleted file]
sdk/pam/examples/shellinabox [deleted file]
sdk/pam/fpm-info.sh [deleted file]
sdk/pam/gittaggers.py [deleted symlink]
sdk/pam/integration_tests/__init__.py [deleted file]
sdk/pam/integration_tests/test_pam.py [deleted file]
sdk/pam/lib/libpam_arvados.py [deleted file]
sdk/pam/pam-configs/arvados [deleted file]
sdk/pam/setup.py [deleted file]
sdk/pam/tests/__init__.py [deleted file]
sdk/pam/tests/integration_test.pl [deleted file]
sdk/pam/tests/mocker.py [deleted file]
sdk/pam/tests/test_auth_event.py [deleted file]
sdk/pam/tests/test_pam_sm.py [deleted file]
sdk/python/arvados/commands/federation_migrate.py
sdk/python/tests/fed-migrate/check.py
sdk/python/tests/fed-migrate/create_users.py
services/api/Gemfile
services/api/Gemfile.lock
services/api/app/controllers/application_controller.rb
services/api/app/models/api_client_authorization.rb
services/api/app/models/arvados_model.rb
services/api/app/models/collection.rb
services/api/app/models/container.rb
services/api/app/models/container_request.rb
services/api/app/models/group.rb
services/api/app/models/link.rb
services/api/app/models/node.rb
services/api/app/models/user.rb
services/api/bin/bundle
services/api/bin/setup
services/api/bin/update
services/api/bin/yarn [new file with mode: 0755]
services/api/config/application.rb
services/api/config/arvados_config.rb
services/api/config/boot.rb
services/api/config/environments/development.rb.example
services/api/config/environments/production.rb.example
services/api/config/environments/test.rb.example
services/api/config/initializers/content_security_policy.rb [new file with mode: 0644]
services/api/config/initializers/legacy_jobs_api.rb
services/api/config/initializers/new_framework_defaults_5_2.rb [new file with mode: 0644]
services/api/config/initializers/preload_all_models.rb [deleted file]
services/api/config/initializers/time_zone.rb
services/api/config/initializers/wrap_parameters.rb
services/api/config/routes.rb
services/api/config/secrets.yml [new file with mode: 0644]
services/api/lib/audit_logs.rb
services/api/lib/sweep_trashed_objects.rb
services/api/lib/update_priority.rb
services/api/test/functional/arvados/v1/keep_services_controller_test.rb
services/api/test/unit/arvados_model_test.rb
services/api/test/unit/link_test.rb
services/api/test/unit/log_test.rb
services/api/test/unit/node_test.rb
services/dockercleaner/fpm-info.sh [new file with mode: 0644]
services/keep-balance/main_test.go
services/keep/tools/traffic_test.py [deleted file]
services/keepstore/s3_volume.go
services/keepstore/s3aws_volume.go [new file with mode: 0644]
services/keepstore/s3aws_volume_test.go [new file with mode: 0644]
services/nodemanager/.gitignore [deleted symlink]
services/nodemanager/MANIFEST.in [deleted file]
services/nodemanager/README.rst [deleted file]
services/nodemanager/agpl-3.0.txt [deleted file]
services/nodemanager/arvados-node-manager.service [deleted file]
services/nodemanager/arvados_version.py [deleted file]
services/nodemanager/arvnodeman/__init__.py [deleted file]
services/nodemanager/arvnodeman/baseactor.py [deleted file]
services/nodemanager/arvnodeman/clientactor.py [deleted file]
services/nodemanager/arvnodeman/computenode/__init__.py [deleted file]
services/nodemanager/arvnodeman/computenode/dispatch/__init__.py [deleted file]
services/nodemanager/arvnodeman/computenode/dispatch/slurm.py [deleted file]
services/nodemanager/arvnodeman/computenode/dispatch/transitions.py [deleted file]
services/nodemanager/arvnodeman/computenode/driver/__init__.py [deleted file]
services/nodemanager/arvnodeman/computenode/driver/azure.py [deleted file]
services/nodemanager/arvnodeman/computenode/driver/dummy.py [deleted file]
services/nodemanager/arvnodeman/computenode/driver/ec2.py [deleted file]
services/nodemanager/arvnodeman/computenode/driver/gce.py [deleted file]
services/nodemanager/arvnodeman/config.py [deleted file]
services/nodemanager/arvnodeman/daemon.py [deleted file]
services/nodemanager/arvnodeman/jobqueue.py [deleted file]
services/nodemanager/arvnodeman/launcher.py [deleted file]
services/nodemanager/arvnodeman/nodelist.py [deleted file]
services/nodemanager/arvnodeman/status.py [deleted file]
services/nodemanager/arvnodeman/test/fake_driver.py [deleted file]
services/nodemanager/arvnodeman/timedcallback.py [deleted file]
services/nodemanager/bin/arvados-node-manager [deleted file]
services/nodemanager/doc/azure.example.cfg [deleted file]
services/nodemanager/doc/ec2.example.cfg [deleted file]
services/nodemanager/doc/gce.example.cfg [deleted file]
services/nodemanager/doc/local.example.cfg [deleted file]
services/nodemanager/fpm-info.sh [deleted file]
services/nodemanager/gittaggers.py [deleted symlink]
services/nodemanager/setup.py [deleted file]
services/nodemanager/tests/__init__.py [deleted file]
services/nodemanager/tests/fake_azure.cfg.template [deleted file]
services/nodemanager/tests/fake_ec2.cfg.template [deleted file]
services/nodemanager/tests/fake_gce.cfg.template [deleted file]
services/nodemanager/tests/integration_test.py [deleted file]
services/nodemanager/tests/stress_test.cwl [deleted file]
services/nodemanager/tests/test_arguments.py [deleted file]
services/nodemanager/tests/test_clientactor.py [deleted file]
services/nodemanager/tests/test_computenode.py [deleted file]
services/nodemanager/tests/test_computenode_dispatch.py [deleted file]
services/nodemanager/tests/test_computenode_dispatch_slurm.py [deleted file]
services/nodemanager/tests/test_computenode_driver.py [deleted file]
services/nodemanager/tests/test_computenode_driver_azure.py [deleted file]
services/nodemanager/tests/test_computenode_driver_ec2.py [deleted file]
services/nodemanager/tests/test_computenode_driver_gce.py [deleted file]
services/nodemanager/tests/test_config.py [deleted file]
services/nodemanager/tests/test_daemon.py [deleted file]
services/nodemanager/tests/test_failure.py [deleted file]
services/nodemanager/tests/test_jobqueue.py [deleted file]
services/nodemanager/tests/test_nodelist.py [deleted file]
services/nodemanager/tests/test_status.py [deleted file]
services/nodemanager/tests/test_timedcallback.py [deleted file]
services/nodemanager/tests/testutil.py [deleted file]
tools/arvbox/bin/arvbox
tools/compute-images/.gitignore [new file with mode: 0644]
tools/compute-images/.licenseignore [moved from sdk/pam/.dockerignore with 72% similarity]
tools/compute-images/1078ECD7.asc [new file with mode: 0644]
tools/compute-images/arvados-images-aws.json [new file with mode: 0644]
tools/compute-images/arvados-images-azure.json [new file with mode: 0644]
tools/compute-images/build.sh [new file with mode: 0755]
tools/compute-images/scripts/base.sh [new file with mode: 0644]
tools/compute-images/scripts/etc-cloud-cloud.cfg.d-07_compute_arvados_dispatch_cloud.cfg [new file with mode: 0644]
tools/compute-images/scripts/usr-local-bin-ensure-encrypted-partitions.sh [new file with mode: 0644]
tools/crunchstat-summary/crunchstat_summary/summarizer.py
tools/keep-exercise/keep-exercise.go

index ad80dc3f4b671cc165db40fe6b215359933a0315..81f6b7181d2083ff2b84b3b5ec0e88168d58ca4b 100644 (file)
@@ -79,4 +79,6 @@ lib/dispatchcloud/test/sshkey_*
 *.asc
 sdk/java-v2/build.gradle
 sdk/java-v2/settings.gradle
-sdk/cwl/tests/wf/feddemo
\ No newline at end of file
+sdk/cwl/tests/wf/feddemo
+go.mod
+go.sum
index 7877e60d3018096bdc3ad6b9ef4c4bd892631925..75773ab90082ba0a79a64ba49b497216e1087094 100644 (file)
@@ -8,6 +8,8 @@ SPDX-License-Identifier: AGPL-3.0 %>
      [User, Group].each do |type|
        type
          .filter([['uuid','in',@share_links.collect(&:tail_uuid)]])
+         .with_count("none")
+         .fetch_multiple_pages(false)
          .each do |o|
          uuid_map[o.uuid] = o
        end
index ba08f34bcd46ebffd64adb7d387714f4b97d189b..857a9c8ebca0787801da8260d90bd0f0bcfaada2 100755 (executable)
@@ -86,15 +86,12 @@ do
             | *.py \
             | sdk/python/bin/arv-* \
             | sdk/cwl/bin/* \
-            | services/nodemanager/bin/* \
             | services/fuse/bin/* \
             | tools/crunchstat-summary/bin/* \
             | crunch_scripts/* \
             | *.yaml | *.yml | *.yml.example | *.cwl \
             | *.sh | *.service \
             | */run | */run-service | */restart-dns-server \
-            | */nodemanager/doc/*.cfg \
-            | */nodemanager/tests/fake*.cfg.template \
             | */nginx.conf \
             | build/build.list | *.R)
             fixer=fixer
index 8ccab49e1e7d3d9e7c557c48758b8b146386db35..5d204464cff89c27b0e21158fb42bbb77adc12cc 100644 (file)
@@ -6,7 +6,7 @@ FROM centos:7
 MAINTAINER Arvados Package Maintainers <packaging@arvados.org>
 
 # Install dependencies.
-RUN yum -q -y install make automake gcc gcc-c++ libyaml-devel patch readline-devel zlib-devel libffi-devel openssl-devel bzip2 libtool bison sqlite-devel rpm-build git perl-ExtUtils-MakeMaker libattr-devel nss-devel libcurl-devel which tar unzip scl-utils centos-release-scl postgresql-devel python-devel python-setuptools fuse-devel xz-libs git python-virtualenv wget pam-devel
+RUN yum -q -y install make automake gcc gcc-c++ libyaml-devel patch readline-devel zlib-devel libffi-devel openssl-devel bzip2 libtool bison sqlite-devel rpm-build git perl-ExtUtils-MakeMaker libattr-devel nss-devel libcurl-devel which tar unzip scl-utils centos-release-scl postgresql-devel fuse-devel xz-libs git wget pam-devel
 
 # Install RVM
 ADD generated/mpapis.asc /tmp/
@@ -41,16 +41,16 @@ RUN ln -s /usr/local/node-v6.11.2-linux-x64/bin/* /usr/local/bin/
 # Need to "touch" RPM database to workaround bug in interaction between
 # overlayfs and yum (https://bugzilla.redhat.com/show_bug.cgi?id=1213602)
 RUN touch /var/lib/rpm/* && yum -q -y install rh-python36
-RUN scl enable rh-python36 "easy_install-3.6 pip" && easy_install-2.7 pip
+RUN scl enable rh-python36 "easy_install-3.6 pip"
 
 # Add epel, we need it for the python-pam dependency
-RUN wget http://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm
-RUN rpm -ivh epel-release-latest-7.noarch.rpm
+#RUN wget http://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm
+#RUN rpm -ivh epel-release-latest-7.noarch.rpm
 
 RUN git clone --depth 1 git://git.arvados.org/arvados.git /tmp/arvados && cd /tmp/arvados/services/api && /usr/local/rvm/bin/rvm-exec default bundle && cd /tmp/arvados/apps/workbench && /usr/local/rvm/bin/rvm-exec default bundle
 
 # The version of setuptools that comes with CentOS is way too old
-RUN pip install --upgrade 'setuptools<45'
+RUN scl enable rh-python36 "easy_install-3.6 pip install 'setuptools<45'"
 
 ENV WORKSPACE /arvados
 CMD ["scl", "enable", "rh-python36", "/usr/local/rvm/bin/rvm-exec default bash /jenkins/run-build-packages.sh --target centos7"]
index 90dfd36b52f66afb6f49c946df761fcd1651ac53..4f306c6aa4e8ca4241e39f87fcbf403b401ab431 100644 (file)
@@ -4,15 +4,15 @@
 
 ## dont use debian:10 here since the word 'buster' is used for rvm precompiled binaries
 FROM debian:buster
-MAINTAINER Ward Vandewege <wvandewege@veritasgenetics.com>
+MAINTAINER Arvados Package Maintainers <packaging@arvados.org>
 
 ENV DEBIAN_FRONTEND noninteractive
 
 # Install dependencies.
-RUN /usr/bin/apt-get update && /usr/bin/apt-get install -q -y python2.7-dev python3 python-setuptools python3-setuptools python3-pip libcurl4-gnutls-dev curl git procps libattr1-dev libfuse-dev libgnutls28-dev libpq-dev python-pip unzip python3-venv python3-dev libpam-dev
+RUN /usr/bin/apt-get update && /usr/bin/apt-get install -q -y python3 python3-setuptools python3-pip libcurl4-gnutls-dev curl git procps libattr1-dev libfuse-dev libgnutls28-dev libpq-dev unzip python3-venv python3-dev libpam-dev
 
 # Install virtualenv
-RUN /usr/bin/pip install 'virtualenv<20'
+RUN /usr/bin/pip3 install 'virtualenv<20'
 
 # Install RVM
 ADD generated/mpapis.asc /tmp/
index 1a84da280898d3010ea6c8bf5978bc0da648f891..5294997f054658d5f3fb5b7366af0d69eab663a8 100644 (file)
@@ -4,15 +4,15 @@
 
 ## dont use debian:9 here since the word 'stretch' is used for rvm precompiled binaries
 FROM debian:stretch
-MAINTAINER Nico Cesar <nico@curoverse.com>
+MAINTAINER Arvados Package Maintainers <packaging@arvados.org>
 
 ENV DEBIAN_FRONTEND noninteractive
 
 # Install dependencies.
-RUN /usr/bin/apt-get update && /usr/bin/apt-get install -q -y python2.7-dev python3 python-setuptools python3-setuptools python3-pip libcurl4-gnutls-dev curl git procps libattr1-dev libfuse-dev libgnutls28-dev libpq-dev python-pip unzip python3-venv python3-dev libpam-dev
+RUN /usr/bin/apt-get update && /usr/bin/apt-get install -q -y python3 python3-setuptools python3-pip libcurl4-gnutls-dev curl git procps libattr1-dev libfuse-dev libgnutls28-dev libpq-dev unzip python3-venv python3-dev libpam-dev
 
 # Install virtualenv
-RUN /usr/bin/pip install 'virtualenv<20'
+RUN /usr/bin/pip3 install 'virtualenv<20'
 
 # Install RVM
 ADD generated/mpapis.asc /tmp/
index 87f7712d50be68aceb65612b33154bc267b0a10c..202bab651322dd9d91cd8ea415a7146b5931f9ce 100644 (file)
@@ -8,10 +8,10 @@ MAINTAINER Arvados Package Maintainers <packaging@arvados.org>
 ENV DEBIAN_FRONTEND noninteractive
 
 # Install dependencies.
-RUN /usr/bin/apt-get update && /usr/bin/apt-get install -q -y python2.7-dev python3 python-setuptools python3-setuptools python3-pip libcurl4-gnutls-dev libgnutls-dev curl git libattr1-dev libfuse-dev libpq-dev python-pip unzip tzdata python3-venv python3-dev libpam-dev
+RUN /usr/bin/apt-get update && /usr/bin/apt-get install -q -y python3 python-setuptools python3-setuptools python3-pip libcurl4-gnutls-dev libgnutls-dev curl git libattr1-dev libfuse-dev libpq-dev unzip tzdata python3-venv python3-dev libpam-dev
 
 # Install virtualenv
-RUN /usr/bin/pip install 'virtualenv<20'
+RUN /usr/bin/pip3 install 'virtualenv<20'
 
 # Install RVM
 ADD generated/mpapis.asc /tmp/
index a2ec29da1cf3932134b3f524608fbcb0c0b72691..05023aa09af50e5384e69db80ed5b253c91d72bb 100644 (file)
@@ -8,10 +8,10 @@ MAINTAINER Arvados Package Maintainers <packaging@arvados.org>
 ENV DEBIAN_FRONTEND noninteractive
 
 # Install dependencies.
-RUN /usr/bin/apt-get update && /usr/bin/apt-get install -q -y python2.7-dev python3 python-setuptools python3-pip libcurl4-gnutls-dev libgnutls28-dev curl git libattr1-dev libfuse-dev libpq-dev python-pip unzip tzdata python3-venv python3-dev libpam-dev
+RUN /usr/bin/apt-get update && /usr/bin/apt-get install -q -y python3 python3-pip libcurl4-gnutls-dev libgnutls28-dev curl git libattr1-dev libfuse-dev libpq-dev unzip tzdata python3-venv python3-dev libpam-dev
 
 # Install virtualenv
-RUN /usr/bin/pip install 'virtualenv<20'
+RUN /usr/bin/pip3 install 'virtualenv<20'
 
 # Install RVM
 ADD generated/mpapis.asc /tmp/
diff --git a/build/package-testing/test-package-arvados-docker-cleaner.sh b/build/package-testing/test-package-arvados-docker-cleaner.sh
new file mode 100755 (executable)
index 0000000..6b344de
--- /dev/null
@@ -0,0 +1,8 @@
+#!/bin/sh
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: AGPL-3.0
+
+set -e
+
+arvados-docker-cleaner -h >/dev/null
diff --git a/build/package-testing/test-package-arvados-node-manager.sh b/build/package-testing/test-package-arvados-node-manager.sh
deleted file mode 100755 (executable)
index 9300f4c..0000000
+++ /dev/null
@@ -1,15 +0,0 @@
-#!/bin/sh
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: AGPL-3.0
-
-set -e
-
-arvados-node-manager --version
-
-exec /usr/share/python2.7/dist/arvados-node-manager/bin/python2.7 <<EOF
-import libcloud.compute.types
-import libcloud.compute.providers
-libcloud.compute.providers.get_driver(libcloud.compute.types.Provider.AZURE_ARM)
-print "Successfully imported compatible libcloud library"
-EOF
diff --git a/build/package-testing/test-package-python-arvados-cwl-runner.sh b/build/package-testing/test-package-python-arvados-cwl-runner.sh
deleted file mode 120000 (symlink)
index 61e61b1..0000000
+++ /dev/null
@@ -1 +0,0 @@
-test-package-python27-python-arvados-cwl-runner.sh
\ No newline at end of file
diff --git a/build/package-testing/test-package-python-arvados-fuse.sh b/build/package-testing/test-package-python-arvados-fuse.sh
deleted file mode 120000 (symlink)
index 3b9232c..0000000
+++ /dev/null
@@ -1 +0,0 @@
-test-package-python27-python-arvados-fuse.sh
\ No newline at end of file
diff --git a/build/package-testing/test-package-python-arvados-python-client.sh b/build/package-testing/test-package-python-arvados-python-client.sh
deleted file mode 120000 (symlink)
index 8a4d0ea..0000000
+++ /dev/null
@@ -1 +0,0 @@
-test-package-python27-python-arvados-python-client.sh
\ No newline at end of file
diff --git a/build/package-testing/test-package-python-cwltest.sh b/build/package-testing/test-package-python-cwltest.sh
deleted file mode 120000 (symlink)
index 9b6545b..0000000
+++ /dev/null
@@ -1 +0,0 @@
-test-package-python27-python-cwltest.sh
\ No newline at end of file
diff --git a/build/package-testing/test-package-python27-python-arvados-python-client.sh b/build/package-testing/test-package-python27-python-arvados-python-client.sh
deleted file mode 100755 (executable)
index 2c92a3e..0000000
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/sh
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: AGPL-3.0
-
-set -e
-
-arv-put --version
-
-/usr/share/python2.7/dist/python-arvados-python-client/bin/python2.7 << EOF
-import arvados
-print "Successfully imported arvados"
-EOF
index 99327c016ad618dbf69971a0960e19def60469e9..ebf7b5becda9d94e371c462ce7bc5de278c82c1b 100755 (executable)
@@ -5,4 +5,4 @@
 
 set -e
 
-arvados-cwl-runner --version
+arvados-cwl-runner --version >/dev/null
index d4e66a27b9510ca06b50c2704c8e2bdee70a17d9..69f728c10e5c335967fac801c9f131726bce18a6 100755 (executable)
@@ -5,7 +5,7 @@
 
 set -e
 
-arv-put --version
+arv-put --version >/dev/null
 
 /usr/share/python3/dist/python3-arvados-python-client/bin/python3 << EOF
 import arvados
old mode 100644 (file)
new mode 100755 (executable)
similarity index 66%
rename from services/nodemanager/arvnodeman/test/__init__.py
rename to build/package-testing/test-package-python3-crunchstat-summary.sh
index d3ac1c2..02b6e0d
@@ -1,5 +1,8 @@
+#!/bin/sh
 # Copyright (C) The Arvados Authors. All rights reserved.
 #
 # SPDX-License-Identifier: AGPL-3.0
 
+set -e
 
+crunchstat-summary -h >/dev/null
similarity index 79%
rename from build/package-testing/test-package-python27-python-arvados-cwl-runner.sh
rename to build/package-testing/test-package-python3-cwltest.sh
index 99327c016ad618dbf69971a0960e19def60469e9..77f1f44016d80bfe3e19c33cde150e8da65e1778 100755 (executable)
@@ -5,4 +5,4 @@
 
 set -e
 
-arvados-cwl-runner --version
+cwltest -h >/dev/null
deleted file mode 120000 (symlink)
index 3b9232c5fa6ccac4a9f1fdaf3e8b1703934959ed..0000000000000000000000000000000000000000
+++ /dev/null
@@ -1 +0,0 @@
-test-package-python27-python-arvados-fuse.sh
\ No newline at end of file
new file mode 100755 (executable)
index 0000000000000000000000000000000000000000..81929857b8eaa6791a3e47e196f578de6f17b9a0
--- /dev/null
@@ -0,0 +1,8 @@
+#!/bin/sh
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: AGPL-3.0
+
+set -e
+
+arv-mount --version
diff --git a/build/package-testing/test-package-rh-python36-python-arvados-cwl-runner.sh b/build/package-testing/test-package-rh-python36-python-arvados-cwl-runner.sh
new file mode 100755 (executable)
index 0000000..ebf7b5b
--- /dev/null
@@ -0,0 +1,8 @@
+#!/bin/sh
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: AGPL-3.0
+
+set -e
+
+arvados-cwl-runner --version >/dev/null
diff --git a/build/package-testing/test-package-rh-python36-python-crunchstat-summary.sh b/build/package-testing/test-package-rh-python36-python-crunchstat-summary.sh
new file mode 100755 (executable)
index 0000000..02b6e0d
--- /dev/null
@@ -0,0 +1,8 @@
+#!/bin/sh
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: AGPL-3.0
+
+set -e
+
+crunchstat-summary -h >/dev/null
similarity index 74%
rename from build/package-testing/test-package-python27-python-cwltest.sh
rename to build/package-testing/test-package-rh-python36-python-cwltest.sh
index 395cefc5138ceba7647ad35995c1c8860466e424..77f1f44016d80bfe3e19c33cde150e8da65e1778 100755 (executable)
@@ -3,6 +3,6 @@
 #
 # SPDX-License-Identifier: AGPL-3.0
 
-exec python <<EOF
-import cwltest
-EOF
+set -e
+
+cwltest -h >/dev/null
index f8816dbe4873c3fad3773d47590393d1e62b5550..d0a79ad3dfa2fdf04cab380f321602fac66df618 100755 (executable)
@@ -192,27 +192,47 @@ popd
 if test -z "$packages" ; then
     packages="arvados-api-server
         arvados-client
+        arvados-controller
+        arvados-dispatch-cloud
         arvados-docker-cleaner
         arvados-git-httpd
-        arvados-node-manager
+        arvados-health
+        arvados-server
         arvados-src
+        arvados-sync-groups
         arvados-workbench
+        arvados-workbench2
+        arvados-ws
         crunch-dispatch-local
         crunch-dispatch-slurm
         crunch-run
         crunchstat
+        keepproxy
+        keepstore
         keep-balance
         keep-block-check
-        keepproxy
         keep-rsync
-        keepstore
+        keep-exercise
+        keep-rsync
+        keep-block-check
         keep-web
         libarvados-perl
-        libpam-arvados
-        libpam-arvados-go
-        python-arvados-fuse
-        python-arvados-python-client
-        python-arvados-cwl-runner"
+        libpam-arvados-go"
+    if [[ "$TARGET" =~ "centos" ]]; then
+      packages="$packages
+        rh-python36-python-cwltest
+        rh-python36-python-arvados-fuse
+        rh-python36-python-arvados-python-client
+        rh-python36-python-arvados-cwl-runner
+        rh-python36-python-crunchstat-summary"
+    else
+      packages="$packages
+        python3-cwltest
+        python3-arvados-fuse
+        python3-arvados-python-client
+        python3-arvados-cwl-runner
+        python3-crunchstat-summary"
+    fi
 fi
 
 FINAL_EXITCODE=0
index ba44218c4e8f076a8ab7d0a8917b5cd40cecb547..66201b3b4d0b577b66b956730c67b5b2e20d3913 100755 (executable)
@@ -195,7 +195,6 @@ if [ $PYTHON -eq 1 ]; then
   python_wrapper arvados-pam "$WORKSPACE/sdk/pam"
   python_wrapper arvados-cwl-runner "$WORKSPACE/sdk/cwl"
   python_wrapper arvados_fuse "$WORKSPACE/services/fuse"
-  python_wrapper arvados-node-manager "$WORKSPACE/services/nodemanager"
 
   if [ $((${#failures[@]} - $GEM_BUILD_FAILURES)) -ne 0 ]; then
     PYTHON_BUILD_FAILURES=$((${#failures[@]} - $GEM_BUILD_FAILURES))
index 5aa0b7e6f8e363642cf3aebfa6bff44d28926d2d..0e74ac6f2570761d34cfc91d58b36d16c1fa812d 100755 (executable)
@@ -102,18 +102,12 @@ if [[ "$DEBUG" != 0 ]]; then
     DASHQ_UNLESS_DEBUG=
 fi
 
-declare -a PYTHON_BACKPORTS PYTHON3_BACKPORTS
+declare -a PYTHON3_BACKPORTS
 
-PYTHON2_VERSION=2.7
 PYTHON3_VERSION=$(python3 -c 'import sys; print("{v.major}.{v.minor}".format(v=sys.version_info))')
 
 ## These defaults are suitable for any Debian-based distribution.
 # You can customize them as needed in distro sections below.
-PYTHON2_PACKAGE=python$PYTHON2_VERSION
-PYTHON2_PKG_PREFIX=python
-PYTHON2_PREFIX=/usr
-PYTHON2_INSTALL_LIB=lib/python$PYTHON2_VERSION/dist-packages
-
 PYTHON3_PACKAGE=python$PYTHON3_VERSION
 PYTHON3_PKG_PREFIX=python3
 PYTHON3_PREFIX=/usr
@@ -129,9 +123,6 @@ case "$TARGET" in
         ;;
     centos*)
         FORMAT=rpm
-        PYTHON2_PACKAGE=$(rpm -qf "$(which python$PYTHON2_VERSION)" --queryformat '%{NAME}\n')
-        PYTHON2_PKG_PREFIX=$PYTHON2_PACKAGE
-        PYTHON2_INSTALL_LIB=lib/python$PYTHON2_VERSION/site-packages
         PYTHON3_PACKAGE=$(rpm -qf "$(which python$PYTHON3_VERSION)" --queryformat '%{NAME}\n')
         PYTHON3_PKG_PREFIX=$PYTHON3_PACKAGE
         PYTHON3_PREFIX=/opt/rh/rh-python36/root/usr
@@ -321,29 +312,17 @@ package_go_binary tools/keep-exercise keep-exercise \
 package_go_so lib/pam pam_arvados.so libpam-arvados-go \
     "Arvados PAM authentication module"
 
-# The Python SDK - Should be built first because it's needed by others
-fpm_build_virtualenv "arvados-python-client" "sdk/python"
-
 # The Python SDK - Python3 package
 fpm_build_virtualenv "arvados-python-client" "sdk/python" "python3"
 
-# Arvados cwl runner - Only supports Python3 now
+# Arvados cwl runner - Python3 package
 fpm_build_virtualenv "arvados-cwl-runner" "sdk/cwl" "python3"
 
-# The PAM module
-fpm_build_virtualenv "libpam-arvados" "sdk/pam"
-
-# The FUSE driver
-fpm_build_virtualenv "arvados-fuse" "services/fuse"
-
 # The FUSE driver - Python3 package
 fpm_build_virtualenv "arvados-fuse" "services/fuse" "python3"
 
-# The node manager
-fpm_build_virtualenv "arvados-node-manager" "services/nodemanager"
-
 # The Arvados crunchstat-summary tool
-fpm_build_virtualenv "crunchstat-summary" "tools/crunchstat-summary"
+fpm_build_virtualenv "crunchstat-summary" "tools/crunchstat-summary" "python3"
 
 # The Docker image cleaner
 fpm_build_virtualenv "arvados-docker-cleaner" "services/dockercleaner" "python3"
@@ -354,11 +333,9 @@ if [[ -e "$WORKSPACE/cwltest" ]]; then
        rm -rf "$WORKSPACE/cwltest"
 fi
 git clone https://github.com/common-workflow-language/cwltest.git
-# last release to support python 2.7
-(cd cwltest && git checkout 1.0.20190906212748)
 # signal to our build script that we want a cwltest executable installed in /usr/bin/
 mkdir cwltest/bin && touch cwltest/bin/cwltest
-fpm_build_virtualenv "cwltest" "cwltest"
+fpm_build_virtualenv "cwltest" "cwltest" "python3"
 rm -rf "$WORKSPACE/cwltest"
 
 calculate_go_package_version arvados_server_version cmd/arvados-server
index 3e6c9f85841d55be0e7d9794c4e86a693e5500c3..528d69d9982eac69e561a3ab7078488a94093d61 100755 (executable)
@@ -231,10 +231,6 @@ default_iteration() {
            [[ ${BASH_REMATCH[1]} -le $LICENSE_PACKAGE_TS ]]; then
         iteration=2
     fi
-    if [[ $package_type =~ ^python ]]; then
-      # Fix --iteration for #9242.
-      iteration=2
-    fi
     echo $iteration
 }
 
@@ -487,18 +483,9 @@ fpm_build_virtualenv () {
         fi
         PACKAGE_PREFIX=$PYTHON3_PKG_PREFIX
         ;;
-    python)
-        # All Arvados Python2 packages depend on Python 2.7.
-        # Make sure we build with that for consistency.
-        python=python2.7
-        pip=pip
-        PACKAGE_PREFIX=$PYTHON2_PKG_PREFIX
-        ;;
   esac
 
-  if [[ "$PKG" != "libpam-arvados" ]] &&
-     [[ "$PKG" != "arvados-node-manager" ]] &&
-     [[ "$PKG" != "arvados-docker-cleaner" ]]; then
+  if [[ "$PKG" != "arvados-docker-cleaner" ]]; then
     PYTHON_PKG=$PACKAGE_PREFIX-$PKG
   else
     # Exception to our package naming convention
@@ -651,25 +638,6 @@ fpm_build_virtualenv () {
   LICENSE_STRING=`grep license $WORKSPACE/$PKG_DIR/setup.py|cut -f2 -d=|sed -e "s/[',\\"]//g"`
   COMMAND_ARR+=('--license' "$LICENSE_STRING")
 
-  # 12271 - As FPM-generated packages don't include scripts by default, the
-  # packages cleanup on upgrade depends on files being listed on the %files
-  # section in the generated SPEC files. To remove DIRECTORIES, they need to
-  # be listed in that section too, so we need to add this parameter to properly
-  # remove lingering dirs. But this only works for python2: if used on
-  # python33, it includes dirs like /opt/rh/python33 that belong to
-  # other packages.
-  if [[ "$FORMAT" == "rpm" ]] && [[ "$python" == "python2.7" ]]; then
-    COMMAND_ARR+=('--rpm-auto-add-directories')
-  fi
-
-  if [[ "$PKG" == "arvados-python-client" ]] || [[ "$PKG" == "arvados-fuse" ]]; then
-    if [[ "$python" == "python2.7" ]]; then
-      COMMAND_ARR+=('--conflicts' "$PYTHON3_PKG_PREFIX-$PKG")
-    else
-      COMMAND_ARR+=('--conflicts' "$PYTHON2_PKG_PREFIX-$PKG")
-    fi
-  fi
-
   if [[ "$DEBUG" != "0" ]]; then
     COMMAND_ARR+=('--verbose' '--log' 'info')
   fi
@@ -685,11 +653,7 @@ fpm_build_virtualenv () {
     COMMAND_ARR+=('--before-remove' "${WORKSPACE}/build/go-python-package-scripts/prerm")
   fi
 
-  if [[ "$python" == "python2.7" ]]; then
-    COMMAND_ARR+=('--depends' "$PYTHON2_PACKAGE")
-  else
-    COMMAND_ARR+=('--depends' "$PYTHON3_PACKAGE")
-  fi
+  COMMAND_ARR+=('--depends' "$PYTHON3_PACKAGE")
 
   # avoid warning
   COMMAND_ARR+=('--deb-no-default-config-files')
@@ -714,7 +678,7 @@ fpm_build_virtualenv () {
   done
 
   # make sure the systemd service file ends up in the right place
-  # used by arvados-docker-cleaner and arvados-node-manager
+  # used by arvados-docker-cleaner
   if [[ -e "${systemd_unit}" ]]; then
     COMMAND_ARR+=("usr/share/$python/dist/$PKG/share/doc/$PKG/$PKG.service=/lib/systemd/system/$PKG.service")
   fi
@@ -733,15 +697,6 @@ fpm_build_virtualenv () {
     done
   fi
 
-  # the libpam module should place a few files in the correct place for the pam
-  # subsystem
-  if [[ -e "$WORKSPACE/$PKG_DIR/dist/build/usr/share/$python/dist/$PYTHON_PKG/lib/security/libpam_arvados.py" ]]; then
-    COMMAND_ARR+=("usr/share/$python/dist/$PYTHON_PKG/lib/security/libpam_arvados.py=/usr/lib/security/")
-  fi
-  if [[ -e "$WORKSPACE/$PKG_DIR/dist/build/usr/share/$python/dist/$PYTHON_PKG/share/pam-configs/arvados" ]]; then
-    COMMAND_ARR+=("usr/share/$python/dist/$PYTHON_PKG/share/pam-configs/arvados=/usr/share/pam-configs/")
-  fi
-
   # the python-arvados-cwl-runner package comes with cwltool, expose that version
   if [[ -e "$WORKSPACE/$PKG_DIR/dist/build/usr/share/python2.7/dist/python-arvados-cwl-runner/bin/cwltool" ]]; then
     COMMAND_ARR+=("usr/share/python2.7/dist/python-arvados-cwl-runner/bin/cwltool=/usr/bin/")
@@ -802,17 +757,6 @@ fpm_build () {
       COMMAND_ARR+=(--deb-ignore-iteration-in-dependencies)
   fi
 
-  # 12271 - As FPM-generated packages don't include scripts by default, the
-  # packages cleanup on upgrade depends on files being listed on the %files
-  # section in the generated SPEC files. To remove DIRECTORIES, they need to
-  # be listed in that section too, so we need to add this parameter to properly
-  # remove lingering dirs. But this only works for python2: if used on
-  # python33, it includes dirs like /opt/rh/python33 that belong to
-  # other packages.
-  if [[ "$FORMAT" = rpm ]] && [[ "$python" = python2.7 ]]; then
-    COMMAND_ARR+=('--rpm-auto-add-directories')
-  fi
-
   if [[ "$DEBUG" != "0" ]]; then
     COMMAND_ARR+=('--verbose' '--log' 'info')
   fi
index ff6ead0facc26bbb0e1141d118b4cd81a70ec4c0..2742540b16b44efe57fa113d23d3e967915e5c2f 100755 (executable)
@@ -91,6 +91,7 @@ lib/dispatchcloud/scheduler
 lib/dispatchcloud/ssh_executor
 lib/dispatchcloud/worker
 lib/mount
+lib/pam
 lib/service
 services/api
 services/arv-git-httpd
@@ -104,14 +105,10 @@ services/keepproxy
 services/keepstore
 services/keep-balance
 services/login-sync
-services/nodemanager
-services/nodemanager_integration
 services/crunch-dispatch-local
 services/crunch-dispatch-slurm
 services/ws
 sdk/cli
-sdk/pam
-sdk/pam:py3
 sdk/python
 sdk/python:py3
 sdk/ruby
@@ -262,7 +259,7 @@ sanity_checks() {
         || fatal "No libpq libpq-fe.h. Try: apt-get install libpq-dev"
     echo -n 'libpam pam_appl.h: '
     find /usr/include -path '*/security/pam_appl.h' | egrep --max-count=1 . \
-        || fatal "No libpam pam_appl.h. Try: apt-get install libpam-dev"
+        || fatal "No libpam pam_appl.h. Try: apt-get install libpam0g-dev"
     echo -n 'postgresql: '
     psql --version || fatal "No postgresql. Try: apt-get install postgresql postgresql-client-common"
     echo -n 'phantomjs: '
@@ -306,8 +303,6 @@ declare -A skip
 declare -A only
 declare -A testargs
 skip[apps/workbench_profile]=1
-# nodemanager_integration tests are not reliable, see #12061.
-skip[services/nodemanager_integration]=1
 
 while [[ -n "$1" ]]
 do
@@ -668,14 +663,6 @@ install_env() {
         python setup.py install
     ) || fatal "installing PyYAML and sdk/python failed"
 
-    # Preinstall libcloud if using a fork; otherwise nodemanager "pip
-    # install" won't pick it up by default.
-    if [[ -n "$LIBCLOUD_PIN_SRC" ]]; then
-        pip freeze 2>/dev/null | egrep ^apache-libcloud==$LIBCLOUD_PIN \
-            || pip install --pre --ignore-installed --no-cache-dir "$LIBCLOUD_PIN_SRC" >/dev/null \
-            || fatal "pip install apache-libcloud failed"
-    fi
-
     # Deactivate Python 2 virtualenv
     deactivate
 
@@ -722,9 +709,6 @@ do_test() {
         apps/workbench_units | apps/workbench_functionals | apps/workbench_integration)
             suite=apps/workbench
             ;;
-        services/nodemanager | services/nodemanager_integration)
-            suite=services/nodemanager_suite
-            ;;
         *)
             suite="${1}"
             ;;
@@ -1004,14 +988,12 @@ install_services/api() {
 
 declare -a pythonstuff
 pythonstuff=(
-    sdk/pam
     sdk/python
     sdk/python:py3
     sdk/cwl:py3
     services/dockercleaner:py3
     services/fuse
     services/fuse:py3
-    services/nodemanager
     tools/crunchstat-summary
     tools/crunchstat-summary:py3
 )
@@ -1076,11 +1058,6 @@ test_services/login-sync() {
         && "$bundle" exec rake test TESTOPTS=-v ${testargs[services/login-sync]}
 }
 
-test_services/nodemanager_integration() {
-    cd "$WORKSPACE/services/nodemanager" \
-        && tests/integration_test.py ${testargs[services/nodemanager_integration]}
-}
-
 test_apps/workbench_units() {
     local TASK="test:units"
     cd "$WORKSPACE/apps/workbench" \
@@ -1175,7 +1152,6 @@ test_all() {
     do_test sdk/cli
     do_test services/login-sync
     do_test sdk/java-v2
-    do_test services/nodemanager_integration
     for p in "${pythonstuff[@]}"
     do
         dir=${p%:py3}
index 887bc62bb322a7e5df7f41ab74efd9c74d82b655..bcc3dda09ac91559d4a35227ef81c95bf3e979cd 100644 (file)
@@ -9,6 +9,7 @@ import (
 
        "git.arvados.org/arvados.git/lib/cli"
        "git.arvados.org/arvados.git/lib/cmd"
+       "git.arvados.org/arvados.git/lib/deduplicationreport"
        "git.arvados.org/arvados.git/lib/mount"
 )
 
@@ -52,7 +53,8 @@ var (
                "virtual_machine":          cli.APICall,
                "workflow":                 cli.APICall,
 
-               "mount": mount.Command,
+               "mount":                mount.Command,
+               "deduplication-report": deduplicationreport.Command,
        })
 )
 
index be52a204c02d4e9548eeaa1139ff8126cff4f400..bbab3f307ed88e5eb784c483e692e7173298547a 100644 (file)
@@ -220,9 +220,10 @@ navbar:
       - install/install-shell-server.html.textile.liquid
       - install/install-webshell.html.textile.liquid
     - Containers API:
-      - install/crunch2-slurm/install-compute-node.html.textile.liquid
       - install/install-jobs-image.html.textile.liquid
-      - install/install-dispatch-cloud.html.textile.liquid
+      - install/crunch2-cloud/install-compute-node.html.textile.liquid
+      - install/crunch2-cloud/install-dispatch-cloud.html.textile.liquid
+      - install/crunch2-slurm/install-compute-node.html.textile.liquid
       - install/crunch2-slurm/install-dispatch.html.textile.liquid
       - install/crunch2-slurm/install-test.html.textile.liquid
     - External dependencies:
index 2adce90383902f653f3b7ba99cea3f5b859e3229..cd8f11d0a30b88a9b6f0721a518b370bee17f78c 100644 (file)
@@ -10,7 +10,7 @@ Copyright (C) The Arvados Authors. All rights reserved.
 SPDX-License-Identifier: CC-BY-SA-3.0
 {% endcomment %}
 
-The @arvados-server@ package includes a @cloudtest@ tool that checks compatibility between your Arvados configuration, your cloud driver, your cloud provider's API, your cloud provider's VM instances, and the worker image you use with the *experimental* "cloud dispatcher":../install/install-dispatch-cloud.html.
+The @arvados-server@ package includes a @cloudtest@ tool that checks compatibility between your Arvados configuration, your cloud driver, your cloud provider's API, your cloud provider's VM instances, and the worker image you use with the "cloud dispatcher":../install/crunch2-cloud/install-dispatch-cloud.html.
 
 @arvados-server cloudtest@ performs the following steps:
 # Create a new instance
index 4c8e856693ba43a639675ee76d03b3969c5e61a3..875ee4618a2b766358179750e54fed18bf83a211 100644 (file)
@@ -75,7 +75,7 @@ After migrating and removing all legacy config files, make sure the @/etc/arvado
 
 h2. Cloud installations only: node manager
 
-Node manager is deprecated and replaced by @arvados-dispatch-cloud@.  No automated config migration is available.  Follow the instructions to "install the cloud dispatcher":../install/install-dispatch-cloud.html
+Node manager is deprecated and replaced by @arvados-dispatch-cloud@.  No automated config migration is available.  Follow the instructions to "install the cloud dispatcher":../install/crunch2-cloud/install-dispatch-cloud.html
 
 *Only one dispatch process should be running at a time.* If you are migrating a system that currently runs Node manager and @crunch-dispatch-slurm@, it is safest to remove the @crunch-dispatch-slurm@ service entirely before installing @arvados-dispatch-cloud@.
 
index 881227b3fa9a84ce084f107de771aa862c1949c5..abdd8db734e7522f61acbcfbf0610ace401d38fe 100644 (file)
@@ -16,24 +16,6 @@ Services must have ManagementToken configured.  This is used to authorize access
 
 To access a monitoring endpoint, the requester must provide the HTTP header @Authorization: Bearer (ManagementToken)@.
 
-h2. Node Manager
-
-Set @port@ (the listen port) and @ManagementToken@ in the @Manage@ section of @node-manager.ini@.
-
-<pre>
-[Manage]
-# The management server responds to http://addr:port/status.json with
-# a snapshot of internal state.
-
-# Management server listening address (default 127.0.0.1)
-#address = 0.0.0.0
-
-# Management server port number (default -1, server is disabled)
-#port = 8989
-
-ManagementToken = xxx
-</pre>
-
 h2. API server and other services
 
 The following services also support monitoring.
@@ -45,7 +27,7 @@ The following services also support monitoring.
 * keepproxy
 * keepstore
 * keep-web
-* websockets
+* arvados-ws 
 
 Set @ManagementToken@ in the appropriate section of @/etc/arvados/config.yml@.
 
index 1d6b87da62116027a96788c8fe7b73c44a269133..0cfa0a2e604cc0ee40bcbe3cc1a44836b3247b72 100644 (file)
@@ -35,7 +35,6 @@ table(table table-bordered table-condensed table-hover).
 |arvados-controller|✓|
 |arvados-dispatch-cloud|✓|
 |arvados-git-httpd||
-|arvados-node-manager||
 |arvados-ws|✓|
 |composer||
 |keepproxy||
@@ -44,48 +43,3 @@ table(table table-bordered table-condensed table-hover).
 |keep-web|✓|
 |workbench1||
 |workbench2||
-
-h2. Node manager
-
-The node manager does not export prometheus-style metrics, but its @/status.json@ endpoint provides a snapshot of internal status at the time of the most recent wishlist update.
-
-<pre>curl -sfH "Authorization: Bearer your_management_token_goes_here" "http://0.0.0.0:8989/status.json"
-</pre>
-
-table(table table-bordered table-condensed).
-|_. Attribute|_. Type|_. Description|
-|nodes_booting|int|Number of nodes in booting state|
-|nodes_unpaired|int|Number of nodes in unpaired state|
-|nodes_busy|int|Number of nodes in busy state|
-|nodes_idle|int|Number of nodes in idle state|
-|nodes_fail|int|Number of nodes in fail state|
-|nodes_down|int|Number of nodes in down state|
-|nodes_shutdown|int|Number of nodes in shutdown state|
-|nodes_wish|int|Number of nodes in the current wishlist|
-|node_quota|int|Current node count ceiling due to cloud quota limits|
-|config_max_nodes|int|Configured max node count|
-
-h3. Example
-
-<pre>
-{
-  "actor_exceptions": 0,
-  "idle_times": {
-    "compute1": 0,
-    "compute3": 0,
-    "compute2": 0,
-    "compute4": 0
-  },
-  "create_node_errors": 0,
-  "destroy_node_errors": 0,
-  "nodes_idle": 0,
-  "config_max_nodes": 8,
-  "list_nodes_errors": 0,
-  "node_quota": 8,
-  "Version": "1.1.4.20180719160944",
-  "nodes_wish": 0,
-  "nodes_unpaired": 0,
-  "nodes_busy": 4,
-  "boot_failures": 0
-}
-</pre>
index 7195a37b4842812589d68e8cd088884b47c2d190..7f49d6961292f7371436cb04cbe3892a1a0efadb 100644 (file)
@@ -25,14 +25,14 @@ Clusters:
       UsePreemptibleInstances: true
     InstanceTypes:
       m4.large:
-       Preemptible: false
+        Preemptible: false
         ProviderType: m4.large
         VCPUs: 2
         RAM: 8GiB
         AddedScratch: 32GB
         Price: 0.1
       m4.large.spot:
-       Preemptible: true
+        Preemptible: true
         ProviderType: m4.large
         VCPUs: 2
         RAM: 8GiB
@@ -42,9 +42,7 @@ Clusters:
 
 When @UsePreemptibleInstances@ is enabled, child containers (workflow steps) will automatically be made preemptible.  Note that because preempting the workflow runner would cancel the entire workflow, the workflow runner runs in a reserved (non-preemptible) instance.
 
-If you are using "arvados-dispatch-cloud":{{site.baseurl}}/install/install-dispatch-cloud.html no additional configuration is required.
-
-If you are using the legacy Nodemanager, "see below":#nodemanager .
+If you are using "arvados-dispatch-cloud":{{site.baseurl}}/install/crunch2-cloud/install-dispatch-cloud.html no additional configuration is required.
 
 h2. Preemptible instances on AWS
 
@@ -62,22 +60,7 @@ The account needs to have a service linked role created. This can be done by log
 
 h3. Cost Tracking
 
-Amazon's Spot instances prices are declared at instance request time and defined by the maximum price that the user is willing to pay per hour. By default, this price is the same amount as the on-demand version of each instance type, and this setting is the one that nodemanager uses for now, as it doesn't include any pricing data to the spot instance request.
+Amazon's Spot instances prices are declared at instance request time and defined by the maximum price that the user is willing to pay per hour. By default, this price is the same amount as the on-demand version of each instance type, and this setting is the one that @arvados-dispatch-cloud@ uses for now, as it doesn't include any pricing data to the spot instance request.
 
 The real price that a spot instance has at any point in time is discovered at the end of each usage hour, depending on instance demand. For this reason, AWS provides a data feed subscription to get hourly logs, as described on "Amazon's User Guide":https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/spot-data-feeds.html.
 
-h2(#nodemanager). Nodemanager
-
-If you are using the legacy Nodemanager, its config file must also declare preemptible instance sizes, which must match the API server's @InstanceTypes@:
-
-<pre>
-[Size m4.large]
-cores = 2
-scratch = 32000
-
-[Size m4.large.spot]
-cores = 2
-instance_type = m4.large
-preemptible = true
-scratch = 32000
-</pre>
index 9cddce5fe647656a3ef0134aa1ab2642db1b5fcd..061b68fa5d27b766e7d45bd0c08750fed210f5dd 100644 (file)
@@ -10,14 +10,14 @@ Copyright (C) The Arvados Authors. All rights reserved.
 SPDX-License-Identifier: CC-BY-SA-3.0
 {% endcomment %}
 
-What you need to know and do in order to upgrade your Arvados installation.
+For Arvados administrators, this page will cover what you need to know and do in order to ensure a smooth upgrade of your Arvados installation.  For general release notes covering features added and bugs fixed, see "Arvados releases":https://arvados.org/releases .
 
 h2. General process
 
 # Consult upgrade notes below to see if any manual configuration updates are necessary.
 # Wait for the cluster to be idle and stop Arvados services.
 # Install new packages using @apt-get upgrade@ or @yum upgrade@.
-# Package installation scripts will perform any necessary data migrations.
+# Wait for package installation scripts as they perform any necessary data migrations.
 # Restart Arvados services.
 
 h2. Upgrade notes
@@ -38,6 +38,10 @@ h2(#master). development master (as of 2020-06-17)
 
 "Upgrading from 2.0.0":#v2_0_0
 
+h3. Removing libpam-arvados, replaced with libpam-arvados-go
+
+The Python-based PAM package has been replaced with a version written in Go. See "using PAM for authentication":{{site.baseurl}}/install/setup-login.html#pam for details.
+
 h3. Removing sso-provider
 
 The SSO (single sign-on) component is deprecated and will not be supported in future releases. Existing configurations will continue to work in this release, but you should switch to one of the built-in authentication mechanisms as soon as possible. See "setting up web based login":{{site.baseurl}}/install/setup-login.html for details.
@@ -179,7 +183,7 @@ h3. New property vocabulary format for Workbench2
 
 h3. Cloud installations only: node manager replaced by arvados-dispatch-cloud
 
-Node manager is deprecated and replaced by @arvados-dispatch-cloud@.  No automated config migration is available.  Follow the instructions to "install the cloud dispatcher":../install/install-dispatch-cloud.html
+Node manager is deprecated and replaced by @arvados-dispatch-cloud@.  No automated config migration is available.  Follow the instructions to "install the cloud dispatcher":../install/crunch2-cloud/install-dispatch-cloud.html
 
 *Only one dispatch process should be running at a time.* If you are migrating a system that currently runs Node manager and @crunch-dispatch-slurm@, it is safest to remove the @crunch-dispatch-slurm@ service entirely before installing @arvados-dispatch-cloud@.
 
@@ -569,7 +573,7 @@ As part of story "#11349":https://dev.arvados.org/issues/11349, commit "2c094e2"
 
 * To enable it, add to your configuration file: <pre>[Manage]
   address = 127.0.0.1
-  port = 8989</pre> (see example configuration files in source:services/nodemanager/doc or https://doc.arvados.org/install/install-nodemanager.html for more info)
+  port = 8989</pre>
 * The server responds to @http://{address}:{port}/status.json@ with a summary of how many nodes are in each state (booting, busy, shutdown, etc.)
 
 h3. New websockets component (2017-03-23)
index 705048cd620cf566ad5ece5722e311262642d623..dddcd050731eaf712d1485458ba25791262a5615 100644 (file)
@@ -20,6 +20,7 @@ table(table table-bordered table-condensed).
 |_. Component|_. Description|
 |api|The API server is the core of Arvados.  It is backed by a Postgres database and manages information such as metadata for storage, a record of submitted compute jobs, users, groups, and associated permissions.|
 |arv-git-httpd|Provides a git+http interface to Arvados-managed git repositories, with permissions and authentication based on an Arvados API token.|
+|arvados-dispatch-cloud|Provide elastic computing by creating and destroying cloud based virtual machines on compute demand.|
 |crunch-dispatch-local|Get compute requests submitted to the API server and execute them locally.|
 |crunch-dispatch-slurm|Get compute requests submitted to the API server and submit them to slurm.|
 |crunch-run|Dispatched by crunch-dispatch, executes a single compute run: setting up a Docker container, running it, and collecting the output.|
@@ -31,8 +32,7 @@ table(table table-bordered table-condensed).
 |keepstore|Provides access to underlying storage (filesystem or object storage such as Amazon S3 or Azure Blob) with Arvados permissions.|
 |keep-web|Provides high-level WebDAV access to collections (file-level data access).|
 |login-sync|Synchronize virtual machine users with Arvados users and permissions.|
-|nodemanager|Provide elastic computing by creating and destroying cloud based virtual machines on compute demand.|
-|ws|Publishes API server change events over websockets.|
+|arvados-ws|Publishes API server change events over websockets.|
 |workbench|Web application providing user interface to Arvados services.|
 
 h3. Tools
index b960ac1fda0c2ab1fbaae77e4ae3c875b8dec0bc..76a2f3ab5723121cb2d0ae9d7e4724c5b2c14d06 100644 (file)
@@ -64,6 +64,9 @@ Volumes are configured in the @Volumes@ section of the cluster configuration fil
           # might be needed for other S3-compatible services.
           V2Signature: false
 
+          # Use the AWS S3 v2 Go driver instead of the goamz driver.
+          UseAWSS3v2Driver: false
+
           # Requested page size for "list bucket contents" requests.
           IndexPageSize: 1000
 
@@ -94,3 +97,9 @@ Volumes are configured in the @Volumes@ section of the cluster configuration fil
         # classes" in the "Admin" section of doc.arvados.org.
         StorageClasses: null
 </code></pre></notextile>
+
+Two S3 drivers are available. Historically, Arvados has used the @goamz@ driver to talk to S3-compatible services. More recently, support for the @aws-sdk-go-v2@ driver was added. This driver can be activated by setting the @UseAWSS3v2Driver@ flag to @true@.
+
+The @aws-sdk-go-v2@ does not support the old S3 v2 signing algorithm. This will not affect interacting with AWS S3, but it might be an issue when Keep is backed by a very old version of a third party S3-compatible service.
+
+The @aws-sdk-go-v2@ driver can improve read performance by 50-100% over the @goamz@ driver, but it has not had as much production use. See the "wiki":https://dev.arvados.org/projects/arvados/wiki/Keep_real_world_performance_numbers for details.
diff --git a/doc/install/crunch2-cloud/install-compute-node.html.textile.liquid b/doc/install/crunch2-cloud/install-compute-node.html.textile.liquid
new file mode 100644 (file)
index 0000000..23da428
--- /dev/null
@@ -0,0 +1,186 @@
+---
+layout: default
+navsection: installguide
+title: Build a cloud compute node image
+...
+{% comment %}
+Copyright (C) The Arvados Authors. All rights reserved.
+
+SPDX-License-Identifier: CC-BY-SA-3.0
+{% endcomment %}
+
+{% include 'notebox_begin_warning' %}
+arvados-dispatch-cloud is only relevant for cloud installations. Skip this section if you are installing an on premises cluster that will spool jobs to Slurm.
+{% include 'notebox_end' %}
+
+# "Introduction":#introduction
+# "Create an SSH keypair":#sshkeypair
+# "The build script":#building
+# "Build an Azure image":#azure
+# "Build an AWS image":#aws
+
+h2(#introduction). Introduction
+
+This page describes how to build a compute node image that can be used to run containers dispatched by Arvados in the cloud.
+
+Packer templates for AWS and Azure are provided with Arvados. To use them, the following are needed:
+
+* "Packer":https://www.packer.io/
+* credentials for your cloud account
+* configuration details for your cloud account
+
+
+h2(#sshkeypair). Create a SSH keypair
+
+@arvados-dispatch-cloud@ communicates with the compute nodes via SSH. To do this securely, a SSH keypair is needed.
+
+Generate a SSH keypair with no passphrase. The private key needs to be stored in the cluster configuration file (see @Containers/DispatchPrivateKey@) for use by @arvados-dispatch-cloud@, as described in the "next section":install-dispatch-cloud.html#update-config. The public key will be baked into the compute node images, see the cloud-specific documentation below.
+
+<notextile>
+<pre><code>~$ <span class="userinput">ssh-keygen -N '' -f ~/.ssh/id_dispatcher</span>
+Generating public/private rsa key pair.
+Your identification has been saved in /home/user/.ssh/id_dispatcher.
+Your public key has been saved in /home/user/.ssh/id_dispatcher.pub.
+The key fingerprint is:
+[...]
+~$ <span class="userinput">cat ~/.ssh/id_dispatcher</span>
+-----BEGIN RSA PRIVATE KEY-----
+MIIEpQIBAAKCAQEAqXoCzcOBkFQ7w4dvXf9B++1ctgZRqEbgRYL3SstuMV4oawks
+ttUuxJycDdsPmeYcHsKo8vsEZpN6iYsX6ZZzhkO5nEayUTU8sBjmg1ZCTo4QqKXr
+...
+oFyAjVoexx0RBcH6BveTfQtJKbktP1qBO4mXo2dP0cacuZEtlAqW9Eb06Pvaw/D9
+foktmqOY8MyctzFgXBpGTxPliGjqo8OkrOyQP2g+FL7v+Km31Xs61P8=
+-----END RSA PRIVATE KEY-----
+</code></pre>
+</notextile>
+
+h2(#building). The build script
+
+The necessary files are located in the @arvados/tools/compute-images@ directory in the source tree. A build script is provided to generate the image. The @--help@ argument lists all available options:
+
+<notextile><pre><code>~$ <span class="userinput">./build.sh --help</span>
+build.sh: Build cloud images for arvados-dispatch-cloud
+
+Syntax:
+        build.sh [options]
+
+Options:
+
+  --json-file (required)
+      Path to the packer json file
+  --arvados-cluster-id (required)
+      The ID of the Arvados cluster, e.g. zzzzz
+  --aws-profile (default: false)
+      AWS profile to use (valid profile from ~/.aws/config
+  --aws-secrets-file (default: false, required if building for AWS)
+      AWS secrets file which will be sourced from this script
+  --aws-source-ami (default: false, required if building for AWS)
+      The AMI to use as base for building the images
+  --aws-region (default: us-east-1)
+      The AWS region to use for building the images
+  --aws-vpc-id (optional)
+      VPC id for AWS, otherwise packer will pick the default one
+  --aws-subnet-id
+      Subnet id for AWS otherwise packer will pick the default one for the VPC
+  --gcp-project-id (default: false, required if building for GCP)
+      GCP project id
+  --gcp-account-file (default: false, required if building for GCP)
+      GCP account file
+  --gcp-zone (default: us-central1-f)
+      GCP zone
+  --azure-secrets-file (default: false, required if building for Azure)
+      Azure secrets file which will be sourced from this script
+  --azure-resource-group (default: false, required if building for Azure)
+      Azure resource group
+  --azure-storage-account (default: false, required if building for Azure)
+      Azure storage account
+  --azure-location (default: false, required if building for Azure)
+      Azure location, e.g. centralus, eastus, westeurope
+  --azure-sku (default: unset, required if building for Azure, e.g. 16.04-LTS)
+      Azure SKU image to use
+  --ssh_user  (default: packer)
+      The user packer will use lo log into the image
+  --domain  (default: arvadosapi.com)
+      The domain part of the FQDN for the cluster
+  --resolver (default: 8.8.8.8)
+      The dns resolver for the machine
+  --reposuffix (default: unset)
+      Set this to "-dev" to track the unstable/dev Arvados repositories
+  --public-key-file (required)
+      Path to the public key file that a-d-c will use to log into the compute node
+  --debug
+      Output debug information (default: false)
+</code></pre></notextile>
+
+h2(#azure). Build an Azure image
+
+<notextile><pre><code>~$ <span class="userinput">./build.sh --json-file arvados-images-azure.json \
+           --arvados-cluster-id ClusterID \
+           --azure-resource-group ResourceGroup \
+           --azure-storage-account StorageAccount \
+           --azure-location AzureRegion \
+           --azure-sku AzureSKU \
+           --azure-secrets-file AzureSecretsFilePath \
+           --resolver ResolverIP \
+           --public-key-file ArvadosDispatchCloudPublicKeyPath
+</span>
+</code></pre></notextile>
+
+For @ClusterID@, fill in your cluster ID. The @ResourceGroup@, @StorageAccount@ and @AzureRegion@ (e.g. 'eastus2') should be configured for where you want the compute image to be generated and stored. The @AzureSKU@ is the SKU of the base image to be used, e.g. '18.04-LTS' for Ubuntu 18.04.
+
+@AzureSecretsFilePath@ should be replaced with the path to a shell script that loads the Azure secrets with sufficient permissions to create the image. The file would look like this:
+
+<notextile><pre><code>export ARM_CLIENT_ID=...
+export ARM_CLIENT_SECRET=...
+export ARM_SUBSCRIPTION_ID=...
+export ARM_TENANT_ID=...
+</code></pre></notextile>
+
+These secrets can be generated from the Azure portal, or with the cli using a command like this:
+
+<notextile><pre><code>~$ <span class="userinput">az ad sp create-for-rbac --name Packer --password ...</span>
+</code></pre></notextile>
+
+@ArvadosDispatchCloudPublicKeyPath@ should be replaced with the path to the ssh *public* key file generated in "Create an SSH keypair":#sshkeypair, above.
+
+Compute nodes must be able to resolve the hostnames of the API server and any keepstore servers to your internal IP addresses. You can do this by running an internal DNS resolver. The IP address of the resolver should replace the string @ResolverIP@ in the command above.
+
+Alternatively, the services could be hardcoded into an @/etc/hosts@ file. For example:
+
+<notextile><pre><code>10.20.30.40     <span class="userinput">ClusterID.example.com</span>
+10.20.30.41     <span class="userinput">keep1.ClusterID.example.com</span>
+10.20.30.42     <span class="userinput">keep2.ClusterID.example.com</span>
+</code></pre></notextile>
+
+Adding these lines to the @/etc/hosts@ file in the compute node image could be done with a small change to the Packer template and the @scripts/base.sh@ script, which will be left as an exercise for the reader.
+
+h2(#aws). Build an AWS image
+
+<notextile><pre><code>~$ <span class="userinput">./build.sh --json-file arvados-images-aws.json \
+           --arvados-cluster-id ClusterID \
+           --aws-profile AWSProfile \
+           --aws-source-ami AMI \
+           --aws-vpc-id VPC \
+           --aws-subnet-id Subnet \
+           --ssh_user admin \
+           --resolver ResolverIP \
+           --public-key-file ArvadosDispatchCloudPublicKeyPath
+</span>
+</code></pre></notextile>
+
+For @ClusterID@, fill in your cluster ID. The @VPC@ and @Subnet@ should be configured for where you want the compute image to be generated and stored. The @AMI@ is the identifier for the base image to be used. Current AMIs are maintained by "Debian":https://wiki.debian.org/Cloud/AmazonEC2Image/Buster and "Ubuntu":https://cloud-images.ubuntu.com/locator/ec2/.
+
+@AWSProfile@ should be replaced with the name of an AWS profile with sufficient permissions to create the image.
+
+@ArvadosDispatchCloudPublicKeyPath@ should be replaced with the path to the ssh *public* key file generated in "Create an SSH keypair":#sshkeypair, above.
+
+Compute nodes must be able to resolve the hostnames of the API server and any keepstore servers to your internal IP addresses. You can do this by running an internal DNS resolver. The IP address of the resolver should replace the string @ResolverIP@ in the command above.
+
+Alternatively, the services could be hardcoded into an @/etc/hosts@ file. For example:
+
+<notextile><pre><code>10.20.30.40     <span class="userinput">ClusterID.example.com</span>
+10.20.30.41     <span class="userinput">keep1.ClusterID.example.com</span>
+10.20.30.42     <span class="userinput">keep2.ClusterID.example.com</span>
+</code></pre></notextile>
+
+Adding these lines to the @/etc/hosts@ file in the compute node image could be done with a small change to the Packer template and the @scripts/base.sh@ script, which will be left as an exercise for the reader.
similarity index 75%
rename from doc/install/install-dispatch-cloud.html.textile.liquid
rename to doc/install/crunch2-cloud/install-dispatch-cloud.html.textile.liquid
index 7bff6a4a246511a15e332284095f435219aca378..faa7c5b953fcf6febf3b32080914c392d27a5a7e 100644 (file)
@@ -10,7 +10,7 @@ SPDX-License-Identifier: CC-BY-SA-3.0
 {% endcomment %}
 
 {% include 'notebox_begin_warning' %}
-arvados-dispatch-cloud is only relevant for cloud installations. Skip this section if you are installing a on premise cluster that will spool jobs to Slurm.
+arvados-dispatch-cloud is only relevant for cloud installations. Skip this section if you are installing an on premises cluster that will spool jobs to Slurm.
 {% include 'notebox_end' %}
 
 # "Introduction":#introduction
@@ -27,53 +27,11 @@ The cloud dispatch service is for running containers on cloud VMs. It works with
 
 The cloud dispatch service can run on any node that can connect to the Arvados API service, the cloud provider's API, and the SSH service on cloud VMs.  It is not resource-intensive, so you can run it on the API server node.
 
-h2(#create-image). Create compute node VM image and configure resolver
-
-Set up a VM following the steps "to set up a compute node":crunch2-slurm/install-compute-node.html
-
-Compute nodes must be able to resolve the hostnames of the API server and any keepstore servers to your internal IP addresses.  You can do this by running an internal DNS resolver and configuring the compute VMs to use that resolver, or by hardcoding the services in the @/etc/hosts@ file.  For example:
-
-<notextile><pre><code>10.20.30.40     <span class="userinput">ClusterID.example.com</span>
-10.20.30.41     <span class="userinput">keep1.ClusterID.example.com</span>
-10.20.30.42     <span class="userinput">keep2.ClusterID.example.com</span>
-</code></pre></notextile>
-
-Once the VM is fully configured, create a reusable VM image from it and make note of the image id.
-
 h2(#update-config). Update config.yml
 
-h3. Create a private key
-
-Generate an SSH private key with no passphrase. Save it in the cluster configuration file (see @PrivateKey@ in the example below).
-
-<notextile>
-<pre><code>~$ <span class="userinput">ssh-keygen -N '' -f ~/.ssh/id_dispatcher</span>
-Generating public/private rsa key pair.
-Your identification has been saved in /home/user/.ssh/id_dispatcher.
-Your public key has been saved in /home/user/.ssh/id_dispatcher.pub.
-The key fingerprint is:
-[...]
-~$ <span class="userinput">cat ~/.ssh/id_dispatcher</span>
------BEGIN RSA PRIVATE KEY-----
-MIIEpQIBAAKCAQEAqXoCzcOBkFQ7w4dvXf9B++1ctgZRqEbgRYL3SstuMV4oawks
-ttUuxJycDdsPmeYcHsKo8vsEZpN6iYsX6ZZzhkO5nEayUTU8sBjmg1ZCTo4QqKXr
-...
-oFyAjVoexx0RBcH6BveTfQtJKbktP1qBO4mXo2dP0cacuZEtlAqW9Eb06Pvaw/D9
-foktmqOY8MyctzFgXBpGTxPliGjqo8OkrOyQP2g+FL7v+Km31Xs61P8=
------END RSA PRIVATE KEY-----
-</code></pre>
-</notextile>
-
-You can delete the key files after you have copied the private key to your configuration file.
-
-<notextile>
-<pre><code>~$ <span class="userinput">rm ~/.ssh/id_dispatcher ~/.ssh/id_dispatcher.pub</span>
-</code></pre>
-</notextile>
-
 h3. Configure CloudVMs
 
-Add or update the following portions of your cluster configuration file, @config.yml@. Refer to "config.defaults.yml":{{site.baseurl}}/admin/config.html for information about additional configuration options.
+Add or update the following portions of your cluster configuration file, @config.yml@. Refer to "config.defaults.yml":{{site.baseurl}}/admin/config.html for information about additional configuration options. The @DispatchPrivateKey@ should be the *private* key generated in "the previous section":install-compute-node.html#sshkeypair.
 
 <notextile>
 <pre><code>    Services:
@@ -141,19 +99,26 @@ h4. Minimal configuration example for Azure
         ImageID: "https://zzzzzzzz.blob.core.windows.net/system/Microsoft.Compute/Images/images/zzzzz-compute-osDisk.55555555-5555-5555-5555-555555555555.vhd"
         Driver: azure
         DriverParameters:
+          # Credentials.
           SubscriptionID: XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX
           ClientID: XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX
           ClientSecret: XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
           TenantID: XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX
-          CloudEnvironment: AzurePublicCloud
-          ResourceGroup: zzzzz
+
+          # Data center where VMs will be allocated
           Location: centralus
-          Network: zzzzz
-          Subnet: zzzzz-subnet-private
+
+          # The resource group where the VM and virtual NIC will be
+          # created.
+          ResourceGroup: zzzzz
+          NetworkResourceGroup: yyyyy   # only if different from ResourceGroup
+          Network: xxxxx
+          Subnet: xxxxx-subnet-private
+
+          # Where to store the VM VHD blobs
           StorageAccount: example
           BlobContainer: vhds
-          DeleteDanglingResourcesAfter: 20s
-          AdminUsername: arvados
+
 </code></pre>
 </notextile>
 
@@ -201,7 +166,7 @@ Run the @cloudtest@ tool to verify that your configuration works. This creates a
 </code></pre>
 </notextile>
 
-Refer to the "cloudtest tool documentation":../admin/cloudtest.html for more information.
+Refer to the "cloudtest tool documentation":../../admin/cloudtest.html for more information.
 
 {% assign arvados_component = 'arvados-dispatch-cloud' %}
 
@@ -220,7 +185,7 @@ On the dispatch node, start monitoring the arvados-dispatch-cloud logs:
 </code></pre>
 </notextile>
 
-"Make sure to install the arvados/jobs image.":install-jobs-image.html
+"Make sure to install the arvados/jobs image.":../install-jobs-image.html
 
 Submit a simple container request:
 
@@ -248,13 +213,46 @@ Submit a simple container request:
 
 This command should return a record with a @container_uuid@ field.  Once @arvados-dispatch-cloud@ polls the API server for new containers to run, you should see it dispatch that same container.
 
-The @arvados-dispatch-cloud@ API a list of queued and running jobs.  For example:
+The @arvados-dispatch-cloud@ API provides a list of queued and running jobs and cloud instances. Use your @ManagementToken@ to test the dispatcher's endpoint. For example, when one container is running:
 
 <notextile>
-<pre><code>~$ <span class="userinput">curl ...</span>
+<pre><code>~$ <span class="userinput">curl -sH "Authorization: Bearer $token" http://localhost:9006/arvados/v1/dispatch/containers</span>
+{
+  "items": [
+    {
+      "container": {
+        "uuid": "zzzzz-dz642-hdp2vpu9nq14tx0",
+        ...
+        "state": "Running",
+        "scheduling_parameters": {
+          "partitions": null,
+          "preemptible": false,
+          "max_run_time": 0
+        },
+        "exit_code": 0,
+        "runtime_status": null,
+        "started_at": null,
+        "finished_at": null
+      },
+      "instance_type": {
+        "Name": "Standard_D2s_v3",
+        "ProviderType": "Standard_D2s_v3",
+        "VCPUs": 2,
+        "RAM": 8589934592,
+        "Scratch": 16000000000,
+        "IncludedScratch": 16000000000,
+        "AddedScratch": 0,
+        "Price": 0.11,
+        "Preemptible": false
+      }
+    }
+  ]
+}
 </code></pre>
 </notextile>
 
+A similar request can be made to the @http://localhost:9006/arvados/v1/dispatch/instances@ endpoint.
+
 When the container finishes, the dispatcher will log it.
 
 After the container finishes, you can get the container record by UUID *from a shell server* to see its results:
index e93332c92cb5b0081d58720898d93ac113defc84..8c01c44ed3491b71b5401aab8976dab3a7e4e7af 100644 (file)
@@ -1,7 +1,7 @@
 ---
 layout: default
 navsection: installguide
-title: Set up a compute node
+title: Set up a Slurm compute node
 ...
 {% comment %}
 Copyright (C) The Arvados Authors. All rights reserved.
@@ -9,6 +9,10 @@ Copyright (C) The Arvados Authors. All rights reserved.
 SPDX-License-Identifier: CC-BY-SA-3.0
 {% endcomment %}
 
+{% include 'notebox_begin_warning' %}
+crunch-dispatch-slurm is only relevant for on premises clusters that will spool jobs to Slurm. Skip this section if you are installing a cloud cluster.
+{% include 'notebox_end' %}
+
 # "Introduction":#introduction
 # "Set up Docker":#docker
 # "Update fuse.conf":#fuse
@@ -20,10 +24,7 @@ SPDX-License-Identifier: CC-BY-SA-3.0
 
 h2(#introduction). Introduction
 
-This page describes how to configure a compute node so that it can be used to run containers dispatched by Arvados.
-
-* If you are using the cloud dispatcher, apply these step and then save a compute node virtual machine image.  The virtual machine image id will go in @config.yml@.
-* If you are using SLURM on a static custer, these steps must be duplicated on every compute node, preferrably using a devops tool such as Puppet.
+This page describes how to configure a compute node so that it can be used to run containers dispatched by Arvados, with Slurm on a static cluster. These steps must be performed on every compute node.
 
 h2(#docker). Set up Docker
 
index 300871763879472a1f57310584855bae2986bdbd..a9689e9ac357842f3cca6dd69d0f8b9f43062a93 100644 (file)
@@ -1,7 +1,7 @@
 ---
 layout: default
 navsection: installguide
-title: Install the SLURM dispatcher
+title: Install the Slurm dispatcher
 
 ...
 {% comment %}
@@ -11,7 +11,7 @@ SPDX-License-Identifier: CC-BY-SA-3.0
 {% endcomment %}
 
 {% include 'notebox_begin_warning' %}
-crunch-dispatch-slurm is only relevant for on premise clusters that will spool jobs to Slurm. Skip this section if you are installing a cloud cluster.
+crunch-dispatch-slurm is only relevant for on premises clusters that will spool jobs to Slurm. Skip this section if you are installing a cloud cluster.
 {% include 'notebox_end' %}
 
 # "Introduction":#introduction
@@ -22,9 +22,9 @@ crunch-dispatch-slurm is only relevant for on premise clusters that will spool j
 
 h2(#introduction). Introduction
 
-This assumes you already have a SLURM cluster, and have "set up all of your compute nodes":install-compute-node.html .  For information on installing SLURM, see "this install guide":https://slurm.schedmd.com/quickstart_admin.html
+This assumes you already have a Slurm cluster, and have "set up all of your compute nodes":install-compute-node.html .  For information on installing Slurm, see "this install guide":https://slurm.schedmd.com/quickstart_admin.html
 
-The Arvados SLURM dispatcher can run on any node that can submit requests to both the Arvados API server and the SLURM controller (via @sbatch@).  It is not resource-intensive, so you can run it on the API server node.
+The Arvados Slurm dispatcher can run on any node that can submit requests to both the Arvados API server and the Slurm controller (via @sbatch@).  It is not resource-intensive, so you can run it on the API server node.
 
 h2(#update-config). Update config.yml (optional)
 
@@ -44,7 +44,7 @@ crunch-dispatch-slurm polls the API server periodically for new containers to ru
 
 h3(#ReserveExtraRAM). Containers.ReserveExtraRAM: Extra RAM for jobs
 
-Extra RAM to reserve (in bytes) on each SLURM job submitted by Arvados, which is added to the amount specified in the container's @runtime_constraints@.  If not provided, the default value is zero.  Helpful when using @-cgroup-parent-subsystem@, where @crunch-run@ and @arv-mount@ share the control group memory limit with the user process.  In this situation, at least 256MiB is recommended to accomodate each container's @crunch-run@ and @arv-mount@ processes.
+Extra RAM to reserve (in bytes) on each Slurm job submitted by Arvados, which is added to the amount specified in the container's @runtime_constraints@.  If not provided, the default value is zero.  Helpful when using @-cgroup-parent-subsystem@, where @crunch-run@ and @arv-mount@ share the control group memory limit with the user process.  In this situation, at least 256MiB is recommended to accomodate each container's @crunch-run@ and @arv-mount@ processes.
 
 Supports suffixes @KB@, @KiB@, @MB@, @MiB@, @GB@, @GiB@, @TB@, @TiB@, @PB@, @PiB@, @EB@, @EiB@ (where @KB@ is 10[^3^], @KiB@ is 2[^10^], @MB@ is 10[^6^], @MiB@ is 2[^20^] and so forth).
 
@@ -56,7 +56,7 @@ Supports suffixes @KB@, @KiB@, @MB@, @MiB@, @GB@, @GiB@, @TB@, @TiB@, @PB@, @PiB
 
 h3(#MinRetryPeriod). Containers.MinRetryPeriod: Rate-limit repeated attempts to start containers
 
-If SLURM is unable to run a container, the dispatcher will submit it again after the next PollPeriod. If PollPeriod is very short, this can be excessive. If MinRetryPeriod is set, the dispatcher will avoid submitting the same container to SLURM more than once in the given time span.
+If Slurm is unable to run a container, the dispatcher will submit it again after the next PollPeriod. If PollPeriod is very short, this can be excessive. If MinRetryPeriod is set, the dispatcher will avoid submitting the same container to Slurm more than once in the given time span.
 
 <notextile>
 <pre>    Containers:
@@ -64,7 +64,7 @@ If SLURM is unable to run a container, the dispatcher will submit it again after
 </pre>
 </notextile>
 
-h3(#KeepServiceURIs). Containers.SLURM.SbatchEnvironmentVariables
+h3(#KeepServiceURIs). Containers.Slurm.SbatchEnvironmentVariables
 
 Some Arvados installations run a local keepstore on each compute node to handle all Keep traffic.  To override Keep service discovery and access the local keep server instead of the global servers, set ARVADOS_KEEP_SERVICES in SbatchEnvironmentVariables:
 
@@ -76,11 +76,11 @@ Some Arvados installations run a local keepstore on each compute node to handle
 </code></pre>
 </notextile>
 
-h3(#PrioritySpread). Containers.SLURM.PrioritySpread
+h3(#PrioritySpread). Containers.Slurm.PrioritySpread
 
-crunch-dispatch-slurm adjusts the "nice" values of its SLURM jobs to ensure containers are prioritized correctly relative to one another. This option tunes the adjustment mechanism.
-* If non-Arvados jobs run on your SLURM cluster, and your Arvados containers are waiting too long in the SLURM queue because their "nice" values are too high for them to compete with other SLURM jobs, you should use a smaller PrioritySpread value.
-* If you have an older SLURM system that limits nice values to 10000, a smaller @PrioritySpread@ can help avoid reaching that limit.
+crunch-dispatch-slurm adjusts the "nice" values of its Slurm jobs to ensure containers are prioritized correctly relative to one another. This option tunes the adjustment mechanism.
+* If non-Arvados jobs run on your Slurm cluster, and your Arvados containers are waiting too long in the Slurm queue because their "nice" values are too high for them to compete with other SLURM jobs, you should use a smaller PrioritySpread value.
+* If you have an older Slurm system that limits nice values to 10000, a smaller @PrioritySpread@ can help avoid reaching that limit.
 * In other cases, a larger value is beneficial because it reduces the total number of adjustments made by executing @scontrol@.
 
 The smallest usable value is @1@. The default value of @10@ is used if this option is zero or negative. Example:
@@ -91,7 +91,7 @@ The smallest usable value is @1@. The default value of @10@ is used if this opti
         <code class="userinput">PrioritySpread: <b>1000</b></code></pre>
 </notextile>
 
-h3(#SbatchArguments). Containers.SLURM.SbatchArgumentsList
+h3(#SbatchArguments). Containers.Slurm.SbatchArgumentsList
 
 When crunch-dispatch-slurm invokes @sbatch@, you can add arguments to the command by specifying @SbatchArguments@.  You can use this to send the jobs to specific cluster partitions or add resource requests.  Set @SbatchArguments@ to an array of strings.  For example:
 
@@ -105,9 +105,9 @@ When crunch-dispatch-slurm invokes @sbatch@, you can add arguments to the comman
 
 Note: If an argument is supplied multiple times, @slurm@ uses the value of the last occurrence of the argument on the command line.  Arguments specified through Arvados are added after the arguments listed in SbatchArguments.  This means, for example, an Arvados container with that specifies @partitions@ in @scheduling_parameter@ will override an occurrence of @--partition@ in SbatchArguments.  As a result, for container parameters that can be specified through Arvados, SbatchArguments can be used to specify defaults but not enforce specific policy.
 
-h3(#CrunchRunCommand-cgroups). Containers.CrunchRunArgumentList: Dispatch to SLURM cgroups
+h3(#CrunchRunCommand-cgroups). Containers.CrunchRunArgumentList: Dispatch to Slurm cgroups
 
-If your SLURM cluster uses the @task/cgroup@ TaskPlugin, you can configure Crunch's Docker containers to be dispatched inside SLURM's cgroups.  This provides consistent enforcement of resource constraints.  To do this, use a crunch-dispatch-slurm configuration like the following:
+If your Slurm cluster uses the @task/cgroup@ TaskPlugin, you can configure Crunch's Docker containers to be dispatched inside Slurm's cgroups.  This provides consistent enforcement of resource constraints.  To do this, use a crunch-dispatch-slurm configuration like the following:
 
 <notextile>
 <pre>    Containers:
@@ -116,13 +116,13 @@ If your SLURM cluster uses the @task/cgroup@ TaskPlugin, you can configure Crunc
 </pre>
 </notextile>
 
-The choice of subsystem ("memory" in this example) must correspond to one of the resource types enabled in SLURM's @cgroup.conf@. Limits for other resource types will also be respected.  The specified subsystem is singled out only to let Crunch determine the name of the cgroup provided by SLURM.  When doing this, you should also set "ReserveExtraRAM":#ReserveExtraRAM .
+The choice of subsystem ("memory" in this example) must correspond to one of the resource types enabled in Slurm's @cgroup.conf@. Limits for other resource types will also be respected.  The specified subsystem is singled out only to let Crunch determine the name of the cgroup provided by Slurm.  When doing this, you should also set "ReserveExtraRAM":#ReserveExtraRAM .
 
 {% include 'notebox_begin' %}
 
-Some versions of Docker (at least 1.9), when run under systemd, require the cgroup parent to be specified as a systemd slice.  This causes an error when specifying a cgroup parent created outside systemd, such as those created by SLURM.
+Some versions of Docker (at least 1.9), when run under systemd, require the cgroup parent to be specified as a systemd slice.  This causes an error when specifying a cgroup parent created outside systemd, such as those created by Slurm.
 
-You can work around this issue by disabling the Docker daemon's systemd integration.  This makes it more difficult to manage Docker services with systemd, but Crunch does not require that functionality, and it will be able to use SLURM's cgroups as container parents.  To do this, "configure the Docker daemon on all compute nodes":install-compute-node.html#configure_docker_daemon to run with the option @--exec-opt native.cgroupdriver=cgroupfs@.
+You can work around this issue by disabling the Docker daemon's systemd integration.  This makes it more difficult to manage Docker services with systemd, but Crunch does not require that functionality, and it will be able to use Slurm's cgroups as container parents.  To do this, "configure the Docker daemon on all compute nodes":install-compute-node.html#configure_docker_daemon to run with the option @--exec-opt native.cgroupdriver=cgroupfs@.
 
 {% include 'notebox_end' %}
 
index 39f1b725865169130adfb009999efcb76c8b02be..23bdd3b264e9a6aafb6b94cac699c4c12a538885 100644 (file)
@@ -1,7 +1,7 @@
 ---
 layout: default
 navsection: installguide
-title: Containers API SLURM prerequisites
+title: Containers API Slurm prerequisites
 ...
 {% comment %}
 Copyright (C) The Arvados Authors. All rights reserved.
index 7f4488fb36d3a49d7dfa2ea22449976221c5f533..061edf96c02cffc42d0cf9f3daf716a3149171e1 100644 (file)
@@ -1,7 +1,7 @@
 ---
 layout: default
 navsection: installguide
-title: Set up SLURM
+title: Set up Slurm
 ...
 {% comment %}
 Copyright (C) The Arvados Authors. All rights reserved.
@@ -9,12 +9,12 @@ Copyright (C) The Arvados Authors. All rights reserved.
 SPDX-License-Identifier: CC-BY-SA-3.0
 {% endcomment %}
 
-Containers can be dispatched to a SLURM cluster.  The dispatcher sends work to the cluster using SLURM's @sbatch@ command, so it works in a variety of SLURM configurations.
+Containers can be dispatched to a Slurm cluster.  The dispatcher sends work to the cluster using Slurm's @sbatch@ command, so it works in a variety of SLURM configurations.
 
 In order to run containers, you must run the dispatcher as a user that has permission to set up FUSE mounts and run Docker containers on each compute node.  This install guide refers to this user as the @crunch@ user.  We recommend you create this user on each compute node with the same UID and GID, and add it to the @fuse@ and @docker@ system groups to grant it the necessary permissions.  However, you can run the dispatcher under any account with sufficient permissions across the cluster.
 
 
-On the API server, install SLURM and munge, and generate a munge key.
+On the API server, install Slurm and munge, and generate a munge key.
 
 On Debian-based systems:
 
@@ -31,7 +31,7 @@ On Red Hat-based systems:
 </code></pre>
 </notextile>
 
-Now we need to give SLURM a configuration file.  On Debian-based systems, this is installed at @/etc/slurm-llnl/slurm.conf@.  On Red Hat-based systems, this is installed at @/etc/slurm/slurm.conf@.  Here's an example @slurm.conf@:
+Now we need to give Slurm a configuration file.  On Debian-based systems, this is installed at @/etc/slurm-llnl/slurm.conf@.  On Red Hat-based systems, this is installed at @/etc/slurm/slurm.conf@.  Here's an example @slurm.conf@:
 
 <notextile>
 <pre><code>
@@ -82,19 +82,19 @@ PartitionName=compute Nodes=compute[0-255] Default=YES Shared=YES
 </code></pre>
 </notextile>
 
-h3. SLURM configuration essentials
+h3. Slurm configuration essentials
 
 Whenever you change this file, you will need to update the copy _on every compute node_ as well as the controller node, and then run @sudo scontrol reconfigure@.
 
-*@ControlMachine@* should be a DNS name that resolves to the SLURM controller (dispatch/API server). This must resolve correctly on all SLURM worker nodes as well as the controller itself. In general SLURM is very sensitive about all of the nodes being able to communicate with the controller _and one another_, all using the same DNS names.
+*@ControlMachine@* should be a DNS name that resolves to the Slurm controller (dispatch/API server). This must resolve correctly on all Slurm worker nodes as well as the controller itself. In general SLURM is very sensitive about all of the nodes being able to communicate with the controller _and one another_, all using the same DNS names.
 
 *@SelectType=select/linear@* is needed on cloud-based installations that update node sizes dynamically, but it can only schedule one container at a time on each node. On a static or homogeneous cluster, use @SelectType=select/cons_res@ with @SelectTypeParameters=CR_CPU_Memory@ instead to enable node sharing.
 
 *@NodeName=compute[0-255]@* establishes that the hostnames of the worker nodes will be compute0, compute1, etc. through compute255.
 * There are several ways to compress sequences of names, like @compute[0-9,80,100-110]@. See the "hostlist" discussion in the @slurm.conf(5)@ and @scontrol(1)@ man pages for more information.
-* It is not necessary for all of the nodes listed here to be alive in order for SLURM to work, although you should make sure the DNS entries exist. It is easiest to define lots of hostnames up front, assigning them to real nodes and updating your DNS records as the nodes appear. This minimizes the frequency of @slurm.conf@ updates and use of @scontrol reconfigure@.
+* It is not necessary for all of the nodes listed here to be alive in order for Slurm to work, although you should make sure the DNS entries exist. It is easiest to define lots of hostnames up front, assigning them to real nodes and updating your DNS records as the nodes appear. This minimizes the frequency of @slurm.conf@ updates and use of @scontrol reconfigure@.
 
-Each hostname in @slurm.conf@ must also resolve correctly on all SLURM worker nodes as well as the controller itself. Furthermore, the hostnames used in the configuration file must match the hostnames reported by @hostname@ or @hostname -s@ on the nodes themselves. This applies to the ControlMachine as well as the worker nodes.
+Each hostname in @slurm.conf@ must also resolve correctly on all Slurm worker nodes as well as the controller itself. Furthermore, the hostnames used in the configuration file must match the hostnames reported by @hostname@ or @hostname -s@ on the nodes themselves. This applies to the ControlMachine as well as the worker nodes.
 
 For example:
 * In @slurm.conf@ on control and worker nodes: @ControlMachine=ClusterID.example.com@
index 6d4ca930590e23606855b4f37c16027ae9d5f579..647995a8ca641c8e135f8187ed6d9ce927646c01 100644 (file)
@@ -1,7 +1,7 @@
 ---
 layout: default
 navsection: installguide
-title: Test SLURM dispatch
+title: Test Slurm dispatch
 ...
 {% comment %}
 Copyright (C) The Arvados Authors. All rights reserved.
@@ -10,19 +10,19 @@ SPDX-License-Identifier: CC-BY-SA-3.0
 {% endcomment %}
 
 {% include 'notebox_begin_warning' %}
-crunch-dispatch-slurm is only relevant for on premise clusters that will spool jobs to Slurm. Skip this section if you are installing a cloud cluster.
+crunch-dispatch-slurm is only relevant for on premises clusters that will spool jobs to Slurm. Skip this section if you are installing a cloud cluster.
 {% include 'notebox_end' %}
 
 h2. Test compute node setup
 
-You should now be able to submit SLURM jobs that run in Docker containers.  On the node where you're running the dispatcher, you can test this by running:
+You should now be able to submit Slurm jobs that run in Docker containers.  On the node where you're running the dispatcher, you can test this by running:
 
 <notextile>
 <pre><code>~$ <span class="userinput">sudo -u <b>crunch</b> srun -N1 docker run busybox echo OK
 </code></pre>
 </notextile>
 
-If it works, this command should print @OK@ (it may also show some status messages from SLURM and/or Docker).  If it does not print @OK@, double-check your compute node setup, and that the @crunch@ user can submit SLURM jobs.
+If it works, this command should print @OK@ (it may also show some status messages from Slurm and/or Docker).  If it does not print @OK@, double-check your compute node setup, and that the @crunch@ user can submit Slurm jobs.
 
 h2. Test the dispatcher
 
@@ -66,7 +66,7 @@ This command should return a record with a @container_uuid@ field.  Once @crunch
 </code></pre>
 </notextile>
 
-Before the container finishes, SLURM's @squeue@ command will show the new job in the list of queued and running jobs.  For example, you might see:
+Before the container finishes, Slurm's @squeue@ command will show the new job in the list of queued and running jobs.  For example, you might see:
 
 <notextile>
 <pre><code>~$ <span class="userinput">squeue --long</span>
index b31827bf70ed6c0a062cef1321cce68c393caaba..24f37bfb4f8ee25b3b32b691624e06586f9b42d1 100644 (file)
@@ -142,6 +142,7 @@ server {
     client_max_body_size    0;
     proxy_http_version      1.1;
     proxy_request_buffering off;
+    proxy_max_temp_file_size 0;
   }
 }
 </pre></notextile>
index ae6bd3989c340cbc64bb67932d9c1c3d8a8121e9..b4edd4f57b6e682560f8da16759b6042921dd0c3 100644 (file)
@@ -66,6 +66,7 @@ server {
   proxy_set_header        X-Real-IP $remote_addr;
   proxy_http_version      1.1;
   proxy_request_buffering off;
+  proxy_max_temp_file_size 0;
 
   ssl_certificate     <span class="userinput">/YOUR/PATH/TO/cert.pem</span>;
   ssl_certificate_key <span class="userinput">/YOUR/PATH/TO/cert.key</span>;
index 869ca15d9eb65c4e0feb22a0d29916bee3b354f5..3cb922642ec7c23448c74738a41facd9f83ac338 100644 (file)
@@ -52,6 +52,8 @@ Fill in the @Volumes@ section of @config.yml@ for each storage volume.  Availabl
 * If you are using S3-compatible object storage (including Amazon S3, Google Cloud Storage, and Ceph RADOS), follow the setup instructions on "S3 Object Storage":configure-s3-object-storage.html
 * If you are using Azure Blob Storage, follow the setup instructions on "Azure Blob Storage":configure-azure-blob-storage.html
 
+There are a number of general configuration parameters for Keepstore. They are described in the "configuration reference":{{site.baseurl}}/admin/config.html. In particular, you probably want to change @API/MaxKeepBlobBuffers@ to align Keepstore's memory usage with the available memory on the machine that hosts it.
+
 h3. List services
 
 Add each keepstore server to the @Services.Keepstore@ section of @/etc/arvados/config.yml@ .
index 2ce6e36a612b701ec7b8de0494a6f71b19f4b175..55095b1f20f05cb21e203a9ba6a39fa3f069a2dd 100644 (file)
@@ -9,7 +9,7 @@ Copyright (C) The Arvados Authors. All rights reserved.
 SPDX-License-Identifier: CC-BY-SA-3.0
 {% endcomment %}
 
-Before attempting installation, you should begin by reviewing supported platforms, choosing backends for identity, storage, and scheduling, and decide how you will distribute Arvados services onto machines.  You should also choose an Arvados Cluster ID, choose your hostnames, and aquire TLS certificates.  It may be helpful to make notes as you go along using one of these worksheets:  "New cluster checklist for AWS":new_cluster_checklist_AWS.xlsx - "New cluster checklist for Azure":new_cluster_checklist_Azure.xlsx - "New cluster checklist for on premise SLURM":new_cluster_checklist_slurm.xlsx
+Before attempting installation, you should begin by reviewing supported platforms, choosing backends for identity, storage, and scheduling, and decide how you will distribute Arvados services onto machines.  You should also choose an Arvados Cluster ID, choose your hostnames, and aquire TLS certificates.  It may be helpful to make notes as you go along using one of these worksheets:  "New cluster checklist for AWS":new_cluster_checklist_AWS.xlsx - "New cluster checklist for Azure":new_cluster_checklist_Azure.xlsx - "New cluster checklist for on premises Slurm":new_cluster_checklist_slurm.xlsx
 
 The Arvados storage subsystem is called "keep".  The compute subsystem is called "crunch".
 
@@ -60,8 +60,8 @@ table(table table-bordered table-condensed).
 |"Shell server":install-shell-server.html |Synchronize (create/delete/configure) Unix shell accounts with Arvados users.|Optional.|
 |"Git server":install-arv-git-httpd.html |Arvados-hosted git repositories, with Arvados-token based authentication.|Optional, but required by Workflow Composer.|
 |\3=. *Crunch (running containers)*|
-|"crunch-dispatch-slurm":crunch2-slurm/install-prerequisites.html |Run analysis workflows using Docker containers distributed across a SLURM cluster.|Optional if you wish to use Arvados for data management only.|
-|"Node Manager":install-nodemanager.html, "arvados-dispatch-cloud":install-dispatch-cloud.html |Allocate and free cloud VM instances on demand based on workload.|Optional, not needed for a static SLURM cluster (such as on-premise HPC).|
+|"arvados-dispatch-cloud":crunch2-cloud/install-dispatch-cloud.html |Allocate and free cloud VM instances on demand based on workload.|Optional, not needed for a static Slurm cluster such as on-premises HPC.|
+|"crunch-dispatch-slurm":crunch2-slurm/install-prerequisites.html |Run analysis workflows using Docker containers distributed across a Slurm cluster.|Optional, not needed for a Cloud installation, or if you wish to use Arvados for data management only.|
 
 h2(#identity). Identity provider
 
diff --git a/doc/install/install-nodemanager.html.textile.liquid b/doc/install/install-nodemanager.html.textile.liquid
deleted file mode 100644 (file)
index 431fc10..0000000
+++ /dev/null
@@ -1,629 +0,0 @@
----
-layout: default
-navsection: installguide
-title: Install Node Manager
-...
-{% comment %}
-Copyright (C) The Arvados Authors. All rights reserved.
-
-SPDX-License-Identifier: CC-BY-SA-3.0
-{% endcomment %}
-
-Arvados Node Manager provides elastic computing for Arvados and SLURM by creating and destroying virtual machines on demand.  Node Manager currently supports Amazon Web Services (AWS), Google Cloud Platform (GCP) and Microsoft Azure.
-
-Note: node manager is only required for elastic computing cloud environments.  Fixed size clusters (such as on-premise HPC) do not require node manager.
-
-h2. Install
-
-Node manager may run anywhere, however it must be able to communicate with the cloud provider's APIs, and use the command line tools @sinfo@, @squeue@ and @scontrol@ to communicate with the cluster's SLURM controller.
-
-On Debian-based systems:
-
-<notextile>
-<pre><code>~$ <span class="userinput">sudo apt-get install arvados-node-manager</span>
-</code></pre>
-</notextile>
-
-On Red Hat-based systems:
-
-<notextile>
-<pre><code>~$ <span class="userinput">sudo yum install arvados-node-manager</span>
-</code></pre>
-</notextile>
-
-h2. Create compute image
-
-Configure a virtual machine following the "instructions to set up a compute node.":{{site.baseurl}}/install/crunch2-slurm/install-compute-node.html and set it up to run a "ping script":{{site.baseurl}}/install/install-compute-ping.html at boot.
-
-Create a virtual machine image using the commands provided by your cloud provider.  We recommend using a tool such as "Packer":https://www.packer.io/ to automate this process.
-
-Configure node manager to use the image with the @image@ or @image_id@ parameter.
-
-h2. Configure node manager
-
-The configuration file at @/etc/arvados-node-manager/config.ini@ .  Some configuration details are specific to the cloud provider you are using:
-
-* "Amazon Web Services":#aws
-* "Google Cloud Platform":#gcp
-* "Microsoft Azure":#azure
-
-h3(#aws). Amazon Web Services
-
-<pre>
-# EC2 configuration for Arvados Node Manager.
-# All times are in seconds unless specified otherwise.
-
-[Manage]
-# The management server responds to http://addr:port/status.json with
-# a snapshot of internal state.
-
-# Management server listening address (default 127.0.0.1)
-#address = 0.0.0.0
-
-# Management server port number (default -1, server is disabled)
-#port = 8989
-
-[Daemon]
-# The dispatcher can customize the start and stop procedure for
-# cloud nodes.  For example, the SLURM dispatcher drains nodes
-# through SLURM before shutting them down.
-dispatcher = slurm
-
-# Node Manager will ensure that there are at least this many nodes running at
-# all times.  If node manager needs to start new idle nodes for the purpose of
-# satisfying min_nodes, it will use the cheapest node type.  However, depending
-# on usage patterns, it may also satisfy min_nodes by keeping alive some
-# more-expensive nodes
-min_nodes = 0
-
-# Node Manager will not start any compute nodes when at least this
-# many are running.
-max_nodes = 8
-
-# Upper limit on rate of spending (in $/hr), will not boot additional nodes
-# if total price of already running nodes meets or exceeds this threshold.
-# default 0 means no limit.
-max_total_price = 0
-
-# Poll EC2 nodes and Arvados for new information every N seconds.
-poll_time = 60
-
-# Polls have exponential backoff when services fail to respond.
-# This is the longest time to wait between polls.
-max_poll_time = 300
-
-# If Node Manager can't succesfully poll a service for this long,
-# it will never start or stop compute nodes, on the assumption that its
-# information is too outdated.
-poll_stale_after = 600
-
-# If Node Manager boots a cloud node, and it does not pair with an Arvados
-# node before this long, assume that there was a cloud bootstrap failure and
-# shut it down.  Note that normal shutdown windows apply (see the Cloud
-# section), so this should be shorter than the first shutdown window value.
-boot_fail_after = 1800
-
-# "Node stale time" affects two related behaviors.
-# 1. If a compute node has been running for at least this long, but it
-# isn't paired with an Arvados node, do not shut it down, but leave it alone.
-# This prevents the node manager from shutting down a node that might
-# actually be doing work, but is having temporary trouble contacting the
-# API server.
-# 2. When the Node Manager starts a new compute node, it will try to reuse
-# an Arvados node that hasn't been updated for this long.
-node_stale_after = 14400
-
-# Number of consecutive times a node must report as "idle" before it
-# will be considered eligible for shutdown.  Node status is checked
-# each poll period, and node can go idle at any point during a poll
-# period (meaning a node could be reported as idle that has only been
-# idle for 1 second).  With a 60 second poll period, three consecutive
-# status updates of "idle" suggests the node has been idle at least
-# 121 seconds.
-consecutive_idle_count = 3
-
-# Scaling factor to be applied to nodes' available RAM size. Usually there's a
-# variable discrepancy between the advertised RAM value on cloud nodes and the
-# actual amount available.
-# If not set, this value will be set to 0.95
-node_mem_scaling = 0.95
-
-# File path for Certificate Authorities
-certs_file = /etc/ssl/certs/ca-certificates.crt
-
-[Logging]
-# Log file path
-file = /var/log/arvados/node-manager.log
-
-# Log level for most Node Manager messages.
-# Choose one of DEBUG, INFO, WARNING, ERROR, or CRITICAL.
-# WARNING lets you know when polling a service fails.
-# INFO additionally lets you know when a compute node is started or stopped.
-level = INFO
-
-# You can also set different log levels for specific libraries.
-# Pykka is the Node Manager's actor library.
-# Setting this to DEBUG will display tracebacks for uncaught
-# exceptions in the actors, but it's also very chatty.
-pykka = WARNING
-
-# Setting apiclient to INFO will log the URL of every Arvados API request.
-apiclient = WARNING
-
-[Arvados]
-host = zyxwv.arvadosapi.com
-token = ARVADOS_TOKEN
-timeout = 15
-
-# Accept an untrusted SSL certificate from the API server?
-insecure = no
-
-[Cloud]
-provider = ec2
-
-# It's usually most cost-effective to shut down compute nodes during narrow
-# windows of time.  For example, EC2 bills each node by the hour, so the best
-# time to shut down a node is right before a new hour of uptime starts.
-# Shutdown windows define these periods of time.  These are windows in
-# full minutes, separated by commas.  Counting from the time the node is
-# booted, the node WILL NOT shut down for N1 minutes; then it MAY shut down
-# for N2 minutes; then it WILL NOT shut down for N3 minutes; and so on.
-# For example, "54, 5, 1" means the node may shut down from the 54th to the
-# 59th minute of each hour of uptime.
-# Specify at least two windows.  You can add as many as you need beyond that.
-shutdown_windows = 54, 5, 1
-
-[Cloud Credentials]
-key = KEY
-secret = SECRET_KEY
-region = us-east-1
-timeout = 60
-
-[Cloud List]
-# This section defines filters that find compute nodes.
-# Tags that you specify here will automatically be added to nodes you create.
-# Replace colons in Amazon filters with underscores
-# (e.g., write "tag:mytag" as "tag_mytag").
-instance-state-name = running
-tag_arvados-class = dynamic-compute
-tag_cluster = zyxwv
-
-[Cloud Create]
-# New compute nodes will send pings to Arvados at this host.
-# You may specify a port, and use brackets to disambiguate IPv6 addresses.
-ping_host = hostname:port
-
-# Give the name of an SSH key on AWS...
-ex_keyname = string
-
-# ... or a file path for an SSH key that can log in to the compute node.
-# (One or the other, not both.)
-# ssh_key = path
-
-# The EC2 IDs of the image and subnet compute nodes should use.
-image_id = idstring
-subnet_id = idstring
-
-# Comma-separated EC2 IDs for the security group(s) assigned to each
-# compute node.
-security_groups = idstring1, idstring2
-
-# Apply an Instance Profile ARN to the newly created compute nodes
-# For more info, see:
-# https://aws.amazon.com/premiumsupport/knowledge-center/iam-policy-restrict-vpc/
-# ex_iamprofile = arn:aws:iam::ACCOUNTNUMBER:instance-profile/ROLENAME
-
-
-# You can define any number of Size sections to list EC2 sizes you're
-# willing to use.  The Node Manager should boot the cheapest size(s) that
-# can run jobs in the queue.
-#
-# Each size section MUST define the number of cores are available in this
-# size class (since libcloud does not provide any consistent API for exposing
-# this setting).
-# You may also want to define the amount of scratch space (expressed
-# in GB) for Crunch jobs.  You can also override Amazon's provided
-# data fields (such as price per hour) by setting them here.
-
-[Size m4.large]
-cores = 2
-price = 0.126
-scratch = 100
-
-[Size m4.xlarge]
-cores = 4
-price = 0.252
-scratch = 100
-</pre>
-
-h3(#gcp). Google Cloud Platform
-
-<pre>
-# Google Compute Engine configuration for Arvados Node Manager.
-# All times are in seconds unless specified otherwise.
-
-[Manage]
-# The management server responds to http://addr:port/status.json with
-# a snapshot of internal state.
-
-# Management server listening address (default 127.0.0.1)
-#address = 0.0.0.0
-
-# Management server port number (default -1, server is disabled)
-#port = 8989
-
-[Daemon]
-# Node Manager will ensure that there are at least this many nodes running at
-# all times.  If node manager needs to start new idle nodes for the purpose of
-# satisfying min_nodes, it will use the cheapest node type.  However, depending
-# on usage patterns, it may also satisfy min_nodes by keeping alive some
-# more-expensive nodes
-min_nodes = 0
-
-# Node Manager will not start any compute nodes when at least this
-# running at all times.  By default, these will be the cheapest node size.
-max_nodes = 8
-
-# Poll compute nodes and Arvados for new information every N seconds.
-poll_time = 60
-
-# Upper limit on rate of spending (in $/hr), will not boot additional nodes
-# if total price of already running nodes meets or exceeds this threshold.
-# default 0 means no limit.
-max_total_price = 0
-
-# Polls have exponential backoff when services fail to respond.
-# This is the longest time to wait between polls.
-max_poll_time = 300
-
-# If Node Manager can't succesfully poll a service for this long,
-# it will never start or stop compute nodes, on the assumption that its
-# information is too outdated.
-poll_stale_after = 600
-
-# "Node stale time" affects two related behaviors.
-# 1. If a compute node has been running for at least this long, but it
-# isn't paired with an Arvados node, do not shut it down, but leave it alone.
-# This prevents the node manager from shutting down a node that might
-# actually be doing work, but is having temporary trouble contacting the
-# API server.
-# 2. When the Node Manager starts a new compute node, it will try to reuse
-# an Arvados node that hasn't been updated for this long.
-node_stale_after = 14400
-
-# Number of consecutive times a node must report as "idle" before it
-# will be considered eligible for shutdown.  Node status is checked
-# each poll period, and node can go idle at any point during a poll
-# period (meaning a node could be reported as idle that has only been
-# idle for 1 second).  With a 60 second poll period, three consecutive
-# status updates of "idle" suggests the node has been idle at least
-# 121 seconds.
-consecutive_idle_count = 3
-
-# Scaling factor to be applied to nodes' available RAM size. Usually there's a
-# variable discrepancy between the advertised RAM value on cloud nodes and the
-# actual amount available.
-# If not set, this value will be set to 0.95
-node_mem_scaling = 0.95
-
-# File path for Certificate Authorities
-certs_file = /etc/ssl/certs/ca-certificates.crt
-
-[Logging]
-# Log file path
-file = /var/log/arvados/node-manager.log
-
-# Log level for most Node Manager messages.
-# Choose one of DEBUG, INFO, WARNING, ERROR, or CRITICAL.
-# WARNING lets you know when polling a service fails.
-# INFO additionally lets you know when a compute node is started or stopped.
-level = INFO
-
-# You can also set different log levels for specific libraries.
-# Pykka is the Node Manager's actor library.
-# Setting this to DEBUG will display tracebacks for uncaught
-# exceptions in the actors, but it's also very chatty.
-pykka = WARNING
-
-# Setting apiclient to INFO will log the URL of every Arvados API request.
-apiclient = WARNING
-
-[Arvados]
-host = zyxwv.arvadosapi.com
-token = ARVADOS_TOKEN
-timeout = 15
-
-# Accept an untrusted SSL certificate from the API server?
-insecure = no
-
-[Cloud]
-provider = gce
-
-# Shutdown windows define periods of time when a node may and may not
-# be shut down.  These are windows in full minutes, separated by
-# commas.  Counting from the time the node is booted, the node WILL
-# NOT shut down for N1 minutes; then it MAY shut down for N2 minutes;
-# then it WILL NOT shut down for N3 minutes; and so on.  For example,
-# "54, 5, 1" means the node may shut down from the 54th to the 59th
-# minute of each hour of uptime.
-# GCE bills by the minute, and does not provide information about when
-# a node booted.  Node Manager will store this information in metadata
-# when it boots a node; if that information is not available, it will
-# assume the node booted at the epoch.  These shutdown settings are
-# very aggressive.  You may want to adjust this if you want more
-# continuity of service from a single node.
-shutdown_windows = 20, 999999
-
-[Cloud Credentials]
-user_id = client_email_address@developer.gserviceaccount.com
-key = path_to_certificate.pem
-project = project-id-from-google-cloud-dashboard
-timeout = 60
-
-# Valid location (zone) names: https://cloud.google.com/compute/docs/zones
-datacenter = us-central1-a
-
-# Optional settings. For full documentation see
-# http://libcloud.readthedocs.org/en/latest/compute/drivers/gce.html#libcloud.compute.drivers.gce.GCENodeDriver
-#
-# auth_type = SA               # SA, IA or GCE
-# scopes = https://www.googleapis.com/auth/compute
-# credential_file =
-
-[Cloud List]
-# A comma-separated list of tags that must be applied to a node for it to
-# be considered a compute node.
-# The driver will automatically apply these tags to nodes it creates.
-tags = zyxwv, compute
-
-[Cloud Create]
-# New compute nodes will send pings to Arvados at this host.
-# You may specify a port, and use brackets to disambiguate IPv6 addresses.
-ping_host = hostname:port
-
-# A file path for an SSH key that can log in to the compute node.
-# ssh_key = path
-
-# The GCE image name and network zone name to use when creating new nodes.
-image = debian
-# network = your_network_name
-
-# JSON string of service account authorizations for this cluster.
-# See http://libcloud.readthedocs.org/en/latest/compute/drivers/gce.html#specifying-service-account-scopes
-# service_accounts = [{'email':'account@example.com', 'scopes':['storage-ro']}]
-
-
-# You can define any number of Size sections to list node sizes you're
-# willing to use.  The Node Manager should boot the cheapest size(s) that
-# can run jobs in the queue.
-#
-# The Size fields are interpreted the same way as with a libcloud NodeSize:
-# http://libcloud.readthedocs.org/en/latest/compute/api.html#libcloud.compute.base.NodeSize
-#
-# See https://cloud.google.com/compute/docs/machine-types for a list
-# of known machine types that may be used as a Size parameter.
-#
-# Each size section MUST define the number of cores are available in this
-# size class (since libcloud does not provide any consistent API for exposing
-# this setting).
-# You may also want to define the amount of scratch space (expressed
-# in GB) for Crunch jobs.
-# You can also override Google's provided data fields (such as price per hour)
-# by setting them here.
-
-[Size n1-standard-2]
-cores = 2
-price = 0.076
-scratch = 100
-
-[Size n1-standard-4]
-cores = 4
-price = 0.152
-scratch = 200
-</pre>
-
-h3(#azure). Microsoft Azure
-
-<pre>
-# Azure configuration for Arvados Node Manager.
-# All times are in seconds unless specified otherwise.
-
-[Manage]
-# The management server responds to http://addr:port/status.json with
-# a snapshot of internal state.
-
-# Management server listening address (default 127.0.0.1)
-#address = 0.0.0.0
-
-# Management server port number (default -1, server is disabled)
-#port = 8989
-
-[Daemon]
-# The dispatcher can customize the start and stop procedure for
-# cloud nodes.  For example, the SLURM dispatcher drains nodes
-# through SLURM before shutting them down.
-dispatcher = slurm
-
-# Node Manager will ensure that there are at least this many nodes running at
-# all times.  If node manager needs to start new idle nodes for the purpose of
-# satisfying min_nodes, it will use the cheapest node type.  However, depending
-# on usage patterns, it may also satisfy min_nodes by keeping alive some
-# more-expensive nodes
-min_nodes = 0
-
-# Node Manager will not start any compute nodes when at least this
-# many are running.
-max_nodes = 8
-
-# Upper limit on rate of spending (in $/hr), will not boot additional nodes
-# if total price of already running nodes meets or exceeds this threshold.
-# default 0 means no limit.
-max_total_price = 0
-
-# Poll Azure nodes and Arvados for new information every N seconds.
-poll_time = 60
-
-# Polls have exponential backoff when services fail to respond.
-# This is the longest time to wait between polls.
-max_poll_time = 300
-
-# If Node Manager can't succesfully poll a service for this long,
-# it will never start or stop compute nodes, on the assumption that its
-# information is too outdated.
-poll_stale_after = 600
-
-# If Node Manager boots a cloud node, and it does not pair with an Arvados
-# node before this long, assume that there was a cloud bootstrap failure and
-# shut it down.  Note that normal shutdown windows apply (see the Cloud
-# section), so this should be shorter than the first shutdown window value.
-boot_fail_after = 1800
-
-# "Node stale time" affects two related behaviors.
-# 1. If a compute node has been running for at least this long, but it
-# isn't paired with an Arvados node, do not shut it down, but leave it alone.
-# This prevents the node manager from shutting down a node that might
-# actually be doing work, but is having temporary trouble contacting the
-# API server.
-# 2. When the Node Manager starts a new compute node, it will try to reuse
-# an Arvados node that hasn't been updated for this long.
-node_stale_after = 14400
-
-# Number of consecutive times a node must report as "idle" before it
-# will be considered eligible for shutdown.  Node status is checked
-# each poll period, and node can go idle at any point during a poll
-# period (meaning a node could be reported as idle that has only been
-# idle for 1 second).  With a 60 second poll period, three consecutive
-# status updates of "idle" suggests the node has been idle at least
-# 121 seconds.
-consecutive_idle_count = 3
-
-# Scaling factor to be applied to nodes' available RAM size. Usually there's a
-# variable discrepancy between the advertised RAM value on cloud nodes and the
-# actual amount available.
-# If not set, this value will be set to 0.95
-node_mem_scaling = 0.95
-
-# File path for Certificate Authorities
-certs_file = /etc/ssl/certs/ca-certificates.crt
-
-[Logging]
-# Log file path
-file = /var/log/arvados/node-manager.log
-
-# Log level for most Node Manager messages.
-# Choose one of DEBUG, INFO, WARNING, ERROR, or CRITICAL.
-# WARNING lets you know when polling a service fails.
-# INFO additionally lets you know when a compute node is started or stopped.
-level = INFO
-
-# You can also set different log levels for specific libraries.
-# Pykka is the Node Manager's actor library.
-# Setting this to DEBUG will display tracebacks for uncaught
-# exceptions in the actors, but it's also very chatty.
-pykka = WARNING
-
-# Setting apiclient to INFO will log the URL of every Arvados API request.
-apiclient = WARNING
-
-[Arvados]
-host = zyxwv.arvadosapi.com
-token = ARVADOS_TOKEN
-timeout = 15
-
-# Accept an untrusted SSL certificate from the API server?
-insecure = no
-
-[Cloud]
-provider = azure
-
-# Shutdown windows define periods of time when a node may and may not be shut
-# down.  These are windows in full minutes, separated by commas.  Counting from
-# the time the node is booted, the node WILL NOT shut down for N1 minutes; then
-# it MAY shut down for N2 minutes; then it WILL NOT shut down for N3 minutes;
-# and so on.  For example, "20, 999999" means the node may shut down between
-# the 20th and 999999th minutes of uptime.
-# Azure bills by the minute, so it makes sense to agressively shut down idle
-# nodes.  Specify at least two windows.  You can add as many as you need beyond
-# that.
-shutdown_windows = 20, 999999
-
-[Cloud Credentials]
-# Use "azure account list" with the azure CLI to get these values.
-tenant_id = 00000000-0000-0000-0000-000000000000
-subscription_id = 00000000-0000-0000-0000-000000000000
-
-# The following directions are based on
-# https://azure.microsoft.com/en-us/documentation/articles/resource-group-authenticate-service-principal/
-# and updated for v2 of the Azure cli tool.
-#
-# az ad app create --display-name "Node Manager" --homepage "https://arvados.org" --identifier-uris "https://<Your_Application_Uri>" --password <Your_Password> --end-date <Desired_credential_expiry_date>
-# az ad sp create "<Application_Id>"
-# az role assignment create --assignee "<Application_Id>" --role Owner --resource-group "<Your_Azure_Arvados_Resource_Group>"
-#
-# Use <Application_Id> for "key" and the <Your_Password> for "secret"
-#
-key = 00000000-0000-0000-0000-000000000000
-secret = PASSWORD
-timeout = 60
-region = East US
-
-[Cloud List]
-# The resource group in which the compute node virtual machines will be created
-# and listed.
-ex_resource_group = ArvadosResourceGroup
-
-[Cloud Create]
-# The compute node image, as a link to a VHD in Azure blob store.
-image = https://example.blob.core.windows.net/system/Microsoft.Compute/Images/images/zyxwv-compute-osDisk.vhd
-
-# Path to a local ssh key file that will be used to provision new nodes.
-ssh_key = /home/arvadosuser/.ssh/id_rsa.pub
-
-# The account name for the admin user that will be provisioned on new nodes.
-ex_user_name = arvadosuser
-
-# The Azure storage account that will be used to store the node OS disk images.
-ex_storage_account = arvadosstorage
-
-# The virtual network the VMs will be associated with.
-ex_network = ArvadosNetwork
-
-# Optional subnet of the virtual network.
-#ex_subnet = default
-
-# Node tags
-tag_arvados-class = dynamic-compute
-tag_cluster = zyxwv
-
-# the API server to ping
-ping_host = hostname:port
-
-# You can define any number of Size sections to list Azure sizes you're willing
-# to use.  The Node Manager should boot the cheapest size(s) that can run jobs
-# in the queue.  You must also provide price per hour as the Azure driver
-# compute currently does not report prices.
-#
-# See https://azure.microsoft.com/en-us/pricing/details/virtual-machines/
-# for a list of known machine types that may be used as a Size parameter.
-#
-# Each size section MUST define the number of cores are available in this
-# size class (since libcloud does not provide any consistent API for exposing
-# this setting).
-# You may also want to define the amount of scratch space (expressed
-# in GB) for Crunch jobs.  You can also override Microsoft's provided
-# data fields by setting them here.
-
-[Size Standard_D3]
-cores = 4
-price = 0.56
-
-[Size Standard_D4]
-cores = 8
-price = 1.12
-</pre>
-
-h2. Running
-
-<pre>
-$ arvados-node-manager --config /etc/arvados-node-manager/config.ini
-</pre>
index 7bb9fdcbe602c5009242798295b043cfb2508204..5fcfcbe3bc3e8c00b3e4f20467ad224271e47c97 100644 (file)
@@ -9,7 +9,13 @@ Copyright (C) The Arvados Authors. All rights reserved.
 SPDX-License-Identifier: CC-BY-SA-3.0
 {% endcomment %}
 
-h2. Upgrading to CWL v1.1
+h2(#v12). Upgrading your workflows to CWL v1.2
+
+If you are starting from a CWL v1.0 document, see "Upgrading your workflows to CWL v1.1":#v11 below.
+
+If you are starting from a CWL v1.1 document, you should be able to trivially change @cwlVersion: v1.1@ to @cwlVersion: v1.2@ to be able to take advantage of the new features of v1.2, such as conditional workflow steps.
+
+h2(#v11). Upgrading your workflows to CWL v1.1
 
 CWL v1.1 introduces several features to the standard that were previously available as Arvados extensions.  CWL v1.1 syntax is backwards compatible with v1.0, so you can just change @cwlVersion: v1.0@ to @cwlVersion: v1.1@ and update your script to using the standard features.  On Arvados, there is only one behavior change between CWL v1.0 and v1.1 to be aware of: for performance reasons, Directory listings are no longer loaded by default.  To control loading Directory listings, use "loadListing":https://www.commonwl.org/v1.1/CommandLineTool.html#CommandInputParameter or "LoadListingRequirement":https://www.commonwl.org/v1.1/CommandLineTool.html#LoadListingRequirement (the extension @cwltool:LoadListingRequirement@ is deprecated.)
 
diff --git a/go.mod b/go.mod
index cc5457975f54da4d6e00702a955451f104fe39d1..71052882adbeff703ae81a21900561afe15c8743 100644 (file)
--- a/go.mod
+++ b/go.mod
@@ -11,6 +11,7 @@ require (
        github.com/anmitsu/go-shlex v0.0.0-20161002113705-648efa622239 // indirect
        github.com/arvados/cgofuse v1.2.0-arvados1
        github.com/aws/aws-sdk-go v1.25.30
+       github.com/aws/aws-sdk-go-v2 v0.23.0
        github.com/bgentry/speakeasy v0.1.0 // indirect
        github.com/bradleypeabody/godap v0.0.0-20170216002349-c249933bc092
        github.com/coreos/go-oidc v2.1.0+incompatible
@@ -22,6 +23,7 @@ require (
        github.com/docker/docker v1.4.2-0.20180109013817-94b8a116fbf1
        github.com/docker/go-connections v0.3.0 // indirect
        github.com/docker/go-units v0.3.3-0.20171221200356-d59758554a3d // indirect
+       github.com/dustin/go-humanize v1.0.0
        github.com/flynn/go-shlex v0.0.0-20150515145356-3f9db97f8568 // indirect
        github.com/fsnotify/fsnotify v1.4.9
        github.com/ghodss/yaml v1.0.0
@@ -35,6 +37,8 @@ require (
        github.com/imdario/mergo v0.3.8-0.20190415133143-5ef87b449ca7
        github.com/jbenet/go-context v0.0.0-20150711004518-d14ea06fba99 // indirect
        github.com/jmcvetta/randutil v0.0.0-20150817122601-2bb1b664bcff
+       github.com/jmoiron/sqlx v1.2.0
+       github.com/johannesboyne/gofakes3 v0.0.0-20200716060623-6b2b4cb092cc
        github.com/julienschmidt/httprouter v1.2.0
        github.com/karalabe/xgo v0.0.0-20191115072854-c5ccff8648a7 // indirect
        github.com/kevinburke/ssh_config v0.0.0-20171013211458-802051befeb5 // indirect
@@ -56,7 +60,7 @@ require (
        github.com/stretchr/testify v1.4.0 // indirect
        github.com/xanzy/ssh-agent v0.1.0 // indirect
        golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550
-       golang.org/x/net v0.0.0-20190620200207-3b0461eec859
+       golang.org/x/net v0.0.0-20200202094626-16171245cfb2
        golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45
        golang.org/x/sys v0.0.0-20191105231009-c1f44814a5cd
        google.golang.org/api v0.13.0
diff --git a/go.sum b/go.sum
index 38153ce3eaa08844dd2abfb944b9318145fbeed0..2565964e7d45121d76e59e0c7b5e21743beaaa28 100644 (file)
--- a/go.sum
+++ b/go.sum
@@ -21,14 +21,18 @@ github.com/arvados/cgofuse v1.2.0-arvados1 h1:4Q4vRJ4hbTCcI4gGEaa6hqwj3rqlUuzeFQ
 github.com/arvados/cgofuse v1.2.0-arvados1/go.mod h1:79WFV98hrkRHK9XPhh2IGGOwpFSjocsWubgxAs2KhRc=
 github.com/arvados/goamz v0.0.0-20190905141525-1bba09f407ef h1:cl7DIRbiAYNqaVxg3CZY8qfZoBOKrj06H/x9SPGaxas=
 github.com/arvados/goamz v0.0.0-20190905141525-1bba09f407ef/go.mod h1:rCtgyMmBGEbjTm37fCuBYbNL0IhztiALzo3OB9HyiOM=
+github.com/aws/aws-sdk-go v1.17.4/go.mod h1:KmX6BPdI08NWTb3/sm4ZGu5ShLoqVDhKgpiN924inxo=
 github.com/aws/aws-sdk-go v1.25.30 h1:I9qj6zW3mMfsg91e+GMSN/INcaX9tTFvr/l/BAHKaIY=
 github.com/aws/aws-sdk-go v1.25.30/go.mod h1:KmX6BPdI08NWTb3/sm4ZGu5ShLoqVDhKgpiN924inxo=
+github.com/aws/aws-sdk-go-v2 v0.23.0 h1:+E1q1LLSfHSDn/DzOtdJOX+pLZE2HiNV2yO5AjZINwM=
+github.com/aws/aws-sdk-go-v2 v0.23.0/go.mod h1:2LhT7UgHOXK3UXONKI5OMgIyoQL6zTAw/jwIeX6yqzw=
 github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q=
 github.com/beorn7/perks v1.0.0/go.mod h1:KWe93zE9D1o94FZ5RNwFwVgaQK1VOXiVxmqh+CedLV8=
 github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
 github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
 github.com/bgentry/speakeasy v0.1.0 h1:ByYyxL9InA1OWqxJqqp2A5pYHUrCiAL6K3J+LKSsQkY=
 github.com/bgentry/speakeasy v0.1.0/go.mod h1:+zsyZBPWlz7T6j88CTgSN5bM796AkVf0kBD4zp0CCIs=
+github.com/boltdb/bolt v1.3.1/go.mod h1:clJnj/oiGkjum5o1McbSZDSLxVThjynRyGBgiAx27Ps=
 github.com/bradleypeabody/godap v0.0.0-20170216002349-c249933bc092 h1:0Di2onNnlN5PAyWPbqlPyN45eOQ+QW/J9eqLynt4IV4=
 github.com/bradleypeabody/godap v0.0.0-20170216002349-c249933bc092/go.mod h1:8IzBjZCRSnsvM6MJMG8HNNtnzMl48H22rbJL2kRUJ0Y=
 github.com/cespare/xxhash/v2 v2.1.0 h1:yTUvW7Vhb89inJ+8irsUqiWjh8iT6sQPZiQzI6ReGkA=
@@ -56,6 +60,8 @@ github.com/docker/go-connections v0.3.0 h1:3lOnM9cSzgGwx8VfK/NGOW5fLQ0GjIlCkaktF
 github.com/docker/go-connections v0.3.0/go.mod h1:Gbd7IOopHjR8Iph03tsViu4nIes5XhDvyHbTtUxmeec=
 github.com/docker/go-units v0.3.3-0.20171221200356-d59758554a3d h1:dVaNRYvaGV23AdNdsm+4y1mPN0tj3/1v6taqKMmM6Ko=
 github.com/docker/go-units v0.3.3-0.20171221200356-d59758554a3d/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk=
+github.com/dustin/go-humanize v1.0.0 h1:VSnTsYCnlFHaM2/igO1h6X3HA71jcobQuxemgkq4zYo=
+github.com/dustin/go-humanize v1.0.0/go.mod h1:HtrtbFcZ19U5GC7JDqmcUSB87Iq5E25KnS6fMYU6eOk=
 github.com/flynn/go-shlex v0.0.0-20150515145356-3f9db97f8568 h1:BHsljHzVlRcyQhjrss6TZTdY2VfCqZPbv5k3iBFa2ZQ=
 github.com/flynn/go-shlex v0.0.0-20150515145356-3f9db97f8568/go.mod h1:xEzjJPgXI435gkrCt3MPfRiAkVrwSbHsst4LCFVfpJc=
 github.com/fsnotify/fsnotify v1.4.9 h1:hsms1Qyu0jgnwNXIxa+/V/PDsU6CfLf6CNO8H7IWoS4=
@@ -72,6 +78,7 @@ github.com/go-ldap/ldap v3.0.3+incompatible h1:HTeSZO8hWMS1Rgb2Ziku6b8a7qRIZZMHj
 github.com/go-ldap/ldap v3.0.3+incompatible/go.mod h1:qfd9rJvER9Q0/D/Sqn1DfHRoBp40uXYvFoEVrNEPqRc=
 github.com/go-logfmt/logfmt v0.3.0/go.mod h1:Qt1PoO58o5twSAckw1HlFXLmHsOX5/0LbT9GBnD5lWE=
 github.com/go-logfmt/logfmt v0.4.0/go.mod h1:3RMwSq7FuexP4Kalkev3ejPJsZTpXXBr9+V4qmtdjCk=
+github.com/go-sql-driver/mysql v1.5.0/go.mod h1:DCzpHaOWr8IXmIStZouvnhqoel9Qv2LBy8hT2VhHyBg=
 github.com/go-stack/stack v1.8.0/go.mod h1:v0f6uXyyMGvRgIKkXu+yp6POWl0qKG85gN/melR3HDY=
 github.com/gogo/protobuf v1.1.1 h1:72R+M5VuhED/KujmZVcIquuo8mBgX4oVda//DQb3PXo=
 github.com/gogo/protobuf v1.1.1/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7atdtwQ=
@@ -88,6 +95,7 @@ github.com/google/btree v0.0.0-20180813153112-4030bb1f1f0c/go.mod h1:lNA+9X1NB3Z
 github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M=
 github.com/google/go-cmp v0.3.0 h1:crn/baboCvb5fXaQ0IJ1SGTsTVrWpDsCWC8EGETZijY=
 github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
+github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
 github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
 github.com/google/martian v2.1.0+incompatible/go.mod h1:9I4somxYTbIHy5NJKHRl3wXiIaQGbYVAs8BPL6v8lEs=
 github.com/google/pprof v0.0.0-20181206194817-3ea8567a2e57/go.mod h1:zfwlbNMJ+OItoe0UupaVj+oy1omPYYDuagoSzA8v9mc=
@@ -109,6 +117,10 @@ github.com/jmcvetta/randutil v0.0.0-20150817122601-2bb1b664bcff h1:6NvhExg4omUC9
 github.com/jmcvetta/randutil v0.0.0-20150817122601-2bb1b664bcff/go.mod h1:ddfPX8Z28YMjiqoaJhNBzWHapTHXejnB5cDCUWDwriw=
 github.com/jmespath/go-jmespath v0.0.0-20180206201540-c2b33e8439af h1:pmfjZENx5imkbgOkpRUYLnmbU7UEFbjtDA2hxJ1ichM=
 github.com/jmespath/go-jmespath v0.0.0-20180206201540-c2b33e8439af/go.mod h1:Nht3zPeWKUH0NzdCt2Blrr5ys8VGpn0CEB0cQHVjt7k=
+github.com/jmoiron/sqlx v1.2.0 h1:41Ip0zITnmWNR/vHV+S4m+VoUivnWY5E4OJfLZjCJMA=
+github.com/jmoiron/sqlx v1.2.0/go.mod h1:1FEQNm3xlJgrMD+FBdI9+xvCksHtbpVBBw5dYhBSsks=
+github.com/johannesboyne/gofakes3 v0.0.0-20200716060623-6b2b4cb092cc h1:JJPhSHowepOF2+ElJVyb9jgt5ZyBkPMkPuhS0uODSFs=
+github.com/johannesboyne/gofakes3 v0.0.0-20200716060623-6b2b4cb092cc/go.mod h1:fNiSoOiEI5KlkWXn26OwKnNe58ilTIkpBlgOrt7Olu8=
 github.com/json-iterator/go v1.1.6/go.mod h1:+SdeFBvtyEkXs7REEP0seUULqWtbJapLOCVDaaPEHmU=
 github.com/json-iterator/go v1.1.7/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/uOdHXbAo4=
 github.com/jstemmer/go-junit-report v0.0.0-20190106144839-af01ea7f8024/go.mod h1:6v2b51hI/fHJwM22ozAgKL4VKDeJcHhJFhtBdhmNjmU=
@@ -121,10 +133,12 @@ github.com/kevinburke/ssh_config v0.0.0-20171013211458-802051befeb5/go.mod h1:CT
 github.com/konsorten/go-windows-terminal-sequences v1.0.1 h1:mweAR1A6xJ3oS2pRaGiHgQ4OO8tzTaLawm8vnODuwDk=
 github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
 github.com/kr/logfmt v0.0.0-20140226030751-b84e30acd515/go.mod h1:+0opPa2QZZtGFBFZlji/RkVcI2GknAs/DXo4wKdlNEc=
+github.com/lib/pq v1.0.0/go.mod h1:5WUZQaWbwv1U+lTReE5YruASi9Al49XbQIvNi/34Woo=
 github.com/lib/pq v1.3.0 h1:/qkRGz8zljWiDcFvgpwUpwIAPu3r07TDvs3Rws+o/pU=
 github.com/lib/pq v1.3.0/go.mod h1:5WUZQaWbwv1U+lTReE5YruASi9Al49XbQIvNi/34Woo=
 github.com/marstr/guid v1.1.1-0.20170427235115-8bdf7d1a087c h1:ouxemItv3B/Zh008HJkEXDYCN3BIRyNHxtUN7ThJ5Js=
 github.com/marstr/guid v1.1.1-0.20170427235115-8bdf7d1a087c/go.mod h1:74gB1z2wpxxInTG6yaqA7KrtM0NZ+RbrcqDvYHefzho=
+github.com/mattn/go-sqlite3 v1.9.0/go.mod h1:FPy6KqzDD04eiIsT53CuJW3U88zkxoIYsOqkbpncsNc=
 github.com/matttproud/golang_protobuf_extensions v1.0.1 h1:4hp9jkHxhMHkqkrB3Ix0jegS5sx/RkqARlsWZ6pIwiU=
 github.com/matttproud/golang_protobuf_extensions v1.0.1/go.mod h1:D8He9yQNgCq6Z5Ld7szi9bcBfOoFv/3dc6xSMkL2PC0=
 github.com/mitchellh/go-homedir v0.0.0-20161203194507-b8bc1bf76747 h1:eQox4Rh4ewJF+mqYPxCkmBAirRnPaHEB26UkNuPyjlk=
@@ -164,13 +178,17 @@ github.com/prometheus/procfs v0.0.0-20181005140218-185b4288413d/go.mod h1:c3At6R
 github.com/prometheus/procfs v0.0.2/go.mod h1:TjEm7ze935MbeOT/UhFTIMYKhuLP4wbCsTZCD3I8kEA=
 github.com/prometheus/procfs v0.0.5 h1:3+auTFlqw+ZaQYJARz6ArODtkaIwtvBTx3N2NehQlL8=
 github.com/prometheus/procfs v0.0.5/go.mod h1:4A/X28fw3Fc593LaREMrKMqOKvUAntwMDaekg4FpcdQ=
+github.com/ryszard/goskiplist v0.0.0-20150312221310-2dfbae5fcf46 h1:GHRpF1pTW19a8tTFrMLUcfWwyC0pnifVo2ClaLq+hP8=
+github.com/ryszard/goskiplist v0.0.0-20150312221310-2dfbae5fcf46/go.mod h1:uAQ5PCi+MFsC7HjREoAz1BU+Mq60+05gifQSsHSDG/8=
 github.com/satori/go.uuid v1.2.1-0.20180103174451-36e9d2ebbde5 h1:Jw7W4WMfQDxsXvfeFSaS2cHlY7bAF4MGrgnbd0+Uo78=
 github.com/satori/go.uuid v1.2.1-0.20180103174451-36e9d2ebbde5/go.mod h1:dA0hQrYB0VpLJoorglMZABFdXlWrHn1NEOzdhQKdks0=
 github.com/sergi/go-diff v1.0.0 h1:Kpca3qRNrduNnOQeazBd0ysaKrUJiIuISHxogkT9RPQ=
 github.com/sergi/go-diff v1.0.0/go.mod h1:0CfEIISq7TuYL3j771MWULgwwjU+GofnZX9QAmXWZgo=
+github.com/shabbyrobe/gocovmerge v0.0.0-20180507124511-f6ea450bfb63/go.mod h1:n+VKSARF5y/tS9XFSP7vWDfS+GUC5vs/YT7M5XDTUEM=
 github.com/sirupsen/logrus v1.2.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo=
 github.com/sirupsen/logrus v1.4.2 h1:SPIRibHv4MatM3XXNO2BJeFLZwZ2LvZgfQ5+UNI2im4=
 github.com/sirupsen/logrus v1.4.2/go.mod h1:tLMulIdttU9McNUspp0xgXVQah82FyeX6MwdIuYE2rE=
+github.com/spf13/afero v1.2.1/go.mod h1:9ZxEEn6pIJ8Rxe320qSDBk6AsU0r9pR7Q4OcevTdifk=
 github.com/src-d/gcfg v1.3.0 h1:2BEDr8r0I0b8h/fOqwtxCEiq2HJu8n2JGZJQFGXWLjg=
 github.com/src-d/gcfg v1.3.0/go.mod h1:p/UMsR43ujA89BJY9duynAwIpvqEujIH/jFlfL7jWoI=
 github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
@@ -199,6 +217,7 @@ golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73r
 golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
 golang.org/x/net v0.0.0-20190108225652-1e06a53dbb7e/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
 golang.org/x/net v0.0.0-20190213061140-3a22650c66bd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
+golang.org/x/net v0.0.0-20190310074541-c10a0554eabf/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
 golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
 golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
 golang.org/x/net v0.0.0-20190503192946-f4e77d36d62c h1:uOCk1iQW6Vc18bnC13MfzScl+wdKBmM9Y9kU7Z83/lw=
@@ -207,6 +226,7 @@ golang.org/x/net v0.0.0-20190613194153-d28f0bde5980 h1:dfGZHvZk057jK2MCeWus/TowK
 golang.org/x/net v0.0.0-20190613194153-d28f0bde5980/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
 golang.org/x/net v0.0.0-20190620200207-3b0461eec859 h1:R/3boaszxrf1GEUWTVDzSKVwLmSJpwZ1yqXm8j0v2QI=
 golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
+golang.org/x/net v0.0.0-20200202094626-16171245cfb2/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
 golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
 golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
 golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45 h1:SVwTIAaPC2U/AvvLNZ2a7OVsmBpC8L5BlwK1whH3hm0=
@@ -220,6 +240,7 @@ golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5h
 golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
 golang.org/x/sys v0.0.0-20181116152217-5ac8a444bdc5/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
 golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
+golang.org/x/sys v0.0.0-20190310054646-10058d7d4faa/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20190422165155-953cdadca894/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20190507160741-ecd444e8653b/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
@@ -235,10 +256,12 @@ golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxb
 golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
 golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
 golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY=
+golang.org/x/tools v0.0.0-20190308174544-00c44ba9c14f/go.mod h1:25r3+/G6/xytQM8iWZKq3Hn0kr0rgFKPUNVEL/dr3z4=
 golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=
 golang.org/x/tools v0.0.0-20190312170243-e65039ee4138/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=
 golang.org/x/tools v0.0.0-20190506145303-2d16b83fe98c h1:97SnQk1GYRXJgvwZ8fadnxDOWfKvkNQHH3CtZntPSrM=
 golang.org/x/tools v0.0.0-20190506145303-2d16b83fe98c/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q=
+golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
 google.golang.org/api v0.4.0/go.mod h1:8k5glujaEP+g9n7WNsDg8QP6cUVNI86fCNMcbazEtwE=
 google.golang.org/api v0.13.0 h1:Q3Ui3V3/CVinFWFiW39Iw0kMuVrRzYX0wN6OPFp0lTA=
 google.golang.org/api v0.13.0/go.mod h1:iLdEw5Ide6rF15KTC1Kkl0iskquN2gFfn9o9XIsbkAI=
@@ -260,6 +283,7 @@ gopkg.in/asn1-ber.v1 v1.0.0-20181015200546-f715ec2f112d/go.mod h1:cuepJuh7vyXfUy
 gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
 gopkg.in/check.v1 v1.0.0-20161208181325-20d25e280405 h1:829vOVxxusYHC+IqBtkX5mbKtsY9fheQiQn0MZRVLfQ=
 gopkg.in/check.v1 v1.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
+gopkg.in/mgo.v2 v2.0.0-20180705113604-9856a29383ce/go.mod h1:yeKp02qBN3iKW1OzL3MGk2IdtZzaj7SFntXj72NppTA=
 gopkg.in/square/go-jose.v2 v2.3.1 h1:SK5KegNXmKmqE342YYN2qPHEnUYeoMiXXl1poUlI+o4=
 gopkg.in/square/go-jose.v2 v2.3.1/go.mod h1:M9dMgbHiYLoDGQrXy7OpJDJWiKiU//h+vD76mk0e1AI=
 gopkg.in/src-d/go-billy.v4 v4.0.1 h1:iMxwQPj2cuKRyaIZ985zxClkcdTtT5VpXYf4PTJc0Ek=
index 752de152da96a2dd7895a770ebd142b3139f5ffe..6de367aa251c4c034b77331befde540782dc89d5 100644 (file)
@@ -43,6 +43,7 @@ type azureInstanceSetConfig struct {
        ResourceGroup                string
        Location                     string
        Network                      string
+       NetworkResourceGroup         string
        Subnet                       string
        StorageAccount               string
        BlobContainer                string
@@ -356,6 +357,11 @@ func (az *azureInstanceSet) Create(
        }
        tags["created-at"] = to.StringPtr(time.Now().Format(time.RFC3339Nano))
 
+       networkResourceGroup := az.azconfig.NetworkResourceGroup
+       if networkResourceGroup == "" {
+               networkResourceGroup = az.azconfig.ResourceGroup
+       }
+
        nicParameters := network.Interface{
                Location: &az.azconfig.Location,
                Tags:     tags,
@@ -368,7 +374,7 @@ func (az *azureInstanceSet) Create(
                                                        ID: to.StringPtr(fmt.Sprintf("/subscriptions/%s/resourceGroups/%s/providers"+
                                                                "/Microsoft.Network/virtualnetworks/%s/subnets/%s",
                                                                az.azconfig.SubscriptionID,
-                                                               az.azconfig.ResourceGroup,
+                                                               networkResourceGroup,
                                                                az.azconfig.Network,
                                                                az.azconfig.Subnet)),
                                                },
index e0c1d75765b5195a402d4b638ba089bd1dc13753..cc871087c31f0f13405430683553c7256e77e819 100644 (file)
@@ -139,9 +139,6 @@ Clusters:
       Workbench2:
         InternalURLs: {}
         ExternalURL: ""
-      Nodemanager:
-        InternalURLs: {}
-        ExternalURL: "-"
       Health:
         InternalURLs: {}
         ExternalURL: "-"
@@ -1011,13 +1008,29 @@ Clusters:
 
           # (azure) Instance configuration.
           CloudEnvironment: AzurePublicCloud
-          ResourceGroup: ""
           Location: centralus
+
+          # (azure) The resource group where the VM and virtual NIC will be
+          # created.
+          ResourceGroup: ""
+
+          # (azure) The resource group of the Network to use for the virtual
+          # NIC (if different from ResourceGroup)
+          NetworkResourceGroup: ""
           Network: ""
           Subnet: ""
+
+          # (azure) Where to store the VM VHD blobs
           StorageAccount: ""
           BlobContainer: ""
+
+          # (azure) How long to wait before deleting VHD and NIC
+          # objects that are no longer being used.
           DeleteDanglingResourcesAfter: 20s
+
+          # Account (that already exists in the VM image) that will be
+          # set up with an ssh authorized key to allow the compute
+          # dispatcher to connect.
           AdminUsername: arvados
 
     InstanceTypes:
@@ -1075,6 +1088,8 @@ Clusters:
           ConnectTimeout: 1m
           ReadTimeout: 10m
           RaceWindow: 24h
+          # Use aws-s3-go (v2) instead of goamz
+          UseAWSS3v2Driver: false
 
           # For S3 driver, potentially unsafe tuning parameter,
           # intentionally excluded from main documentation.
index 1be7208ee38facce00e71f2cfdf07885ccffde08..0552b66adb80bed162ee3f2518a1c649c0c89ec2 100644 (file)
@@ -43,7 +43,6 @@ type nodeProfile struct {
        Keepproxy     systemServiceInstance `json:"keepproxy"`
        Keepstore     systemServiceInstance `json:"keepstore"`
        Keepweb       systemServiceInstance `json:"keep-web"`
-       Nodemanager   systemServiceInstance `json:"arvados-node-manager"`
        DispatchCloud systemServiceInstance `json:"arvados-dispatch-cloud"`
        RailsAPI      systemServiceInstance `json:"arvados-api-server"`
        Websocket     systemServiceInstance `json:"arvados-ws"`
index 87e26fd09672805aa5bd840b757df71778477057..ca376ba0bb233f56f6606adb8404d1e39bffa4aa 100644 (file)
@@ -205,6 +205,7 @@ func (s *LoadSuite) TestLegacyKeepproxyConfig(c *check.C) {
 
        content = []byte(fmtKeepproxyConfig("", false))
        cluster, err = testLoadLegacyConfig(content, f, c)
+       c.Check(err, check.IsNil)
        c.Check(cluster.SystemLogs.LogLevel, check.Equals, "info")
 
        content = []byte(fmtKeepproxyConfig(`"DisableGet": true,`, true))
index e02ed6a418a5aa2f1623f15e7d03664110b82c26..0374ff7c7b14dd9188b58ae149e98f8fdafb4d94 100644 (file)
@@ -145,9 +145,6 @@ Clusters:
       Workbench2:
         InternalURLs: {}
         ExternalURL: ""
-      Nodemanager:
-        InternalURLs: {}
-        ExternalURL: "-"
       Health:
         InternalURLs: {}
         ExternalURL: "-"
@@ -1017,13 +1014,29 @@ Clusters:
 
           # (azure) Instance configuration.
           CloudEnvironment: AzurePublicCloud
-          ResourceGroup: ""
           Location: centralus
+
+          # (azure) The resource group where the VM and virtual NIC will be
+          # created.
+          ResourceGroup: ""
+
+          # (azure) The resource group of the Network to use for the virtual
+          # NIC (if different from ResourceGroup)
+          NetworkResourceGroup: ""
           Network: ""
           Subnet: ""
+
+          # (azure) Where to store the VM VHD blobs
           StorageAccount: ""
           BlobContainer: ""
+
+          # (azure) How long to wait before deleting VHD and NIC
+          # objects that are no longer being used.
           DeleteDanglingResourcesAfter: 20s
+
+          # Account (that already exists in the VM image) that will be
+          # set up with an ssh authorized key to allow the compute
+          # dispatcher to connect.
           AdminUsername: arvados
 
     InstanceTypes:
@@ -1081,6 +1094,8 @@ Clusters:
           ConnectTimeout: 1m
           ReadTimeout: 10m
           RaceWindow: 24h
+          # Use aws-s3-go (v2) instead of goamz
+          UseAWSS3v2Driver: false
 
           # For S3 driver, potentially unsafe tuning parameter,
           # intentionally excluded from main documentation.
index 5e912f91aa835e51cec5210b1d6b52c1a9016e39..58ddf950efdd82d1f1a2f26b4d3b4a248f116764 100644 (file)
@@ -164,6 +164,7 @@ func (s *LoadSuite) TestSampleKeys(c *check.C) {
                cfg, err := testLoader(c, yaml, nil).Load()
                c.Assert(err, check.IsNil)
                cc, err := cfg.GetCluster("z1111")
+               c.Assert(err, check.IsNil)
                _, hasSample := cc.InstanceTypes["SAMPLE"]
                c.Check(hasSample, check.Equals, false)
                if strings.Contains(yaml, "Foo") {
diff --git a/lib/controller/api/routable.go b/lib/controller/api/routable.go
new file mode 100644 (file)
index 0000000..6049cba
--- /dev/null
@@ -0,0 +1,17 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+// Package api provides types used by controller/server-component
+// packages.
+package api
+
+import "context"
+
+// A RoutableFunc calls an API method (sometimes via a wrapped
+// RoutableFunc) that has real argument types.
+//
+// (It is used by ctrlctx to manage database transactions, so moving
+// it to the router package would cause a circular dependency
+// router->arvadostest->ctrlctx->router.)
+type RoutableFunc func(ctx context.Context, opts interface{}) (interface{}, error)
index 20edd90b95dad70791b21cf4f19bfce6b496bff4..8745f3b9730b068faa2cc9e8a02d4c5638c7164a 100755 (executable)
@@ -23,6 +23,7 @@ func (conn *Conn) generated_ContainerList(ctx context.Context, options arvados.L
        var needSort atomic.Value
        needSort.Store(false)
        err := conn.splitListRequest(ctx, options, func(ctx context.Context, _ string, backend arvados.API, options arvados.ListOptions) ([]string, error) {
+               options.ForwardedFor = conn.cluster.ClusterID + "-" + options.ForwardedFor
                cl, err := backend.ContainerList(ctx, options)
                if err != nil {
                        return nil, err
@@ -63,6 +64,7 @@ func (conn *Conn) generated_SpecimenList(ctx context.Context, options arvados.Li
        var needSort atomic.Value
        needSort.Store(false)
        err := conn.splitListRequest(ctx, options, func(ctx context.Context, _ string, backend arvados.API, options arvados.ListOptions) ([]string, error) {
+               options.ForwardedFor = conn.cluster.ClusterID + "-" + options.ForwardedFor
                cl, err := backend.SpecimenList(ctx, options)
                if err != nil {
                        return nil, err
@@ -103,6 +105,7 @@ func (conn *Conn) generated_UserList(ctx context.Context, options arvados.ListOp
        var needSort atomic.Value
        needSort.Store(false)
        err := conn.splitListRequest(ctx, options, func(ctx context.Context, _ string, backend arvados.API, options arvados.ListOptions) ([]string, error) {
+               options.ForwardedFor = conn.cluster.ClusterID + "-" + options.ForwardedFor
                cl, err := backend.UserList(ctx, options)
                if err != nil {
                        return nil, err
index 0a596eb9cb6ac6aac690dc19a2e43ca3dc723340..bc6d3e00a493361b4d9897aeab77e3abf5cced27 100644 (file)
@@ -27,6 +27,7 @@ func (conn *Conn) generated_CollectionList(ctx context.Context, options arvados.
        var needSort atomic.Value
        needSort.Store(false)
        err := conn.splitListRequest(ctx, options, func(ctx context.Context, _ string, backend arvados.API, options arvados.ListOptions) ([]string, error) {
+               options.ForwardedFor = conn.cluster.ClusterID + "-" + options.ForwardedFor
                cl, err := backend.CollectionList(ctx, options)
                if err != nil {
                        return nil, err
@@ -107,7 +108,7 @@ func (conn *Conn) generated_CollectionList(ctx context.Context, options arvados.
 // backend.
 func (conn *Conn) splitListRequest(ctx context.Context, opts arvados.ListOptions, fn func(context.Context, string, arvados.API, arvados.ListOptions) ([]string, error)) error {
 
-       if opts.BypassFederation {
+       if opts.BypassFederation || opts.ForwardedFor != "" {
                // Client requested no federation.  Pass through.
                _, err := fn(ctx, conn.cluster.ClusterID, conn.local, opts)
                return err
@@ -249,7 +250,7 @@ func (conn *Conn) splitListRequest(ctx context.Context, opts arvados.ListOptions
 
                                done, err := fn(ctx, clusterID, backend, remoteOpts)
                                if err != nil {
-                                       errs <- httpErrorf(http.StatusBadGateway, err.Error())
+                                       errs <- httpErrorf(http.StatusBadGateway, "%s", err.Error())
                                        return
                                }
                                progress := false
index cc06246420559479203e24843164cee281e07633..2dd1d816e060a752fb8e71d4eeaacc5d0b3cfb9b 100644 (file)
@@ -6,7 +6,6 @@ package controller
 
 import (
        "context"
-       "database/sql"
        "errors"
        "fmt"
        "net/http"
@@ -16,13 +15,14 @@ import (
        "time"
 
        "git.arvados.org/arvados.git/lib/controller/federation"
-       "git.arvados.org/arvados.git/lib/controller/localdb"
        "git.arvados.org/arvados.git/lib/controller/railsproxy"
        "git.arvados.org/arvados.git/lib/controller/router"
+       "git.arvados.org/arvados.git/lib/ctrlctx"
        "git.arvados.org/arvados.git/sdk/go/arvados"
        "git.arvados.org/arvados.git/sdk/go/ctxlog"
        "git.arvados.org/arvados.git/sdk/go/health"
        "git.arvados.org/arvados.git/sdk/go/httpserver"
+       "github.com/jmoiron/sqlx"
        _ "github.com/lib/pq"
 )
 
@@ -34,7 +34,7 @@ type Handler struct {
        proxy          *proxy
        secureClient   *http.Client
        insecureClient *http.Client
-       pgdb           *sql.DB
+       pgdb           *sqlx.DB
        pgdbMtx        sync.Mutex
 }
 
@@ -87,7 +87,7 @@ func (h *Handler) setup() {
                Routes: health.Routes{"ping": func() error { _, err := h.db(context.TODO()); return err }},
        })
 
-       rtr := router.New(federation.New(h.Cluster), localdb.WrapCallsInTransactions(h.db))
+       rtr := router.New(federation.New(h.Cluster), ctrlctx.WrapCallsInTransactions(h.db))
        mux.Handle("/arvados/v1/config", rtr)
        mux.Handle("/"+arvados.EndpointUserAuthenticate.Path, rtr)
 
@@ -121,14 +121,14 @@ func (h *Handler) setup() {
 
 var errDBConnection = errors.New("database connection error")
 
-func (h *Handler) db(ctx context.Context) (*sql.DB, error) {
+func (h *Handler) db(ctx context.Context) (*sqlx.DB, error) {
        h.pgdbMtx.Lock()
        defer h.pgdbMtx.Unlock()
        if h.pgdb != nil {
                return h.pgdb, nil
        }
 
-       db, err := sql.Open("postgres", h.Cluster.PostgreSQL.Connection.String())
+       db, err := sqlx.Open("postgres", h.Cluster.PostgreSQL.Connection.String())
        if err != nil {
                ctxlog.FromContext(ctx).WithError(err).Error("postgresql connect failed")
                return nil, errDBConnection
@@ -137,7 +137,7 @@ func (h *Handler) db(ctx context.Context) (*sql.DB, error) {
                db.SetMaxOpenConns(p)
        }
        if err := db.Ping(); err != nil {
-               ctxlog.FromContext(ctx).WithError(err).Error("postgresql connect scuceeded but ping failed")
+               ctxlog.FromContext(ctx).WithError(err).Error("postgresql connect succeeded but ping failed")
                return nil, errDBConnection
        }
        h.pgdb = db
index 3bf64771d70b30d08d6c53312384071bef14a259..a73f5f9f828574b1c234932432a2a4b63c769087 100644 (file)
@@ -300,6 +300,7 @@ func (s *IntegrationSuite) TestCreateContainerRequestWithFedToken(c *check.C) {
        resp, err = arvados.InsecureHTTPClient.Do(req)
        if c.Check(err, check.IsNil) {
                err = json.NewDecoder(resp.Body).Decode(&cr)
+               c.Check(err, check.IsNil)
                c.Check(cr.UUID, check.Matches, "z2222-.*")
        }
 }
index 60263455bdb1d02c10a9164c7c235d22a0f90fb7..4f0035edf993ad525c4d82b8d5e880049432c6c2 100644 (file)
@@ -22,11 +22,13 @@ type Conn struct {
 
 func NewConn(cluster *arvados.Cluster) *Conn {
        railsProxy := railsproxy.NewConn(cluster)
-       return &Conn{
+       var conn Conn
+       conn = Conn{
                cluster:         cluster,
                railsProxy:      railsProxy,
                loginController: chooseLoginController(cluster, railsProxy),
        }
+       return &conn
 }
 
 func (conn *Conn) Logout(ctx context.Context, opts arvados.LogoutOptions) (arvados.LogoutResponse, error) {
index 1cd349a10eaa94d987899ac1315f811ffbf186e1..ee1ea56924c5700d25e43262347d1045d534ca5c 100644 (file)
@@ -15,6 +15,7 @@ import (
        "strings"
 
        "git.arvados.org/arvados.git/lib/controller/rpc"
+       "git.arvados.org/arvados.git/lib/ctrlctx"
        "git.arvados.org/arvados.git/sdk/go/arvados"
        "git.arvados.org/arvados.git/sdk/go/auth"
        "git.arvados.org/arvados.git/sdk/go/httpserver"
@@ -117,7 +118,7 @@ func createAPIClientAuthorization(ctx context.Context, conn *rpc.Conn, rootToken
                return
        }
        token := target.Query().Get("api_token")
-       tx, err := currenttx(ctx)
+       tx, err := ctrlctx.CurrentTx(ctx)
        if err != nil {
                return
        }
@@ -130,7 +131,7 @@ func createAPIClientAuthorization(ctx context.Context, conn *rpc.Conn, rootToken
        }
        var exp sql.NullString
        var scopes []byte
-       err = tx.QueryRowContext(ctx, "select uuid, api_token, expires_at, scopes from api_client_authorizations where api_token=$1", tokensecret).Scan(&resp.UUID, &resp.APIToken, &exp, &scopes)
+       err = tx.QueryRowxContext(ctx, "select uuid, api_token, expires_at, scopes from api_client_authorizations where api_token=$1", tokensecret).Scan(&resp.UUID, &resp.APIToken, &exp, &scopes)
        if err != nil {
                return
        }
index 64ae58bce2681f792020b1855c2465d5ad226ae1..700d757c274d707c703ad0c58dbac812440a45a6 100644 (file)
@@ -6,18 +6,19 @@ package localdb
 
 import (
        "context"
-       "database/sql"
        "encoding/json"
        "net"
        "net/http"
 
        "git.arvados.org/arvados.git/lib/config"
        "git.arvados.org/arvados.git/lib/controller/railsproxy"
+       "git.arvados.org/arvados.git/lib/ctrlctx"
        "git.arvados.org/arvados.git/sdk/go/arvados"
        "git.arvados.org/arvados.git/sdk/go/arvadostest"
        "git.arvados.org/arvados.git/sdk/go/auth"
        "git.arvados.org/arvados.git/sdk/go/ctxlog"
        "github.com/bradleypeabody/godap"
+       "github.com/jmoiron/sqlx"
        check "gopkg.in/check.v1"
 )
 
@@ -27,11 +28,11 @@ type LDAPSuite struct {
        cluster *arvados.Cluster
        ctrl    *ldapLoginController
        ldap    *godap.LDAPServer // fake ldap server that accepts auth goodusername/goodpassword
-       db      *sql.DB
+       db      *sqlx.DB
 
        // transaction context
        ctx      context.Context
-       rollback func()
+       rollback func() error
 }
 
 func (s *LDAPSuite) TearDownSuite(c *check.C) {
@@ -48,6 +49,7 @@ func (s *LDAPSuite) SetUpSuite(c *check.C) {
        c.Assert(err, check.IsNil)
 
        ln, err := net.Listen("tcp", "127.0.0.1:0")
+       c.Assert(err, check.IsNil)
        s.ldap = &godap.LDAPServer{
                Listener: ln,
                Handlers: []godap.LDAPRequestHandler{
@@ -91,15 +93,20 @@ func (s *LDAPSuite) SetUpSuite(c *check.C) {
                Cluster:    s.cluster,
                RailsProxy: railsproxy.NewConn(s.cluster),
        }
-       s.db = testdb(c, s.cluster)
+       s.db = arvadostest.DB(c, s.cluster)
 }
 
 func (s *LDAPSuite) SetUpTest(c *check.C) {
-       s.ctx, s.rollback = testctx(c, s.db)
+       tx, err := s.db.Beginx()
+       c.Assert(err, check.IsNil)
+       s.ctx = ctrlctx.NewWithTransaction(context.Background(), tx)
+       s.rollback = tx.Rollback
 }
 
 func (s *LDAPSuite) TearDownTest(c *check.C) {
-       s.rollback()
+       if s.rollback != nil {
+               s.rollback()
+       }
 }
 
 func (s *LDAPSuite) TestLoginSuccess(c *check.C) {
index 29c81ac5cae9ac63431e691852230a00c2335afe..2944524344e9028fa22cf0c9d18327cb39193733 100644 (file)
@@ -10,6 +10,7 @@ import (
        "net/http"
        "strings"
 
+       "git.arvados.org/arvados.git/lib/controller/api"
        "git.arvados.org/arvados.git/sdk/go/arvados"
        "git.arvados.org/arvados.git/sdk/go/auth"
        "git.arvados.org/arvados.git/sdk/go/ctxlog"
@@ -21,7 +22,7 @@ import (
 type router struct {
        mux       *mux.Router
        backend   arvados.API
-       wrapCalls func(RoutableFunc) RoutableFunc
+       wrapCalls func(api.RoutableFunc) api.RoutableFunc
 }
 
 // New returns a new router (which implements the http.Handler
@@ -32,7 +33,7 @@ type router struct {
 // the returned method is used in its place. This can be used to
 // install hooks before and after each API call and alter responses;
 // see localdb.WrapCallsInTransaction for an example.
-func New(backend arvados.API, wrapCalls func(RoutableFunc) RoutableFunc) *router {
+func New(backend arvados.API, wrapCalls func(api.RoutableFunc) api.RoutableFunc) *router {
        rtr := &router{
                mux:       mux.NewRouter(),
                backend:   backend,
@@ -42,13 +43,11 @@ func New(backend arvados.API, wrapCalls func(RoutableFunc) RoutableFunc) *router
        return rtr
 }
 
-type RoutableFunc func(ctx context.Context, opts interface{}) (interface{}, error)
-
 func (rtr *router) addRoutes() {
        for _, route := range []struct {
                endpoint    arvados.APIEndpoint
                defaultOpts func() interface{}
-               exec        RoutableFunc
+               exec        api.RoutableFunc
        }{
                {
                        arvados.EndpointConfigGet,
@@ -340,7 +339,7 @@ var altMethod = map[string]string{
        "GET":   "HEAD", // Accept HEAD at any GET route
 }
 
-func (rtr *router) addRoute(endpoint arvados.APIEndpoint, defaultOpts func() interface{}, exec RoutableFunc) {
+func (rtr *router) addRoute(endpoint arvados.APIEndpoint, defaultOpts func() interface{}, exec api.RoutableFunc) {
        methods := []string{endpoint.Method}
        if alt, ok := altMethod[endpoint.Method]; ok {
                methods = append(methods, alt)
index c73bc64915f12aff293f23c803e25771669fe8a9..18fff7c9cc4f5d4a10f347c3da55ab79ca1bf38d 100644 (file)
@@ -273,7 +273,7 @@ func (s *RouterIntegrationSuite) TestContainerLock(c *check.C) {
        c.Check(rr.Code, check.Equals, http.StatusOK)
        c.Check(jresp["uuid"], check.HasLen, 27)
        c.Check(jresp["state"], check.Equals, "Locked")
-       _, rr, jresp = doRequest(c, s.rtr, token, "POST", "/arvados/v1/containers/"+uuid+"/lock", nil, nil)
+       _, rr, _ = doRequest(c, s.rtr, token, "POST", "/arvados/v1/containers/"+uuid+"/lock", nil, nil)
        c.Check(rr.Code, check.Equals, http.StatusUnprocessableEntity)
        c.Check(rr.Body.String(), check.Not(check.Matches), `.*"uuid":.*`)
        _, rr, jresp = doRequest(c, s.rtr, token, "POST", "/arvados/v1/containers/"+uuid+"/unlock", nil, nil)
index b97c0f87b85f6e4f8c2c0ee798256bde9fced23c..f43cc1ddee295d506854fc97447c0cfe46d868ab 100644 (file)
@@ -81,10 +81,12 @@ func (s *RPCSuite) TestSpecimenCRUD(c *check.C) {
        c.Check(sp.Properties["foo"], check.Equals, "bar")
 
        spGet, err := s.conn.SpecimenGet(s.ctx, arvados.GetOptions{UUID: sp.UUID})
+       c.Check(err, check.IsNil)
        c.Check(spGet.UUID, check.Equals, sp.UUID)
        c.Check(spGet.Properties["foo"], check.Equals, "bar")
 
        spList, err := s.conn.SpecimenList(s.ctx, arvados.ListOptions{Limit: -1, Filters: []arvados.Filter{{"uuid", "=", sp.UUID}}})
+       c.Check(err, check.IsNil)
        c.Check(spList.ItemsAvailable, check.Equals, 1)
        c.Assert(spList.Items, check.HasLen, 1)
        c.Check(spList.Items[0].UUID, check.Equals, sp.UUID)
@@ -92,9 +94,11 @@ func (s *RPCSuite) TestSpecimenCRUD(c *check.C) {
 
        anonCtx := context.WithValue(context.Background(), contextKeyTestTokens, []string{arvadostest.AnonymousToken})
        spList, err = s.conn.SpecimenList(anonCtx, arvados.ListOptions{Limit: -1, Filters: []arvados.Filter{{"uuid", "=", sp.UUID}}})
+       c.Check(err, check.IsNil)
        c.Check(spList.ItemsAvailable, check.Equals, 0)
        c.Check(spList.Items, check.HasLen, 0)
 
        spDel, err := s.conn.SpecimenDelete(s.ctx, arvados.DeleteOptions{UUID: sp.UUID})
+       c.Check(err, check.IsNil)
        c.Check(spDel.UUID, check.Equals, sp.UUID)
 }
similarity index 62%
rename from lib/controller/localdb/db.go
rename to lib/ctrlctx/db.go
index 4f64e63524469cc9e9fb987a4570772eb445fd8b..127be489df3a27e553f6aa421a6f1c40cdbdcc55 100644 (file)
@@ -2,16 +2,22 @@
 //
 // SPDX-License-Identifier: AGPL-3.0
 
-package localdb
+package ctrlctx
 
 import (
        "context"
-       "database/sql"
        "errors"
        "sync"
 
-       "git.arvados.org/arvados.git/lib/controller/router"
+       "git.arvados.org/arvados.git/lib/controller/api"
        "git.arvados.org/arvados.git/sdk/go/ctxlog"
+       "github.com/jmoiron/sqlx"
+       _ "github.com/lib/pq"
+)
+
+var (
+       ErrNoTransaction   = errors.New("bug: there is no transaction in this context")
+       ErrContextFinished = errors.New("refusing to start a transaction after wrapped function already returned")
 )
 
 // WrapCallsInTransactions returns a call wrapper (suitable for
@@ -20,20 +26,20 @@ import (
 //
 // The wrapper calls getdb() to get a database handle before each API
 // call.
-func WrapCallsInTransactions(getdb func(context.Context) (*sql.DB, error)) func(router.RoutableFunc) router.RoutableFunc {
-       return func(origFunc router.RoutableFunc) router.RoutableFunc {
+func WrapCallsInTransactions(getdb func(context.Context) (*sqlx.DB, error)) func(api.RoutableFunc) api.RoutableFunc {
+       return func(origFunc api.RoutableFunc) api.RoutableFunc {
                return func(ctx context.Context, opts interface{}) (_ interface{}, err error) {
-                       ctx, finishtx := starttx(ctx, getdb)
+                       ctx, finishtx := New(ctx, getdb)
                        defer finishtx(&err)
                        return origFunc(ctx, opts)
                }
        }
 }
 
-// ContextWithTransaction returns a child context in which the given
+// NewWithTransaction returns a child context in which the given
 // transaction will be used by any localdb API call that needs one.
 // The caller is responsible for calling Commit or Rollback on tx.
-func ContextWithTransaction(ctx context.Context, tx *sql.Tx) context.Context {
+func NewWithTransaction(ctx context.Context, tx *sqlx.Tx) context.Context {
        txn := &transaction{tx: tx}
        txn.setup.Do(func() {})
        return context.WithValue(ctx, contextKeyTransaction, txn)
@@ -44,26 +50,26 @@ type contextKeyT string
 var contextKeyTransaction = contextKeyT("transaction")
 
 type transaction struct {
-       tx    *sql.Tx
+       tx    *sqlx.Tx
        err   error
-       getdb func(context.Context) (*sql.DB, error)
+       getdb func(context.Context) (*sqlx.DB, error)
        setup sync.Once
 }
 
-type transactionFinishFunc func(*error)
+type finishFunc func(*error)
 
-// starttx returns a new child context that can be used with
-// currenttx(). It does not open a database transaction until the
-// first call to currenttx().
+// New returns a new child context that can be used with
+// CurrentTx(). It does not open a database transaction until the
+// first call to CurrentTx().
 //
 // The caller must eventually call the returned finishtx() func to
 // commit or rollback the transaction, if any.
 //
 //     func example(ctx context.Context) (err error) {
-//             ctx, finishtx := starttx(ctx, dber)
+//             ctx, finishtx := New(ctx, dber)
 //             defer finishtx(&err)
 //             // ...
-//             tx, err := currenttx(ctx)
+//             tx, err := CurrentTx(ctx)
 //             if err != nil {
 //                     return fmt.Errorf("example: %s", err)
 //             }
@@ -75,17 +81,17 @@ type transactionFinishFunc func(*error)
 //
 // If *err is non-nil, finishtx() rolls back the transaction, and
 // does not modify *err.
-func starttx(ctx context.Context, getdb func(context.Context) (*sql.DB, error)) (context.Context, transactionFinishFunc) {
+func New(ctx context.Context, getdb func(context.Context) (*sqlx.DB, error)) (context.Context, finishFunc) {
        txn := &transaction{getdb: getdb}
        return context.WithValue(ctx, contextKeyTransaction, txn), func(err *error) {
                txn.setup.Do(func() {
                        // Using (*sync.Once)Do() prevents a future
-                       // call to currenttx() from opening a
+                       // call to CurrentTx() from opening a
                        // transaction which would never get committed
-                       // or rolled back. If currenttx() hasn't been
+                       // or rolled back. If CurrentTx() hasn't been
                        // called before now, future calls will return
                        // this error.
-                       txn.err = errors.New("refusing to start a transaction after wrapped function already returned")
+                       txn.err = ErrContextFinished
                })
                if txn.tx == nil {
                        // we never [successfully] started a transaction
@@ -100,16 +106,16 @@ func starttx(ctx context.Context, getdb func(context.Context) (*sql.DB, error))
        }
 }
 
-func currenttx(ctx context.Context) (*sql.Tx, error) {
+func CurrentTx(ctx context.Context) (*sqlx.Tx, error) {
        txn, ok := ctx.Value(contextKeyTransaction).(*transaction)
        if !ok {
-               return nil, errors.New("bug: there is no transaction in this context")
+               return nil, ErrNoTransaction
        }
        txn.setup.Do(func() {
                if db, err := txn.getdb(ctx); err != nil {
                        txn.err = err
                } else {
-                       txn.tx, txn.err = db.Begin()
+                       txn.tx, txn.err = db.Beginx()
                }
        })
        return txn.tx, txn.err
similarity index 62%
rename from lib/controller/localdb/db_test.go
rename to lib/ctrlctx/db_test.go
index 5bab86c60289e688475efa98e6be9061936a800a..5361f13c68a4967168082b28f16ab562fce546ee 100644 (file)
@@ -2,37 +2,24 @@
 //
 // SPDX-License-Identifier: AGPL-3.0
 
-package localdb
+package ctrlctx
 
 import (
        "context"
-       "database/sql"
        "sync"
        "sync/atomic"
+       "testing"
 
        "git.arvados.org/arvados.git/lib/config"
-       "git.arvados.org/arvados.git/sdk/go/arvados"
        "git.arvados.org/arvados.git/sdk/go/ctxlog"
+       "github.com/jmoiron/sqlx"
        _ "github.com/lib/pq"
        check "gopkg.in/check.v1"
 )
 
-// testdb returns a DB connection for the given cluster config.
-func testdb(c *check.C, cluster *arvados.Cluster) *sql.DB {
-       db, err := sql.Open("postgres", cluster.PostgreSQL.Connection.String())
-       c.Assert(err, check.IsNil)
-       return db
-}
-
-// testctx returns a context suitable for running a test case in a new
-// transaction, and a rollback func which the caller should call after
-// the test.
-func testctx(c *check.C, db *sql.DB) (ctx context.Context, rollback func()) {
-       tx, err := db.Begin()
-       c.Assert(err, check.IsNil)
-       return ContextWithTransaction(context.Background(), tx), func() {
-               c.Check(tx.Rollback(), check.IsNil)
-       }
+// Gocheck boilerplate
+func Test(t *testing.T) {
+       check.TestingT(t)
 }
 
 var _ = check.Suite(&DatabaseSuite{})
@@ -46,26 +33,28 @@ func (*DatabaseSuite) TestTransactionContext(c *check.C) {
        c.Assert(err, check.IsNil)
 
        var getterCalled int64
-       getter := func(context.Context) (*sql.DB, error) {
+       getter := func(context.Context) (*sqlx.DB, error) {
                atomic.AddInt64(&getterCalled, 1)
-               return testdb(c, cluster), nil
+               db, err := sqlx.Open("postgres", cluster.PostgreSQL.Connection.String())
+               c.Assert(err, check.IsNil)
+               return db, nil
        }
        wrapper := WrapCallsInTransactions(getter)
        wrappedFunc := wrapper(func(ctx context.Context, opts interface{}) (interface{}, error) {
-               txes := make([]*sql.Tx, 20)
+               txes := make([]*sqlx.Tx, 20)
                var wg sync.WaitGroup
                for i := range txes {
                        i := i
                        wg.Add(1)
                        go func() {
-                               // Concurrent calls to currenttx(),
+                               // Concurrent calls to CurrentTx(),
                                // with different children of the same
                                // parent context, will all return the
                                // same transaction.
                                defer wg.Done()
                                ctx, cancel := context.WithCancel(ctx)
                                defer cancel()
-                               tx, err := currenttx(ctx)
+                               tx, err := CurrentTx(ctx)
                                c.Check(err, check.IsNil)
                                txes[i] = tx
                        }()
@@ -82,8 +71,8 @@ func (*DatabaseSuite) TestTransactionContext(c *check.C) {
        c.Check(err, check.IsNil)
        c.Check(getterCalled, check.Equals, int64(1))
 
-       // When a wrapped func returns without calling currenttx(),
-       // calling currenttx() later shouldn't start a new
+       // When a wrapped func returns without calling CurrentTx(),
+       // calling CurrentTx() later shouldn't start a new
        // transaction.
        var savedctx context.Context
        ok, err = wrapper(func(ctx context.Context, opts interface{}) (interface{}, error) {
@@ -92,7 +81,7 @@ func (*DatabaseSuite) TestTransactionContext(c *check.C) {
        })(context.Background(), "blah")
        c.Check(ok, check.Equals, true)
        c.Check(err, check.IsNil)
-       tx, err := currenttx(savedctx)
+       tx, err := CurrentTx(savedctx)
        c.Check(tx, check.IsNil)
        c.Check(err, check.NotNil)
 }
diff --git a/lib/deduplicationreport/command.go b/lib/deduplicationreport/command.go
new file mode 100644 (file)
index 0000000..1199bc0
--- /dev/null
@@ -0,0 +1,43 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+package deduplicationreport
+
+import (
+       "io"
+
+       "git.arvados.org/arvados.git/lib/config"
+       "git.arvados.org/arvados.git/sdk/go/ctxlog"
+       "github.com/sirupsen/logrus"
+)
+
+var Command command
+
+type command struct{}
+
+type NoPrefixFormatter struct{}
+
+func (f *NoPrefixFormatter) Format(entry *logrus.Entry) ([]byte, error) {
+       return []byte(entry.Message), nil
+}
+
+// RunCommand implements the subcommand "deduplication-report <collection> <collection> ..."
+func (command) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int {
+       var err error
+       logger := ctxlog.New(stderr, "text", "info")
+       defer func() {
+               if err != nil {
+                       logger.WithError(err).Error("fatal")
+               }
+       }()
+
+       logger.SetFormatter(new(NoPrefixFormatter))
+
+       loader := config.NewLoader(stdin, logger)
+       loader.SkipLegacy = true
+
+       exitcode := report(prog, args, loader, logger, stdout, stderr)
+
+       return exitcode
+}
diff --git a/lib/deduplicationreport/report.go b/lib/deduplicationreport/report.go
new file mode 100644 (file)
index 0000000..8bb3fc4
--- /dev/null
@@ -0,0 +1,216 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package deduplicationreport
+
+import (
+       "flag"
+       "fmt"
+       "io"
+       "strings"
+
+       "git.arvados.org/arvados.git/lib/config"
+       "git.arvados.org/arvados.git/sdk/go/arvados"
+       "git.arvados.org/arvados.git/sdk/go/arvadosclient"
+       "git.arvados.org/arvados.git/sdk/go/manifest"
+
+       "github.com/dustin/go-humanize"
+       "github.com/sirupsen/logrus"
+)
+
+func deDuplicate(inputs []string) (trimmed []string) {
+       seen := make(map[string]bool)
+       for _, uuid := range inputs {
+               if !seen[uuid] {
+                       seen[uuid] = true
+                       trimmed = append(trimmed, uuid)
+               }
+       }
+       return
+}
+
+func parseFlags(prog string, args []string, loader *config.Loader, logger *logrus.Logger, stderr io.Writer) (exitcode int, inputs []string) {
+       flags := flag.NewFlagSet("", flag.ContinueOnError)
+       flags.SetOutput(stderr)
+       flags.Usage = func() {
+               fmt.Fprintf(flags.Output(), `
+Usage:
+  %s [options ...] <collection-uuid> <collection-uuid> ...
+
+  %s [options ...] <collection-pdh>,<collection_uuid> \
+     <collection-pdh>,<collection_uuid> ...
+
+  This program analyzes the overlap in blocks used by 2 or more collections. It
+  prints a deduplication report that shows the nominal space used by the
+  collections, as well as the actual size and the amount of space that is saved
+  by Keep's deduplication.
+
+  The list of collections may be provided in two ways. A list of collection
+  uuids is sufficient. Alternatively, the PDH for each collection may also be
+  provided. This is will greatly speed up operation when the list contains
+  multiple collections with the same PDH.
+
+  Exit status will be zero if there were no errors generating the report.
+
+Example:
+
+  Use the 'arv' and 'jq' commands to get the list of the 100
+  largest collections and generate the deduplication report:
+
+  arv collection list --order 'file_size_total desc' --limit 100 | \
+    jq -r '.items[] | [.portable_data_hash,.uuid] |@csv' | \
+    tail -n+2 |sed -e 's/"//g'|tr '\n' ' ' | \
+    xargs %s
+
+Options:
+`, prog, prog, prog)
+               flags.PrintDefaults()
+       }
+       loader.SetupFlags(flags)
+       loglevel := flags.String("log-level", "info", "logging level (debug, info, ...)")
+       err := flags.Parse(args)
+       if err == flag.ErrHelp {
+               return 0, inputs
+       } else if err != nil {
+               return 2, inputs
+       }
+
+       inputs = flags.Args()
+
+       inputs = deDuplicate(inputs)
+
+       if len(inputs) < 1 {
+               logger.Errorf("Error: no collections provided")
+               flags.Usage()
+               return 2, inputs
+       }
+
+       lvl, err := logrus.ParseLevel(*loglevel)
+       if err != nil {
+               return 2, inputs
+       }
+       logger.SetLevel(lvl)
+       return
+}
+
+func blockList(collection arvados.Collection) (blocks map[string]int) {
+       blocks = make(map[string]int)
+       m := manifest.Manifest{Text: collection.ManifestText}
+       blockChannel := m.BlockIterWithDuplicates()
+       for b := range blockChannel {
+               blocks[b.Digest.String()] = b.Size
+       }
+       return
+}
+
+func report(prog string, args []string, loader *config.Loader, logger *logrus.Logger, stdout, stderr io.Writer) (exitcode int) {
+
+       var inputs []string
+       exitcode, inputs = parseFlags(prog, args, loader, logger, stderr)
+       if exitcode != 0 {
+               return
+       }
+
+       // Arvados Client setup
+       arv, err := arvadosclient.MakeArvadosClient()
+       if err != nil {
+               logger.Errorf("Error creating Arvados object: %s", err)
+               exitcode = 1
+               return
+       }
+
+       type Col struct {
+               FileSizeTotal int64
+               FileCount     int64
+       }
+
+       blocks := make(map[string]map[string]int)
+       pdhs := make(map[string]Col)
+       var nominalSize int64
+
+       for _, input := range inputs {
+               var uuid string
+               var pdh string
+               if strings.Contains(input, ",") {
+                       // The input is in the format pdh,uuid. This will allow us to save time on duplicate pdh's
+                       tmp := strings.Split(input, ",")
+                       pdh = tmp[0]
+                       uuid = tmp[1]
+               } else {
+                       // The input must be a plain uuid
+                       uuid = input
+               }
+               if !strings.Contains(uuid, "-4zz18-") {
+                       logger.Errorf("Error: uuid must refer to collection object")
+                       exitcode = 1
+                       return
+               }
+               if _, ok := pdhs[pdh]; ok {
+                       // We've processed a collection with this pdh already. Simply add its
+                       // size to the totals and move on to the next one.
+                       // Note that we simply trust the PDH matches the collection UUID here,
+                       // in other words, we use it over the UUID. If they don't match, the report
+                       // will be wrong.
+                       nominalSize += pdhs[pdh].FileSizeTotal
+               } else {
+                       var collection arvados.Collection
+                       err = arv.Get("collections", uuid, nil, &collection)
+                       if err != nil {
+                               logger.Errorf("Error: unable to retrieve collection: %s", err)
+                               exitcode = 1
+                               return
+                       }
+                       blocks[uuid] = make(map[string]int)
+                       blocks[uuid] = blockList(collection)
+                       if pdh != "" && collection.PortableDataHash != pdh {
+                               logger.Errorf("Error: the collection with UUID %s has PDH %s, but a different PDH was provided in the arguments: %s", uuid, collection.PortableDataHash, pdh)
+                               exitcode = 1
+                               return
+                       }
+                       if pdh == "" {
+                               pdh = collection.PortableDataHash
+                       }
+
+                       col := Col{}
+                       if collection.FileSizeTotal != 0 || collection.FileCount != 0 {
+                               nominalSize += collection.FileSizeTotal
+                               col.FileSizeTotal = collection.FileSizeTotal
+                               col.FileCount = int64(collection.FileCount)
+                       } else {
+                               // Collections created with old Arvados versions do not always have the total file size and count cached in the collections object
+                               var collSize int64
+                               for _, size := range blocks[uuid] {
+                                       collSize += int64(size)
+                               }
+                               nominalSize += collSize
+                               col.FileSizeTotal = collSize
+                       }
+                       pdhs[pdh] = col
+               }
+
+               if pdhs[pdh].FileCount != 0 {
+                       fmt.Fprintf(stdout, "Collection %s: pdh %s; nominal size %d (%s); file count %d\n", uuid, pdh, pdhs[pdh].FileSizeTotal, humanize.IBytes(uint64(pdhs[pdh].FileSizeTotal)), pdhs[pdh].FileCount)
+               } else {
+                       fmt.Fprintf(stdout, "Collection %s: pdh %s; nominal size %d (%s)\n", uuid, pdh, pdhs[pdh].FileSizeTotal, humanize.IBytes(uint64(pdhs[pdh].FileSizeTotal)))
+               }
+       }
+
+       var totalSize int64
+       seen := make(map[string]bool)
+       for _, v := range blocks {
+               for pdh, size := range v {
+                       if !seen[pdh] {
+                               seen[pdh] = true
+                               totalSize += int64(size)
+                       }
+               }
+       }
+       fmt.Fprintln(stdout)
+       fmt.Fprintf(stdout, "Collections:                 %15d\n", len(inputs))
+       fmt.Fprintf(stdout, "Nominal size of stored data: %15d bytes (%s)\n", nominalSize, humanize.IBytes(uint64(nominalSize)))
+       fmt.Fprintf(stdout, "Actual size of stored data:  %15d bytes (%s)\n", totalSize, humanize.IBytes(uint64(totalSize)))
+       fmt.Fprintf(stdout, "Saved by Keep deduplication: %15d bytes (%s)\n", nominalSize-totalSize, humanize.IBytes(uint64(nominalSize-totalSize)))
+
+       return exitcode
+}
diff --git a/lib/deduplicationreport/report_test.go b/lib/deduplicationreport/report_test.go
new file mode 100644 (file)
index 0000000..a4ed466
--- /dev/null
@@ -0,0 +1,119 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package deduplicationreport
+
+import (
+       "bytes"
+       "testing"
+
+       "git.arvados.org/arvados.git/sdk/go/arvados"
+       "git.arvados.org/arvados.git/sdk/go/arvadostest"
+       "gopkg.in/check.v1"
+)
+
+func Test(t *testing.T) {
+       check.TestingT(t)
+}
+
+var _ = check.Suite(&Suite{})
+
+type Suite struct{}
+
+func (s *Suite) TearDownSuite(c *check.C) {
+       // Undo any changes/additions to the database so they don't affect subsequent tests.
+       arvadostest.ResetEnv()
+}
+
+func (*Suite) TestUsage(c *check.C) {
+       var stdout, stderr bytes.Buffer
+       exitcode := Command.RunCommand("deduplicationreport.test", []string{"-log-level=debug"}, &bytes.Buffer{}, &stdout, &stderr)
+       c.Check(exitcode, check.Equals, 2)
+       c.Check(stdout.String(), check.Equals, "")
+       c.Log(stderr.String())
+       c.Check(stderr.String(), check.Matches, `(?ms).*Usage:.*`)
+}
+
+func (*Suite) TestTwoIdenticalUUIDs(c *check.C) {
+       var stdout, stderr bytes.Buffer
+       // Run dedupreport with 2 identical uuids
+       exitcode := Command.RunCommand("deduplicationreport.test", []string{arvadostest.FooCollection, arvadostest.FooCollection}, &bytes.Buffer{}, &stdout, &stderr)
+       c.Check(exitcode, check.Equals, 0)
+       c.Check(stdout.String(), check.Matches, "(?ms).*Collections:[[:space:]]+1.*")
+       c.Check(stdout.String(), check.Matches, "(?ms).*Nominal size of stored data:[[:space:]]+3 bytes \\(3 B\\).*")
+       c.Check(stdout.String(), check.Matches, "(?ms).*Actual size of stored data:[[:space:]]+3 bytes \\(3 B\\).*")
+       c.Check(stdout.String(), check.Matches, "(?ms).*Saved by Keep deduplication:[[:space:]]+0 bytes \\(0 B\\).*")
+       c.Log(stderr.String())
+}
+
+func (*Suite) TestTwoUUIDsInvalidPDH(c *check.C) {
+       var stdout, stderr bytes.Buffer
+       // Run dedupreport with pdh,uuid where pdh does not match
+       exitcode := Command.RunCommand("deduplicationreport.test", []string{arvadostest.FooAndBarFilesInDirPDH + "," + arvadostest.FooCollection, arvadostest.FooCollection}, &bytes.Buffer{}, &stdout, &stderr)
+       c.Check(exitcode, check.Equals, 1)
+       c.Check(stdout.String(), check.Equals, "")
+       c.Log(stderr.String())
+       c.Check(stderr.String(), check.Matches, `(?ms).*Error: the collection with UUID zzzzz-4zz18-fy296fx3hot09f7 has PDH 1f4b0bc7583c2a7f9102c395f4ffc5e3\+45, but a different PDH was provided in the arguments: 870369fc72738603c2fad16664e50e2d\+58.*`)
+}
+
+func (*Suite) TestNonExistentCollection(c *check.C) {
+       var stdout, stderr bytes.Buffer
+       // Run dedupreport with many UUIDs
+       exitcode := Command.RunCommand("deduplicationreport.test", []string{arvadostest.FooCollection, arvadostest.NonexistentCollection}, &bytes.Buffer{}, &stdout, &stderr)
+       c.Check(exitcode, check.Equals, 1)
+       c.Check(stdout.String(), check.Equals, "Collection zzzzz-4zz18-fy296fx3hot09f7: pdh 1f4b0bc7583c2a7f9102c395f4ffc5e3+45; nominal size 3 (3 B)\n")
+       c.Log(stderr.String())
+       c.Check(stderr.String(), check.Matches, `(?ms).*Error: unable to retrieve collection:.*404 Not Found.*`)
+}
+
+func (*Suite) TestManyUUIDsNoOverlap(c *check.C) {
+       var stdout, stderr bytes.Buffer
+       // Run dedupreport with 5 UUIDs
+       exitcode := Command.RunCommand("deduplicationreport.test", []string{arvadostest.FooCollection, arvadostest.HelloWorldCollection, arvadostest.FooBarDirCollection, arvadostest.WazVersion1Collection, arvadostest.UserAgreementCollection}, &bytes.Buffer{}, &stdout, &stderr)
+       c.Check(exitcode, check.Equals, 0)
+       c.Check(stdout.String(), check.Matches, "(?ms).*Collections:[[:space:]]+5.*")
+       c.Check(stdout.String(), check.Matches, "(?ms).*Nominal size of stored data:[[:space:]]+249049 bytes \\(243 KiB\\).*")
+       c.Check(stdout.String(), check.Matches, "(?ms).*Actual size of stored data:[[:space:]]+249049 bytes \\(243 KiB\\).*")
+       c.Check(stdout.String(), check.Matches, "(?ms).*Saved by Keep deduplication:[[:space:]]+0 bytes \\(0 B\\).*")
+       c.Log(stderr.String())
+       c.Check(stderr.String(), check.Equals, "")
+}
+
+func (*Suite) TestTwoOverlappingCollections(c *check.C) {
+       var stdout, stderr bytes.Buffer
+       // Create two collections
+       arv := arvados.NewClientFromEnv()
+
+       var c1 arvados.Collection
+       err := arv.RequestAndDecode(&c1, "POST", "arvados/v1/collections", nil, map[string]interface{}{"collection": map[string]interface{}{"manifest_text": ". d3b07384d113edec49eaa6238ad5ff00+4 0:4:foo\n"}})
+       c.Assert(err, check.Equals, nil)
+
+       var c2 arvados.Collection
+       err = arv.RequestAndDecode(&c2, "POST", "arvados/v1/collections", nil, map[string]interface{}{"collection": map[string]interface{}{"manifest_text": ". c157a79031e1c40f85931829bc5fc552+4 d3b07384d113edec49eaa6238ad5ff00+4 0:4:bar 4:4:foo\n"}})
+       c.Assert(err, check.Equals, nil)
+
+       for _, trial := range []struct {
+               field1 string
+               field2 string
+       }{
+               {
+                       // Run dedupreport with 2 arguments: uuid uuid
+                       field1: c1.UUID,
+                       field2: c2.UUID,
+               },
+               {
+                       // Run dedupreport with 2 arguments: pdh,uuid uuid
+                       field1: c1.PortableDataHash + "," + c1.UUID,
+                       field2: c2.UUID,
+               },
+       } {
+               exitcode := Command.RunCommand("deduplicationreport.test", []string{trial.field1, trial.field2}, &bytes.Buffer{}, &stdout, &stderr)
+               c.Check(exitcode, check.Equals, 0)
+               c.Check(stdout.String(), check.Matches, "(?ms).*Nominal size of stored data:[[:space:]]+12 bytes \\(12 B\\).*")
+               c.Check(stdout.String(), check.Matches, "(?ms).*Actual size of stored data:[[:space:]]+8 bytes \\(8 B\\).*")
+               c.Check(stdout.String(), check.Matches, "(?ms).*Saved by Keep deduplication:[[:space:]]+4 bytes \\(4 B\\).*")
+               c.Log(stderr.String())
+               c.Check(stderr.String(), check.Equals, "")
+       }
+}
index dd9d811818fae48704ace6f9bd0d808750e8b6f6..aa5f22a501331a2bdd108878987e25b133df720b 100644 (file)
@@ -207,6 +207,10 @@ func (s *DispatcherSuite) TestDispatchToStubDriver(c *check.C) {
        c.Check(resp.Body.String(), check.Matches, `(?ms).*driver_operations{error="0",operation="Destroy"} [^0].*`)
        c.Check(resp.Body.String(), check.Matches, `(?ms).*driver_operations{error="1",operation="Create"} [^0].*`)
        c.Check(resp.Body.String(), check.Matches, `(?ms).*driver_operations{error="1",operation="List"} 0\n.*`)
+       c.Check(resp.Body.String(), check.Matches, `(?ms).*boot_outcomes{outcome="aborted"} 0.*`)
+       c.Check(resp.Body.String(), check.Matches, `(?ms).*boot_outcomes{outcome="disappeared"} [^0].*`)
+       c.Check(resp.Body.String(), check.Matches, `(?ms).*boot_outcomes{outcome="failure"} [^0].*`)
+       c.Check(resp.Body.String(), check.Matches, `(?ms).*boot_outcomes{outcome="success"} [^0].*`)
        c.Check(resp.Body.String(), check.Matches, `(?ms).*instances_disappeared{state="shutdown"} [^0].*`)
        c.Check(resp.Body.String(), check.Matches, `(?ms).*instances_disappeared{state="unknown"} 0\n.*`)
 }
index 8ab1cd9ba7337a0a2c33d9628b33f257a46fa6e9..32c6b3b24d198b90adb5f2899580783beb2dd9cb 100644 (file)
@@ -36,6 +36,7 @@ type stubPool struct {
        notify    <-chan struct{}
        unalloc   map[arvados.InstanceType]int // idle+booting+unknown
        idle      map[arvados.InstanceType]int
+       unknown   map[arvados.InstanceType]int
        running   map[string]time.Time
        atQuota   bool
        canCreate int
@@ -62,7 +63,7 @@ func (p *stubPool) Unallocated() map[arvados.InstanceType]int {
        defer p.Unlock()
        r := map[arvados.InstanceType]int{}
        for it, n := range p.unalloc {
-               r[it] = n
+               r[it] = n - p.unknown[it]
        }
        return r
 }
@@ -96,6 +97,7 @@ func (p *stubPool) CountWorkers() map[worker.State]int {
                worker.StateBooting: len(p.unalloc) - len(p.idle),
                worker.StateIdle:    len(p.idle),
                worker.StateRunning: len(p.running),
+               worker.StateUnknown: len(p.unknown),
        }
 }
 func (p *stubPool) StartContainer(it arvados.InstanceType, ctr arvados.Container) bool {
index de69df98227e624fc29ef8e55884e8457db29592..116ca7643117d3f4df3b6e8d4e99864a44d6dfe6 100644 (file)
@@ -8,6 +8,7 @@ import (
        "fmt"
 
        "git.arvados.org/arvados.git/lib/dispatchcloud/container"
+       "git.arvados.org/arvados.git/lib/dispatchcloud/worker"
        "git.arvados.org/arvados.git/sdk/go/arvados"
        "github.com/sirupsen/logrus"
 )
@@ -23,6 +24,7 @@ import (
 // Running containers whose crunch-run processes have exited are
 // cancelled.
 func (sch *Scheduler) sync() {
+       anyUnknownWorkers := sch.pool.CountWorkers()[worker.StateUnknown] > 0
        running := sch.pool.Running()
        qEntries, qUpdated := sch.queue.Entries()
        for uuid, ent := range qEntries {
@@ -30,7 +32,9 @@ func (sch *Scheduler) sync() {
                switch ent.Container.State {
                case arvados.ContainerStateRunning:
                        if !running {
-                               go sch.cancel(uuid, "not running on any worker")
+                               if !anyUnknownWorkers {
+                                       go sch.cancel(uuid, "not running on any worker")
+                               }
                        } else if !exited.IsZero() && qUpdated.After(exited) {
                                go sch.cancel(uuid, "state=Running after crunch-run exited")
                        } else if ent.Container.Priority == 0 {
index 305ab9e04eb379c82288853b3df9891bc639bf5b..538f5ea8cfd0b9e14edec62d629eaa104ff70514 100644 (file)
@@ -54,3 +54,65 @@ func (*SchedulerSuite) TestForgetIrrelevantContainers(c *check.C) {
        ents, _ = queue.Entries()
        c.Check(ents, check.HasLen, 0)
 }
+
+func (*SchedulerSuite) TestCancelOrphanedContainers(c *check.C) {
+       ctx := ctxlog.Context(context.Background(), ctxlog.TestLogger(c))
+       pool := stubPool{
+               unalloc: map[arvados.InstanceType]int{test.InstanceType(1): 1},
+               unknown: map[arvados.InstanceType]int{test.InstanceType(1): 1},
+       }
+       queue := test.Queue{
+               ChooseType: chooseType,
+               Containers: []arvados.Container{
+                       {
+                               UUID:     test.ContainerUUID(1),
+                               Priority: 0,
+                               State:    arvados.ContainerStateRunning,
+                               RuntimeConstraints: arvados.RuntimeConstraints{
+                                       VCPUs: 1,
+                                       RAM:   1 << 30,
+                               },
+                       },
+               },
+       }
+       queue.Update()
+
+       ents, _ := queue.Entries()
+       c.Check(ents, check.HasLen, 1)
+
+       sch := New(ctx, &queue, &pool, time.Millisecond, time.Millisecond)
+
+       // Sync shouldn't cancel the container because it might be
+       // running on the VM with state=="unknown".
+       //
+       // (Cancel+forget happens asynchronously and requires multiple
+       // sync() calls, so even after 10x sync-and-sleep iterations,
+       // we aren't 100% confident that sync isn't trying to
+       // cancel. But in the test environment, the goroutines started
+       // by sync() access stubs and therefore run quickly, so it
+       // works fine in practice. We accept that if the code is
+       // broken, the test will still pass occasionally.)
+       for i := 0; i < 10; i++ {
+               sch.sync()
+               time.Sleep(time.Millisecond)
+       }
+       ents, _ = queue.Entries()
+       c.Check(ents, check.HasLen, 1)
+       c.Check(ents[test.ContainerUUID(1)].Container.State, check.Equals, arvados.ContainerStateRunning)
+
+       // Sync should cancel & forget the container when the
+       // "unknown" node goes away.
+       //
+       // (As above, cancel+forget is async and requires multiple
+       // sync() calls, but stubs are fast so in practice this takes
+       // much less than 1s to complete.)
+       pool.unknown = nil
+       for deadline := time.Now().Add(time.Second); ; time.Sleep(time.Millisecond) {
+               sch.sync()
+               ents, _ = queue.Entries()
+               if len(ents) == 0 || time.Now().After(deadline) {
+                       break
+               }
+       }
+       c.Check(ents, check.HasLen, 0)
+}
index 7f1e4bc4b2fddeeee1fbc93ba54119ad37fbb2e8..12bc1cdd71636263cebc0c8f21bd283d791aec04 100644 (file)
@@ -176,6 +176,7 @@ type Pool struct {
        mInstancesPrice    *prometheus.GaugeVec
        mVCPUs             *prometheus.GaugeVec
        mMemory            *prometheus.GaugeVec
+       mBootOutcomes      *prometheus.CounterVec
        mDisappearances    *prometheus.CounterVec
 }
 
@@ -436,6 +437,7 @@ func (wp *Pool) Shutdown(it arvados.InstanceType) bool {
                for _, wkr := range wp.workers {
                        if wkr.idleBehavior != IdleBehaviorHold && wkr.state == tryState && wkr.instType == it {
                                logger.WithField("Instance", wkr.instance.ID()).Info("shutting down")
+                               wkr.reportBootOutcome(BootOutcomeAborted)
                                wkr.shutdown()
                                return true
                        }
@@ -494,7 +496,7 @@ func (wp *Pool) StartContainer(it arvados.InstanceType, ctr arvados.Container) b
        defer wp.mtx.Unlock()
        var wkr *worker
        for _, w := range wp.workers {
-               if w.instType == it && w.state == StateIdle {
+               if w.instType == it && w.state == StateIdle && w.idleBehavior == IdleBehaviorRun {
                        if wkr == nil || w.busy.After(wkr.busy) {
                                wkr = w
                        }
@@ -593,6 +595,16 @@ func (wp *Pool) registerMetrics(reg *prometheus.Registry) {
                Help:      "Total memory on all cloud VMs.",
        }, []string{"category"})
        reg.MustRegister(wp.mMemory)
+       wp.mBootOutcomes = prometheus.NewCounterVec(prometheus.CounterOpts{
+               Namespace: "arvados",
+               Subsystem: "dispatchcloud",
+               Name:      "boot_outcomes",
+               Help:      "Boot outcomes by type.",
+       }, []string{"outcome"})
+       for k := range validBootOutcomes {
+               wp.mBootOutcomes.WithLabelValues(string(k)).Add(0)
+       }
+       reg.MustRegister(wp.mBootOutcomes)
        wp.mDisappearances = prometheus.NewCounterVec(prometheus.CounterOpts{
                Namespace: "arvados",
                Subsystem: "dispatchcloud",
@@ -765,6 +777,7 @@ func (wp *Pool) KillInstance(id cloud.InstanceID, reason string) error {
                return errors.New("instance not found")
        }
        wkr.logger.WithField("Reason", reason).Info("shutting down")
+       wkr.reportBootOutcome(BootOutcomeAborted)
        wkr.shutdown()
        return nil
 }
@@ -867,6 +880,7 @@ func (wp *Pool) sync(threshold time.Time, instances []cloud.Instance) {
                        "WorkerState": wkr.state,
                })
                logger.Info("instance disappeared in cloud")
+               wkr.reportBootOutcome(BootOutcomeDisappeared)
                if wp.mDisappearances != nil {
                        wp.mDisappearances.WithLabelValues(stateString[wkr.state]).Inc()
                }
index 1948c1e874859f2d8355115b3671f2c5ef0ae32d..0c173c107d4a248ec38ca635f5fa0ac219af6a4b 100644 (file)
@@ -72,8 +72,8 @@ func (suite *PoolSuite) TestResumeAfterRestart(c *check.C) {
        newExecutor := func(cloud.Instance) Executor {
                return &stubExecutor{
                        response: map[string]stubResp{
-                               "crunch-run --list": stubResp{},
-                               "true":              stubResp{},
+                               "crunch-run --list": {},
+                               "true":              {},
                        },
                }
        }
@@ -146,6 +146,59 @@ func (suite *PoolSuite) TestResumeAfterRestart(c *check.C) {
        pool2.Stop()
 }
 
+func (suite *PoolSuite) TestDrain(c *check.C) {
+       logger := ctxlog.TestLogger(c)
+       driver := test.StubDriver{}
+       instanceSet, err := driver.InstanceSet(nil, "test-instance-set-id", nil, logger)
+       c.Assert(err, check.IsNil)
+
+       ac := arvados.NewClientFromEnv()
+
+       type1 := test.InstanceType(1)
+       pool := &Pool{
+               arvClient:   ac,
+               logger:      logger,
+               newExecutor: func(cloud.Instance) Executor { return &stubExecutor{} },
+               instanceSet: &throttledInstanceSet{InstanceSet: instanceSet},
+               instanceTypes: arvados.InstanceTypeMap{
+                       type1.Name: type1,
+               },
+       }
+       notify := pool.Subscribe()
+       defer pool.Unsubscribe(notify)
+
+       pool.Create(type1)
+
+       // Wait for the instance to either return from its Create
+       // call, or show up in a poll.
+       suite.wait(c, pool, notify, func() bool {
+               pool.mtx.RLock()
+               defer pool.mtx.RUnlock()
+               return len(pool.workers) == 1
+       })
+
+       tests := []struct {
+               state        State
+               idleBehavior IdleBehavior
+               result       bool
+       }{
+               {StateIdle, IdleBehaviorHold, false},
+               {StateIdle, IdleBehaviorDrain, false},
+               {StateIdle, IdleBehaviorRun, true},
+       }
+
+       for _, test := range tests {
+               for _, wkr := range pool.workers {
+                       wkr.state = test.state
+                       wkr.idleBehavior = test.idleBehavior
+               }
+
+               // Try to start a container
+               started := pool.StartContainer(type1, arvados.Container{UUID: "testcontainer"})
+               c.Check(started, check.Equals, test.result)
+       }
+}
+
 func (suite *PoolSuite) TestCreateUnallocShutdown(c *check.C) {
        logger := ctxlog.TestLogger(c)
        driver := test.StubDriver{HoldCloudOps: true}
index 357ac20a038d56ca7a4778c6df38f05f2e2dae08..5d2360f3ccc64671b7193b281a7807d7b70de23b 100644 (file)
@@ -54,6 +54,23 @@ func (s State) MarshalText() ([]byte, error) {
        return []byte(stateString[s]), nil
 }
 
+// BootOutcome is the result of a worker boot. It is used as a label in a metric.
+type BootOutcome string
+
+const (
+       BootOutcomeFailed      BootOutcome = "failure"
+       BootOutcomeSucceeded   BootOutcome = "success"
+       BootOutcomeAborted     BootOutcome = "aborted"
+       BootOutcomeDisappeared BootOutcome = "disappeared"
+)
+
+var validBootOutcomes = map[BootOutcome]bool{
+       BootOutcomeFailed:      true,
+       BootOutcomeSucceeded:   true,
+       BootOutcomeAborted:     true,
+       BootOutcomeDisappeared: true,
+}
+
 // IdleBehavior indicates the behavior desired when a node becomes idle.
 type IdleBehavior string
 
@@ -74,22 +91,23 @@ type worker struct {
        executor Executor
        wp       *Pool
 
-       mtx          sync.Locker // must be wp's Locker.
-       state        State
-       idleBehavior IdleBehavior
-       instance     cloud.Instance
-       instType     arvados.InstanceType
-       vcpus        int64
-       memory       int64
-       appeared     time.Time
-       probed       time.Time
-       updated      time.Time
-       busy         time.Time
-       destroyed    time.Time
-       lastUUID     string
-       running      map[string]*remoteRunner // remember to update state idle<->running when this changes
-       starting     map[string]*remoteRunner // remember to update state idle<->running when this changes
-       probing      chan struct{}
+       mtx                 sync.Locker // must be wp's Locker.
+       state               State
+       idleBehavior        IdleBehavior
+       instance            cloud.Instance
+       instType            arvados.InstanceType
+       vcpus               int64
+       memory              int64
+       appeared            time.Time
+       probed              time.Time
+       updated             time.Time
+       busy                time.Time
+       destroyed           time.Time
+       lastUUID            string
+       running             map[string]*remoteRunner // remember to update state idle<->running when this changes
+       starting            map[string]*remoteRunner // remember to update state idle<->running when this changes
+       probing             chan struct{}
+       bootOutcomeReported bool
 }
 
 func (wkr *worker) onUnkillable(uuid string) {
@@ -111,6 +129,17 @@ func (wkr *worker) onKilled(uuid string) {
        go wkr.wp.notify()
 }
 
+// caller must have lock.
+func (wkr *worker) reportBootOutcome(outcome BootOutcome) {
+       if wkr.bootOutcomeReported {
+               return
+       }
+       if wkr.wp.mBootOutcomes != nil {
+               wkr.wp.mBootOutcomes.WithLabelValues(string(outcome)).Inc()
+       }
+       wkr.bootOutcomeReported = true
+}
+
 // caller must have lock.
 func (wkr *worker) setIdleBehavior(idleBehavior IdleBehavior) {
        wkr.logger.WithField("IdleBehavior", idleBehavior).Info("set idle behavior")
@@ -224,6 +253,7 @@ func (wkr *worker) probeAndUpdate() {
        defer wkr.mtx.Unlock()
        if reportedBroken && wkr.idleBehavior == IdleBehaviorRun {
                logger.Info("probe reported broken instance")
+               wkr.reportBootOutcome(BootOutcomeFailed)
                wkr.setIdleBehavior(IdleBehaviorDrain)
        }
        if !ok || (!booted && len(ctrUUIDs) == 0 && len(wkr.running) == 0) {
@@ -247,6 +277,7 @@ func (wkr *worker) probeAndUpdate() {
                        // some evidence about why the node never
                        // booted, even in non-debug mode.
                        if !booted {
+                               wkr.reportBootOutcome(BootOutcomeFailed)
                                logger.WithFields(logrus.Fields{
                                        "Duration": dur,
                                        "stderr":   string(stderr),
@@ -311,6 +342,7 @@ func (wkr *worker) probeAndUpdate() {
        }
        wkr.updated = updateTime
        if booted && (initialState == StateUnknown || initialState == StateBooting) {
+               wkr.reportBootOutcome(BootOutcomeSucceeded)
                logger.WithFields(logrus.Fields{
                        "RunningContainers": len(wkr.running),
                        "State":             wkr.state,
@@ -468,6 +500,7 @@ func (wkr *worker) shutdownIfIdle() bool {
                "IdleDuration": stats.Duration(time.Since(wkr.busy)),
                "IdleBehavior": wkr.idleBehavior,
        }).Info("shutdown worker")
+       wkr.reportBootOutcome(BootOutcomeAborted)
        wkr.shutdown()
        return true
 }
index 34b90805363fc3230ef7f78f1bf3605ecebdfdea..ee967af6cc6f1f773d429b69a4999c051e22989e 100644 (file)
@@ -102,7 +102,7 @@ func authenticate(logger *logrus.Logger, username, token string, argv []string)
                } else if arg == "debug" {
                        logger.SetLevel(logrus.DebugLevel)
                } else {
-                       logger.Warnf("unkown option: %s\n", arg)
+                       logger.Warnf("unknown option: %s\n", arg)
                }
        }
        if hostname == "" || hostname == "-" {
index 57c2c64cdab01289911043c5767ed92edbcd8c36..7b3c8e1b4ed95b59f5e155fcb0392b7fe0d58a26 100644 (file)
@@ -90,6 +90,7 @@ func (*Suite) TestUntrashAndTouchBlock(c *check.C) {
                c.Assert(err, check.IsNil)
                t := time.Now().Add(-time.Hour * 24 * 365)
                err = os.Chtimes(trashfile, t, t)
+               c.Assert(err, check.IsNil)
        }
 
        var stdout, stderr bytes.Buffer
index ec7834972c2609aeb5e4cd14099d35367a7e3c09..4a984c9e780a9fba5bca0ff3d964e972ec2eb728 100644 (file)
@@ -107,6 +107,7 @@ Clusters:
                                continue
                        }
                        body, err := ioutil.ReadAll(resp.Body)
+                       c.Check(err, check.IsNil)
                        c.Logf("status %d, body %s", resp.StatusCode, string(body))
                        c.Check(resp.StatusCode, check.Equals, http.StatusOK)
                        break
index 88a5ceecee1dfc5f1cad8714845df9ffc3d8d5c5..f60adf5385ce7489ccaace30423847033698f579 100644 (file)
@@ -31,7 +31,7 @@ Gem::Specification.new do |s|
   s.summary     = "Arvados CLI tools"
   s.description = "Arvados command line tools, git commit #{git_hash}"
   s.authors     = ["Arvados Authors"]
-  s.email       = 'gem-dev@curoverse.com'
+  s.email       = 'gem-dev@arvados.org'
   #s.bindir      = '.'
   s.licenses    = ['Apache-2.0']
   s.files       = ["bin/arv", "bin/arv-tag", "LICENSE-2.0.txt"]
@@ -42,7 +42,7 @@ Gem::Specification.new do |s|
   # Our google-api-client dependency used to be < 0.9, but that could be
   # satisfied by the buggy 0.9.pre*.  https://dev.arvados.org/issues/9213
   s.add_runtime_dependency 'arvados-google-api-client', '~> 0.6', '>= 0.6.3', '<0.8.9'
-  s.add_runtime_dependency 'activesupport', '>= 3.2.13', '< 5.1'
+  s.add_runtime_dependency 'activesupport', '>= 3.2.13', '< 5.3'
   s.add_runtime_dependency 'json', '>= 1.7.7', '<3'
   s.add_runtime_dependency 'optimist', '~> 3.0'
   s.add_runtime_dependency 'andand', '~> 1.3', '>= 1.3.3'
index adbce90d8d4215329d46eebbe06be66d1f71de43..f3629b68972650e90f06e772404b8c5b29a46f94 100644 (file)
@@ -176,7 +176,7 @@ def arg_parser():  # type: () -> argparse.ArgumentParser
 
     parser.add_argument("--enable-dev", action="store_true",
                         help="Enable loading and running development versions "
-                             "of CWL spec.", default=False)
+                             "of the CWL standards.", default=False)
     parser.add_argument('--storage-classes', default="default",
                         help="Specify comma separated list of storage classes to be used when saving workflow output to Keep.")
 
@@ -202,6 +202,14 @@ def arg_parser():  # type: () -> argparse.ArgumentParser
     parser.add_argument("--http-timeout", type=int,
                         default=5*60, dest="http_timeout", help="API request timeout in seconds. Default is 300 seconds (5 minutes).")
 
+    parser.add_argument(
+        "--skip-schemas",
+        action="store_true",
+        help="Skip loading of schemas",
+        default=False,
+        dest="skip_schemas",
+    )
+
     exgroup = parser.add_mutually_exclusive_group()
     exgroup.add_argument("--trash-intermediate", action="store_true",
                         default=False, dest="trash_intermediate",
@@ -218,15 +226,12 @@ def arg_parser():  # type: () -> argparse.ArgumentParser
 def add_arv_hints():
     cwltool.command_line_tool.ACCEPTLIST_EN_RELAXED_RE = re.compile(r".*")
     cwltool.command_line_tool.ACCEPTLIST_RE = cwltool.command_line_tool.ACCEPTLIST_EN_RELAXED_RE
-    res10 = pkg_resources.resource_stream(__name__, 'arv-cwl-schema-v1.0.yml')
-    res11 = pkg_resources.resource_stream(__name__, 'arv-cwl-schema-v1.1.yml')
-    customschema10 = res10.read().decode('utf-8')
-    customschema11 = res11.read().decode('utf-8')
-    use_custom_schema("v1.0", "http://arvados.org/cwl", customschema10)
-    use_custom_schema("v1.1.0-dev1", "http://arvados.org/cwl", customschema11)
-    use_custom_schema("v1.1", "http://arvados.org/cwl", customschema11)
-    res10.close()
-    res11.close()
+    supported_versions = ["v1.0", "v1.1", "v1.2"]
+    for s in supported_versions:
+        res = pkg_resources.resource_stream(__name__, 'arv-cwl-schema-%s.yml' % s)
+        customschema = res.read().decode('utf-8')
+        use_custom_schema(s, "http://arvados.org/cwl", customschema)
+        res.close()
     cwltool.process.supportedProcessRequirements.extend([
         "http://arvados.org/cwl#RunInSingleContainer",
         "http://arvados.org/cwl#OutputDirType",
diff --git a/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.2.yml b/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.2.yml
new file mode 100644 (file)
index 0000000..b9b9e61
--- /dev/null
@@ -0,0 +1,206 @@
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+$base: "http://arvados.org/cwl#"
+$namespaces:
+  cwl: "https://w3id.org/cwl/cwl#"
+  cwltool: "http://commonwl.org/cwltool#"
+$graph:
+- $import: https://w3id.org/cwl/CommonWorkflowLanguage.yml
+
+- name: cwltool:Secrets
+  type: record
+  inVocab: false
+  extends: cwl:ProcessRequirement
+  fields:
+    class:
+      type: string
+      doc: "Always 'Secrets'"
+      jsonldPredicate:
+        "_id": "@type"
+        "_type": "@vocab"
+    secrets:
+      type: string[]
+      doc: |
+        List one or more input parameters that are sensitive (such as passwords)
+        which will be deliberately obscured from logging.
+      jsonldPredicate:
+        "_type": "@id"
+        refScope: 0
+
+- name: RunInSingleContainer
+  type: record
+  extends: cwl:ProcessRequirement
+  inVocab: false
+  doc: |
+    Indicates that a subworkflow should run in a single container
+    and not be scheduled as separate steps.
+  fields:
+    - name: class
+      type: string
+      doc: "Always 'arv:RunInSingleContainer'"
+      jsonldPredicate:
+        _id: "@type"
+        _type: "@vocab"
+
+- name: OutputDirType
+  type: enum
+  symbols:
+    - local_output_dir
+    - keep_output_dir
+  doc:
+    - |
+      local_output_dir: Use regular file system local to the compute node.
+      There must be sufficient local scratch space to store entire output;
+      specify this with `outdirMin` of `ResourceRequirement`.  Files are
+      batch uploaded to Keep when the process completes.  Most compatible, but
+      upload step can be time consuming for very large files.
+    - |
+      keep_output_dir: Use writable Keep mount.  Files are streamed to Keep as
+      they are written.  Does not consume local scratch space, but does consume
+      RAM for output buffers (up to 192 MiB per file simultaneously open for
+      writing.)  Best suited to processes which produce sequential output of
+      large files (non-sequential writes may produced fragmented file
+      manifests).  Supports regular files and directories, does not support
+      special files such as symlinks, hard links, named pipes, named sockets,
+      or device nodes.
+
+
+- name: RuntimeConstraints
+  type: record
+  extends: cwl:ProcessRequirement
+  inVocab: false
+  doc: |
+    Set Arvados-specific runtime hints.
+  fields:
+    - name: class
+      type: string
+      doc: "Always 'arv:RuntimeConstraints'"
+      jsonldPredicate:
+        _id: "@type"
+        _type: "@vocab"
+    - name: keep_cache
+      type: int?
+      doc: |
+        Size of file data buffer for Keep mount in MiB. Default is 256
+        MiB. Increase this to reduce cache thrashing in situations such as
+        accessing multiple large (64+ MiB) files at the same time, or
+        performing random access on a large file.
+    - name: outputDirType
+      type: OutputDirType?
+      doc: |
+        Preferred backing store for output staging.  If not specified, the
+        system may choose which one to use.
+
+- name: PartitionRequirement
+  type: record
+  extends: cwl:ProcessRequirement
+  inVocab: false
+  doc: |
+    Select preferred compute partitions on which to run jobs.
+  fields:
+    - name: class
+      type: string
+      doc: "Always 'arv:PartitionRequirement'"
+      jsonldPredicate:
+        _id: "@type"
+        _type: "@vocab"
+    - name: partition
+      type:
+        - string
+        - string[]
+
+- name: APIRequirement
+  type: record
+  extends: cwl:ProcessRequirement
+  inVocab: false
+  doc: |
+    Indicates that process wants to access to the Arvados API.  Will be granted
+    limited network access and have ARVADOS_API_HOST and ARVADOS_API_TOKEN set
+    in the environment.
+  fields:
+    - name: class
+      type: string
+      doc: "Always 'arv:APIRequirement'"
+      jsonldPredicate:
+        _id: "@type"
+        _type: "@vocab"
+
+- name: IntermediateOutput
+  type: record
+  extends: cwl:ProcessRequirement
+  inVocab: false
+  doc: |
+    Specify desired handling of intermediate output collections.
+  fields:
+    class:
+      type: string
+      doc: "Always 'arv:IntermediateOutput'"
+      jsonldPredicate:
+        _id: "@type"
+        _type: "@vocab"
+    outputTTL:
+      type: int
+      doc: |
+        If the value is greater than zero, consider intermediate output
+        collections to be temporary and should be automatically
+        trashed. Temporary collections will be trashed `outputTTL` seconds
+        after creation.  A value of zero means intermediate output should be
+        retained indefinitely (this is the default behavior).
+
+        Note: arvados-cwl-runner currently does not take workflow dependencies
+        into account when setting the TTL on an intermediate output
+        collection. If the TTL is too short, it is possible for a collection to
+        be trashed before downstream steps that consume it are started.  The
+        recommended minimum value for TTL is the expected duration of the
+        entire the workflow.
+
+- name: WorkflowRunnerResources
+  type: record
+  extends: cwl:ProcessRequirement
+  inVocab: false
+  doc: |
+    Specify memory or cores resource request for the CWL runner process itself.
+  fields:
+    class:
+      type: string
+      doc: "Always 'arv:WorkflowRunnerResources'"
+      jsonldPredicate:
+        _id: "@type"
+        _type: "@vocab"
+    ramMin:
+      type: int?
+      doc: Minimum RAM, in mebibytes (2**20)
+      jsonldPredicate: "https://w3id.org/cwl/cwl#ResourceRequirement/ramMin"
+    coresMin:
+      type: int?
+      doc: Minimum cores allocated to cwl-runner
+      jsonldPredicate: "https://w3id.org/cwl/cwl#ResourceRequirement/coresMin"
+    keep_cache:
+      type: int?
+      doc: |
+        Size of collection metadata cache for the workflow runner, in
+        MiB.  Default 256 MiB.  Will be added on to the RAM request
+        when determining node size to request.
+      jsonldPredicate: "http://arvados.org/cwl#RuntimeConstraints/keep_cache"
+
+- name: ClusterTarget
+  type: record
+  extends: cwl:ProcessRequirement
+  inVocab: false
+  doc: |
+    Specify where a workflow step should run
+  fields:
+    class:
+      type: string
+      doc: "Always 'arv:ClusterTarget'"
+      jsonldPredicate:
+        _id: "@type"
+        _type: "@vocab"
+    cluster_id:
+      type: string?
+      doc: The cluster to run the container
+    project_uuid:
+      type: string?
+      doc: The project that will own the container requests and intermediate collections
index 2b55ce9df5afa6b4a5e7d98ded954be50ae40aa0..fb23c2ccf73df514923f4fd0041814c6e8751833 100644 (file)
@@ -150,21 +150,28 @@ class ArvadosContainer(JobBase):
                 with Perf(metrics, "createfiles %s" % self.name):
                     for f, p in sorteditems:
                         if not p.target:
-                            pass
-                        elif p.type in ("File", "Directory", "WritableFile", "WritableDirectory"):
+                            continue
+
+                        if p.target.startswith("/"):
+                            dst = p.target[len(self.outdir)+1:] if p.target.startswith(self.outdir+"/") else p.target[1:]
+                        else:
+                            dst = p.target
+
+                        if p.type in ("File", "Directory", "WritableFile", "WritableDirectory"):
                             if p.resolved.startswith("_:"):
-                                vwd.mkdirs(p.target)
+                                vwd.mkdirs(dst)
                             else:
                                 source, path = self.arvrunner.fs_access.get_collection(p.resolved)
-                                vwd.copy(path or ".", p.target, source_collection=source)
+                                vwd.copy(path or ".", dst, source_collection=source)
                         elif p.type == "CreateFile":
                             if self.arvrunner.secret_store.has_secret(p.resolved):
-                                secret_mounts["%s/%s" % (self.outdir, p.target)] = {
+                                mountpoint = p.target if p.target.startswith("/") else os.path.join(self.outdir, p.target)
+                                secret_mounts[mountpoint] = {
                                     "kind": "text",
                                     "content": self.arvrunner.secret_store.retrieve(p.resolved)
                                 }
                             else:
-                                with vwd.open(p.target, "w") as n:
+                                with vwd.open(dst, "w") as n:
                                     n.write(p.resolved)
 
                 def keepemptydirs(p):
@@ -191,10 +198,14 @@ class ArvadosContainer(JobBase):
                     if (not p.target or self.arvrunner.secret_store.has_secret(p.resolved) or
                         (prev is not None and p.target.startswith(prev))):
                         continue
-                    mountpoint = "%s/%s" % (self.outdir, p.target)
+                    if p.target.startswith("/"):
+                        dst = p.target[len(self.outdir)+1:] if p.target.startswith(self.outdir+"/") else p.target[1:]
+                    else:
+                        dst = p.target
+                    mountpoint = p.target if p.target.startswith("/") else os.path.join(self.outdir, p.target)
                     mounts[mountpoint] = {"kind": "collection",
                                           "portable_data_hash": vwd.portable_data_hash(),
-                                          "path": p.target}
+                                          "path": dst}
                     if p.type.startswith("Writable"):
                         mounts[mountpoint]["writable"] = True
                     prev = p.target + "/"
@@ -316,6 +327,7 @@ class ArvadosContainer(JobBase):
                 logger.info("%s %s state is %s", self.arvrunner.label(self), response["uuid"], response["state"])
         except Exception:
             logger.exception("%s got an error", self.arvrunner.label(self))
+            logger.debug("Container request was %s", container_request)
             self.output_callback({}, "permanentFail")
 
     def done(self, record):
index 704edaccb903eb83f1e66c983eb007fe1c4f8711..a9361a85f9fe66a5260a64b6719fa74c28cdeee2 100644 (file)
@@ -57,7 +57,7 @@ class ArvadosCommandTool(CommandLineTool):
                                  "/keep/%s/%s")
 
     def job(self, joborder, output_callback, runtimeContext):
-        builder = make_builder(joborder, self.hints, self.requirements, runtimeContext)
+        builder = make_builder(joborder, self.hints, self.requirements, runtimeContext, self.metadata)
         runtimeContext = set_cluster_target(self.tool, self.arvrunner, builder, runtimeContext)
 
         if runtimeContext.work_api == "containers":
index ddd3c00764c7b0fb42fe97adf50622bd1b23e9cb..97c5fafe792fc06ce099e6a9bc6934671ace580d 100644 (file)
@@ -141,7 +141,8 @@ class ArvadosWorkflowStep(WorkflowStep):
         runtimeContext = runtimeContext.copy()
         runtimeContext.toplevel = True  # Preserve behavior for #13365
 
-        builder = make_builder({shortname(k): v for k,v in viewitems(joborder)}, self.hints, self.requirements, runtimeContext)
+        builder = make_builder({shortname(k): v for k,v in viewitems(joborder)}, self.hints, self.requirements,
+                               runtimeContext, self.metadata)
         runtimeContext = set_cluster_target(self.tool, self.arvrunner, builder, runtimeContext)
         return super(ArvadosWorkflowStep, self).job(joborder, output_callback, runtimeContext)
 
@@ -161,7 +162,7 @@ class ArvadosWorkflow(Workflow):
 
     def job(self, joborder, output_callback, runtimeContext):
 
-        builder = make_builder(joborder, self.hints, self.requirements, runtimeContext)
+        builder = make_builder(joborder, self.hints, self.requirements, runtimeContext, self.metadata)
         runtimeContext = set_cluster_target(self.tool, self.arvrunner, builder, runtimeContext)
 
         req, _ = self.get_requirement("http://arvados.org/cwl#RunInSingleContainer")
index ec91eea6aa807eaea1f37012b0fa1f04d21a1f1f..e8d1347ddfeec7545b8ab9740de38b78b55b4e75 100644 (file)
@@ -507,7 +507,7 @@ The 'jobs' API is no longer supported.
                                               }).execute(num_retries=self.num_retries)
             except Exception:
                 logger.exception("Setting container output")
-                return
+                raise
 
     def apply_reqs(self, job_order_object, tool):
         if "https://w3id.org/cwl/cwl#requirements" in job_order_object:
index bc2c5e34d7b6c2737cc8bdcb541fc1daf394d9ae..4688e65a3748348b8068bd27e1d9e78aa5a5e9de 100644 (file)
@@ -148,6 +148,11 @@ class CollectionFsAccess(cwltool.stdfsaccess.StdFsAccess):
                 return False
             else:
                 raise
+        except IOError as err:
+            if err.errno == errno.ENOENT:
+                return False
+            else:
+                raise
         if collection is not None:
             if rest:
                 return collection.exists(rest)
index 47a304372c58a27ecde8d8c13bb55d6435f9cf79..dcc2a51192dfc4d4b573da302b3373fd08d67fff 100644 (file)
@@ -16,6 +16,7 @@ import arvados.collection
 import urllib.parse
 import logging
 import calendar
+import urllib.parse
 
 logger = logging.getLogger('arvados.cwl-runner')
 
@@ -148,7 +149,9 @@ def http_to_keep(api, project_uuid, url, utcnow=datetime.datetime.utcnow):
                     logger.info("%d downloaded, %3.2f MiB/s", count, (bps / (1024*1024)))
                 checkpoint = loopnow
 
-    c.save_new(name="Downloaded from %s" % url, owner_uuid=project_uuid, ensure_unique_name=True)
+
+    collectionname = "Downloaded from %s" % urllib.parse.quote(url, safe='')
+    c.save_new(name=collectionname, owner_uuid=project_uuid, ensure_unique_name=True)
 
     api.collections().update(uuid=c.manifest_locator(), body={"collection":{"properties": properties}}).execute()
 
index 4cd204f7df83ba49197f2cdb6ab2a61673a40b28..5bad290773be9f49ef2e87b10b2dac48e70ef75b 100644 (file)
@@ -285,6 +285,7 @@ class StagingPathMapper(PathMapper):
     def visit(self, obj, stagedir, basedir, copy=False, staged=False):
         # type: (Dict[unicode, Any], unicode, unicode, bool) -> None
         loc = obj["location"]
+        stagedir = obj.get("dirname") or stagedir
         tgt = os.path.join(stagedir, obj["basename"])
         basetgt, baseext = os.path.splitext(tgt)
 
index 71e499ebcab0cca29ccbee7a350cfbbb5aaa6e19..b10f02d1401b9e31014eb30b32e18adfdcb394d2 100644 (file)
@@ -83,7 +83,7 @@ def find_defaults(d, op):
             for i in viewvalues(d):
                 find_defaults(i, op)
 
-def make_builder(joborder, hints, requirements, runtimeContext):
+def make_builder(joborder, hints, requirements, runtimeContext, metadata):
     return Builder(
                  job=joborder,
                  files=[],               # type: List[Dict[Text, Text]]
@@ -106,6 +106,7 @@ def make_builder(joborder, hints, requirements, runtimeContext):
                  outdir="",              # type: Text
                  tmpdir="",              # type: Text
                  stagedir="",            # type: Text
+                 cwlVersion=metadata.get("http://commonwl.org/cwltool#original_cwlVersion") or metadata.get("cwlVersion")
                 )
 
 def search_schemadef(name, reqs):
@@ -172,7 +173,10 @@ def set_secondary(fsaccess, builder, inputschema, secondaryspec, primary, discov
         specs = []
         primary["secondaryFiles"] = secondaryspec
         for i, sf in enumerate(aslist(secondaryspec)):
-            pattern = builder.do_eval(sf["pattern"], context=primary)
+            if builder.cwlVersion == "v1.0":
+                pattern = builder.do_eval(sf, context=primary)
+            else:
+                pattern = builder.do_eval(sf["pattern"], context=primary)
             if pattern is None:
                 continue
             if isinstance(pattern, list):
@@ -263,6 +267,8 @@ def upload_dependencies(arvrunner, name, document_loader,
         # that external references in $include and $mixin are captured.
         scanobj = loadref("", workflowobj["id"])
 
+    metadata = scanobj
+
     sc_result = scandeps(uri, scanobj,
                   loadref_fields,
                   set(("$include", "$schemas", "location")),
@@ -354,7 +360,8 @@ def upload_dependencies(arvrunner, name, document_loader,
         builder = make_builder(builder_job_order,
                                obj.get("hints", []),
                                obj.get("requirements", []),
-                               ArvRuntimeContext())
+                               ArvRuntimeContext(),
+                               metadata)
         discover_secondary_files(arvrunner.fs_access,
                                  builder,
                                  obj["inputs"],
@@ -516,7 +523,8 @@ def upload_job_order(arvrunner, name, tool, job_order):
     builder = make_builder(builder_job_order,
                            tool.hints,
                            tool.requirements,
-                           ArvRuntimeContext())
+                           ArvRuntimeContext(),
+                           tool.metadata)
     # Now update job_order with secondaryFiles
     discover_secondary_files(arvrunner.fs_access,
                              builder,
index 40ee679857f4429b0e32cf491339e144695489de..c8ab71e50b3dd5bc3cfd2e6f08fa03cdb46f3100 100644 (file)
@@ -30,7 +30,7 @@ setup(name='arvados-cwl-runner',
       download_url="https://github.com/arvados/arvados.git",
       license='Apache 2.0',
       packages=find_packages(),
-      package_data={'arvados_cwl': ['arv-cwl-schema-v1.0.yml', 'arv-cwl-schema-v1.1.yml']},
+      package_data={'arvados_cwl': ['arv-cwl-schema-v1.0.yml', 'arv-cwl-schema-v1.1.yml', 'arv-cwl-schema-v1.2.yml']},
       scripts=[
           'bin/cwl-runner',
           'bin/arvados-cwl-runner',
@@ -39,8 +39,8 @@ setup(name='arvados-cwl-runner',
       # file to determine what version of cwltool and schema-salad to
       # build.
       install_requires=[
-          'cwltool==3.0.20200530110633',
-          'schema-salad==6.0.20200601095207',
+          'cwltool==3.0.20200807132242',
+          'schema-salad==7.0.20200612160654',
           'arvados-python-client{}'.format(pysdk_dep),
           'setuptools',
           'ciso8601 >= 2.0.0'
index 7aa7b0aa43c06a0ae6e2d6615541de0cf428f94a..8d0dee971a89901c216e1223870662e49eb7a7e0 100644 (file)
@@ -1,11 +1,18 @@
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
 cwlVersion: v1.0
 class: CommandLineTool
 requirements:
   - class: InlineJavascriptRequirement
+  - class: ShellCommandRequirement
 arguments:
+  - cd
+  - $(inputs.hello.dirname)
+  - {shellQuote: false, valueFrom: "&&"}
   - ls
-  - -l
-  - $(inputs.hello)
+stdout: hello.out
 inputs:
   hello:
     type: File
@@ -14,4 +21,8 @@ inputs:
       location: keep:4d8a70b1e63b2aad6984e40e338e2373+69/hello.txt
     secondaryFiles:
       - .idx
-outputs: []
\ No newline at end of file
+outputs:
+  out:
+    type: File
+    outputBinding:
+      glob: hello.out
diff --git a/sdk/cwl/tests/16377-missing-default.cwl b/sdk/cwl/tests/16377-missing-default.cwl
new file mode 100644 (file)
index 0000000..b8208e6
--- /dev/null
@@ -0,0 +1,28 @@
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+cwlVersion: v1.0
+class: CommandLineTool
+requirements:
+  - class: InlineJavascriptRequirement
+  - class: ShellCommandRequirement
+arguments:
+  - cd
+  - $(inputs.hello.dirname)
+  - {shellQuote: false, valueFrom: "&&"}
+  - ls
+stdout: hello.out
+inputs:
+  hello:
+    type: File
+    default:
+      class: File
+      location: keep:ffffffffffffffffffffffffffaaaaaa+69/hello.txt
+    secondaryFiles:
+      - .idx
+outputs:
+  out:
+    type: File
+    outputBinding:
+      glob: hello.out
index c4c0968756a46b04ad8b201cbc66241fb4d6826d..a46decd9616cff63fe932e2320568b14c563b3b6 100644 (file)
 
 - job: null
   output:
-    out: null
+    "out": {
+        "location": "hello.out",
+        "class": "File",
+        "checksum": "sha1$ec5d3976351abab45a483a49ce714a8430cb203a",
+        "size": 24
+    }
   tool: 13976-keepref-wf.cwl
   doc: "Test issue 13976"
 
   }
   tool: 16169-no-listing-hint.cwl
   doc: "Test cwltool:LoadListingRequirement propagation"
+
+- job: hello.yml
+  output:
+    "out": {
+        "location": "hello.out",
+        "class": "File",
+        "checksum": "sha1$ec5d3976351abab45a483a49ce714a8430cb203a",
+        "size": 24
+    }
+  tool: 16377-missing-default.cwl
+  doc: "Test issue 16377 - missing default fails even when it should be overridden by valid input"
diff --git a/sdk/cwl/tests/hello.yml b/sdk/cwl/tests/hello.yml
new file mode 100644 (file)
index 0000000..e7a324e
--- /dev/null
@@ -0,0 +1,7 @@
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+hello:
+  class: File
+  location: keep:4d8a70b1e63b2aad6984e40e338e2373+69/hello.txt
index 4119fee383e27bcfe30a97d3de754d1879c067a9..650b5f0598514bbe9fd5ea0de96ab848d2375ad0 100644 (file)
@@ -61,7 +61,7 @@ class TestHttpToKeep(unittest.TestCase):
         getmock.assert_called_with("http://example.com/file1.txt", stream=True, allow_redirects=True)
 
         cm.open.assert_called_with("file1.txt", "wb")
-        cm.save_new.assert_called_with(name="Downloaded from http://example.com/file1.txt",
+        cm.save_new.assert_called_with(name="Downloaded from http%3A%2F%2Fexample.com%2Ffile1.txt",
                                        owner_uuid=None, ensure_unique_name=True)
 
         api.collections().update.assert_has_calls([
@@ -189,7 +189,7 @@ class TestHttpToKeep(unittest.TestCase):
         getmock.assert_called_with("http://example.com/file1.txt", stream=True, allow_redirects=True)
 
         cm.open.assert_called_with("file1.txt", "wb")
-        cm.save_new.assert_called_with(name="Downloaded from http://example.com/file1.txt",
+        cm.save_new.assert_called_with(name="Downloaded from http%3A%2F%2Fexample.com%2Ffile1.txt",
                                        owner_uuid=None, ensure_unique_name=True)
 
         api.collections().update.assert_has_calls([
@@ -280,7 +280,7 @@ class TestHttpToKeep(unittest.TestCase):
         getmock.assert_called_with("http://example.com/download?fn=/file1.txt", stream=True, allow_redirects=True)
 
         cm.open.assert_called_with("file1.txt", "wb")
-        cm.save_new.assert_called_with(name="Downloaded from http://example.com/download?fn=/file1.txt",
+        cm.save_new.assert_called_with(name="Downloaded from http%3A%2F%2Fexample.com%2Fdownload%3Ffn%3D%2Ffile1.txt",
                                        owner_uuid=None, ensure_unique_name=True)
 
         api.collections().update.assert_has_calls([
index 562664c698b34df87ec2f19eb64d867ec5461698..0698db70ff68534ba70aa4176c5487f308cf2559 100644 (file)
@@ -527,9 +527,12 @@ class TestSubmit(unittest.TestCase):
 
     @mock.patch("arvados_cwl.task_queue.TaskQueue")
     @mock.patch("arvados_cwl.arvworkflow.ArvadosWorkflow.job")
-    @mock.patch("arvados_cwl.executor.ArvCwlExecutor.make_output_collection", return_value = (None, None))
+    @mock.patch("arvados_cwl.executor.ArvCwlExecutor.make_output_collection")
     @stubs
     def test_storage_classes_correctly_propagate_to_make_output_collection(self, stubs, make_output, job, tq):
+        final_output_c = arvados.collection.Collection()
+        make_output.return_value = ({},final_output_c)
+
         def set_final_output(job_order, output_callback, runtimeContext):
             output_callback("zzzzz-4zz18-zzzzzzzzzzzzzzzz", "success")
             return []
@@ -538,16 +541,19 @@ class TestSubmit(unittest.TestCase):
         exited = arvados_cwl.main(
             ["--debug", "--local", "--storage-classes=foo",
                 "tests/wf/submit_wf.cwl", "tests/submit_test_job.json"],
-            sys.stdin, sys.stderr, api_client=stubs.api, keep_client=stubs.keep_client)
+            stubs.capture_stdout, sys.stderr, api_client=stubs.api, keep_client=stubs.keep_client)
 
         make_output.assert_called_with(u'Output of submit_wf.cwl', ['foo'], '', 'zzzzz-4zz18-zzzzzzzzzzzzzzzz')
         self.assertEqual(exited, 0)
 
     @mock.patch("arvados_cwl.task_queue.TaskQueue")
     @mock.patch("arvados_cwl.arvworkflow.ArvadosWorkflow.job")
-    @mock.patch("arvados_cwl.executor.ArvCwlExecutor.make_output_collection", return_value = (None, None))
+    @mock.patch("arvados_cwl.executor.ArvCwlExecutor.make_output_collection")
     @stubs
     def test_default_storage_classes_correctly_propagate_to_make_output_collection(self, stubs, make_output, job, tq):
+        final_output_c = arvados.collection.Collection()
+        make_output.return_value = ({},final_output_c)
+
         def set_final_output(job_order, output_callback, runtimeContext):
             output_callback("zzzzz-4zz18-zzzzzzzzzzzzzzzz", "success")
             return []
@@ -556,7 +562,7 @@ class TestSubmit(unittest.TestCase):
         exited = arvados_cwl.main(
             ["--debug", "--local",
                 "tests/wf/submit_wf.cwl", "tests/submit_test_job.json"],
-            sys.stdin, sys.stderr, api_client=stubs.api, keep_client=stubs.keep_client)
+            stubs.capture_stdout, sys.stderr, api_client=stubs.api, keep_client=stubs.keep_client)
 
         make_output.assert_called_with(u'Output of submit_wf.cwl', ['default'], '', 'zzzzz-4zz18-zzzzzzzzzzzzzzzz')
         self.assertEqual(exited, 0)
@@ -1103,7 +1109,10 @@ class TestSubmit(unittest.TestCase):
                                 "outputs": [
                                     {
                                         "id": "#secret_job.cwl/out",
-                                        "type": "stdout"
+                                        "type": "File",
+                                        "outputBinding": {
+                                              "glob": "hashed_example.txt"
+                                        }
                                     }
                                 ],
                                 "stdout": "hashed_example.txt",
@@ -1312,7 +1321,7 @@ class TestSubmit(unittest.TestCase):
                     stubs.capture_stdout, capture_stderr, api_client=stubs.api, keep_client=stubs.keep_client)
 
                 self.assertEqual(exited, 1)
-                self.assertRegexpMatches(
+                self.assertRegex(
                     re.sub(r'[ \n]+', ' ', capture_stderr.getvalue()),
                     r"Expected collection uuid zzzzz-4zz18-zzzzzzzzzzzzzzz to be 99999999999999999999999999999998\+99 but API server reported 99999999999999999999999999999997\+99")
             finally:
@@ -1335,7 +1344,7 @@ class TestSubmit(unittest.TestCase):
 
         try:
             self.assertEqual(exited, 1)
-            self.assertRegexpMatches(
+            self.assertRegex(
                 capture_stderr.getvalue(),
                 r"Collection uuid zzzzz-4zz18-zzzzzzzzzzzzzzz not found")
         finally:
index c32f88864f88750c00fe896286e147ccd9d061ce..5a2cfb8800402496f3f8fe400cf38c786e57d6eb 100644 (file)
@@ -86,6 +86,7 @@ type ListOptions struct {
        IncludeTrash       bool                   `json:"include_trash"`
        IncludeOldVersions bool                   `json:"include_old_versions"`
        BypassFederation   bool                   `json:"bypass_federation"`
+       ForwardedFor       string                 `json:"forwarded_for,omitempty"`
 }
 
 type CreateOptions struct {
index da03fba7d9547589a80814264da93ae2be668911..41c20c8db2ee71cf4c4a024e7d1d73b72878a098 100644 (file)
@@ -278,6 +278,7 @@ type S3VolumeDriverParameters struct {
        Bucket             string
        LocationConstraint bool
        V2Signature        bool
+       UseAWSS3v2Driver   bool
        IndexPageSize      int
        ConnectTimeout     Duration
        ReadTimeout        Duration
@@ -314,7 +315,6 @@ type Services struct {
        Keepbalance    Service
        Keepproxy      Service
        Keepstore      Service
-       Nodemanager    Service
        RailsAPI       Service
        SSO            Service
        WebDAVDownload Service
@@ -553,7 +553,7 @@ func (ss *StringSet) UnmarshalJSON(data []byte) error {
                return err
        }
        *ss = make(map[string]struct{}, len(hash))
-       for t, _ := range hash {
+       for t := range hash {
                (*ss)[t] = struct{}{}
        }
 
@@ -567,7 +567,6 @@ const (
        ServiceNameController    ServiceName = "arvados-controller"
        ServiceNameDispatchCloud ServiceName = "arvados-dispatch-cloud"
        ServiceNameHealth        ServiceName = "arvados-health"
-       ServiceNameNodemanager   ServiceName = "arvados-node-manager"
        ServiceNameWorkbench1    ServiceName = "arvados-workbench1"
        ServiceNameWorkbench2    ServiceName = "arvados-workbench2"
        ServiceNameWebsocket     ServiceName = "arvados-ws"
@@ -585,7 +584,6 @@ func (svcs Services) Map() map[ServiceName]Service {
                ServiceNameController:    svcs.Controller,
                ServiceNameDispatchCloud: svcs.DispatchCloud,
                ServiceNameHealth:        svcs.Health,
-               ServiceNameNodemanager:   svcs.Nodemanager,
                ServiceNameWorkbench1:    svcs.Workbench1,
                ServiceNameWorkbench2:    svcs.Workbench2,
                ServiceNameWebsocket:     svcs.Websocket,
index 257a2b4ef54156d65b22bafb3152cc067de6cd13..86fe218c3361a1b986e0cd38ab09a39eac1ec9bc 100644 (file)
@@ -23,6 +23,7 @@ func (s *DurationSuite) TestMarshalJSON(c *check.C) {
        c.Check(err, check.IsNil)
        c.Check(d.D, check.Equals, Duration(time.Second+234*time.Millisecond))
        buf, err := json.Marshal(d)
+       c.Check(err, check.IsNil)
        c.Check(string(buf), check.Equals, `{"D":"1.234s"}`)
 
        for _, trial := range []struct {
index 1d091f5a8e014e87d47ad5abab680fa469169c5d..cb2e54bda261ed590b39b59a90235f6b991787a3 100644 (file)
@@ -52,6 +52,7 @@ func (s *SiteFSSuite) testHomeProject(c *check.C, path string) {
        f, err := s.fs.Open(path)
        c.Assert(err, check.IsNil)
        fis, err := f.Readdir(-1)
+       c.Assert(err, check.IsNil)
        c.Check(len(fis), check.Not(check.Equals), 0)
 
        ok := false
index d3111e1cb1adf1cd71bc49a0fa61ed2f78263a23..778b12015a6f3964be7db301f30cd8ca5db1a971 100644 (file)
@@ -86,6 +86,7 @@ func (s *SiteFSSuite) TestByUUIDAndPDH(c *check.C) {
                f, err = s.fs.Open("/by_id/" + path)
                c.Assert(err, check.IsNil)
                fis, err = f.Readdir(-1)
+               c.Assert(err, check.IsNil)
                var names []string
                for _, fi := range fis {
                        names = append(names, fi.Name())
@@ -96,6 +97,7 @@ func (s *SiteFSSuite) TestByUUIDAndPDH(c *check.C) {
        f, err = s.fs.Open("/by_id/" + fixtureAProjectUUID + "/A Subproject/baz_file")
        c.Assert(err, check.IsNil)
        fis, err = f.Readdir(-1)
+       c.Assert(err, check.IsNil)
        var names []string
        for _, fi := range fis {
                names = append(names, fi.Name())
index e0a41c924bb3de6a0f9ae7826fd4abd2febfd5eb..fc686ad63739e51340d5e254f8f68d65ac4db3e7 100644 (file)
@@ -63,6 +63,7 @@ func (s *ServerRequiredSuite) TestMakeArvadosClientInsecure(c *C) {
 
 func (s *ServerRequiredSuite) TestGetInvalidUUID(c *C) {
        arv, err := MakeArvadosClient()
+       c.Assert(err, IsNil)
 
        getback := make(Dict)
        err = arv.Get("collections", "", nil, &getback)
@@ -80,6 +81,7 @@ func (s *ServerRequiredSuite) TestGetInvalidUUID(c *C) {
 
 func (s *ServerRequiredSuite) TestGetValidUUID(c *C) {
        arv, err := MakeArvadosClient()
+       c.Assert(err, IsNil)
 
        getback := make(Dict)
        err = arv.Get("collections", "zzzzz-4zz18-abcdeabcdeabcde", nil, &getback)
@@ -95,6 +97,7 @@ func (s *ServerRequiredSuite) TestGetValidUUID(c *C) {
 
 func (s *ServerRequiredSuite) TestInvalidResourceType(c *C) {
        arv, err := MakeArvadosClient()
+       c.Assert(err, IsNil)
 
        getback := make(Dict)
        err = arv.Get("unicorns", "zzzzz-zebra-unicorn7unicorn", nil, &getback)
@@ -141,6 +144,7 @@ func (s *ServerRequiredSuite) TestErrorResponse(c *C) {
 
 func (s *ServerRequiredSuite) TestAPIDiscovery_Get_defaultCollectionReplication(c *C) {
        arv, err := MakeArvadosClient()
+       c.Assert(err, IsNil)
        value, err := arv.Discovery("defaultCollectionReplication")
        c.Assert(err, IsNil)
        c.Assert(value, NotNil)
@@ -148,6 +152,7 @@ func (s *ServerRequiredSuite) TestAPIDiscovery_Get_defaultCollectionReplication(
 
 func (s *ServerRequiredSuite) TestAPIDiscovery_Get_noSuchParameter(c *C) {
        arv, err := MakeArvadosClient()
+       c.Assert(err, IsNil)
        value, err := arv.Discovery("noSuchParameter")
        c.Assert(err, NotNil)
        c.Assert(value, IsNil)
diff --git a/sdk/go/arvadostest/db.go b/sdk/go/arvadostest/db.go
new file mode 100644 (file)
index 0000000..41ecfac
--- /dev/null
@@ -0,0 +1,33 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+package arvadostest
+
+import (
+       "context"
+
+       "git.arvados.org/arvados.git/lib/ctrlctx"
+       "git.arvados.org/arvados.git/sdk/go/arvados"
+       "github.com/jmoiron/sqlx"
+       _ "github.com/lib/pq"
+       "gopkg.in/check.v1"
+)
+
+// DB returns a DB connection for the given cluster config.
+func DB(c *check.C, cluster *arvados.Cluster) *sqlx.DB {
+       db, err := sqlx.Open("postgres", cluster.PostgreSQL.Connection.String())
+       c.Assert(err, check.IsNil)
+       return db
+}
+
+// TransactionContext returns a context suitable for running a test
+// case in a new transaction, and a rollback func which the caller
+// should call after the test.
+func TransactionContext(c *check.C, db *sqlx.DB) (ctx context.Context, rollback func()) {
+       tx, err := db.Beginx()
+       c.Assert(err, check.IsNil)
+       return ctrlctx.NewWithTransaction(context.Background(), tx), func() {
+               c.Check(tx.Rollback(), check.IsNil)
+       }
+}
index f4b0a994366db603ba0284cb76f4a6573ff266c7..2acf3e59ab81ae10ff816577f5f33fdaea8b9922 100644 (file)
@@ -157,7 +157,6 @@ func (s *AggregatorSuite) setAllServiceURLs(listen string) {
                &svcs.Keepproxy,
                &svcs.Keepstore,
                &svcs.Health,
-               &svcs.Nodemanager,
                &svcs.RailsAPI,
                &svcs.WebDAV,
                &svcs.Websocket,
index 32802b6a9e00f341195712284b897e1dc083eb48..7d5eb2b64f326e2cd10b41bbeacd147a2bff682a 100644 (file)
@@ -58,6 +58,7 @@ func (s *Suite) TestLogRequests(c *check.C) {
 
        gotReq := make(map[string]interface{})
        err = dec.Decode(&gotReq)
+       c.Check(err, check.IsNil)
        c.Logf("%#v", gotReq)
        c.Check(gotReq["RequestID"], check.Matches, "req-[a-z0-9]{20}")
        c.Check(gotReq["reqForwardedFor"], check.Equals, "1.2.3.4:12345")
@@ -65,6 +66,7 @@ func (s *Suite) TestLogRequests(c *check.C) {
 
        gotResp := make(map[string]interface{})
        err = dec.Decode(&gotResp)
+       c.Check(err, check.IsNil)
        c.Logf("%#v", gotResp)
        c.Check(gotResp["RequestID"], check.Equals, gotReq["RequestID"])
        c.Check(gotResp["reqForwardedFor"], check.Equals, "1.2.3.4:12345")
@@ -111,9 +113,11 @@ func (s *Suite) TestLogErrorBody(c *check.C) {
 
                gotReq := make(map[string]interface{})
                err = dec.Decode(&gotReq)
+               c.Check(err, check.IsNil)
                c.Logf("%#v", gotReq)
                gotResp := make(map[string]interface{})
                err = dec.Decode(&gotResp)
+               c.Check(err, check.IsNil)
                c.Logf("%#v", gotResp)
                if trial.expectLog {
                        c.Check(gotResp["respBody"], check.Equals, trial.expectBody, comment)
index c6c9f044416a7fb93c4b898dfcafcdc121dfc1e4..75603f1baa2bcd59f3af7249e7fc540a06d63dae 100644 (file)
@@ -134,11 +134,11 @@ func (s *CollectionReaderUnit) TestCollectionReaderContent(c *check.C) {
                        c.Check(err, check.Equals, want)
                case string:
                        buf := make([]byte, len(want))
-                       n, err := io.ReadFull(rdr, buf)
+                       _, err := io.ReadFull(rdr, buf)
                        c.Check(err, check.IsNil)
                        for i := 0; i < 4; i++ {
                                c.Check(string(buf), check.Equals, want)
-                               n, err = rdr.Read(buf)
+                               n, err := rdr.Read(buf)
                                c.Check(n, check.Equals, 0)
                                c.Check(err, check.Equals, io.EOF)
                        }
@@ -173,6 +173,7 @@ func (s *CollectionReaderUnit) TestCollectionReaderManyBlocks(c *check.C) {
        filesize := 0
        for i := range locs {
                _, err := rand.Read(buf[:i])
+               c.Assert(err, check.IsNil)
                h.Write(buf[:i])
                locs[i], _, err = s.kc.PutB(buf[:i])
                c.Assert(err, check.IsNil)
@@ -202,9 +203,9 @@ func (s *CollectionReaderUnit) TestCollectionReaderManyBlocks(c *check.C) {
                offset := rand.Intn(len(buf) - 1)
                count := rand.Intn(len(buf) - offset)
                if rand.Intn(2) == 0 {
-                       curPos, err = rdr.Seek(int64(offset)-curPos, io.SeekCurrent)
+                       curPos, _ = rdr.Seek(int64(offset)-curPos, io.SeekCurrent)
                } else {
-                       curPos, err = rdr.Seek(int64(offset), io.SeekStart)
+                       curPos, _ = rdr.Seek(int64(offset), io.SeekStart)
                }
                c.Check(curPos, check.Equals, int64(offset))
                for count > 0 {
@@ -215,6 +216,7 @@ func (s *CollectionReaderUnit) TestCollectionReaderManyBlocks(c *check.C) {
                        count -= n
                }
                curPos, err = rdr.Seek(0, io.SeekCurrent)
+               c.Check(err, check.IsNil)
                c.Check(curPos, check.Equals, int64(offset))
        }
        c.Check(md5.Sum(buf), check.DeepEquals, md5.Sum(testdata))
index e25faed33c1d8c44341e88ec2938948f64140117..a1801b21456b9a6d8bbb716f4db19eaa78feaa4a 100644 (file)
@@ -80,10 +80,12 @@ func (s *ServerRequiredSuite) TestDefaultReplications(c *C) {
        c.Assert(err, Equals, nil)
 
        kc, err := MakeKeepClient(arv)
+       c.Check(err, IsNil)
        c.Assert(kc.Want_replicas, Equals, 2)
 
        arv.DiscoveryDoc["defaultCollectionReplication"] = 3.0
        kc, err = MakeKeepClient(arv)
+       c.Check(err, IsNil)
        c.Assert(kc.Want_replicas, Equals, 3)
 
        arv.DiscoveryDoc["defaultCollectionReplication"] = 1.0
@@ -367,6 +369,7 @@ func (s *StandaloneSuite) TestPutWithFail(c *C) {
                make(chan string, 1)}
 
        arv, err := arvadosclient.MakeArvadosClient()
+       c.Check(err, IsNil)
        kc, _ := MakeKeepClient(arv)
 
        kc.Want_replicas = 2
@@ -426,6 +429,7 @@ func (s *StandaloneSuite) TestPutWithTooManyFail(c *C) {
                make(chan string, 4)}
 
        arv, err := arvadosclient.MakeArvadosClient()
+       c.Check(err, IsNil)
        kc, _ := MakeKeepClient(arv)
 
        kc.Want_replicas = 2
@@ -487,6 +491,7 @@ func (s *StandaloneSuite) TestGet(c *C) {
        defer ks.listener.Close()
 
        arv, err := arvadosclient.MakeArvadosClient()
+       c.Check(err, IsNil)
        kc, _ := MakeKeepClient(arv)
        arv.ApiToken = "abc123"
        kc.SetServiceRoots(map[string]string{"x": ks.url}, nil, nil)
@@ -511,6 +516,7 @@ func (s *StandaloneSuite) TestGet404(c *C) {
        defer ks.listener.Close()
 
        arv, err := arvadosclient.MakeArvadosClient()
+       c.Check(err, IsNil)
        kc, _ := MakeKeepClient(arv)
        arv.ApiToken = "abc123"
        kc.SetServiceRoots(map[string]string{"x": ks.url}, nil, nil)
@@ -552,6 +558,7 @@ func (s *StandaloneSuite) TestGetFail(c *C) {
        defer ks.listener.Close()
 
        arv, err := arvadosclient.MakeArvadosClient()
+       c.Check(err, IsNil)
        kc, _ := MakeKeepClient(arv)
        arv.ApiToken = "abc123"
        kc.SetServiceRoots(map[string]string{"x": ks.url}, nil, nil)
@@ -583,6 +590,7 @@ func (s *StandaloneSuite) TestGetFailRetry(c *C) {
        defer ks.listener.Close()
 
        arv, err := arvadosclient.MakeArvadosClient()
+       c.Check(err, IsNil)
        kc, _ := MakeKeepClient(arv)
        arv.ApiToken = "abc123"
        kc.SetServiceRoots(map[string]string{"x": ks.url}, nil, nil)
@@ -609,6 +617,7 @@ func (s *StandaloneSuite) TestGetNetError(c *C) {
        hash := fmt.Sprintf("%x", md5.Sum([]byte("foo")))
 
        arv, err := arvadosclient.MakeArvadosClient()
+       c.Check(err, IsNil)
        kc, _ := MakeKeepClient(arv)
        arv.ApiToken = "abc123"
        kc.SetServiceRoots(map[string]string{"x": "http://localhost:62222"}, nil, nil)
@@ -645,6 +654,7 @@ func (s *StandaloneSuite) TestGetWithServiceHint(c *C) {
        defer ks.listener.Close()
 
        arv, err := arvadosclient.MakeArvadosClient()
+       c.Check(err, IsNil)
        kc, _ := MakeKeepClient(arv)
        arv.ApiToken = "abc123"
        kc.SetServiceRoots(
@@ -688,6 +698,7 @@ func (s *StandaloneSuite) TestGetWithLocalServiceHint(c *C) {
        defer ks.listener.Close()
 
        arv, err := arvadosclient.MakeArvadosClient()
+       c.Check(err, IsNil)
        kc, _ := MakeKeepClient(arv)
        arv.ApiToken = "abc123"
        kc.SetServiceRoots(
@@ -735,6 +746,7 @@ func (s *StandaloneSuite) TestGetWithServiceHintFailoverToLocals(c *C) {
        defer ksGateway.listener.Close()
 
        arv, err := arvadosclient.MakeArvadosClient()
+       c.Check(err, IsNil)
        kc, _ := MakeKeepClient(arv)
        arv.ApiToken = "abc123"
        kc.SetServiceRoots(
@@ -772,11 +784,13 @@ func (s *StandaloneSuite) TestChecksum(c *C) {
        defer ks.listener.Close()
 
        arv, err := arvadosclient.MakeArvadosClient()
+       c.Check(err, IsNil)
        kc, _ := MakeKeepClient(arv)
        arv.ApiToken = "abc123"
        kc.SetServiceRoots(map[string]string{"x": ks.url}, nil, nil)
 
        r, n, _, err := kc.Get(barhash)
+       c.Check(err, IsNil)
        _, err = ioutil.ReadAll(r)
        c.Check(n, Equals, int64(3))
        c.Check(err, Equals, nil)
@@ -784,6 +798,7 @@ func (s *StandaloneSuite) TestChecksum(c *C) {
        <-st.handled
 
        r, n, _, err = kc.Get(foohash)
+       c.Check(err, IsNil)
        _, err = ioutil.ReadAll(r)
        c.Check(n, Equals, int64(3))
        c.Check(err, Equals, BadChecksum)
@@ -806,6 +821,7 @@ func (s *StandaloneSuite) TestGetWithFailures(c *C) {
                content}
 
        arv, err := arvadosclient.MakeArvadosClient()
+       c.Check(err, IsNil)
        kc, _ := MakeKeepClient(arv)
        arv.ApiToken = "abc123"
        localRoots := make(map[string]string)
@@ -852,6 +868,7 @@ func (s *ServerRequiredSuite) TestPutGetHead(c *C) {
        content := []byte("TestPutGetHead")
 
        arv, err := arvadosclient.MakeArvadosClient()
+       c.Check(err, IsNil)
        kc, err := MakeKeepClient(arv)
        c.Assert(err, Equals, nil)
 
@@ -912,6 +929,7 @@ func (s *StandaloneSuite) TestPutProxy(c *C) {
        st := StubProxyHandler{make(chan string, 1)}
 
        arv, err := arvadosclient.MakeArvadosClient()
+       c.Check(err, IsNil)
        kc, _ := MakeKeepClient(arv)
 
        kc.Want_replicas = 2
@@ -940,6 +958,7 @@ func (s *StandaloneSuite) TestPutProxyInsufficientReplicas(c *C) {
        st := StubProxyHandler{make(chan string, 1)}
 
        arv, err := arvadosclient.MakeArvadosClient()
+       c.Check(err, IsNil)
        kc, _ := MakeKeepClient(arv)
 
        kc.Want_replicas = 3
@@ -1133,6 +1152,7 @@ func (s *StandaloneSuite) TestGetIndexWithPrefix(c *C) {
        defer ks.listener.Close()
 
        arv, err := arvadosclient.MakeArvadosClient()
+       c.Check(err, IsNil)
        kc, _ := MakeKeepClient(arv)
        arv.ApiToken = "abc123"
        kc.SetServiceRoots(map[string]string{"x": ks.url}, nil, nil)
@@ -1159,6 +1179,7 @@ func (s *StandaloneSuite) TestGetIndexIncomplete(c *C) {
        defer ks.listener.Close()
 
        arv, err := arvadosclient.MakeArvadosClient()
+       c.Check(err, IsNil)
        kc, _ := MakeKeepClient(arv)
        arv.ApiToken = "abc123"
        kc.SetServiceRoots(map[string]string{"x": ks.url}, nil, nil)
@@ -1181,6 +1202,7 @@ func (s *StandaloneSuite) TestGetIndexWithNoSuchServer(c *C) {
        defer ks.listener.Close()
 
        arv, err := arvadosclient.MakeArvadosClient()
+       c.Check(err, IsNil)
        kc, _ := MakeKeepClient(arv)
        arv.ApiToken = "abc123"
        kc.SetServiceRoots(map[string]string{"x": ks.url}, nil, nil)
@@ -1201,6 +1223,7 @@ func (s *StandaloneSuite) TestGetIndexWithNoSuchPrefix(c *C) {
        defer ks.listener.Close()
 
        arv, err := arvadosclient.MakeArvadosClient()
+       c.Check(err, IsNil)
        kc, _ := MakeKeepClient(arv)
        arv.ApiToken = "abc123"
        kc.SetServiceRoots(map[string]string{"x": ks.url}, nil, nil)
diff --git a/sdk/pam/.gitignore b/sdk/pam/.gitignore
deleted file mode 120000 (symlink)
index 1399fd4..0000000
+++ /dev/null
@@ -1 +0,0 @@
-../python/.gitignore
\ No newline at end of file
diff --git a/sdk/pam/Dockerfile b/sdk/pam/Dockerfile
deleted file mode 100644 (file)
index ff450d8..0000000
+++ /dev/null
@@ -1,56 +0,0 @@
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-# These tests assume you have a real API server running on the docker host.
-#
-# Build the test container:
-#   First, replace 3000 below with your api server's port number if necessary.
-#   host$ python setup.py sdist rotate --keep=1 --match .tar.gz
-#   host$ docker build --tag=arvados/pam_test .
-#
-# Automated integration test:
-#   host$ docker run -it --add-host zzzzz.arvadosapi.com:"$(hostname -I |awk '{print $1}')" arvados/pam_test
-# You should see "=== OK ===", followed by a Perl stack trace due to a
-# yet-unidentified pam_python.so bug.
-#
-# Manual integration test:
-#   host$ docker run -it --add-host zzzzz.arvadosapi.com:"$(hostname -I |awk '{print $1}')" arvados/pam_test bash -c 'rsyslogd & tail -F /var/log/auth.log & sleep 1 & bash'
-#   container# login
-#   login: active
-#   Arvados API token: 3kg6k6lzmp9kj5cpkcoxie963cmvjahbt2fod9zru30k1jqdmi
-# You should now be logged in to the "active" shell account in the
-# container. You should also see arvados_pam log entries in
-# /var/log/auth.log (and in your terminal, thanks to "tail -F").
-
-FROM debian:wheezy
-RUN apt-get update
-RUN apt-get -qy dist-upgrade
-RUN apt-get -qy install python python-virtualenv libpam-python rsyslog
-# Packages required by pycurl, ciso8601
-RUN apt-get -qy install libcurl4-gnutls-dev python2.7-dev
-
-# for jessie (which also has other snags)
-# RUN apt-get -qy install python-pip libgnutls28-dev
-
-RUN pip install --upgrade setuptools
-RUN pip install python-pam
-ADD dist /dist
-RUN pip install /dist/arvados-pam-*.tar.gz
-
-# Configure and enable the module (hopefully vendor packages will offer a neater way)
-RUN perl -pi -e 's{api.example}{zzzzz.arvadosapi.com:3000}; s{shell\.example}{testvm2.shell insecure};' /usr/share/pam-configs/arvados
-RUN DEBIAN_FRONTEND=noninteractive pam-auth-update arvados --remove unix
-
-# Add a user account matching the fixture
-RUN useradd -ms /bin/bash active
-
-# Test with python (SIGSEGV during tests)
-#ADD . /pam
-#WORKDIR /pam
-#CMD rsyslogd & tail -F /var/log/auth.log & python setup.py test --test-suite integration_tests
-
-# Test with perl (SIGSEGV when program exits)
-RUN apt-get install -qy libauthen-pam-perl
-ADD tests/integration_test.pl /integration_test.pl
-CMD rsyslogd & tail -F /var/log/auth.log & sleep 1 && /integration_test.pl
diff --git a/sdk/pam/LICENSE-2.0.txt b/sdk/pam/LICENSE-2.0.txt
deleted file mode 100644 (file)
index d645695..0000000
+++ /dev/null
@@ -1,202 +0,0 @@
-
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright [yyyy] [name of copyright owner]
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
diff --git a/sdk/pam/MANIFEST.in b/sdk/pam/MANIFEST.in
deleted file mode 100644 (file)
index 48892fa..0000000
+++ /dev/null
@@ -1,10 +0,0 @@
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-include LICENSE-2.0.txt
-include README.rst
-include examples/shellinabox
-include lib/libpam_arvados.py
-include pam-configs/arvados
-include arvados_version.py
\ No newline at end of file
diff --git a/sdk/pam/README.rst b/sdk/pam/README.rst
deleted file mode 100644 (file)
index 81be331..0000000
+++ /dev/null
@@ -1,25 +0,0 @@
-.. Copyright (C) The Arvados Authors. All rights reserved.
-..
-.. SPDX-License-Identifier: Apache-2.0
-
-==================
-Arvados PAM Module
-==================
-
-Overview
---------
-
-Accept Arvados API tokens to authenticate to shell accounts.
-
-.. _Arvados: https://arvados.org
-
-Installation
-------------
-
-See http://doc.arvados.org
-
-Testing and Development
------------------------
-
-https://arvados.org/projects/arvados/wiki/Hacking
-describes how to set up a development environment and run tests.
diff --git a/sdk/pam/arvados_pam/__init__.py b/sdk/pam/arvados_pam/__init__.py
deleted file mode 100644 (file)
index dd78d41..0000000
+++ /dev/null
@@ -1,55 +0,0 @@
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-import sys
-sys.argv=['']
-
-from . import auth_event
-
-def pam_sm_authenticate(pamh, flags, argv):
-    config = {}
-    config['arvados_api_host'] = argv[1]
-    config['virtual_machine_hostname'] = argv[2]
-    if len(argv) > 3:
-        for k in argv[3:]:
-            config[k] = True
-
-    try:
-        username = pamh.get_user(None)
-    except pamh.exception as e:
-        return e.pam_result
-
-    if not username:
-        return pamh.PAM_USER_UNKNOWN
-
-    try:
-        prompt = '' if config.get('noprompt') else 'Arvados API token: '
-        token = pamh.conversation(pamh.Message(pamh.PAM_PROMPT_ECHO_OFF, prompt)).resp
-    except pamh.exception as e:
-        return e.pam_result
-
-    if auth_event.AuthEvent(
-            config=config,
-            service=pamh.service,
-            client_host=pamh.rhost,
-            username=username,
-            token=token).can_login():
-        return pamh.PAM_SUCCESS
-    else:
-        return pamh.PAM_AUTH_ERR
-
-def pam_sm_setcred(pamh, flags, argv):
-    return pamh.PAM_SUCCESS
-
-def pam_sm_acct_mgmt(pamh, flags, argv):
-    return pamh.PAM_SUCCESS
-
-def pam_sm_open_session(pamh, flags, argv):
-    return pamh.PAM_SUCCESS
-
-def pam_sm_close_session(pamh, flags, argv):
-    return pamh.PAM_SUCCESS
-
-def pam_sm_chauthtok(pamh, flags, argv):
-    return pamh.PAM_SUCCESS
diff --git a/sdk/pam/arvados_pam/auth_event.py b/sdk/pam/arvados_pam/auth_event.py
deleted file mode 100644 (file)
index 4f2663c..0000000
+++ /dev/null
@@ -1,92 +0,0 @@
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-import arvados
-import syslog
-
-def auth_log(msg):
-    """Log an authentication result to syslogd"""
-    syslog.openlog(facility=syslog.LOG_AUTH)
-    syslog.syslog('arvados_pam: ' + msg)
-    syslog.closelog()
-
-class AuthEvent(object):
-    def __init__(self, config, service, client_host, username, token):
-        self.config = config
-        self.service = service
-        self.client_host = client_host
-        self.username = username
-        self.token = token
-
-        self.api_host = None
-        self.vm_uuid = None
-        self.user = None
-
-    def can_login(self):
-        """Return truthy IFF credentials should be accepted."""
-        ok = False
-        try:
-            self.api_host = self.config['arvados_api_host']
-            self.arv = arvados.api('v1', host=self.api_host, token=self.token,
-                                   insecure=self.config.get('insecure', False),
-                                   cache=False)
-
-            vmname = self.config['virtual_machine_hostname']
-            vms = self.arv.virtual_machines().list(filters=[['hostname','=',vmname]]).execute()
-            if vms['items_available'] > 1:
-                raise Exception("lookup hostname %s returned %d records" % (vmname, vms['items_available']))
-            if vms['items_available'] == 0:
-                raise Exception("lookup hostname %s not found" % vmname)
-            vm = vms['items'][0]
-            if vm['hostname'] != vmname:
-                raise Exception("lookup hostname %s returned hostname %s" % (vmname, vm['hostname']))
-            self.vm_uuid = vm['uuid']
-
-            self.user = self.arv.users().current().execute()
-
-            filters = [
-                ['link_class','=','permission'],
-                ['name','=','can_login'],
-                ['head_uuid','=',self.vm_uuid],
-                ['tail_uuid','=',self.user['uuid']]]
-            for l in self.arv.links().list(filters=filters, limit=10000).execute()['items']:
-                if (l['properties']['username'] == self.username and
-                    l['tail_uuid'] == self.user['uuid'] and
-                    l['head_uuid'] == self.vm_uuid and
-                    l['link_class'] == 'permission' and
-                    l['name'] == 'can_login'):
-                    return self._report(True)
-
-            return self._report(False)
-
-        except Exception as e:
-            return self._report(e)
-
-    def _report(self, result):
-        """Log the result. Return truthy IFF result is True.
-
-        result must be True, False, or an exception.
-        """
-        self.result = result
-        auth_log(self.message())
-        return result == True
-
-    def message(self):
-        """Return a log message describing the event and its outcome."""
-        if isinstance(self.result, Exception):
-            outcome = 'Error: ' + repr(self.result)
-        elif self.result == True:
-            outcome = 'Allow'
-        else:
-            outcome = 'Deny'
-
-        if len(self.token) > 40:
-            log_token = self.token[0:15]
-        else:
-            log_token = '<invalid>'
-
-        log_label = [self.service, self.api_host, self.vm_uuid, self.client_host, self.username, log_token]
-        if self.user:
-            log_label += [self.user.get('uuid'), self.user.get('full_name')]
-        return str(log_label) + ': ' + outcome
diff --git a/sdk/pam/arvados_version.py b/sdk/pam/arvados_version.py
deleted file mode 100644 (file)
index 9aabff4..0000000
+++ /dev/null
@@ -1,36 +0,0 @@
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-import subprocess
-import time
-import os
-import re
-
-def git_version_at_commit():
-    curdir = os.path.dirname(os.path.abspath(__file__))
-    myhash = subprocess.check_output(['git', 'log', '-n1', '--first-parent',
-                                       '--format=%H', curdir]).strip()
-    myversion = subprocess.check_output([curdir+'/../../build/version-at-commit.sh', myhash]).strip().decode()
-    return myversion
-
-def save_version(setup_dir, module, v):
-  with open(os.path.join(setup_dir, module, "_version.py"), 'wt') as fp:
-      return fp.write("__version__ = '%s'\n" % v)
-
-def read_version(setup_dir, module):
-  with open(os.path.join(setup_dir, module, "_version.py"), 'rt') as fp:
-      return re.match("__version__ = '(.*)'$", fp.read()).groups()[0]
-
-def get_version(setup_dir, module):
-    env_version = os.environ.get("ARVADOS_BUILDING_VERSION")
-
-    if env_version:
-        save_version(setup_dir, module, env_version)
-    else:
-        try:
-            save_version(setup_dir, module, git_version_at_commit())
-        except (subprocess.CalledProcessError, OSError):
-            pass
-
-    return read_version(setup_dir, module)
diff --git a/sdk/pam/examples/shellinabox b/sdk/pam/examples/shellinabox
deleted file mode 100644 (file)
index 2d91ccb..0000000
+++ /dev/null
@@ -1,27 +0,0 @@
-# This example is a stock debian "login" file with libpam_arvados
-# replacing pam_unix, and the "noprompt" option in use. It can be
-# installed as /etc/pam.d/shellinabox .
-
-auth       optional   pam_faildelay.so  delay=3000000
-auth [success=ok new_authtok_reqd=ok ignore=ignore user_unknown=bad default=die] pam_securetty.so
-auth       requisite  pam_nologin.so
-session [success=ok ignore=ignore module_unknown=ignore default=bad] pam_selinux.so close
-session       required   pam_env.so readenv=1
-session       required   pam_env.so readenv=1 envfile=/etc/default/locale
-
-auth [success=1 default=ignore] pam_python.so /usr/local/lib/security/libpam_arvados.py api.example shell.example noprompt
-auth   requisite                       pam_deny.so
-auth   required                        pam_permit.so
-
-auth       optional   pam_group.so
-session    required   pam_limits.so
-session    optional   pam_lastlog.so
-session    optional   pam_motd.so  motd=/run/motd.dynamic
-session    optional   pam_motd.so
-session    optional   pam_mail.so standard
-
-@include common-account
-@include common-session
-@include common-password
-
-session [success=ok ignore=ignore module_unknown=ignore default=bad] pam_selinux.so open
diff --git a/sdk/pam/fpm-info.sh b/sdk/pam/fpm-info.sh
deleted file mode 100644 (file)
index 6c323f5..0000000
+++ /dev/null
@@ -1,22 +0,0 @@
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-case "$TARGET" in
-    debian* | ubuntu*)
-        fpm_depends+=('libpam-python' 'libcurl3-gnutls')
-        ;;
-    centos*)
-        fpm_depends+=('python-pam')
-        ;;
-    *)
-        echo >&2 "ERROR: $PACKAGE: pam_python.so dependency unavailable in $TARGET."
-        return 1
-        ;;
-esac
-
-case "$FORMAT" in
-    deb)
-        fpm_args+=('--deb-recommends=system-log-daemon')
-        ;;
-esac
diff --git a/sdk/pam/gittaggers.py b/sdk/pam/gittaggers.py
deleted file mode 120000 (symlink)
index d59c02c..0000000
+++ /dev/null
@@ -1 +0,0 @@
-../python/gittaggers.py
\ No newline at end of file
diff --git a/sdk/pam/integration_tests/__init__.py b/sdk/pam/integration_tests/__init__.py
deleted file mode 100644 (file)
index e69de29..0000000
diff --git a/sdk/pam/integration_tests/test_pam.py b/sdk/pam/integration_tests/test_pam.py
deleted file mode 100644 (file)
index 32ae38d..0000000
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-"""These tests assume we are running (in a docker container) with
-arvados_pam configured and a test API server running.
-"""
-import pam
-import unittest
-
-# From services/api/test/fixtures/api_client_authorizations.yml
-# because that file is not available during integration tests:
-ACTIVE_TOKEN = '3kg6k6lzmp9kj5cpkcoxie963cmvjahbt2fod9zru30k1jqdmi'
-SPECTATOR_TOKEN = 'zw2f4gwx8hw8cjre7yp6v1zylhrhn3m5gvjq73rtpwhmknrybu'
-
-class IntegrationTest(unittest.TestCase):
-    def setUp(self):
-        self.p = pam.pam()
-
-    def test_allow(self):
-        self.assertTrue(self.p.authenticate('active', ACTIVE_TOKEN, service='login'))
-
-    def test_deny_bad_token(self):
-        self.assertFalse(self.p.authenticate('active', 'thisisaverybadtoken', service='login'))
-
-    def test_deny_empty_token(self):
-        self.assertFalse(self.p.authenticate('active', '', service='login'))
-
-    def test_deny_permission(self):
-        self.assertFalse(self.p.authenticate('spectator', SPECTATOR_TOKEN, service='login'))
diff --git a/sdk/pam/lib/libpam_arvados.py b/sdk/pam/lib/libpam_arvados.py
deleted file mode 100644 (file)
index 7c3406d..0000000
+++ /dev/null
@@ -1,7 +0,0 @@
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-import sys
-sys.path.append('/usr/share/python2.7/dist/libpam-arvados/lib/python2.7/site-packages')
-from arvados_pam import *
diff --git a/sdk/pam/pam-configs/arvados b/sdk/pam/pam-configs/arvados
deleted file mode 100644 (file)
index 086e176..0000000
+++ /dev/null
@@ -1,14 +0,0 @@
-# 1. Change "api.example" to your ARVADOS_API_HOST
-# 2. Change "shell.example" to this host's hostname
-#    (as it appears in the Arvados virtual_machines list)
-# 3. Install in /usr/share/pam-configs/arvados
-# 4. Run `pam-auth-update arvados`
-
-Name: Arvados authentication
-Default: yes
-Priority: 256
-Auth-Type: Primary
-Auth:
-       [success=end default=ignore]    pam_python.so /usr/local/lib/security/libpam_arvados.py api.example shell.example
-Auth-Initial:
-       [success=end default=ignore]    pam_python.so /usr/local/lib/security/libpam_arvados.py api.example shell.example
diff --git a/sdk/pam/setup.py b/sdk/pam/setup.py
deleted file mode 100755 (executable)
index 59b49a1..0000000
+++ /dev/null
@@ -1,57 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-from __future__ import absolute_import
-import glob
-import os
-import sys
-import re
-import subprocess
-
-from setuptools import setup, find_packages
-
-SETUP_DIR = os.path.dirname(__file__) or '.'
-README = os.path.join(SETUP_DIR, 'README.rst')
-
-import arvados_version
-version = arvados_version.get_version(SETUP_DIR, "arvados_pam")
-if os.environ.get('ARVADOS_BUILDING_VERSION', False):
-    pysdk_dep = "=={}".format(version)
-else:
-    # On dev releases, arvados-python-client may have a different timestamp
-    pysdk_dep = "<={}".format(version)
-
-short_tests_only = False
-if '--short-tests-only' in sys.argv:
-    short_tests_only = True
-    sys.argv.remove('--short-tests-only')
-
-setup(name='arvados-pam',
-      version=version,
-      description='Arvados PAM module',
-      long_description=open(README).read(),
-      author='Arvados',
-      author_email='info@arvados.org',
-      url='https://arvados.org',
-      download_url='https://github.com/arvados/arvados.git',
-      license='Apache 2.0',
-      packages=[
-          'arvados_pam',
-      ],
-      scripts=[
-      ],
-      data_files=[
-          ('lib/security', ['lib/libpam_arvados.py']),
-          ('share/pam-configs', ['pam-configs/arvados']),
-          ('share/doc/arvados-pam', ['LICENSE-2.0.txt', 'README.rst']),
-          ('share/doc/arvados-pam/examples', glob.glob('examples/*')),
-      ],
-      install_requires=[
-          'arvados-python-client{}'.format(pysdk_dep),
-      ],
-      test_suite='tests',
-      tests_require=['pbr<1.7.0', 'mock>=1.0', 'python-pam'],
-      zip_safe=False,
-)
diff --git a/sdk/pam/tests/__init__.py b/sdk/pam/tests/__init__.py
deleted file mode 100644 (file)
index e69de29..0000000
diff --git a/sdk/pam/tests/integration_test.pl b/sdk/pam/tests/integration_test.pl
deleted file mode 100755 (executable)
index cbe9b0a..0000000
+++ /dev/null
@@ -1,49 +0,0 @@
-#!/usr/bin/env perl
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-$ENV{ARVADOS_API_HOST_INSECURE} = 1;
-use Authen::PAM qw(:constants);
-
-for my $case (['good', 1, 'active', '3kg6k6lzmp9kj5cpkcoxie963cmvjahbt2fod9zru30k1jqdmi'],
-              ['badtoken', 0, 'active', 'badtokenmp9kj5cpkcoxie963cmvjahbt2fod9zru30k1jqdmi'],
-              ['badusername', 0, 'baduser', '3kg6k6lzmp9kj5cpkcoxie963cmvjahbt2fod9zru30k1jqdmi']) {
-    dotest(@$case);
-}
-print "=== OK ===\n";
-
-sub dotest {
-    my ($label, $expect_ok, $user, $token) = @_;
-    print "$label: ";
-    my $service_name = 'login';
-    $main::Token = $token;
-    my $pamh = new Authen::PAM($service_name, $user, \&token_conv_func);
-    ref($pamh) || die "Error code $pamh during PAM init!";
-    $pamh->pam_set_item(PAM_RHOST(), '::1');
-    $pamh->pam_set_item(PAM_RUSER(), 'none');
-    $pamh->pam_set_item(PAM_TTY(), '/dev/null');
-    my $flags = PAM_SILENT();
-    $res = $pamh->pam_authenticate($flags);
-    $msg = $pamh->pam_strerror($res);
-    print "Result (code $res): $msg\n";
-    if (($res == 0) != ($expect_ok == 1)) {
-        die "*** FAIL ***\n";
-    }
-}
-
-sub token_conv_func {
-    my @res;
-    while ( @_ ) {
-        my $code = shift;
-        my $msg = shift;
-        my $ans;
-        print "Message (type $code): $msg\n";
-        if ($code == PAM_PROMPT_ECHO_OFF() || $code == PAM_PROMPT_ECHO_ON()) {
-            $ans = $main::Token;
-        }
-        push @res, (0,$ans);
-    }
-    push @res, PAM_SUCCESS();
-    return @res;
-}
diff --git a/sdk/pam/tests/mocker.py b/sdk/pam/tests/mocker.py
deleted file mode 100644 (file)
index ec6f064..0000000
+++ /dev/null
@@ -1,63 +0,0 @@
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-import mock
-import unittest
-
-class Mocker(unittest.TestCase):
-    ACTIVE_TOKEN = '3kg6k6lzmp9kj5cpkcoxie963cmvjahbt2fod9zru30k1jqdmi'
-
-    default_config = {
-        'arvados_api_host': 'zzzzz.api_host.example',
-        'virtual_machine_hostname': 'testvm2.shell',
-    }
-    default_request = {
-        'client_host': '::1',
-        'token': ACTIVE_TOKEN,
-        'username': 'active',
-    }
-    default_response = {
-        'links': {
-            'items': [{
-                'uuid': 'zzzzz-o0j2j-rah2ya1ohx9xaev',
-                'tail_uuid': 'zzzzz-tpzed-xurymjxw79nv3jz',
-                'head_uuid': 'zzzzz-2x53u-382brsig8rp3065',
-                'link_class': 'permission',
-                'name': 'can_login',
-                'properties': {
-                    'username': 'active',
-                },
-            }],
-        },
-        'users': {
-            'uuid': 'zzzzz-tpzed-xurymjxw79nv3jz',
-            'full_name': 'Active User',
-        },
-        'virtual_machines': {
-            'items': [{
-                'uuid': 'zzzzz-2x53u-382brsig8rp3065',
-                'hostname': 'testvm2.shell',
-            }],
-            'items_available': 1,
-        },
-    }
-
-    def setUp(self):
-        self.config = self.default_config.copy()
-        self.request = self.default_request.copy()
-        self.response = self.default_response.copy()
-        self.api_client = mock.MagicMock(name='api_client')
-        self.api_client.users().current().execute.side_effect = lambda: self.response['users']
-        self.api_client.virtual_machines().list().execute.side_effect = lambda: self.response['virtual_machines']
-        self.api_client.links().list().execute.side_effect = lambda: self.response['links']
-        patcher = mock.patch('arvados.api')
-        self.api = patcher.start()
-        self.addCleanup(patcher.stop)
-        self.api.side_effect = [self.api_client]
-
-        self.syslogged = []
-        patcher = mock.patch('syslog.syslog')
-        self.syslog = patcher.start()
-        self.addCleanup(patcher.stop)
-        self.syslog.side_effect = lambda s: self.syslogged.append(s)
diff --git a/sdk/pam/tests/test_auth_event.py b/sdk/pam/tests/test_auth_event.py
deleted file mode 100644 (file)
index f907b31..0000000
+++ /dev/null
@@ -1,99 +0,0 @@
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-import arvados_pam
-import re
-from . import mocker
-
-class AuthEventTest(mocker.Mocker):
-    def attempt(self):
-        return arvados_pam.auth_event.AuthEvent(config=self.config, service='test_service', **self.request).can_login()
-
-    def test_success(self):
-        self.assertTrue(self.attempt())
-
-        self.api_client.virtual_machines().list.assert_called_with(
-            filters=[['hostname','=',self.config['virtual_machine_hostname']]])
-        self.api.assert_called_with(
-            'v1',
-            host=self.config['arvados_api_host'], token=self.request['token'],
-            insecure=False,
-            cache=False)
-        self.assertEqual(1, len(self.syslogged))
-        for i in ['test_service',
-                  self.request['username'],
-                  self.config['arvados_api_host'],
-                  self.response['virtual_machines']['items'][0]['uuid']]:
-            self.assertRegexpMatches(self.syslogged[0], re.escape(i))
-        self.assertRegexpMatches(self.syslogged[0], re.escape(self.request['token'][0:15]), 'token prefix not logged')
-        self.assertNotRegexpMatches(self.syslogged[0], re.escape(self.request['token'][15:30]), 'too much token logged')
-
-    def test_fail_vm_lookup(self):
-        self.api_client.virtual_machines().list().execute.side_effect = Exception("Test-induced failure")
-        self.assertFalse(self.attempt())
-        self.assertRegexpMatches(self.syslogged[0], 'Test-induced failure')
-
-    def test_vm_hostname_not_found(self):
-        self.response['virtual_machines'] = {
-            'items': [],
-            'items_available': 0,
-        }
-        self.assertFalse(self.attempt())
-
-    def test_vm_hostname_ambiguous(self):
-        self.response['virtual_machines'] = {
-            'items': [
-                {
-                    'uuid': 'zzzzz-2x53u-382brsig8rp3065',
-                    'hostname': 'testvm2.shell',
-                },
-                {
-                    'uuid': 'zzzzz-2x53u-382brsig8rp3065',
-                    'hostname': 'testvm2.shell',
-                },
-            ],
-            'items_available': 2,
-        }
-        self.assertFalse(self.attempt())
-
-    def test_server_ignores_vm_filters(self):
-        self.response['virtual_machines'] = {
-            'items': [
-                {
-                    'uuid': 'zzzzz-2x53u-382brsig8rp3065',
-                    'hostname': 'testvm22.shell', # <-----
-                },
-            ],
-            'items_available': 1,
-        }
-        self.assertFalse(self.attempt())
-
-    def test_fail_user_lookup(self):
-        self.api_client.users().current().execute.side_effect = Exception("Test-induced failure")
-        self.assertFalse(self.attempt())
-
-    def test_fail_permission_check(self):
-        self.api_client.links().list().execute.side_effect = Exception("Test-induced failure")
-        self.assertFalse(self.attempt())
-
-    def test_no_login_permission(self):
-        self.response['links'] = {
-            'items': [],
-        }
-        self.assertFalse(self.attempt())
-
-    def test_server_ignores_permission_filters(self):
-        self.response['links'] = {
-            'items': [{
-                'uuid': 'zzzzz-o0j2j-rah2ya1ohx9xaev',
-                'tail_uuid': 'zzzzz-tpzed-xurymjxw79nv3jz',
-                'head_uuid': 'zzzzz-2x53u-382brsig8rp3065',
-                'link_class': 'permission',
-                'name': 'CANT_login', # <-----
-                'properties': {
-                    'username': 'active',
-                },
-            }],
-        }
-        self.assertFalse(self.attempt())
diff --git a/sdk/pam/tests/test_pam_sm.py b/sdk/pam/tests/test_pam_sm.py
deleted file mode 100644 (file)
index 53597c0..0000000
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-import arvados_pam
-import mock
-from . import mocker
-
-class PamSMTest(mocker.Mocker):
-    def attempt(self):
-        return arvados_pam.pam_sm_authenticate(self.pamh, 0, self.argv)
-
-    def test_success(self):
-        self.assertEqual(self.pamh.PAM_SUCCESS, self.attempt())
-
-    def test_bad_user(self):
-        self.pamh.get_user = mock.MagicMock(return_value='badusername')
-        self.assertEqual(self.pamh.PAM_AUTH_ERR, self.attempt())
-
-    def test_bad_vm(self):
-        self.argv[2] = 'testvm22.shell'
-        self.assertEqual(self.pamh.PAM_AUTH_ERR, self.attempt())
-
-    def setUp(self):
-        super(PamSMTest, self).setUp()
-        self.pamh = mock.MagicMock()
-        self.pamh.get_user = mock.MagicMock(return_value='active')
-        self.pamh.PAM_SUCCESS = 12345
-        self.pamh.PAM_AUTH_ERR = 54321
-        self.argv = [__file__, 'zzzzz.arvadosapi.com', 'testvm2.shell']
index 445775ccedcd1f4ef246297c22a17e470e0f0e94..5c1bb29e764c549d1612bb13f2890a076866fc74 100755 (executable)
@@ -22,6 +22,7 @@ import hmac
 import urllib.parse
 import os
 import hashlib
+import re
 from arvados._version import __version__
 
 EMAIL=0
@@ -169,19 +170,20 @@ def read_migrations(args, by_email, by_username):
 
 def update_username(args, email, user_uuid, username, migratecluster, migratearv):
     print("(%s) Updating username of %s to '%s' on %s" % (email, user_uuid, username, migratecluster))
-    if not args.dry_run:
-        try:
-            conflicts = migratearv.users().list(filters=[["username", "=", username]], bypass_federation=True).execute()
-            if conflicts["items"]:
-                # There's already a user with the username, move the old user out of the way
-                migratearv.users().update(uuid=conflicts["items"][0]["uuid"],
-                                          bypass_federation=True,
-                                          body={"user": {"username": username+"migrate"}}).execute()
-            migratearv.users().update(uuid=user_uuid,
-                                      bypass_federation=True,
-                                      body={"user": {"username": username}}).execute()
-        except arvados.errors.ApiError as e:
-            print("(%s) Error updating username of %s to '%s' on %s: %s" % (email, user_uuid, username, migratecluster, e))
+    if args.dry_run:
+        return
+    try:
+        conflicts = migratearv.users().list(filters=[["username", "=", username]], bypass_federation=True).execute()
+        if conflicts["items"]:
+            # There's already a user with the username, move the old user out of the way
+            migratearv.users().update(uuid=conflicts["items"][0]["uuid"],
+                                        bypass_federation=True,
+                                        body={"user": {"username": username+"migrate"}}).execute()
+        migratearv.users().update(uuid=user_uuid,
+                                    bypass_federation=True,
+                                    body={"user": {"username": username}}).execute()
+    except arvados.errors.ApiError as e:
+        print("(%s) Error updating username of %s to '%s' on %s: %s" % (email, user_uuid, username, migratecluster, e))
 
 
 def choose_new_user(args, by_email, email, userhome, username, old_user_uuid, clusters):
@@ -212,11 +214,17 @@ def choose_new_user(args, by_email, email, userhome, username, old_user_uuid, cl
                 conflicts = homearv.users().list(filters=[["username", "=", username]],
                                                  bypass_federation=True).execute()
                 if conflicts["items"]:
-                    homearv.users().update(uuid=conflicts["items"][0]["uuid"],
-                                           bypass_federation=True,
-                                           body={"user": {"username": username+"migrate"}}).execute()
-                user = homearv.users().create(body={"user": {"email": email, "username": username,
-                                                             "is_active": olduser["is_active"]}}).execute()
+                    homearv.users().update(
+                        uuid=conflicts["items"][0]["uuid"],
+                        bypass_federation=True,
+                        body={"user": {"username": username+"migrate"}}).execute()
+                user = homearv.users().create(
+                    body={"user": {
+                        "email": email,
+                        "first_name": olduser["first_name"],
+                        "last_name": olduser["last_name"],
+                        "username": username,
+                        "is_active": olduser["is_active"]}}).execute()
             except arvados.errors.ApiError as e:
                 print("(%s) Could not create user: %s" % (email, str(e)))
                 return None
@@ -271,7 +279,7 @@ def activate_remote_user(args, email, homearv, migratearv, old_user_uuid, new_us
             newuser = arvados.api(host=ru.netloc, token=salted,
                                   insecure=os.environ.get("ARVADOS_API_HOST_INSECURE")).users().current().execute()
         else:
-            newuser = {"is_active": True, "username": username}
+            newuser = {"is_active": True, "username": email.split('@')[0], "is_admin": False}
     except arvados.errors.ApiError as e:
         print("(%s) Error getting user info for %s from %s: %s" % (email, new_user_uuid, migratecluster, e))
         return None
@@ -287,39 +295,48 @@ def activate_remote_user(args, email, homearv, migratearv, old_user_uuid, new_us
             return None
 
     if olduser["is_admin"] and not newuser["is_admin"]:
-        print("(%s) Not migrating %s because user is admin but target user %s is not admin on %s" % (email, old_user_uuid, new_user_uuid, migratecluster))
+        print("(%s) Not migrating %s because user is admin but target user %s is not admin on %s. Please ensure the user admin status is the same on both clusters. Note that a federated admin account has admin privileges on the entire federation." % (email, old_user_uuid, new_user_uuid, migratecluster))
         return None
 
     return newuser
 
 def migrate_user(args, migratearv, email, new_user_uuid, old_user_uuid):
+    if args.dry_run:
+        return
     try:
-        if not args.dry_run:
+        new_owner_uuid = new_user_uuid
+        if args.data_into_subproject:
             grp = migratearv.groups().create(body={
                 "owner_uuid": new_user_uuid,
                 "name": "Migrated from %s (%s)" % (email, old_user_uuid),
                 "group_class": "project"
             }, ensure_unique_name=True).execute()
-            migratearv.users().merge(old_user_uuid=old_user_uuid,
-                                     new_user_uuid=new_user_uuid,
-                                     new_owner_uuid=grp["uuid"],
-                                     redirect_to_new_user=True).execute()
+            new_owner_uuid = grp["uuid"]
+        migratearv.users().merge(old_user_uuid=old_user_uuid,
+                                    new_user_uuid=new_user_uuid,
+                                    new_owner_uuid=new_owner_uuid,
+                                    redirect_to_new_user=True).execute()
     except arvados.errors.ApiError as e:
-        print("(%s) Error migrating user: %s" % (email, e))
+        name_collision = re.search(r'Key \(owner_uuid, name\)=\((.*?), (.*?)\) already exists\.\n.*UPDATE "(.*?)"', e._get_reason())
+        if name_collision:
+            target_owner, rsc_name, rsc_type = name_collision.groups()
+            print("(%s) Cannot migrate to %s because both origin and target users have a %s named '%s'. Please rename the conflicting items or use --data-into-subproject to migrate all users' data into a special subproject." % (email, target_owner, rsc_type[:-1], rsc_name))
+        else:
+            print("(%s) Skipping user migration because of error: %s" % (email, e))
 
 
 def main():
-
     parser = argparse.ArgumentParser(description='Migrate users to federated identity, see https://doc.arvados.org/admin/merge-remote-account.html')
     parser.add_argument(
         '--version', action='version', version="%s %s" % (sys.argv[0], __version__),
         help='Print version and exit.')
-    parser.add_argument('--tokens', type=str, required=False)
+    parser.add_argument('--tokens', type=str, metavar='FILE', required=False, help="Read tokens from FILE. Not needed when using LoginCluster.")
+    parser.add_argument('--data-into-subproject', action="store_true", help="Migrate user's data into a separate subproject. This can be used to avoid name collisions from within an account.")
     group = parser.add_mutually_exclusive_group(required=True)
-    group.add_argument('--report', type=str, help="Generate report .csv file listing users by email address and their associated Arvados accounts")
-    group.add_argument('--migrate', type=str, help="Consume report .csv and migrate users to designated Arvados accounts")
-    group.add_argument('--dry-run', type=str, help="Consume report .csv and report how user would be migrated to designated Arvados accounts")
-    group.add_argument('--check', action="store_true", help="Check that tokens are usable and the federation is well connected")
+    group.add_argument('--report', type=str, metavar='FILE', help="Generate report .csv file listing users by email address and their associated Arvados accounts.")
+    group.add_argument('--migrate', type=str, metavar='FILE', help="Consume report .csv and migrate users to designated Arvados accounts.")
+    group.add_argument('--dry-run', type=str, metavar='FILE', help="Consume report .csv and report how user would be migrated to designated Arvados accounts.")
+    group.add_argument('--check', action="store_true", help="Check that tokens are usable and the federation is well connected.")
     args = parser.parse_args()
 
     clusters, errors, loginCluster = connect_clusters(args)
index a2c0096165c74b7bc1fda0daf212177cb4d08ac2..e31ac05418a1154a14f87fc7ed4298e283564e3d 100644 (file)
@@ -1,4 +1,9 @@
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
 import arvados
+import arvados.errors
 import json
 import sys
 
@@ -21,7 +26,7 @@ def check_A(users):
     for i in range(1, 10):
         found = False
         for u in users["items"]:
-            if u["username"] == ("case%d" % i) and u["email"] == ("case%d@test" % i):
+            if u["username"] == ("case%d" % i) and u["email"] == ("case%d@test" % i) and u["first_name"] == ("Case%d" % i) and u["last_name"] == "Testuser":
                 found = True
                 by_username[u["username"]] = u["uuid"]
         assert found
@@ -60,6 +65,7 @@ for i in range(2, 9):
     found = False
     for u in users["items"]:
         if (u["username"] == ("case%d" % i) and u["email"] == ("case%d@test" % i) and
+            u["first_name"] == ("Case%d" % i) and u["last_name"] == "Testuser" and
             u["uuid"] == by_username[u["username"]] and u["is_active"] is True):
             found = True
     assert found, "Not found case%i" % i
@@ -67,6 +73,7 @@ for i in range(2, 9):
 found = False
 for u in users["items"]:
     if (u["username"] == "case9" and u["email"] == "case9@test" and
+        u["first_name"] == "Case9" and u["last_name"] == "Testuser" and
         u["uuid"] == by_username[u["username"]] and u["is_active"] is False):
         found = True
 assert found
@@ -87,6 +94,7 @@ for i in (2, 4, 6, 7, 8):
     found = False
     for u in users["items"]:
         if (u["username"] == ("case%d" % i) and u["email"] == ("case%d@test" % i) and
+            u["first_name"] == ("Case%d" % i) and u["last_name"] == "Testuser" and
             u["uuid"] == by_username[u["username"]] and u["is_active"] is True):
             found = True
     assert found
@@ -97,6 +105,7 @@ for i in (3, 5, 9):
     found = False
     for u in users["items"]:
         if (u["username"] == ("case%d" % i) and u["email"] == ("case%d@test" % i) and
+            u["first_name"] == ("Case%d" % i) and u["last_name"] == "Testuser" and
             u["uuid"] == by_username[u["username"]] and u["is_active"] is True):
             found = True
     assert not found
@@ -105,4 +114,64 @@ for i in (3, 5, 9):
 users = apiC.users().list().execute()
 check_A(users)
 
+
+####
+# bug 16683 tests
+
+# Check that this query returns empty, instead of returning a 500 or
+# 502 error.
+# Yes, we're asking for a group from the users endpoint.  This is not a
+# mistake, this is something workbench does to populate the sharing
+# dialog.
+clusterID_B = apiB.configs().get().execute()["ClusterID"]
+i = apiB.users().list(filters=[["uuid", "in", ["%s-j7d0g-fffffffffffffff" % clusterID_B]]], count="none").execute()
+assert len(i["items"]) == 0
+
+# Check that we can create a project and give a remote user access to it
+
+tok3 = apiA.api_client_authorizations().create(body={"api_client_authorization": {"owner_uuid": by_username["case3"]}}).execute()
+tok4 = apiA.api_client_authorizations().create(body={"api_client_authorization": {"owner_uuid": by_username["case4"]}}).execute()
+
+v2_token3 = "v2/%s/%s" % (tok3["uuid"], tok3["api_token"])
+v2_token4 = "v2/%s/%s" % (tok4["uuid"], tok4["api_token"])
+
+apiB_3 = arvados.api(host=j["arvados_api_hosts"][1], token=v2_token3, insecure=True)
+apiB_4 = arvados.api(host=j["arvados_api_hosts"][1], token=v2_token4, insecure=True)
+
+assert apiB_3.users().current().execute()["uuid"] == by_username["case3"]
+assert apiB_4.users().current().execute()["uuid"] == by_username["case4"]
+
+newproject = apiB_3.groups().create(body={"group_class": "project",
+                                           "name":"fed test project"},
+                                    ensure_unique_name=True).execute()
+
+try:
+    # Expect to fail
+    apiB_4.groups().get(uuid=newproject["uuid"]).execute()
+except arvados.errors.ApiError as e:
+    if e.resp['status'] == '404':
+        pass
+    else:
+        raise
+
+l = apiB_3.links().create(body={"link_class": "permission",
+                            "name":"can_read",
+                            "tail_uuid": by_username["case4"],
+                            "head_uuid": newproject["uuid"]}).execute()
+
+# Expect to succeed
+apiB_4.groups().get(uuid=newproject["uuid"]).execute()
+
+# remove permission
+apiB_3.links().delete(uuid=l["uuid"]).execute()
+
+try:
+    # Expect to fail again
+    apiB_4.groups().get(uuid=newproject["uuid"]).execute()
+except arvados.errors.ApiError as e:
+    if e.resp['status'] == '404':
+        pass
+    else:
+        raise
+
 print("Passed checks")
index cea624ec4c4e2290635e3949c97135b2c4c992c2..0b5732293d0982fb6f158366c0c2aa894f1674ab 100644 (file)
@@ -1,3 +1,7 @@
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
 import arvados
 import json
 import sys
@@ -11,13 +15,21 @@ apiC = arvados.api(host=j["arvados_api_hosts"][2], token=j["superuser_tokens"][2
 def maketoken(newtok):
     return 'v2/' + newtok["uuid"] + '/' + newtok["api_token"]
 
+def get_user_data(case_nr, is_active=True):
+    return {
+        "email": "case{}@test".format(case_nr),
+        "first_name": "Case{}".format(case_nr),
+        "last_name": "Testuser",
+        "is_active": is_active
+    }
+
 # case 1
 # user only exists on cluster A
-apiA.users().create(body={"user": {"email": "case1@test", "is_active": True}}).execute()
+apiA.users().create(body={"user": get_user_data(case_nr=1)}).execute()
 
 # case 2
 # user exists on cluster A and has remotes on B and C
-case2 = apiA.users().create(body={"user": {"email": "case2@test", "is_active": True}}).execute()
+case2 = apiA.users().create(body={"user": get_user_data(case_nr=2)}).execute()
 newtok = apiA.api_client_authorizations().create(body={
     "api_client_authorization": {'owner_uuid': case2["uuid"]}}).execute()
 arvados.api(host=j["arvados_api_hosts"][1], token=maketoken(newtok), insecure=True).users().current().execute()
@@ -25,11 +37,11 @@ arvados.api(host=j["arvados_api_hosts"][2], token=maketoken(newtok), insecure=Tr
 
 # case 3
 # user only exists on cluster B
-case3 = apiB.users().create(body={"user": {"email": "case3@test", "is_active": True}}).execute()
+case3 = apiB.users().create(body={"user": get_user_data(case_nr=3)}).execute()
 
 # case 4
 # user only exists on cluster B and has remotes on A and C
-case4 = apiB.users().create(body={"user": {"email": "case4@test", "is_active": True}}).execute()
+case4 = apiB.users().create(body={"user": get_user_data(case_nr=4)}).execute()
 newtok = apiB.api_client_authorizations().create(body={
     "api_client_authorization": {'owner_uuid': case4["uuid"]}}).execute()
 arvados.api(host=j["arvados_api_hosts"][0], token=maketoken(newtok), insecure=True).users().current().execute()
@@ -38,18 +50,18 @@ arvados.api(host=j["arvados_api_hosts"][2], token=maketoken(newtok), insecure=Tr
 
 # case 5
 # user exists on both cluster A and B
-case5 = apiA.users().create(body={"user": {"email": "case5@test", "is_active": True}}).execute()
-case5 = apiB.users().create(body={"user": {"email": "case5@test", "is_active": True}}).execute()
+case5 = apiA.users().create(body={"user": get_user_data(case_nr=5)}).execute()
+case5 = apiB.users().create(body={"user": get_user_data(case_nr=5)}).execute()
 
 # case 6
 # user exists on both cluster A and B, with remotes on A, B and C
-case6_A = apiA.users().create(body={"user": {"email": "case6@test", "is_active": True}}).execute()
+case6_A = apiA.users().create(body={"user": get_user_data(case_nr=6)}).execute()
 newtokA = apiA.api_client_authorizations().create(body={
     "api_client_authorization": {'owner_uuid': case6_A["uuid"]}}).execute()
 arvados.api(host=j["arvados_api_hosts"][1], token=maketoken(newtokA), insecure=True).users().current().execute()
 arvados.api(host=j["arvados_api_hosts"][2], token=maketoken(newtokA), insecure=True).users().current().execute()
 
-case6_B = apiB.users().create(body={"user": {"email": "case6@test", "is_active": True}}).execute()
+case6_B = apiB.users().create(body={"user": get_user_data(case_nr=6)}).execute()
 newtokB = apiB.api_client_authorizations().create(body={
     "api_client_authorization": {'owner_uuid': case6_B["uuid"]}}).execute()
 arvados.api(host=j["arvados_api_hosts"][0], token=maketoken(newtokB), insecure=True).users().current().execute()
@@ -57,13 +69,13 @@ arvados.api(host=j["arvados_api_hosts"][2], token=maketoken(newtokB), insecure=T
 
 # case 7
 # user exists on both cluster B and A, with remotes on A, B and C
-case7_B = apiB.users().create(body={"user": {"email": "case7@test", "is_active": True}}).execute()
+case7_B = apiB.users().create(body={"user": get_user_data(case_nr=7)}).execute()
 newtokB = apiB.api_client_authorizations().create(body={
     "api_client_authorization": {'owner_uuid': case7_B["uuid"]}}).execute()
 arvados.api(host=j["arvados_api_hosts"][0], token=maketoken(newtokB), insecure=True).users().current().execute()
 arvados.api(host=j["arvados_api_hosts"][2], token=maketoken(newtokB), insecure=True).users().current().execute()
 
-case7_A = apiA.users().create(body={"user": {"email": "case7@test", "is_active": True}}).execute()
+case7_A = apiA.users().create(body={"user": get_user_data(case_nr=7)}).execute()
 newtokA = apiA.api_client_authorizations().create(body={
     "api_client_authorization": {'owner_uuid': case7_A["uuid"]}}).execute()
 arvados.api(host=j["arvados_api_hosts"][1], token=maketoken(newtokA), insecure=True).users().current().execute()
@@ -71,13 +83,13 @@ arvados.api(host=j["arvados_api_hosts"][2], token=maketoken(newtokA), insecure=T
 
 # case 8
 # user exists on both cluster B and C, with remotes on A, B and C
-case8_B = apiB.users().create(body={"user": {"email": "case8@test", "is_active": True}}).execute()
+case8_B = apiB.users().create(body={"user": get_user_data(case_nr=8)}).execute()
 newtokB = apiB.api_client_authorizations().create(body={
     "api_client_authorization": {'owner_uuid': case8_B["uuid"]}}).execute()
 arvados.api(host=j["arvados_api_hosts"][0], token=maketoken(newtokB), insecure=True).users().current().execute()
 arvados.api(host=j["arvados_api_hosts"][2], token=maketoken(newtokB), insecure=True).users().current().execute()
 
-case8_C = apiC.users().create(body={"user": {"email": "case8@test", "is_active": True}}).execute()
+case8_C = apiC.users().create(body={"user": get_user_data(case_nr=8)}).execute()
 newtokC = apiC.api_client_authorizations().create(body={
     "api_client_authorization": {'owner_uuid': case8_C["uuid"]}}).execute()
 arvados.api(host=j["arvados_api_hosts"][0], token=maketoken(newtokC), insecure=True).users().current().execute()
@@ -85,4 +97,4 @@ arvados.api(host=j["arvados_api_hosts"][1], token=maketoken(newtokC), insecure=T
 
 # case 9
 # user only exists on cluster B, but is inactive
-case9 = apiB.users().create(body={"user": {"email": "case9@test", "is_active": False}}).execute()
+case9 = apiB.users().create(body={"user": get_user_data(case_nr=9, is_active=False)}).execute()
index 18797d69c68e6fc0d9d39550a86c3a2ba916cb24..1e12d6a4ce790ec9f9abdfe77ee08044795f8a71 100644 (file)
@@ -4,12 +4,11 @@
 
 source 'https://rubygems.org'
 
-gem 'rails', '~> 5.0.0'
+gem 'rails', '~> 5.2.0'
 gem 'responders', '~> 2.0'
 
 group :test, :development do
   gem 'factory_bot_rails'
-  gem 'database_cleaner'
 
   # As of now (2019-03-27) There's an open issue about incompatibilities with
   # newer versions of this gem: https://github.com/rails/rails-perftest/issues/38
@@ -23,8 +22,12 @@ group :test, :development do
   gem 'simplecov-rcov', require: false
   gem 'mocha', require: false
   gem 'byebug'
+  gem 'listen'
 end
 
+# Fast app boot times
+gem 'bootsnap', require: false
+
 gem 'pg', '~> 1.0'
 
 gem 'multi_json'
index 127a09ee2db71a00bc7c05ee5e2e651ea379a33d..4279151899da9a0051e8e69476f9f4abee672803 100644 (file)
@@ -22,39 +22,43 @@ GIT
 GEM
   remote: https://rubygems.org/
   specs:
-    actioncable (5.0.7.2)
-      actionpack (= 5.0.7.2)
-      nio4r (>= 1.2, < 3.0)
-      websocket-driver (~> 0.6.1)
-    actionmailer (5.0.7.2)
-      actionpack (= 5.0.7.2)
-      actionview (= 5.0.7.2)
-      activejob (= 5.0.7.2)
+    actioncable (5.2.4.3)
+      actionpack (= 5.2.4.3)
+      nio4r (~> 2.0)
+      websocket-driver (>= 0.6.1)
+    actionmailer (5.2.4.3)
+      actionpack (= 5.2.4.3)
+      actionview (= 5.2.4.3)
+      activejob (= 5.2.4.3)
       mail (~> 2.5, >= 2.5.4)
       rails-dom-testing (~> 2.0)
-    actionpack (5.0.7.2)
-      actionview (= 5.0.7.2)
-      activesupport (= 5.0.7.2)
-      rack (~> 2.0)
-      rack-test (~> 0.6.3)
+    actionpack (5.2.4.3)
+      actionview (= 5.2.4.3)
+      activesupport (= 5.2.4.3)
+      rack (~> 2.0, >= 2.0.8)
+      rack-test (>= 0.6.3)
       rails-dom-testing (~> 2.0)
       rails-html-sanitizer (~> 1.0, >= 1.0.2)
-    actionview (5.0.7.2)
-      activesupport (= 5.0.7.2)
+    actionview (5.2.4.3)
+      activesupport (= 5.2.4.3)
       builder (~> 3.1)
-      erubis (~> 2.7.0)
+      erubi (~> 1.4)
       rails-dom-testing (~> 2.0)
       rails-html-sanitizer (~> 1.0, >= 1.0.3)
-    activejob (5.0.7.2)
-      activesupport (= 5.0.7.2)
+    activejob (5.2.4.3)
+      activesupport (= 5.2.4.3)
       globalid (>= 0.3.6)
-    activemodel (5.0.7.2)
-      activesupport (= 5.0.7.2)
-    activerecord (5.0.7.2)
-      activemodel (= 5.0.7.2)
-      activesupport (= 5.0.7.2)
-      arel (~> 7.0)
-    activesupport (5.0.7.2)
+    activemodel (5.2.4.3)
+      activesupport (= 5.2.4.3)
+    activerecord (5.2.4.3)
+      activemodel (= 5.2.4.3)
+      activesupport (= 5.2.4.3)
+      arel (>= 9.0)
+    activestorage (5.2.4.3)
+      actionpack (= 5.2.4.3)
+      activerecord (= 5.2.4.3)
+      marcel (~> 0.3.1)
+    activesupport (5.2.4.3)
       concurrent-ruby (~> 1.0, >= 1.0.2)
       i18n (>= 0.7, < 2)
       minitest (~> 5.1)
@@ -66,9 +70,9 @@ GEM
     addressable (2.7.0)
       public_suffix (>= 2.0.2, < 5.0)
     andand (1.3.3)
-    arel (7.1.4)
-    arvados-google-api-client (0.8.7.3)
-      activesupport (>= 3.2, < 5.1)
+    arel (9.0.0)
+    arvados-google-api-client (0.8.7.4)
+      activesupport (>= 3.2, < 5.3)
       addressable (~> 2.3)
       autoparse (~> 0.3)
       extlib (~> 0.9)
@@ -82,7 +86,9 @@ GEM
       addressable (>= 2.3.1)
       extlib (>= 0.9.15)
       multi_json (>= 1.0.0)
-    builder (3.2.3)
+    bootsnap (1.4.7)
+      msgpack (~> 1.0)
+    builder (3.2.4)
     byebug (11.0.1)
     capistrano (2.15.9)
       highline
@@ -90,10 +96,9 @@ GEM
       net-sftp (>= 2.0.0)
       net-ssh (>= 2.0.14)
       net-ssh-gateway (>= 1.1.0)
-    concurrent-ruby (1.1.5)
-    crass (1.0.4)
-    database_cleaner (1.7.0)
-    erubis (2.7.0)
+    concurrent-ruby (1.1.6)
+    crass (1.0.6)
+    erubi (1.9.0)
     execjs (2.7.0)
     extlib (0.9.16)
     factory_bot (5.0.2)
@@ -127,25 +132,32 @@ GEM
     launchy (2.4.3)
       addressable (~> 2.3)
     libv8 (3.16.14.19)
+    listen (3.2.1)
+      rb-fsevent (~> 0.10, >= 0.10.3)
+      rb-inotify (~> 0.9, >= 0.9.10)
     lograge (0.10.0)
       actionpack (>= 4)
       activesupport (>= 4)
       railties (>= 4)
       request_store (~> 1.0)
     logstash-event (1.2.02)
-    loofah (2.2.3)
+    loofah (2.6.0)
       crass (~> 1.0.2)
       nokogiri (>= 1.5.9)
     mail (2.7.1)
       mini_mime (>= 0.1.1)
+    marcel (0.3.3)
+      mimemagic (~> 0.3.2)
     memoist (0.16.2)
     metaclass (0.0.4)
-    method_source (0.9.2)
-    mini_mime (1.0.1)
+    method_source (1.0.0)
+    mimemagic (0.3.5)
+    mini_mime (1.0.2)
     mini_portile2 (2.4.0)
     minitest (5.10.3)
     mocha (1.8.0)
       metaclass (~> 0.0.1)
+    msgpack (1.3.3)
     multi_json (1.14.1)
     multi_xml (0.6.0)
     multipart-post (2.1.1)
@@ -156,8 +168,8 @@ GEM
     net-ssh (5.2.0)
     net-ssh-gateway (2.0.0)
       net-ssh (>= 4.0.0)
-    nio4r (2.3.1)
-    nokogiri (1.10.8)
+    nio4r (2.5.2)
+    nokogiri (1.10.10)
       mini_portile2 (~> 2.4.0)
     oauth2 (1.4.1)
       faraday (>= 0.8, < 0.16.0)
@@ -181,19 +193,20 @@ GEM
     power_assert (1.1.4)
     public_suffix (4.0.3)
     rack (2.2.3)
-    rack-test (0.6.3)
-      rack (>= 1.0)
-    rails (5.0.7.2)
-      actioncable (= 5.0.7.2)
-      actionmailer (= 5.0.7.2)
-      actionpack (= 5.0.7.2)
-      actionview (= 5.0.7.2)
-      activejob (= 5.0.7.2)
-      activemodel (= 5.0.7.2)
-      activerecord (= 5.0.7.2)
-      activesupport (= 5.0.7.2)
+    rack-test (1.1.0)
+      rack (>= 1.0, < 3)
+    rails (5.2.4.3)
+      actioncable (= 5.2.4.3)
+      actionmailer (= 5.2.4.3)
+      actionpack (= 5.2.4.3)
+      actionview (= 5.2.4.3)
+      activejob (= 5.2.4.3)
+      activemodel (= 5.2.4.3)
+      activerecord (= 5.2.4.3)
+      activestorage (= 5.2.4.3)
+      activesupport (= 5.2.4.3)
       bundler (>= 1.3.0)
-      railties (= 5.0.7.2)
+      railties (= 5.2.4.3)
       sprockets-rails (>= 2.0.0)
     rails-controller-testing (1.0.4)
       actionpack (>= 5.0.1.x)
@@ -202,17 +215,17 @@ GEM
     rails-dom-testing (2.0.3)
       activesupport (>= 4.2.0)
       nokogiri (>= 1.6)
-    rails-html-sanitizer (1.0.4)
-      loofah (~> 2.2, >= 2.2.2)
+    rails-html-sanitizer (1.3.0)
+      loofah (~> 2.3)
     rails-observers (0.1.5)
       activemodel (>= 4.0)
     rails-perftest (0.0.7)
-    railties (5.0.7.2)
-      actionpack (= 5.0.7.2)
-      activesupport (= 5.0.7.2)
+    railties (5.2.4.3)
+      actionpack (= 5.2.4.3)
+      activesupport (= 5.2.4.3)
       method_source
       rake (>= 0.8.7)
-      thor (>= 0.18.1, < 2.0)
+      thor (>= 0.19.0, < 2.0)
     rake (13.0.1)
     rb-fsevent (0.10.3)
     rb-inotify (0.9.10)
@@ -263,15 +276,15 @@ GEM
     therubyracer (0.12.3)
       libv8 (~> 3.16.14.15)
       ref
-    thor (0.20.3)
+    thor (1.0.1)
     thread_safe (0.3.6)
     tilt (2.0.8)
-    tzinfo (1.2.6)
+    tzinfo (1.2.7)
       thread_safe (~> 0.1)
     uglifier (2.7.2)
       execjs (>= 0.3.0)
       json (>= 1.8.0)
-    websocket-driver (0.6.5)
+    websocket-driver (0.7.3)
       websocket-extensions (>= 0.1.0)
     websocket-extensions (0.1.5)
 
@@ -282,11 +295,12 @@ DEPENDENCIES
   acts_as_api
   andand
   arvados!
+  bootsnap
   byebug
-  database_cleaner
   factory_bot_rails
   httpclient
   jquery-rails
+  listen
   lograge
   logstash-event
   minitest (= 5.10.3)
@@ -298,7 +312,7 @@ DEPENDENCIES
   optimist
   passenger
   pg (~> 1.0)
-  rails (~> 5.0.0)
+  rails (~> 5.2.0)
   rails-controller-testing
   rails-observers
   rails-perftest
@@ -317,4 +331,4 @@ DEPENDENCIES
   uglifier (~> 2.0)
 
 BUNDLED WITH
-   1.16.6
+   1.17.3
index 83a233cd54681b18b9fb6bb12c72642a2e95cae4..2644a06579787082d8e1c7421a5288a085450684 100644 (file)
@@ -63,7 +63,6 @@ class ApplicationController < ActionController::Base
                 :with => :render_error)
     rescue_from(ActiveRecord::RecordNotFound,
                 ActionController::RoutingError,
-                ActionController::UnknownController,
                 AbstractController::ActionNotFound,
                 :with => :render_not_found)
   end
@@ -361,7 +360,7 @@ class ApplicationController < ActionController::Base
     %w(created_at modified_by_client_uuid modified_by_user_uuid modified_at).each do |x|
       @attrs.delete x.to_sym
     end
-    @attrs = @attrs.symbolize_keys if @attrs.is_a? HashWithIndifferentAccess
+    @attrs = @attrs.symbolize_keys if @attrs.is_a? ActiveSupport::HashWithIndifferentAccess
     @attrs
   end
 
index 6057c4d2698c8e1bb3d131d7dfcd9d0a8c85ea0d..a4d49c35c1fc4c73c490375921c7b2bf3a94c97b 100644 (file)
@@ -325,6 +325,7 @@ class ApiClientAuthorization < ArvadosModel
   end
 
   def log_update
-    super unless (changed - UNLOGGED_CHANGES).empty?
+
+    super unless (saved_changes.keys - UNLOGGED_CHANGES).empty?
   end
 end
index 01a31adb91967c0cb3648e364b3af7c891fd28f0..6fb8ff2b33549af8e4e512a1374363f8dee8fa64 100644 (file)
@@ -16,6 +16,7 @@ class ArvadosModel < ApplicationRecord
   include DbCurrentTime
   extend RecordFilters
 
+  after_find :schedule_restoring_changes
   after_initialize :log_start_state
   before_save :ensure_permission_to_save
   before_save :ensure_owner_uuid_is_permitted
@@ -137,6 +138,7 @@ class ArvadosModel < ApplicationRecord
   def reload(*args)
     super
     log_start_state
+    self
   end
 
   def self.create raw_params={}, *args
@@ -749,6 +751,20 @@ class ArvadosModel < ApplicationRecord
     %r/[a-z0-9]{5}-#{uuid_prefix}-[a-z0-9]{15}/
   end
 
+  def check_readable_uuid attr, attr_value
+    return if attr_value.nil?
+    if (r = ArvadosModel::resource_class_for_uuid attr_value)
+      unless skip_uuid_read_permission_check.include? attr
+        r = r.readable_by(current_user)
+      end
+      if r.where(uuid: attr_value).count == 0
+        errors.add(attr, "'#{attr_value}' not found")
+      end
+    else
+      # Not a valid uuid or PDH, but that (currently) is not an error.
+    end
+  end
+
   def ensure_valid_uuids
     specials = [system_user_uuid]
 
@@ -757,16 +773,7 @@ class ArvadosModel < ApplicationRecord
         next if skip_uuid_existence_check.include? attr
         attr_value = send attr
         next if specials.include? attr_value
-        if attr_value
-          if (r = ArvadosModel::resource_class_for_uuid attr_value)
-            unless skip_uuid_read_permission_check.include? attr
-              r = r.readable_by(current_user)
-            end
-            if r.where(uuid: attr_value).count == 0
-              errors.add(attr, "'#{attr_value}' not found")
-            end
-          end
-        end
+        check_readable_uuid attr, attr_value
       end
     end
   end
@@ -833,10 +840,24 @@ class ArvadosModel < ApplicationRecord
              Rails.configuration.AuditLogs.MaxDeleteBatch.to_i > 0)
   end
 
+  def schedule_restoring_changes
+    # This will be checked at log_start_state, to reset any (virtual) changes
+    # produced by the act of reading a serialized attribute.
+    @fresh_from_database = true
+  end
+
   def log_start_state
     if is_audit_logging_enabled?
       @old_attributes = Marshal.load(Marshal.dump(attributes))
       @old_logged_attributes = Marshal.load(Marshal.dump(logged_attributes))
+      if @fresh_from_database
+        # This instance was created from reading a database record. Attributes
+        # haven't been changed, but those serialized attributes will be reported
+        # as unpersisted, so we restore them to avoid issues with lock!() and
+        # with_lock().
+        restore_attributes
+        @fresh_from_database = nil
+      end
     end
   end
 
index caac5611e79c8baa43d30e396b33cc4a92f9d146..8b549a71ab4fba348ab9279f456595912fb693db 100644 (file)
@@ -259,9 +259,10 @@ class Collection < ArvadosModel
     should_preserve_version = should_preserve_version? # Time sensitive, cache value
     return(yield) unless (should_preserve_version || syncable_updates.any?)
 
-    # Put aside the changes because with_lock forces a record reload
+    # Put aside the changes because with_lock does a record reload
     changes = self.changes
     snapshot = nil
+    restore_attributes
     with_lock do
       # Copy the original state to save it as old version
       if should_preserve_version
@@ -303,12 +304,18 @@ class Collection < ArvadosModel
 
   def syncable_updates
     updates = {}
-    (syncable_attrs & self.changes.keys).each do |attr|
+    if self.changes.any?
+      changes = self.changes
+    else
+      # If called after save...
+      changes = self.saved_changes
+    end
+    (syncable_attrs & changes.keys).each do |attr|
       if attr == 'uuid'
         # Point old versions to current version's new UUID
-        updates['current_version_uuid'] = self.changes[attr].last
+        updates['current_version_uuid'] = changes[attr].last
       else
-        updates[attr] = self.changes[attr].last
+        updates[attr] = changes[attr].last
       end
     end
     return updates
@@ -316,7 +323,7 @@ class Collection < ArvadosModel
 
   def sync_past_versions
     updates = self.syncable_updates
-    Collection.where('current_version_uuid = ? AND uuid != ?', self.uuid_was, self.uuid_was).each do |c|
+    Collection.where('current_version_uuid = ? AND uuid != ?', self.uuid_before_last_save, self.uuid_before_last_save).each do |c|
       c.attributes = updates
       # Use a different validation context to skip the 'past_versions_cannot_be_updated'
       # validator, as on this case it is legal to update some fields.
index 912a801a6fb1820724489216f0ec38d99bd80210..5833c2251f9b8db26a5ebf5834130d96fc4690d0 100644 (file)
@@ -138,7 +138,7 @@ class Container < ArvadosModel
   end
 
   def propagate_priority
-    return true unless priority_changed?
+    return true unless saved_change_to_priority?
     act_as_system_user do
       # Update the priority of child container requests to match new
       # priority of the parent container (ignoring requests with no
@@ -387,7 +387,7 @@ class Container < ArvadosModel
     if users_list.select { |u| u.is_admin }.any?
       return super
     end
-    Container.where(ContainerRequest.readable_by(*users_list).where("containers.uuid = container_requests.container_uuid").exists)
+    Container.where(ContainerRequest.readable_by(*users_list).where("containers.uuid = container_requests.container_uuid").arel.exists)
   end
 
   def final?
@@ -556,7 +556,7 @@ class Container < ArvadosModel
     # If self.final?, this update is superfluous: the final log/output
     # update will be done when handle_completed calls finalize! on
     # each requesting CR.
-    return if self.final? || !self.log_changed?
+    return if self.final? || !saved_change_to_log?
     leave_modified_by_user_alone do
       ContainerRequest.where(container_uuid: self.uuid).each do |cr|
         cr.update_collections(container: self, collections: ['log'])
@@ -653,11 +653,11 @@ class Container < ArvadosModel
   def handle_completed
     # This container is finished so finalize any associated container requests
     # that are associated with this container.
-    if self.state_changed? and self.final?
+    if saved_change_to_state? and self.final?
       # These get wiped out by with_lock (which reloads the record),
       # so record them now in case we need to schedule a retry.
-      prev_secret_mounts = self.secret_mounts_was
-      prev_runtime_token = self.runtime_token_was
+      prev_secret_mounts = secret_mounts_before_last_save
+      prev_runtime_token = runtime_token_before_last_save
 
       # Need to take a lock on the container to ensure that any
       # concurrent container requests that might try to reuse this
index b30b8cc1d9b24cc2bfcbeac7400afaa38cd03fa4..77536eee4f28f53a2acae66cc90d647967ff6b51 100644 (file)
@@ -472,10 +472,10 @@ class ContainerRequest < ArvadosModel
   end
 
   def update_priority
-    return unless state_changed? || priority_changed? || container_uuid_changed?
+    return unless saved_change_to_state? || saved_change_to_priority? || saved_change_to_container_uuid?
     act_as_system_user do
       Container.
-        where('uuid in (?)', [self.container_uuid_was, self.container_uuid].compact).
+        where('uuid in (?)', [container_uuid_before_last_save, self.container_uuid].compact).
         map(&:update_priority!)
     end
   end
index 02c6a242f911ddcaebd3a4ae68113c546d5487bd..7e015f3564e7475f6103e8f4a42c5beb5bf53c83 100644 (file)
@@ -57,7 +57,7 @@ class Group < ArvadosModel
   end
 
   def update_trash
-    if trash_at_changed? or owner_uuid_changed?
+    if saved_change_to_trash_at? or saved_change_to_owner_uuid?
       # The group was added or removed from the trash.
       #
       # Strategy:
@@ -97,7 +97,7 @@ on conflict (group_uuid) do update set trash_at=EXCLUDED.trash_at;
   end
 
   def after_ownership_change
-    if owner_uuid_changed?
+    if saved_change_to_owner_uuid?
       update_permissions self.owner_uuid, self.uuid, CAN_MANAGE_PERM
     end
   end
index e4ba7f3de1ef8f20833355efb0dae1a153b05113..0d7334e44e85440d37a530e6316d338f125b92aa 100644 (file)
@@ -43,6 +43,28 @@ class Link < ArvadosModel
 
   protected
 
+  def check_readable_uuid attr, attr_value
+    if attr == 'tail_uuid' &&
+       !attr_value.nil? &&
+       self.link_class == 'permission' &&
+       attr_value[0..4] != Rails.configuration.ClusterID &&
+       ApiClientAuthorization.remote_host(uuid_prefix: attr_value[0..4]) &&
+       ArvadosModel::resource_class_for_uuid(attr_value) == User
+      # Permission link tail is a remote user (the user permissions
+      # are being granted to), so bypass the standard check that a
+      # referenced object uuid is readable by current user.
+      #
+      # We could do a call to the remote cluster to check if the user
+      # in tail_uuid exists.  This would detect copy-and-paste errors,
+      # but add another way for the request to fail, and I don't think
+      # it would improve security.  It doesn't seem to be worth the
+      # complexity tradeoff.
+      true
+    else
+      super
+    end
+  end
+
   def permission_to_attach_to_objects
     # Anonymous users cannot write links
     return false if !current_user
@@ -76,6 +98,11 @@ class Link < ArvadosModel
 
     head_obj = ArvadosModel.find_by_uuid(head_uuid)
 
+    if head_obj.nil?
+      errors.add(:head_uuid, "does not exist")
+      return false
+    end
+
     # No permission links can be pointed to past collection versions
     if head_obj.is_a?(Collection) && head_obj.current_version_uuid != head_uuid
       errors.add(:head_uuid, "cannot point to a past version of a collection")
index d200bb80110869ade17386d3ebbac9cf9b8de979..c8b463696bb5423b1d5a5f7f5533b95637246165 100644 (file)
@@ -168,7 +168,7 @@ class Node < ArvadosModel
   end
 
   def dns_server_update
-    if ip_address_changed? && ip_address
+    if saved_change_to_ip_address? && ip_address
       Node.where('id != ? and ip_address = ?',
                  id, ip_address).each do |stale_node|
         # One or more(!) stale node records have the same IP address
@@ -178,10 +178,10 @@ class Node < ArvadosModel
         stale_node.update_attributes!(ip_address: nil)
       end
     end
-    if hostname_was && hostname_changed?
-      self.class.dns_server_update(hostname_was, UNUSED_NODE_IP)
+    if hostname_before_last_save && saved_change_to_hostname?
+      self.class.dns_server_update(hostname_before_last_save, UNUSED_NODE_IP)
     end
-    if hostname && (hostname_changed? || ip_address_changed?)
+    if hostname && (saved_change_to_hostname? || saved_change_to_ip_address?)
       self.class.dns_server_update(hostname, ip_address || UNUSED_NODE_IP)
     end
   end
index 64facaa98e84c2eacfdc6fed38372f2dff22fdde..778ad7d0bb1728c22ad45dcfecdc5264f1c65312 100644 (file)
@@ -23,32 +23,32 @@ class User < ArvadosModel
   validate :must_unsetup_to_deactivate
   before_update :prevent_privilege_escalation
   before_update :prevent_inactive_admin
-  before_update :verify_repositories_empty, :if => Proc.new { |user|
-    user.username.nil? and user.username_changed?
+  before_update :verify_repositories_empty, :if => Proc.new {
+    username.nil? and username_changed?
   }
   before_update :setup_on_activate
 
   before_create :check_auto_admin
-  before_create :set_initial_username, :if => Proc.new { |user|
-    user.username.nil? and user.email
+  before_create :set_initial_username, :if => Proc.new {
+    username.nil? and email
   }
   after_create :after_ownership_change
   after_create :setup_on_activate
   after_create :add_system_group_permission_link
-  after_create :auto_setup_new_user, :if => Proc.new { |user|
+  after_create :auto_setup_new_user, :if => Proc.new {
     Rails.configuration.Users.AutoSetupNewUsers and
-    (user.uuid != system_user_uuid) and
-    (user.uuid != anonymous_user_uuid)
+    (uuid != system_user_uuid) and
+    (uuid != anonymous_user_uuid)
   }
   after_create :send_admin_notifications
 
   before_update :before_ownership_change
   after_update :after_ownership_change
   after_update :send_profile_created_notification
-  after_update :sync_repository_names, :if => Proc.new { |user|
-    (user.uuid != system_user_uuid) and
-    user.username_changed? and
-    (not user.username_was.nil?)
+  after_update :sync_repository_names, :if => Proc.new {
+    (uuid != system_user_uuid) and
+    saved_change_to_username? and
+    (not username_before_last_save.nil?)
   }
   before_destroy :clear_permissions
   after_destroy :remove_self_from_permissions
@@ -151,7 +151,7 @@ SELECT 1 FROM #{PERMISSION_VIEW}
   end
 
   def after_ownership_change
-    if owner_uuid_changed?
+    if saved_change_to_owner_uuid?
       update_permissions self.owner_uuid, self.uuid, CAN_MANAGE_PERM
     end
   end
@@ -241,11 +241,8 @@ SELECT target_uuid, perm_level
                      name: 'can_login').destroy_all
 
     # delete "All users" group read permissions for this user
-    group = Group.where(name: 'All users').select do |g|
-      g[:uuid].match(/-f+$/)
-    end.first
     Link.where(tail_uuid: self.uuid,
-                     head_uuid: group[:uuid],
+                     head_uuid: all_users_group_uuid,
                      link_class: 'permission',
                      name: 'can_read').destroy_all
 
@@ -272,10 +269,6 @@ SELECT target_uuid, perm_level
        self.is_active_was &&
        !self.is_active
 
-      group = Group.where(name: 'All users').select do |g|
-        g[:uuid].match(/-f+$/)
-      end.first
-
       # When a user is set up, they are added to the "All users"
       # group.  A user that is part of the "All users" group is
       # allowed to self-activate.
@@ -290,7 +283,7 @@ SELECT target_uuid, perm_level
       # explaining the correct way to deactivate a user.
       #
       if Link.where(tail_uuid: self.uuid,
-                    head_uuid: group[:uuid],
+                    head_uuid: all_users_group_uuid,
                     link_class: 'permission',
                     name: 'can_read').any?
         errors.add :is_active, "cannot be set to false directly, use the 'Deactivate' button on Workbench, or the 'unsetup' API call"
@@ -711,11 +704,11 @@ update #{PERMISSION_VIEW} set target_uuid=$1 where target_uuid = $2
   # add the user to the 'All users' group
   def create_user_group_link
     return (Link.where(tail_uuid: self.uuid,
-                       head_uuid: all_users_group[:uuid],
+                       head_uuid: all_users_group_uuid,
                        link_class: 'permission',
                        name: 'can_read').first or
             Link.create(tail_uuid: self.uuid,
-                        head_uuid: all_users_group[:uuid],
+                        head_uuid: all_users_group_uuid,
                         link_class: 'permission',
                         name: 'can_read'))
   end
@@ -743,7 +736,8 @@ update #{PERMISSION_VIEW} set target_uuid=$1 where target_uuid = $2
   # Automatically setup if is_active flag turns on
   def setup_on_activate
     return if [system_user_uuid, anonymous_user_uuid].include?(self.uuid)
-    if is_active && (new_record? || is_active_changed?)
+    if is_active &&
+      (new_record? || saved_change_to_is_active? || will_save_change_to_is_active?)
       setup
     end
   end
@@ -766,8 +760,8 @@ update #{PERMISSION_VIEW} set target_uuid=$1 where target_uuid = $2
 
   # Send notification if the user saved profile for the first time
   def send_profile_created_notification
-    if self.prefs_changed?
-      if self.prefs_was.andand.empty? || !self.prefs_was.andand['profile']
+    if saved_change_to_prefs?
+      if prefs_before_last_save.andand.empty? || !prefs_before_last_save.andand['profile']
         profile_notification_address = Rails.configuration.Users.UserProfileNotificationAddress
         ProfileNotifier.profile_created(self, profile_notification_address).deliver_now if profile_notification_address and !profile_notification_address.empty?
       end
@@ -782,7 +776,7 @@ update #{PERMISSION_VIEW} set target_uuid=$1 where target_uuid = $2
   end
 
   def sync_repository_names
-    old_name_re = /^#{Regexp.escape(username_was)}\//
+    old_name_re = /^#{Regexp.escape(username_before_last_save)}\//
     name_sub = "#{username}/"
     repositories.find_each do |repo|
       repo.name = repo.name.sub(old_name_re, name_sub)
index 044b5ca2318afe4f90c913d94cadf9ab5ddf7964..00d640cf7cf156097b9739a34f71e65eb284d48d 100755 (executable)
@@ -4,5 +4,5 @@
 #
 # SPDX-License-Identifier: AGPL-3.0
 
-ENV['BUNDLE_GEMFILE'] ||= File.expand_path('../../Gemfile', __FILE__)
+ENV['BUNDLE_GEMFILE'] ||= File.expand_path('../Gemfile', __dir__)
 load Gem.bin_path('bundler', 'bundle')
index 2e4d28c58d85e8640cf46a2b11a9e112575c7c13..c9142b942ed12a848a4497a01ad7393dfd78d370 100755 (executable)
@@ -4,12 +4,11 @@
 #
 # SPDX-License-Identifier: AGPL-3.0
 
-require 'pathname'
 require 'fileutils'
 include FileUtils
 
 # path to your application root.
-APP_ROOT = Pathname.new File.expand_path('../../', __FILE__)
+APP_ROOT = File.expand_path('..', __dir__)
 
 def system!(*args)
   system(*args) || abort("\n== Command #{args} failed ==")
index 07a3df93e48b0b2eaacc35e59683caefe9ff2efb..201287ef61e8859930cb93cc03cb81f20c12b4ff 100755 (executable)
@@ -4,12 +4,11 @@
 #
 # SPDX-License-Identifier: AGPL-3.0
 
-require 'pathname'
 require 'fileutils'
 include FileUtils
 
 # path to your application root.
-APP_ROOT = Pathname.new File.expand_path('../../', __FILE__)
+APP_ROOT = File.expand_path('..', __dir__)
 
 def system!(*args)
   system(*args) || abort("\n== Command #{args} failed ==")
diff --git a/services/api/bin/yarn b/services/api/bin/yarn
new file mode 100755 (executable)
index 0000000..cc54a3b
--- /dev/null
@@ -0,0 +1,16 @@
+#!/usr/bin/env ruby
+
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: AGPL-3.0
+
+APP_ROOT = File.expand_path('..', __dir__)
+Dir.chdir(APP_ROOT) do
+  begin
+    exec "yarnpkg", *ARGV
+  rescue Errno::ENOENT
+    $stderr.puts "Yarn executable was not detected in the system."
+    $stderr.puts "Download Yarn at https://yarnpkg.com/en/docs/install"
+    exit 1
+  end
+end
index b6174a0d8989f36e2e851431b18fe1627a33dbb8..369294e8a79278ffb437571e74cb726af527e845 100644 (file)
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: AGPL-3.0
 
-require File.expand_path('../boot', __FILE__)
+require_relative 'boot'
 
 require "rails"
 # Pick only the frameworks we need:
@@ -12,10 +12,11 @@ require "active_record/railtie"
 require "action_controller/railtie"
 require "action_mailer/railtie"
 require "action_view/railtie"
-# Skip ActionCable (new in Rails 5.0) as it adds '/cable' routes that we're not using
-# require "action_cable/engine"
 require "sprockets/railtie"
 require "rails/test_unit/railtie"
+# Skipping the following:
+# * ActionCable (new in Rails 5.0) as it adds '/cable' routes that we're not using
+# * Skip ActiveStorage (new in Rails 5.1)
 
 require 'digest'
 
index f63f8af0335884c606ba2c52117d939657b4ff1e..035a3972f86c318e758318330c7aa63af44ff9c5 100644 (file)
@@ -16,6 +16,7 @@
 # config:migrate to /etc/arvados/config.yml, you will be able to
 # delete application.yml and database.yml.
 
+require "cgi"
 require 'config_loader'
 require 'open3'
 
@@ -277,14 +278,16 @@ end
 # For config migration, we've previously populated the PostgreSQL
 # section of the config from database.yml
 #
-ENV["DATABASE_URL"] = "postgresql://#{$arvados_config["PostgreSQL"]["Connection"]["user"]}:"+
-                      "#{$arvados_config["PostgreSQL"]["Connection"]["password"]}@"+
-                      "#{dbhost}/#{$arvados_config["PostgreSQL"]["Connection"]["dbname"]}?"+
+database_url = "postgresql://#{CGI.escape $arvados_config["PostgreSQL"]["Connection"]["user"]}:"+
+                      "#{CGI.escape $arvados_config["PostgreSQL"]["Connection"]["password"]}@"+
+                      "#{dbhost}/#{CGI.escape $arvados_config["PostgreSQL"]["Connection"]["dbname"]}?"+
                       "template=#{$arvados_config["PostgreSQL"]["Connection"]["template"]}&"+
                       "encoding=#{$arvados_config["PostgreSQL"]["Connection"]["client_encoding"]}&"+
                       "collation=#{$arvados_config["PostgreSQL"]["Connection"]["collation"]}&"+
                       "pool=#{$arvados_config["PostgreSQL"]["ConnectionPool"]}"
 
+ENV["DATABASE_URL"] = database_url
+
 Server::Application.configure do
   # Copy into the Rails config object.  This also turns Hash into
   # OrderedOptions so that application code can use
index 717101c2b2b6ccbacb9e01c587195b38e1bd8bb4..9605b584e9b4c94f42753fd58ac95fb35a04b048 100644 (file)
@@ -5,4 +5,5 @@
 # Set up gems listed in the Gemfile.
 ENV['BUNDLE_GEMFILE'] ||= File.expand_path('../Gemfile', __dir__)
 
-require 'bundler/setup'
+require 'bundler/setup' # Set up gems listed in the Gemfile.
+require 'bootsnap/setup' # Speed up boot time by caching expensive operations.
\ No newline at end of file
index 56a4ed6dcd9ecad7b92ccdbd18fb28633acb869c..f5ab77a4df285283dab8e2c3ef1f0fe35b7da2d4 100644 (file)
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: AGPL-3.0
 
-Server::Application.configure do
+Rails.application.configure do
   # Settings specified here will take precedence over those in config/application.rb
 
   # In the development environment your application's code is reloaded on
index 6c48dcd0196209f3b16a31f64f48ad93fa06244b..c8194057ccfc731d5fbf91b2fdfd55d0c417f812 100644 (file)
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: AGPL-3.0
 
-Server::Application.configure do
+Rails.application.configure do
   # Settings specified here will take precedence over those in config/application.rb
 
   # Code is not reloaded between requests
index 6b550587cbb28b95d7b07bf1f0841afe6ec5bdc4..9cdf5d9cd137aa0342a932c6c875c8a17b4f2ae7 100644 (file)
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: AGPL-3.0
 
-Server::Application.configure do
+Rails.application.configure do
   # Settings specified here will take precedence over those in config/application.rb
 
   # The test environment is used exclusively to run your application's
diff --git a/services/api/config/initializers/content_security_policy.rb b/services/api/config/initializers/content_security_policy.rb
new file mode 100644 (file)
index 0000000..853ecde
--- /dev/null
@@ -0,0 +1,29 @@
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: AGPL-3.0
+
+# Be sure to restart your server when you modify this file.
+
+# Define an application-wide content security policy
+# For further information see the following documentation
+# https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Content-Security-Policy
+
+# Rails.application.config.content_security_policy do |policy|
+#   policy.default_src :self, :https
+#   policy.font_src    :self, :https, :data
+#   policy.img_src     :self, :https, :data
+#   policy.object_src  :none
+#   policy.script_src  :self, :https
+#   policy.style_src   :self, :https
+
+#   # Specify URI for violation reports
+#   # policy.report_uri "/csp-violation-report-endpoint"
+# end
+
+# If you are using UJS then enable automatic nonce generation
+# Rails.application.config.content_security_policy_nonce_generator = -> request { SecureRandom.base64(16) }
+
+# Report CSP violations to a specified URI
+# For further information see the following documentation:
+# https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Content-Security-Policy-Report-Only
+# Rails.application.config.content_security_policy_report_only = true
index 8f3b3cb5f8e951df55979a1f74adce8b847de652..2abe40566ecf03cc0d48054b74690c6d1d7048b6 100644 (file)
@@ -8,8 +8,13 @@
 
 require 'enable_jobs_api'
 
-Server::Application.configure do
-  if ActiveRecord::Base.connection.tables.include?('jobs')
-    check_enable_legacy_jobs_api
+Rails.application.configure do
+  begin
+    if ActiveRecord::Base.connection.tables.include?('jobs')
+      check_enable_legacy_jobs_api
+    end
+  rescue ActiveRecord::NoDatabaseError
+    # Since rails 5.2, all initializers are run by rake tasks (like db:create),
+    # see: https://github.com/rails/rails/issues/32870
   end
 end
diff --git a/services/api/config/initializers/new_framework_defaults_5_2.rb b/services/api/config/initializers/new_framework_defaults_5_2.rb
new file mode 100644 (file)
index 0000000..93a8d52
--- /dev/null
@@ -0,0 +1,42 @@
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: AGPL-3.0
+
+# Be sure to restart your server when you modify this file.
+#
+# This file contains migration options to ease your Rails 5.2 upgrade.
+#
+# Once upgraded flip defaults one by one to migrate to the new default.
+#
+# Read the Guide for Upgrading Ruby on Rails for more info on each option.
+
+# Make Active Record use stable #cache_key alongside new #cache_version method.
+# This is needed for recyclable cache keys.
+# Rails.application.config.active_record.cache_versioning = true
+
+# Use AES-256-GCM authenticated encryption for encrypted cookies.
+# Also, embed cookie expiry in signed or encrypted cookies for increased security.
+#
+# This option is not backwards compatible with earlier Rails versions.
+# It's best enabled when your entire app is migrated and stable on 5.2.
+#
+# Existing cookies will be converted on read then written with the new scheme.
+# Rails.application.config.action_dispatch.use_authenticated_cookie_encryption = true
+
+# Use AES-256-GCM authenticated encryption as default cipher for encrypting messages
+# instead of AES-256-CBC, when use_authenticated_message_encryption is set to true.
+# Rails.application.config.active_support.use_authenticated_message_encryption = true
+
+# Add default protection from forgery to ActionController::Base instead of in
+# ApplicationController.
+# Rails.application.config.action_controller.default_protect_from_forgery = true
+
+# Store boolean values are in sqlite3 databases as 1 and 0 instead of 't' and
+# 'f' after migrating old data.
+# Rails.application.config.active_record.sqlite3.represent_boolean_as_integer = true
+
+# Use SHA-1 instead of MD5 to generate non-sensitive digests, such as the ETag header.
+# Rails.application.config.active_support.use_sha1_digests = true
+
+# Make `form_with` generate id attributes for any generated HTML tags.
+# Rails.application.config.action_view.form_with_generates_ids = true
diff --git a/services/api/config/initializers/preload_all_models.rb b/services/api/config/initializers/preload_all_models.rb
deleted file mode 100644 (file)
index 713c61f..0000000
+++ /dev/null
@@ -1,15 +0,0 @@
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: AGPL-3.0
-
-# See http://aaronvb.com/articles/37-rails-caching-and-undefined-class-module
-
-# Config must be done before we load model class files; otherwise they
-# won't be able to use Rails.configuration.* to initialize their
-# classes.
-
-if Rails.env == 'development'
-  Dir.foreach("#{Rails.root}/app/models") do |model_file|
-    require_dependency model_file if model_file.match(/\.rb$/)
-  end
-end
index cedd8f3e4a325b4e438febdc7d8cc9a7367c1a56..26681d613fa60b1daaa8857bdf4bebe3bd082096 100644 (file)
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: AGPL-3.0
 
-ActiveRecord::Base.connection.class.set_callback :checkout, :after do
+ActiveRecord::ConnectionAdapters::AbstractAdapter.set_callback :checkout, :before, ->(conn) do
   # If the database connection is in a time zone other than UTC,
   # "timestamp" values don't behave as desired.
   #
@@ -11,5 +11,5 @@ ActiveRecord::Base.connection.class.set_callback :checkout, :after do
   # before now()), but false in time zone -0100 (now() returns an
   # earlier clock time, and its time zone is dropped when comparing to
   # a "timestamp without time zone").
-  raw_connection.sync_exec("SET TIME ZONE 'UTC'")
+  conn.execute("SET TIME ZONE 'UTC'")
 end
index 976777723a970cf79600b13399f871ee7dafba12..6fb9786504ea5247982f342ac7dfc6d426486b46 100644 (file)
@@ -9,7 +9,7 @@
 
 # Enable parameter wrapping for JSON. You can disable this by setting :format to an empty array.
 ActiveSupport.on_load(:action_controller) do
-  wrap_parameters :format => [:json]
+  wrap_parameters format: [:json]
 end
 
 # Disable root element in JSON by default.
index 8afd22192a62f56c002b363bf63625e07009fcec..69758580356ba771ac05a70e022735fe092962d5 100644 (file)
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: AGPL-3.0
 
-Server::Application.routes.draw do
+Rails.application.routes.draw do
   themes_for_rails
 
   # OPTIONS requests are not allowed at routes that use cookies.
diff --git a/services/api/config/secrets.yml b/services/api/config/secrets.yml
new file mode 100644 (file)
index 0000000..293b93b
--- /dev/null
@@ -0,0 +1,31 @@
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: AGPL-3.0
+
+# Be sure to restart your server when you modify this file.
+
+# Your secret key is used for verifying the integrity of signed cookies.
+# If you change this key, all old signed cookies will become invalid!
+
+# Make sure the secret is at least 30 characters and all random,
+# no regular words or you'll be exposed to dictionary attacks.
+# You can use `rails secret` to generate a secure secret key.
+
+# NOTE that these get overriden by Arvados' own configuration system.
+
+# shared:
+#   api_key: a1B2c3D4e5F6
+
+# Environmental secrets are only available for that specific environment.
+
+# development:
+#   secret_key_base: <%= rand(1<<255).to_s(36) %>
+
+# test:
+#   secret_key_base: <%= rand(1<<255).to_s(36) %>
+
+# In case this doesn't get overriden for some reason, assign a random key
+# to gracefully degrade by rejecting cookies instead of by opening a
+# vulnerability.
+production:
+  secret_key_base: <%= rand(1<<255).to_s(36) %>
index 886c8873891c044270313e3563c73e4fe950c5cb..2b5e3b8abff2d14ddebea3008aa6774280c466f9 100644 (file)
@@ -62,7 +62,12 @@ module AuditLogs
       rescue => e
         Rails.logger.error "#{e.class}: #{e}\n#{e.backtrace.join("\n\t")}"
       ensure
-        ActiveRecord::Base.connection.close
+        # Rails 5.1+ makes test threads share a database connection, so we can't
+        # close a connection shared with other threads.
+        # https://github.com/rails/rails/commit/deba47799ff905f778e0c98a015789a1327d5087
+        if Rails.env != "test"
+          ActiveRecord::Base.connection.close
+        end
       end
     end
   end
index 8613c749cf247c6c11f309c4d43cddc544e99b4f..c09896567f3ac1291d8cbe0632393ac60d2ac8fc 100644 (file)
@@ -69,7 +69,12 @@ module SweepTrashedObjects
         rescue => e
           Rails.logger.error "#{e.class}: #{e}\n#{e.backtrace.join("\n\t")}"
         ensure
-          ActiveRecord::Base.connection.close
+          # Rails 5.1+ makes test threads share a database connection, so we can't
+          # close a connection shared with other threads.
+          # https://github.com/rails/rails/commit/deba47799ff905f778e0c98a015789a1327d5087
+          if Rails.env != "test"
+            ActiveRecord::Base.connection.close
+          end
         end
       end
     end
index c688ac008b44b21944e86b36cdb3abbb15273e12..6c17f1bd03bf5bae3d1dbd9a2a9e4123ee99b715 100644 (file)
@@ -33,7 +33,7 @@ module UpdatePriority
       # priority==0 but should be >0:
       act_as_system_user do
         Container.
-          joins("JOIN container_requests ON container_requests.container_uuid=containers.uuid AND container_requests.state=#{Container.sanitize(ContainerRequest::Committed)} AND container_requests.priority>0").
+          joins("JOIN container_requests ON container_requests.container_uuid=containers.uuid AND container_requests.state=#{ActiveRecord::Base.connection.quote(ContainerRequest::Committed)} AND container_requests.priority>0").
           where('containers.state IN (?) AND containers.priority=0 AND container_requests.uuid IS NOT NULL',
                 [Container::Queued, Container::Locked, Container::Running]).
           map(&:update_priority!)
@@ -55,7 +55,12 @@ module UpdatePriority
       rescue => e
         Rails.logger.error "#{e.class}: #{e}\n#{e.backtrace.join("\n\t")}"
       ensure
-        ActiveRecord::Base.connection.close
+        # Rails 5.1+ makes test threads share a database connection, so we can't
+        # close a connection shared with other threads.
+        # https://github.com/rails/rails/commit/deba47799ff905f778e0c98a015789a1327d5087
+        if Rails.env != "test"
+          ActiveRecord::Base.connection.close
+        end
       end
     end
   end
index ce1d447f16ad0f950327ecfa1e47f7cb24fcd76f..0fbc7625ceb0d985d4c26d10e9cc2b636574378e 100644 (file)
@@ -50,8 +50,7 @@ class Arvados::V1::KeepServicesControllerTest < ActionController::TestCase
     refute_empty expect_rvz
     authorize_with :active
     get :index,
-      params: {:format => :json},
-      headers: auth(:active)
+      params: {:format => :json}
     assert_response :success
     json_response['items'].each do |svc|
       url = "#{svc['service_ssl_flag'] ? 'https' : 'http'}://#{svc['service_host']}:#{svc['service_port']}/"
index c1db8c8b5db1aa48fe4a843fb2a573f0b0966a3f..64f78071350a6736994986eff3267c541e72b4f6 100644 (file)
@@ -295,4 +295,29 @@ class ArvadosModelTest < ActiveSupport::TestCase
     c.reload
     assert_equal({'foo' => 'bar'}, c.properties)
   end
+
+  test 'serialized attributes dirty tracking with audit log settings' do
+    Rails.configuration.AuditLogs.MaxDeleteBatch = 1000
+    set_user_from_auth :admin
+    [false, true].each do |auditlogs_enabled|
+      if auditlogs_enabled
+        Rails.configuration.AuditLogs.MaxAge = 3600
+      else
+        Rails.configuration.AuditLogs.MaxAge = 0
+      end
+      [
+        User.find_by_uuid(users(:active).uuid),
+        ContainerRequest.find_by_uuid(container_requests(:queued).uuid),
+        Container.find_by_uuid(containers(:queued).uuid),
+        PipelineInstance.find_by_uuid(pipeline_instances(:has_component_with_completed_jobs).uuid),
+        PipelineTemplate.find_by_uuid(pipeline_templates(:two_part).uuid),
+        Job.find_by_uuid(jobs(:running).uuid)
+      ].each do |obj|
+        assert_not(obj.class.serialized_attributes.empty?,
+          "#{obj.class} model doesn't have serialized attributes")
+        # obj shouldn't have changed since it's just retrieved from the database
+        assert_not(obj.changed?, "#{obj.class} model's attribute(s) appear as changed: '#{obj.changes.keys.join(',')}' with audit logs #{auditlogs_enabled ? '': 'not '}enabled.")
+      end
+    end
+  end
 end
index 00f3cc291352493b11258aa0f9750fc883a263ff..c7d21bdc4da721d51f40c0cb235a15a8e3c3db96 100644 (file)
@@ -58,6 +58,14 @@ class LinkTest < ActiveSupport::TestCase
                                   users(:active).uuid.sub(/-\w+$/, "-#{'z' * 15}"))
   end
 
+  test "link granting permission to remote user is valid" do
+    refute new_active_link_valid?(tail_uuid:
+                                  users(:active).uuid.sub(/^\w+-/, "foooo-"))
+    Rails.configuration.RemoteClusters = Rails.configuration.RemoteClusters.merge({foooo: ActiveSupport::InheritableOptions.new({Host: "bar.com"})})
+    assert new_active_link_valid?(tail_uuid:
+                                  users(:active).uuid.sub(/^\w+-/, "foooo-"))
+  end
+
   test "link granting non-project permission to unreadable user is invalid" do
     refute new_active_link_valid?(tail_uuid: users(:admin).uuid,
                                   head_uuid: collections(:bar_file).uuid)
index a1c8ff8a921d214d2ea27608708c0b3d19caa8f9..016a0e4eb4a9b6a59717de2c75a634b3182dd82f 100644 (file)
@@ -378,19 +378,6 @@ class LogTest < ActiveSupport::TestCase
         sleep 0.1
       end
       assert_operator remaining_audit_logs.count, :<, initial_log_count
-    ensure
-      # The test framework rolls back our transactions, but that
-      # doesn't undo the deletes we did from separate threads.
-      ActiveRecord::Base.connection.exec_query 'ROLLBACK'
-      Thread.new do
-        begin
-          dc = DatabaseController.new
-          dc.define_singleton_method :render do |*args| end
-          dc.reset
-        ensure
-          ActiveRecord::Base.connection.close
-        end
-      end.join
     end
   end
 end
index b54e8d9de64f970726dc49d07ca47e368491986a..9fa3febe1e75fddf4227fae9801cda7d976b4149 100644 (file)
@@ -141,7 +141,7 @@ class NodeTest < ActiveSupport::TestCase
     assert_equal "custom1", node2.hostname
   end
 
-  test "update dns when nodemanager clears hostname and ip_address" do
+  test "update dns when hostname and ip_address are cleared" do
     act_as_system_user do
       node = ping_node(:new_with_custom_hostname, {})
       Node.expects(:dns_server_update).with(node.hostname, Node::UNUSED_NODE_IP)
diff --git a/services/dockercleaner/fpm-info.sh b/services/dockercleaner/fpm-info.sh
new file mode 100644 (file)
index 0000000..d678fdf
--- /dev/null
@@ -0,0 +1,12 @@
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+case "$TARGET" in
+    debian9 | ubuntu1604)
+        fpm_depends+=()
+        ;;
+    debian* | ubuntu*)
+        fpm_depends+=(python3-distutils)
+        ;;
+esac
index a6445506e5fd3b3c1500caf6d3d8cb98d63b5f08..b154f6e99848a3623b167726412ce5b48a59c715 100644 (file)
@@ -31,6 +31,7 @@ func (s *mainSuite) TestHTTPServer(c *check.C) {
                c.Fatal(err)
        }
        _, p, err := net.SplitHostPort(ln.Addr().String())
+       c.Check(err, check.IsNil)
        ln.Close()
        config := "Clusters:\n zzzzz:\n  ManagementToken: abcdefg\n  Services: {Keepbalance: {InternalURLs: {'http://localhost:" + p + "/': {}}}}\n"
 
diff --git a/services/keep/tools/traffic_test.py b/services/keep/tools/traffic_test.py
deleted file mode 100755 (executable)
index cd50a52..0000000
+++ /dev/null
@@ -1,129 +0,0 @@
-#! /usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: AGPL-3.0
-
-# traffic_test.py
-#
-# Launch a test Keep and API server and PUT and GET a bunch of blocks.
-# Can be used to simulate client traffic in Keep to evaluate memory usage,
-# error logging, performance, etc.
-#
-# This script is warty and is relatively environment-specific, but the
-# example run described below should execute cleanly.
-#
-# Usage:
-#   traffic_test.py start
-#       Starts the test servers.
-#   traffic_test.py put file1 file2 file3 ....
-#       Runs arv-put on each file.
-#   traffic_test.py get hash1 hash2 hash3 ....
-#       Loops forever issuing GET requests for specified blocks.
-#   traffic_test.py stop
-#       Stops the test servers.
-#
-# Example:
-#
-#   $ ./traffic_test.py start
-#   $ ./traffic_test.py put GS00253-DNA_A02_200_37.tsv.bz2 \
-#         GS00253-DNA_B01_200_37.tsv.bz2 \
-#         GS00253-DNA_B02_200_37.tsv.bz2
-#   $ ./traffic_test.py get $(find /tmp/tmp* -type f -printf "%f ")
-#     [loops forever]
-#     ^C
-#   $ ./traffic_test.py stop
-#
-# Multiple "get" runs may be run concurrently to evaluate Keep's handling
-# of additional concurrent clients.
-
-PYSDK_DIR    = "../../../sdk/python"
-PYTEST_DIR   = PYSDK_DIR + "/tests"
-ARV_PUT_PATH = PYSDK_DIR + "/bin/arv-put"
-ARV_GET_PATH = PYSDK_DIR + "/bin/arv-get"
-SECONDS_BETWEEN_GETS = 1
-
-import argparse
-import httplib2
-import os
-import random
-import subprocess
-import sys
-import time
-
-# for run_test_server.py
-sys.path.insert(0, PYSDK_DIR)
-sys.path.insert(0, PYTEST_DIR)
-import arvados
-import run_test_server
-
-def arv_cmd(*args):
-    p = subprocess.Popen([sys.executable] + list(args),
-                         stdout=subprocess.PIPE)
-    (arvout, arverr) = p.communicate()
-    if p.returncode != 0:
-        print "error {} from {} {}: {}".format(
-            p.returncode, sys.executable, args, arverr)
-        sys.exit(p.returncode)
-    return arvout
-
-def start():
-    run_test_server.run()
-    run_test_server.run_keep()
-
-def put(files):
-    os.environ["ARVADOS_API_HOST"] = "127.0.0.1:3000"
-    run_test_server.authorize_with('active')
-    for v in ["ARVADOS_API_HOST",
-              "ARVADOS_API_HOST_INSECURE",
-              "ARVADOS_API_TOKEN"]:
-        os.environ[v] = arvados.config.settings()[v]
-
-    if not os.environ.has_key('PYTHONPATH'):
-        os.environ['PYTHONPATH'] = ''
-    os.environ['PYTHONPATH'] = "{}:{}:{}".format(
-        PYSDK_DIR, PYTEST_DIR, os.environ['PYTHONPATH'])
-
-    for c in files:
-        manifest_uuid = arv_cmd(ARV_PUT_PATH, c)
-
-def get(blocks):
-    os.environ["ARVADOS_API_HOST"] = "127.0.0.1:3000"
-
-    run_test_server.authorize_with('active')
-    for v in ["ARVADOS_API_HOST",
-              "ARVADOS_API_HOST_INSECURE",
-              "ARVADOS_API_TOKEN"]:
-        os.environ[v] = arvados.config.settings()[v]
-
-    nqueries = 0
-    while True:
-        b = random.choice(blocks)
-        print "GET /" + b
-        body = arv_cmd(ARV_GET_PATH, b)
-        print "got {} bytes".format(len(body))
-        time.sleep(SECONDS_BETWEEN_GETS)
-        nqueries = nqueries + 1
-
-def stop():
-    run_test_server.stop_keep()
-    run_test_server.stop()
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument('action',
-                        type=str,
-                        nargs='+',
-                        help='''"start", "put", "get", "stop"''')
-    args = parser.parse_args()
-
-    if args.action[0] == 'start':
-        start()
-    elif args.action[0] == 'put':
-        put(args.action[1:])
-    elif args.action[0] == 'get':
-        get(args.action[1:])
-    elif args.action[0] == 'stop':
-        stop()
-    else:
-        print('Unrecognized action "{}"'.format(args.action))
-        print('actions are "start", "put", "get", "stop"')
index 96f2e7db3965704570f3906c78ab6e624072e013..235d369b5a67f780fb1cb29794ed65294b7a150c 100644 (file)
@@ -32,12 +32,12 @@ import (
 )
 
 func init() {
-       driver["S3"] = newS3Volume
+       driver["S3"] = chooseS3VolumeDriver
 }
 
 func newS3Volume(cluster *arvados.Cluster, volume arvados.Volume, logger logrus.FieldLogger, metrics *volumeMetricsVecs) (Volume, error) {
        v := &S3Volume{cluster: cluster, volume: volume, metrics: metrics}
-       err := json.Unmarshal(volume.DriverParameters, &v)
+       err := json.Unmarshal(volume.DriverParameters, v)
        if err != nil {
                return nil, err
        }
diff --git a/services/keepstore/s3aws_volume.go b/services/keepstore/s3aws_volume.go
new file mode 100644 (file)
index 0000000..c9fa7fc
--- /dev/null
@@ -0,0 +1,900 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package main
+
+import (
+       "bytes"
+       "context"
+       "encoding/base64"
+       "encoding/hex"
+       "encoding/json"
+       "errors"
+       "fmt"
+       "io"
+       "os"
+       "regexp"
+       "strings"
+       "sync"
+       "sync/atomic"
+       "time"
+
+       "git.arvados.org/arvados.git/sdk/go/arvados"
+       "github.com/aws/aws-sdk-go-v2/aws"
+       "github.com/aws/aws-sdk-go-v2/aws/awserr"
+       "github.com/aws/aws-sdk-go-v2/aws/defaults"
+       "github.com/aws/aws-sdk-go-v2/aws/ec2metadata"
+       "github.com/aws/aws-sdk-go-v2/aws/ec2rolecreds"
+       "github.com/aws/aws-sdk-go-v2/aws/endpoints"
+       "github.com/aws/aws-sdk-go-v2/service/s3"
+       "github.com/aws/aws-sdk-go-v2/service/s3/s3manager"
+       "github.com/prometheus/client_golang/prometheus"
+       "github.com/sirupsen/logrus"
+)
+
+// S3Volume implements Volume using an S3 bucket.
+type S3AWSVolume struct {
+       arvados.S3VolumeDriverParameters
+       AuthToken      string    // populated automatically when IAMRole is used
+       AuthExpiration time.Time // populated automatically when IAMRole is used
+
+       cluster   *arvados.Cluster
+       volume    arvados.Volume
+       logger    logrus.FieldLogger
+       metrics   *volumeMetricsVecs
+       bucket    *s3AWSbucket
+       region    string
+       startOnce sync.Once
+}
+
+// s3bucket wraps s3.bucket and counts I/O and API usage stats. The
+// wrapped bucket can be replaced atomically with SetBucket in order
+// to update credentials.
+type s3AWSbucket struct {
+       bucket string
+       svc    *s3.Client
+       stats  s3awsbucketStats
+       mu     sync.Mutex
+}
+
+// chooseS3VolumeDriver distinguishes between the old goamz driver and
+// aws-sdk-go based on the UseAWSS3v2Driver feature flag
+func chooseS3VolumeDriver(cluster *arvados.Cluster, volume arvados.Volume, logger logrus.FieldLogger, metrics *volumeMetricsVecs) (Volume, error) {
+       v := &S3Volume{cluster: cluster, volume: volume, metrics: metrics}
+       err := json.Unmarshal(volume.DriverParameters, v)
+       if err != nil {
+               return nil, err
+       }
+       if v.UseAWSS3v2Driver {
+               logger.Debugln("Using AWS S3 v2 driver")
+               return newS3AWSVolume(cluster, volume, logger, metrics)
+       } else {
+               logger.Debugln("Using goamz S3 driver")
+               return newS3Volume(cluster, volume, logger, metrics)
+       }
+}
+
+const (
+       PartSize         = 5 * 1024 * 1024
+       ReadConcurrency  = 13
+       WriteConcurrency = 5
+)
+
+var s3AWSKeepBlockRegexp = regexp.MustCompile(`^[0-9a-f]{32}$`)
+var s3AWSZeroTime time.Time
+
+func (v *S3AWSVolume) isKeepBlock(s string) bool {
+       return s3AWSKeepBlockRegexp.MatchString(s)
+}
+
+func newS3AWSVolume(cluster *arvados.Cluster, volume arvados.Volume, logger logrus.FieldLogger, metrics *volumeMetricsVecs) (Volume, error) {
+       v := &S3AWSVolume{cluster: cluster, volume: volume, metrics: metrics}
+       err := json.Unmarshal(volume.DriverParameters, v)
+       if err != nil {
+               return nil, err
+       }
+       v.logger = logger.WithField("Volume", v.String())
+       return v, v.check("")
+}
+
+func (v *S3AWSVolume) translateError(err error) error {
+       if aerr, ok := err.(awserr.Error); ok {
+               switch aerr.Code() {
+               case "NotFound":
+                       return os.ErrNotExist
+               case "NoSuchKey":
+                       return os.ErrNotExist
+               }
+       }
+       return err
+}
+
+// safeCopy calls CopyObjectRequest, and checks the response to make sure the
+// copy succeeded and updated the timestamp on the destination object
+//
+// (If something goes wrong during the copy, the error will be embedded in the
+// 200 OK response)
+func (v *S3AWSVolume) safeCopy(dst, src string) error {
+       input := &s3.CopyObjectInput{
+               Bucket:      aws.String(v.bucket.bucket),
+               ContentType: aws.String("application/octet-stream"),
+               CopySource:  aws.String(v.bucket.bucket + "/" + src),
+               Key:         aws.String(dst),
+       }
+
+       req := v.bucket.svc.CopyObjectRequest(input)
+       resp, err := req.Send(context.Background())
+
+       err = v.translateError(err)
+       if os.IsNotExist(err) {
+               return err
+       } else if err != nil {
+               return fmt.Errorf("PutCopy(%q ← %q): %s", dst, v.bucket.bucket+"/"+src, err)
+       }
+
+       if resp.CopyObjectResult.LastModified == nil {
+               return fmt.Errorf("PutCopy succeeded but did not return a timestamp: %q: %s", resp.CopyObjectResult.LastModified, err)
+       } else if time.Now().Sub(*resp.CopyObjectResult.LastModified) > maxClockSkew {
+               return fmt.Errorf("PutCopy succeeded but returned an old timestamp: %q: %s", resp.CopyObjectResult.LastModified, resp.CopyObjectResult.LastModified)
+       }
+       return nil
+}
+
+func (v *S3AWSVolume) check(ec2metadataHostname string) error {
+       if v.Bucket == "" {
+               return errors.New("DriverParameters: Bucket must be provided")
+       }
+       if v.IndexPageSize == 0 {
+               v.IndexPageSize = 1000
+       }
+       if v.RaceWindow < 0 {
+               return errors.New("DriverParameters: RaceWindow must not be negative")
+       }
+
+       if v.V2Signature {
+               return errors.New("DriverParameters: V2Signature is not supported")
+       }
+
+       defaultResolver := endpoints.NewDefaultResolver()
+
+       cfg := defaults.Config()
+
+       if v.Endpoint == "" && v.Region == "" {
+               return fmt.Errorf("AWS region or endpoint must be specified")
+       } else if v.Endpoint != "" || ec2metadataHostname != "" {
+               myCustomResolver := func(service, region string) (aws.Endpoint, error) {
+                       if v.Endpoint != "" && service == "s3" {
+                               return aws.Endpoint{
+                                       URL:           v.Endpoint,
+                                       SigningRegion: v.Region,
+                               }, nil
+                       } else if service == "ec2metadata" && ec2metadataHostname != "" {
+                               return aws.Endpoint{
+                                       URL: ec2metadataHostname,
+                               }, nil
+                       }
+
+                       return defaultResolver.ResolveEndpoint(service, region)
+               }
+               cfg.EndpointResolver = aws.EndpointResolverFunc(myCustomResolver)
+       }
+
+       cfg.Region = v.Region
+
+       // Zero timeouts mean "wait forever", which is a bad
+       // default. Default to long timeouts instead.
+       if v.ConnectTimeout == 0 {
+               v.ConnectTimeout = s3DefaultConnectTimeout
+       }
+       if v.ReadTimeout == 0 {
+               v.ReadTimeout = s3DefaultReadTimeout
+       }
+
+       creds := aws.NewChainProvider(
+               []aws.CredentialsProvider{
+                       aws.NewStaticCredentialsProvider(v.AccessKey, v.SecretKey, v.AuthToken),
+                       ec2rolecreds.New(ec2metadata.New(cfg)),
+               })
+
+       cfg.Credentials = creds
+
+       v.bucket = &s3AWSbucket{
+               bucket: v.Bucket,
+               svc:    s3.New(cfg),
+       }
+
+       // Set up prometheus metrics
+       lbls := prometheus.Labels{"device_id": v.GetDeviceID()}
+       v.bucket.stats.opsCounters, v.bucket.stats.errCounters, v.bucket.stats.ioBytes = v.metrics.getCounterVecsFor(lbls)
+
+       return nil
+}
+
+// String implements fmt.Stringer.
+func (v *S3AWSVolume) String() string {
+       return fmt.Sprintf("s3-bucket:%+q", v.Bucket)
+}
+
+// GetDeviceID returns a globally unique ID for the storage bucket.
+func (v *S3AWSVolume) GetDeviceID() string {
+       return "s3://" + v.Endpoint + "/" + v.Bucket
+}
+
+// Compare the given data with the stored data.
+func (v *S3AWSVolume) Compare(ctx context.Context, loc string, expect []byte) error {
+       errChan := make(chan error, 1)
+       go func() {
+               _, err := v.Head("recent/" + loc)
+               errChan <- err
+       }()
+       var err error
+       select {
+       case <-ctx.Done():
+               return ctx.Err()
+       case err = <-errChan:
+       }
+       if err != nil {
+               // Checking for "loc" itself here would interfere with
+               // future GET requests.
+               //
+               // On AWS, if X doesn't exist, a HEAD or GET request
+               // for X causes X's non-existence to be cached. Thus,
+               // if we test for X, then create X and return a
+               // signature to our client, the client might still get
+               // 404 from all keepstores when trying to read it.
+               //
+               // To avoid this, we avoid doing HEAD X or GET X until
+               // we know X has been written.
+               //
+               // Note that X might exist even though recent/X
+               // doesn't: for example, the response to HEAD recent/X
+               // might itself come from a stale cache. In such
+               // cases, we will return a false negative and
+               // PutHandler might needlessly create another replica
+               // on a different volume. That's not ideal, but it's
+               // better than passing the eventually-consistent
+               // problem on to our clients.
+               return v.translateError(err)
+       }
+
+       input := &s3.GetObjectInput{
+               Bucket: aws.String(v.bucket.bucket),
+               Key:    aws.String(loc),
+       }
+
+       req := v.bucket.svc.GetObjectRequest(input)
+       result, err := req.Send(ctx)
+       if err != nil {
+               return v.translateError(err)
+       }
+       return v.translateError(compareReaderWithBuf(ctx, result.Body, expect, loc[:32]))
+}
+
+// EmptyTrash looks for trashed blocks that exceeded BlobTrashLifetime
+// and deletes them from the volume.
+func (v *S3AWSVolume) EmptyTrash() {
+       if v.cluster.Collections.BlobDeleteConcurrency < 1 {
+               return
+       }
+
+       var bytesInTrash, blocksInTrash, bytesDeleted, blocksDeleted int64
+
+       // Define "ready to delete" as "...when EmptyTrash started".
+       startT := time.Now()
+
+       emptyOneKey := func(trash *s3.Object) {
+               loc := strings.TrimPrefix(*trash.Key, "trash/")
+               if !v.isKeepBlock(loc) {
+                       return
+               }
+               atomic.AddInt64(&bytesInTrash, *trash.Size)
+               atomic.AddInt64(&blocksInTrash, 1)
+
+               trashT := *(trash.LastModified)
+               recent, err := v.Head("recent/" + loc)
+               if err != nil && os.IsNotExist(v.translateError(err)) {
+                       v.logger.Warnf("EmptyTrash: found trash marker %q but no %q (%s); calling Untrash", trash.Key, "recent/"+loc, err)
+                       err = v.Untrash(loc)
+                       if err != nil {
+                               v.logger.WithError(err).Errorf("EmptyTrash: Untrash(%q) failed", loc)
+                       }
+                       return
+               } else if err != nil {
+                       v.logger.WithError(err).Warnf("EmptyTrash: HEAD %q failed", "recent/"+loc)
+                       return
+               }
+               if trashT.Sub(*recent.LastModified) < v.cluster.Collections.BlobSigningTTL.Duration() {
+                       if age := startT.Sub(*recent.LastModified); age >= v.cluster.Collections.BlobSigningTTL.Duration()-time.Duration(v.RaceWindow) {
+                               // recent/loc is too old to protect
+                               // loc from being Trashed again during
+                               // the raceWindow that starts if we
+                               // delete trash/X now.
+                               //
+                               // Note this means (TrashSweepInterval
+                               // < BlobSigningTTL - raceWindow) is
+                               // necessary to avoid starvation.
+                               v.logger.Infof("EmptyTrash: detected old race for %q, calling fixRace + Touch", loc)
+                               v.fixRace(loc)
+                               v.Touch(loc)
+                               return
+                       }
+                       _, err := v.Head(loc)
+                       if os.IsNotExist(err) {
+                               v.logger.Infof("EmptyTrash: detected recent race for %q, calling fixRace", loc)
+                               v.fixRace(loc)
+                               return
+                       } else if err != nil {
+                               v.logger.WithError(err).Warnf("EmptyTrash: HEAD %q failed", loc)
+                               return
+                       }
+               }
+               if startT.Sub(trashT) < v.cluster.Collections.BlobTrashLifetime.Duration() {
+                       return
+               }
+               err = v.bucket.Del(*trash.Key)
+               if err != nil {
+                       v.logger.WithError(err).Errorf("EmptyTrash: error deleting %q", *trash.Key)
+                       return
+               }
+               atomic.AddInt64(&bytesDeleted, *trash.Size)
+               atomic.AddInt64(&blocksDeleted, 1)
+
+               _, err = v.Head(loc)
+               if err == nil {
+                       v.logger.Warnf("EmptyTrash: HEAD %q succeeded immediately after deleting %q", loc, loc)
+                       return
+               }
+               if !os.IsNotExist(v.translateError(err)) {
+                       v.logger.WithError(err).Warnf("EmptyTrash: HEAD %q failed", loc)
+                       return
+               }
+               err = v.bucket.Del("recent/" + loc)
+               if err != nil {
+                       v.logger.WithError(err).Warnf("EmptyTrash: error deleting %q", "recent/"+loc)
+               }
+       }
+
+       var wg sync.WaitGroup
+       todo := make(chan *s3.Object, v.cluster.Collections.BlobDeleteConcurrency)
+       for i := 0; i < v.cluster.Collections.BlobDeleteConcurrency; i++ {
+               wg.Add(1)
+               go func() {
+                       defer wg.Done()
+                       for key := range todo {
+                               emptyOneKey(key)
+                       }
+               }()
+       }
+
+       trashL := s3awsLister{
+               Logger:   v.logger,
+               Bucket:   v.bucket,
+               Prefix:   "trash/",
+               PageSize: v.IndexPageSize,
+               Stats:    &v.bucket.stats,
+       }
+       for trash := trashL.First(); trash != nil; trash = trashL.Next() {
+               todo <- trash
+       }
+       close(todo)
+       wg.Wait()
+
+       if err := trashL.Error(); err != nil {
+               v.logger.WithError(err).Error("EmptyTrash: lister failed")
+       }
+       v.logger.Infof("EmptyTrash: stats for %v: Deleted %v bytes in %v blocks. Remaining in trash: %v bytes in %v blocks.", v.String(), bytesDeleted, blocksDeleted, bytesInTrash-bytesDeleted, blocksInTrash-blocksDeleted)
+}
+
+// fixRace(X) is called when "recent/X" exists but "X" doesn't
+// exist. If the timestamps on "recent/"+loc and "trash/"+loc indicate
+// there was a race between Put and Trash, fixRace recovers from the
+// race by Untrashing the block.
+func (v *S3AWSVolume) fixRace(loc string) bool {
+       trash, err := v.Head("trash/" + loc)
+       if err != nil {
+               if !os.IsNotExist(v.translateError(err)) {
+                       v.logger.WithError(err).Errorf("fixRace: HEAD %q failed", "trash/"+loc)
+               }
+               return false
+       }
+
+       recent, err := v.Head("recent/" + loc)
+       if err != nil {
+               v.logger.WithError(err).Errorf("fixRace: HEAD %q failed", "recent/"+loc)
+               return false
+       }
+
+       recentTime := *recent.LastModified
+       trashTime := *trash.LastModified
+       ageWhenTrashed := trashTime.Sub(recentTime)
+       if ageWhenTrashed >= v.cluster.Collections.BlobSigningTTL.Duration() {
+               // No evidence of a race: block hasn't been written
+               // since it became eligible for Trash. No fix needed.
+               return false
+       }
+
+       v.logger.Infof("fixRace: %q: trashed at %s but touched at %s (age when trashed = %s < %s)", loc, trashTime, recentTime, ageWhenTrashed, v.cluster.Collections.BlobSigningTTL)
+       v.logger.Infof("fixRace: copying %q to %q to recover from race between Put/Touch and Trash", "recent/"+loc, loc)
+       err = v.safeCopy(loc, "trash/"+loc)
+       if err != nil {
+               v.logger.WithError(err).Error("fixRace: copy failed")
+               return false
+       }
+       return true
+}
+
+func (v *S3AWSVolume) Head(loc string) (result *s3.HeadObjectOutput, err error) {
+       input := &s3.HeadObjectInput{
+               Bucket: aws.String(v.bucket.bucket),
+               Key:    aws.String(loc),
+       }
+
+       req := v.bucket.svc.HeadObjectRequest(input)
+       res, err := req.Send(context.TODO())
+
+       v.bucket.stats.TickOps("head")
+       v.bucket.stats.Tick(&v.bucket.stats.Ops, &v.bucket.stats.HeadOps)
+       v.bucket.stats.TickErr(err)
+
+       if err != nil {
+               return nil, v.translateError(err)
+       }
+       result = res.HeadObjectOutput
+       return
+}
+
+// Get a block: copy the block data into buf, and return the number of
+// bytes copied.
+func (v *S3AWSVolume) Get(ctx context.Context, loc string, buf []byte) (int, error) {
+       return getWithPipe(ctx, loc, buf, v)
+}
+
+func (v *S3AWSVolume) readWorker(ctx context.Context, loc string) (rdr io.ReadCloser, err error) {
+       buf := make([]byte, 0, 67108864)
+       awsBuf := aws.NewWriteAtBuffer(buf)
+
+       downloader := s3manager.NewDownloaderWithClient(v.bucket.svc, func(u *s3manager.Downloader) {
+               u.PartSize = PartSize
+               u.Concurrency = ReadConcurrency
+       })
+
+       v.logger.Debugf("Partsize: %d; Concurrency: %d\n", downloader.PartSize, downloader.Concurrency)
+
+       _, err = downloader.DownloadWithContext(ctx, awsBuf, &s3.GetObjectInput{
+               Bucket: aws.String(v.bucket.bucket),
+               Key:    aws.String(loc),
+       })
+       v.bucket.stats.TickOps("get")
+       v.bucket.stats.Tick(&v.bucket.stats.Ops, &v.bucket.stats.GetOps)
+       v.bucket.stats.TickErr(err)
+       if err != nil {
+               return nil, v.translateError(err)
+       }
+       buf = awsBuf.Bytes()
+
+       rdr = NewCountingReader(bytes.NewReader(buf), v.bucket.stats.TickInBytes)
+       return
+}
+
+// ReadBlock implements BlockReader.
+func (v *S3AWSVolume) ReadBlock(ctx context.Context, loc string, w io.Writer) error {
+       rdr, err := v.readWorker(ctx, loc)
+
+       if err == nil {
+               _, err2 := io.Copy(w, rdr)
+               if err2 != nil {
+                       return err2
+               }
+               return err
+       }
+
+       err = v.translateError(err)
+       if !os.IsNotExist(err) {
+               return err
+       }
+
+       _, err = v.Head("recent/" + loc)
+       err = v.translateError(err)
+       if err != nil {
+               // If we can't read recent/X, there's no point in
+               // trying fixRace. Give up.
+               return err
+       }
+       if !v.fixRace(loc) {
+               err = os.ErrNotExist
+               return err
+       }
+
+       rdr, err = v.readWorker(ctx, loc)
+       if err != nil {
+               v.logger.Warnf("reading %s after successful fixRace: %s", loc, err)
+               err = v.translateError(err)
+               return err
+       }
+
+       _, err = io.Copy(w, rdr)
+
+       return err
+}
+
+func (v *S3AWSVolume) writeObject(ctx context.Context, name string, r io.Reader) error {
+       if r == nil {
+               // r == nil leads to a memory violation in func readFillBuf in
+               // aws-sdk-go-v2@v0.23.0/service/s3/s3manager/upload.go
+               r = bytes.NewReader(nil)
+       }
+
+       uploadInput := s3manager.UploadInput{
+               Bucket: aws.String(v.bucket.bucket),
+               Key:    aws.String(name),
+               Body:   r,
+       }
+
+       if len(name) == 32 {
+               var contentMD5 string
+               md5, err := hex.DecodeString(name)
+               if err != nil {
+                       return err
+               }
+               contentMD5 = base64.StdEncoding.EncodeToString(md5)
+               uploadInput.ContentMD5 = &contentMD5
+       }
+
+       // Experimentation indicated that using concurrency 5 yields the best
+       // throughput, better than higher concurrency (10 or 13) by ~5%.
+       // Defining u.BufferProvider = s3manager.NewBufferedReadSeekerWriteToPool(64 * 1024 * 1024)
+       // is detrimental to througput (minus ~15%).
+       uploader := s3manager.NewUploaderWithClient(v.bucket.svc, func(u *s3manager.Uploader) {
+               u.PartSize = PartSize
+               u.Concurrency = WriteConcurrency
+       })
+
+       // Unlike the goamz S3 driver, we don't need to precompute ContentSHA256:
+       // the aws-sdk-go v2 SDK uses a ReadSeeker to avoid having to copy the
+       // block, so there is no extra memory use to be concerned about. See
+       // makeSha256Reader in aws/signer/v4/v4.go. In fact, we explicitly disable
+       // calculating the Sha-256 because we don't need it; we already use md5sum
+       // hashes that match the name of the block.
+       _, err := uploader.UploadWithContext(ctx, &uploadInput, s3manager.WithUploaderRequestOptions(func(r *aws.Request) {
+               r.HTTPRequest.Header.Set("X-Amz-Content-Sha256", "UNSIGNED-PAYLOAD")
+       }))
+
+       v.bucket.stats.TickOps("put")
+       v.bucket.stats.Tick(&v.bucket.stats.Ops, &v.bucket.stats.PutOps)
+       v.bucket.stats.TickErr(err)
+
+       return err
+}
+
+// Put writes a block.
+func (v *S3AWSVolume) Put(ctx context.Context, loc string, block []byte) error {
+       return putWithPipe(ctx, loc, block, v)
+}
+
+// WriteBlock implements BlockWriter.
+func (v *S3AWSVolume) WriteBlock(ctx context.Context, loc string, rdr io.Reader) error {
+       if v.volume.ReadOnly {
+               return MethodDisabledError
+       }
+
+       r := NewCountingReader(rdr, v.bucket.stats.TickOutBytes)
+       err := v.writeObject(ctx, loc, r)
+       if err != nil {
+               return err
+       }
+       return v.writeObject(ctx, "recent/"+loc, nil)
+}
+
+type s3awsLister struct {
+       Logger            logrus.FieldLogger
+       Bucket            *s3AWSbucket
+       Prefix            string
+       PageSize          int
+       Stats             *s3awsbucketStats
+       ContinuationToken string
+       buf               []s3.Object
+       err               error
+}
+
+// First fetches the first page and returns the first item. It returns
+// nil if the response is the empty set or an error occurs.
+func (lister *s3awsLister) First() *s3.Object {
+       lister.getPage()
+       return lister.pop()
+}
+
+// Next returns the next item, fetching the next page if necessary. It
+// returns nil if the last available item has already been fetched, or
+// an error occurs.
+func (lister *s3awsLister) Next() *s3.Object {
+       if len(lister.buf) == 0 && lister.ContinuationToken != "" {
+               lister.getPage()
+       }
+       return lister.pop()
+}
+
+// Return the most recent error encountered by First or Next.
+func (lister *s3awsLister) Error() error {
+       return lister.err
+}
+
+func (lister *s3awsLister) getPage() {
+       lister.Stats.TickOps("list")
+       lister.Stats.Tick(&lister.Stats.Ops, &lister.Stats.ListOps)
+
+       var input *s3.ListObjectsV2Input
+       if lister.ContinuationToken == "" {
+               input = &s3.ListObjectsV2Input{
+                       Bucket:  aws.String(lister.Bucket.bucket),
+                       MaxKeys: aws.Int64(int64(lister.PageSize)),
+                       Prefix:  aws.String(lister.Prefix),
+               }
+       } else {
+               input = &s3.ListObjectsV2Input{
+                       Bucket:            aws.String(lister.Bucket.bucket),
+                       MaxKeys:           aws.Int64(int64(lister.PageSize)),
+                       Prefix:            aws.String(lister.Prefix),
+                       ContinuationToken: &lister.ContinuationToken,
+               }
+       }
+
+       req := lister.Bucket.svc.ListObjectsV2Request(input)
+       resp, err := req.Send(context.Background())
+       if err != nil {
+               if aerr, ok := err.(awserr.Error); ok {
+                       lister.err = aerr
+               } else {
+                       lister.err = err
+               }
+               return
+       }
+
+       if *resp.IsTruncated {
+               lister.ContinuationToken = *resp.NextContinuationToken
+       } else {
+               lister.ContinuationToken = ""
+       }
+       lister.buf = make([]s3.Object, 0, len(resp.Contents))
+       for _, key := range resp.Contents {
+               if !strings.HasPrefix(*key.Key, lister.Prefix) {
+                       lister.Logger.Warnf("s3awsLister: S3 Bucket.List(prefix=%q) returned key %q", lister.Prefix, *key.Key)
+                       continue
+               }
+               lister.buf = append(lister.buf, key)
+       }
+}
+
+func (lister *s3awsLister) pop() (k *s3.Object) {
+       if len(lister.buf) > 0 {
+               k = &lister.buf[0]
+               lister.buf = lister.buf[1:]
+       }
+       return
+}
+
+// IndexTo writes a complete list of locators with the given prefix
+// for which Get() can retrieve data.
+func (v *S3AWSVolume) IndexTo(prefix string, writer io.Writer) error {
+       // Use a merge sort to find matching sets of X and recent/X.
+       dataL := s3awsLister{
+               Logger:   v.logger,
+               Bucket:   v.bucket,
+               Prefix:   prefix,
+               PageSize: v.IndexPageSize,
+               Stats:    &v.bucket.stats,
+       }
+       recentL := s3awsLister{
+               Logger:   v.logger,
+               Bucket:   v.bucket,
+               Prefix:   "recent/" + prefix,
+               PageSize: v.IndexPageSize,
+               Stats:    &v.bucket.stats,
+       }
+       for data, recent := dataL.First(), recentL.First(); data != nil && dataL.Error() == nil; data = dataL.Next() {
+               if *data.Key >= "g" {
+                       // Conveniently, "recent/*" and "trash/*" are
+                       // lexically greater than all hex-encoded data
+                       // hashes, so stopping here avoids iterating
+                       // over all of them needlessly with dataL.
+                       break
+               }
+               if !v.isKeepBlock(*data.Key) {
+                       continue
+               }
+
+               // stamp is the list entry we should use to report the
+               // last-modified time for this data block: it will be
+               // the recent/X entry if one exists, otherwise the
+               // entry for the data block itself.
+               stamp := data
+
+               // Advance to the corresponding recent/X marker, if any
+               for recent != nil && recentL.Error() == nil {
+                       if cmp := strings.Compare((*recent.Key)[7:], *data.Key); cmp < 0 {
+                               recent = recentL.Next()
+                               continue
+                       } else if cmp == 0 {
+                               stamp = recent
+                               recent = recentL.Next()
+                               break
+                       } else {
+                               // recent/X marker is missing: we'll
+                               // use the timestamp on the data
+                               // object.
+                               break
+                       }
+               }
+               if err := recentL.Error(); err != nil {
+                       return err
+               }
+               fmt.Fprintf(writer, "%s+%d %d\n", *data.Key, *data.Size, stamp.LastModified.UnixNano())
+       }
+       return dataL.Error()
+}
+
+// Mtime returns the stored timestamp for the given locator.
+func (v *S3AWSVolume) Mtime(loc string) (time.Time, error) {
+       _, err := v.Head(loc)
+       if err != nil {
+               return s3AWSZeroTime, v.translateError(err)
+       }
+       resp, err := v.Head("recent/" + loc)
+       err = v.translateError(err)
+       if os.IsNotExist(err) {
+               // The data object X exists, but recent/X is missing.
+               err = v.writeObject(context.Background(), "recent/"+loc, nil)
+               if err != nil {
+                       v.logger.WithError(err).Errorf("error creating %q", "recent/"+loc)
+                       return s3AWSZeroTime, v.translateError(err)
+               }
+               v.logger.Infof("Mtime: created %q to migrate existing block to new storage scheme", "recent/"+loc)
+               resp, err = v.Head("recent/" + loc)
+               if err != nil {
+                       v.logger.WithError(err).Errorf("HEAD failed after creating %q", "recent/"+loc)
+                       return s3AWSZeroTime, v.translateError(err)
+               }
+       } else if err != nil {
+               // HEAD recent/X failed for some other reason.
+               return s3AWSZeroTime, err
+       }
+       return *resp.LastModified, err
+}
+
+// Status returns a *VolumeStatus representing the current in-use
+// storage capacity and a fake available capacity that doesn't make
+// the volume seem full or nearly-full.
+func (v *S3AWSVolume) Status() *VolumeStatus {
+       return &VolumeStatus{
+               DeviceNum: 1,
+               BytesFree: BlockSize * 1000,
+               BytesUsed: 1,
+       }
+}
+
+// InternalStats returns bucket I/O and API call counters.
+func (v *S3AWSVolume) InternalStats() interface{} {
+       return &v.bucket.stats
+}
+
+// Touch sets the timestamp for the given locator to the current time.
+func (v *S3AWSVolume) Touch(loc string) error {
+       if v.volume.ReadOnly {
+               return MethodDisabledError
+       }
+       _, err := v.Head(loc)
+       err = v.translateError(err)
+       if os.IsNotExist(err) && v.fixRace(loc) {
+               // The data object got trashed in a race, but fixRace
+               // rescued it.
+       } else if err != nil {
+               return err
+       }
+       err = v.writeObject(context.Background(), "recent/"+loc, nil)
+       return v.translateError(err)
+}
+
+// checkRaceWindow returns a non-nil error if trash/loc is, or might
+// be, in the race window (i.e., it's not safe to trash loc).
+func (v *S3AWSVolume) checkRaceWindow(loc string) error {
+       resp, err := v.Head("trash/" + loc)
+       err = v.translateError(err)
+       if os.IsNotExist(err) {
+               // OK, trash/X doesn't exist so we're not in the race
+               // window
+               return nil
+       } else if err != nil {
+               // Error looking up trash/X. We don't know whether
+               // we're in the race window
+               return err
+       }
+       t := resp.LastModified
+       safeWindow := t.Add(v.cluster.Collections.BlobTrashLifetime.Duration()).Sub(time.Now().Add(time.Duration(v.RaceWindow)))
+       if safeWindow <= 0 {
+               // We can't count on "touch trash/X" to prolong
+               // trash/X's lifetime. The new timestamp might not
+               // become visible until now+raceWindow, and EmptyTrash
+               // is allowed to delete trash/X before then.
+               return fmt.Errorf("same block is already in trash, and safe window ended %s ago", -safeWindow)
+       }
+       // trash/X exists, but it won't be eligible for deletion until
+       // after now+raceWindow, so it's safe to overwrite it.
+       return nil
+}
+
+func (b *s3AWSbucket) Del(path string) error {
+       input := &s3.DeleteObjectInput{
+               Bucket: aws.String(b.bucket),
+               Key:    aws.String(path),
+       }
+       req := b.svc.DeleteObjectRequest(input)
+       _, err := req.Send(context.Background())
+       //err := b.Bucket().Del(path)
+       b.stats.TickOps("delete")
+       b.stats.Tick(&b.stats.Ops, &b.stats.DelOps)
+       b.stats.TickErr(err)
+       return err
+}
+
+// Trash a Keep block.
+func (v *S3AWSVolume) Trash(loc string) error {
+       if v.volume.ReadOnly {
+               return MethodDisabledError
+       }
+       if t, err := v.Mtime(loc); err != nil {
+               return err
+       } else if time.Since(t) < v.cluster.Collections.BlobSigningTTL.Duration() {
+               return nil
+       }
+       if v.cluster.Collections.BlobTrashLifetime == 0 {
+               if !v.UnsafeDelete {
+                       return ErrS3TrashDisabled
+               }
+               return v.translateError(v.bucket.Del(loc))
+       }
+       err := v.checkRaceWindow(loc)
+       if err != nil {
+               return err
+       }
+       err = v.safeCopy("trash/"+loc, loc)
+       if err != nil {
+               return err
+       }
+       return v.translateError(v.bucket.Del(loc))
+}
+
+// Untrash moves block from trash back into store
+func (v *S3AWSVolume) Untrash(loc string) error {
+       err := v.safeCopy(loc, "trash/"+loc)
+       if err != nil {
+               return err
+       }
+       err = v.writeObject(context.Background(), "recent/"+loc, nil)
+       return v.translateError(err)
+}
+
+type s3awsbucketStats struct {
+       statsTicker
+       Ops     uint64
+       GetOps  uint64
+       PutOps  uint64
+       HeadOps uint64
+       DelOps  uint64
+       ListOps uint64
+}
+
+func (s *s3awsbucketStats) TickErr(err error) {
+       if err == nil {
+               return
+       }
+       errType := fmt.Sprintf("%T", err)
+       if aerr, ok := err.(awserr.Error); ok {
+               if reqErr, ok := err.(awserr.RequestFailure); ok {
+                       // A service error occurred
+                       errType = errType + fmt.Sprintf(" %d %s", reqErr.StatusCode(), aerr.Code())
+               } else {
+                       errType = errType + fmt.Sprintf(" 000 %s", aerr.Code())
+               }
+       }
+       s.statsTicker.TickErr(err, errType)
+}
diff --git a/services/keepstore/s3aws_volume_test.go b/services/keepstore/s3aws_volume_test.go
new file mode 100644 (file)
index 0000000..d9886c0
--- /dev/null
@@ -0,0 +1,660 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package main
+
+import (
+       "bytes"
+       "context"
+       "crypto/md5"
+       "encoding/json"
+       "fmt"
+       "io"
+       "net/http"
+       "net/http/httptest"
+       "os"
+       "strings"
+       "time"
+
+       "git.arvados.org/arvados.git/sdk/go/arvados"
+       "git.arvados.org/arvados.git/sdk/go/ctxlog"
+
+       "github.com/aws/aws-sdk-go-v2/aws"
+       "github.com/aws/aws-sdk-go-v2/service/s3"
+       "github.com/aws/aws-sdk-go-v2/service/s3/s3manager"
+
+       "github.com/johannesboyne/gofakes3"
+       "github.com/johannesboyne/gofakes3/backend/s3mem"
+       "github.com/prometheus/client_golang/prometheus"
+       "github.com/sirupsen/logrus"
+       check "gopkg.in/check.v1"
+)
+
+const (
+       S3AWSTestBucketName = "testbucket"
+)
+
+type s3AWSFakeClock struct {
+       now *time.Time
+}
+
+func (c *s3AWSFakeClock) Now() time.Time {
+       if c.now == nil {
+               return time.Now().UTC()
+       }
+       return c.now.UTC()
+}
+
+func (c *s3AWSFakeClock) Since(t time.Time) time.Duration {
+       return c.Now().Sub(t)
+}
+
+var _ = check.Suite(&StubbedS3AWSSuite{})
+
+var srv httptest.Server
+
+type StubbedS3AWSSuite struct {
+       s3server *httptest.Server
+       metadata *httptest.Server
+       cluster  *arvados.Cluster
+       handler  *handler
+       volumes  []*TestableS3AWSVolume
+}
+
+func (s *StubbedS3AWSSuite) SetUpTest(c *check.C) {
+       s.s3server = nil
+       s.metadata = nil
+       s.cluster = testCluster(c)
+       s.cluster.Volumes = map[string]arvados.Volume{
+               "zzzzz-nyw5e-000000000000000": {Driver: "S3"},
+               "zzzzz-nyw5e-111111111111111": {Driver: "S3"},
+       }
+       s.handler = &handler{}
+}
+
+func (s *StubbedS3AWSSuite) TestGeneric(c *check.C) {
+       DoGenericVolumeTests(c, false, func(t TB, cluster *arvados.Cluster, volume arvados.Volume, logger logrus.FieldLogger, metrics *volumeMetricsVecs) TestableVolume {
+               // Use a negative raceWindow so s3test's 1-second
+               // timestamp precision doesn't confuse fixRace.
+               return s.newTestableVolume(c, cluster, volume, metrics, -2*time.Second)
+       })
+}
+
+func (s *StubbedS3AWSSuite) TestGenericReadOnly(c *check.C) {
+       DoGenericVolumeTests(c, true, func(t TB, cluster *arvados.Cluster, volume arvados.Volume, logger logrus.FieldLogger, metrics *volumeMetricsVecs) TestableVolume {
+               return s.newTestableVolume(c, cluster, volume, metrics, -2*time.Second)
+       })
+}
+
+func (s *StubbedS3AWSSuite) TestIndex(c *check.C) {
+       v := s.newTestableVolume(c, s.cluster, arvados.Volume{Replication: 2}, newVolumeMetricsVecs(prometheus.NewRegistry()), 0)
+       v.IndexPageSize = 3
+       for i := 0; i < 256; i++ {
+               v.PutRaw(fmt.Sprintf("%02x%030x", i, i), []byte{102, 111, 111})
+       }
+       for _, spec := range []struct {
+               prefix      string
+               expectMatch int
+       }{
+               {"", 256},
+               {"c", 16},
+               {"bc", 1},
+               {"abc", 0},
+       } {
+               buf := new(bytes.Buffer)
+               err := v.IndexTo(spec.prefix, buf)
+               c.Check(err, check.IsNil)
+
+               idx := bytes.SplitAfter(buf.Bytes(), []byte{10})
+               c.Check(len(idx), check.Equals, spec.expectMatch+1)
+               c.Check(len(idx[len(idx)-1]), check.Equals, 0)
+       }
+}
+
+func (s *StubbedS3AWSSuite) TestSignature(c *check.C) {
+       var header http.Header
+       stub := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+               header = r.Header
+       }))
+       defer stub.Close()
+
+       // The aws-sdk-go-v2 driver only supports S3 V4 signatures. S3 v2 signatures are being phased out
+       // as of June 24, 2020. Cf. https://forums.aws.amazon.com/ann.jspa?annID=5816
+       vol := S3AWSVolume{
+               S3VolumeDriverParameters: arvados.S3VolumeDriverParameters{
+                       AccessKey: "xxx",
+                       SecretKey: "xxx",
+                       Endpoint:  stub.URL,
+                       Region:    "test-region-1",
+                       Bucket:    "test-bucket-name",
+               },
+               cluster: s.cluster,
+               logger:  ctxlog.TestLogger(c),
+               metrics: newVolumeMetricsVecs(prometheus.NewRegistry()),
+       }
+       err := vol.check("")
+       // Our test S3 server uses the older 'Path Style'
+       vol.bucket.svc.ForcePathStyle = true
+
+       c.Check(err, check.IsNil)
+       err = vol.Put(context.Background(), "acbd18db4cc2f85cedef654fccc4a4d8", []byte("foo"))
+       c.Check(err, check.IsNil)
+       c.Check(header.Get("Authorization"), check.Matches, `AWS4-HMAC-SHA256 .*`)
+}
+
+func (s *StubbedS3AWSSuite) TestIAMRoleCredentials(c *check.C) {
+       s.metadata = httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+               upd := time.Now().UTC().Add(-time.Hour).Format(time.RFC3339)
+               exp := time.Now().UTC().Add(time.Hour).Format(time.RFC3339)
+               // Literal example from
+               // https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/iam-roles-for-amazon-ec2.html#instance-metadata-security-credentials
+               // but with updated timestamps
+               io.WriteString(w, `{"Code":"Success","LastUpdated":"`+upd+`","Type":"AWS-HMAC","AccessKeyId":"ASIAIOSFODNN7EXAMPLE","SecretAccessKey":"wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY","Token":"token","Expiration":"`+exp+`"}`)
+       }))
+       defer s.metadata.Close()
+
+       v := &S3AWSVolume{
+               S3VolumeDriverParameters: arvados.S3VolumeDriverParameters{
+                       IAMRole:  s.metadata.URL + "/latest/api/token",
+                       Endpoint: "http://localhost:12345",
+                       Region:   "test-region-1",
+                       Bucket:   "test-bucket-name",
+               },
+               cluster: s.cluster,
+               logger:  ctxlog.TestLogger(c),
+               metrics: newVolumeMetricsVecs(prometheus.NewRegistry()),
+       }
+       err := v.check(s.metadata.URL + "/latest")
+       c.Check(err, check.IsNil)
+       creds, err := v.bucket.svc.Client.Config.Credentials.Retrieve(context.Background())
+       c.Check(err, check.IsNil)
+       c.Check(creds.AccessKeyID, check.Equals, "ASIAIOSFODNN7EXAMPLE")
+       c.Check(creds.SecretAccessKey, check.Equals, "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY")
+
+       s.metadata = httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+               w.WriteHeader(http.StatusNotFound)
+       }))
+       deadv := &S3AWSVolume{
+               S3VolumeDriverParameters: arvados.S3VolumeDriverParameters{
+                       IAMRole:  s.metadata.URL + "/fake-metadata/test-role",
+                       Endpoint: "http://localhost:12345",
+                       Region:   "test-region-1",
+                       Bucket:   "test-bucket-name",
+               },
+               cluster: s.cluster,
+               logger:  ctxlog.TestLogger(c),
+               metrics: newVolumeMetricsVecs(prometheus.NewRegistry()),
+       }
+       err = deadv.check(s.metadata.URL + "/latest")
+       c.Check(err, check.IsNil)
+       _, err = deadv.bucket.svc.Client.Config.Credentials.Retrieve(context.Background())
+       c.Check(err, check.ErrorMatches, `(?s).*EC2RoleRequestError: no EC2 instance role found.*`)
+       c.Check(err, check.ErrorMatches, `(?s).*404.*`)
+}
+
+func (s *StubbedS3AWSSuite) TestStats(c *check.C) {
+       v := s.newTestableVolume(c, s.cluster, arvados.Volume{Replication: 2}, newVolumeMetricsVecs(prometheus.NewRegistry()), 5*time.Minute)
+       stats := func() string {
+               buf, err := json.Marshal(v.InternalStats())
+               c.Check(err, check.IsNil)
+               return string(buf)
+       }
+
+       c.Check(stats(), check.Matches, `.*"Ops":0,.*`)
+
+       loc := "acbd18db4cc2f85cedef654fccc4a4d8"
+       _, err := v.Get(context.Background(), loc, make([]byte, 3))
+       c.Check(err, check.NotNil)
+       c.Check(stats(), check.Matches, `.*"Ops":[^0],.*`)
+       c.Check(stats(), check.Matches, `.*"s3.requestFailure 404 NoSuchKey[^"]*":[^0].*`)
+       c.Check(stats(), check.Matches, `.*"InBytes":0,.*`)
+
+       err = v.Put(context.Background(), loc, []byte("foo"))
+       c.Check(err, check.IsNil)
+       c.Check(stats(), check.Matches, `.*"OutBytes":3,.*`)
+       c.Check(stats(), check.Matches, `.*"PutOps":2,.*`)
+
+       _, err = v.Get(context.Background(), loc, make([]byte, 3))
+       c.Check(err, check.IsNil)
+       _, err = v.Get(context.Background(), loc, make([]byte, 3))
+       c.Check(err, check.IsNil)
+       c.Check(stats(), check.Matches, `.*"InBytes":6,.*`)
+}
+
+type s3AWSBlockingHandler struct {
+       requested chan *http.Request
+       unblock   chan struct{}
+}
+
+func (h *s3AWSBlockingHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
+       if r.Method == "PUT" && !strings.Contains(strings.Trim(r.URL.Path, "/"), "/") {
+               // Accept PutBucket ("PUT /bucketname/"), called by
+               // newTestableVolume
+               return
+       }
+       if h.requested != nil {
+               h.requested <- r
+       }
+       if h.unblock != nil {
+               <-h.unblock
+       }
+       http.Error(w, "nothing here", http.StatusNotFound)
+}
+
+func (s *StubbedS3AWSSuite) TestGetContextCancel(c *check.C) {
+       loc := "acbd18db4cc2f85cedef654fccc4a4d8"
+       buf := make([]byte, 3)
+
+       s.testContextCancel(c, func(ctx context.Context, v *TestableS3AWSVolume) error {
+               _, err := v.Get(ctx, loc, buf)
+               return err
+       })
+}
+
+func (s *StubbedS3AWSSuite) TestCompareContextCancel(c *check.C) {
+       loc := "acbd18db4cc2f85cedef654fccc4a4d8"
+       buf := []byte("bar")
+
+       s.testContextCancel(c, func(ctx context.Context, v *TestableS3AWSVolume) error {
+               return v.Compare(ctx, loc, buf)
+       })
+}
+
+func (s *StubbedS3AWSSuite) TestPutContextCancel(c *check.C) {
+       loc := "acbd18db4cc2f85cedef654fccc4a4d8"
+       buf := []byte("foo")
+
+       s.testContextCancel(c, func(ctx context.Context, v *TestableS3AWSVolume) error {
+               return v.Put(ctx, loc, buf)
+       })
+}
+
+func (s *StubbedS3AWSSuite) testContextCancel(c *check.C, testFunc func(context.Context, *TestableS3AWSVolume) error) {
+       handler := &s3AWSBlockingHandler{}
+       s.s3server = httptest.NewServer(handler)
+       defer s.s3server.Close()
+
+       v := s.newTestableVolume(c, s.cluster, arvados.Volume{Replication: 2}, newVolumeMetricsVecs(prometheus.NewRegistry()), 5*time.Minute)
+
+       ctx, cancel := context.WithCancel(context.Background())
+
+       handler.requested = make(chan *http.Request)
+       handler.unblock = make(chan struct{})
+       defer close(handler.unblock)
+
+       doneFunc := make(chan struct{})
+       go func() {
+               err := testFunc(ctx, v)
+               c.Check(err, check.Equals, context.Canceled)
+               close(doneFunc)
+       }()
+
+       timeout := time.After(10 * time.Second)
+
+       // Wait for the stub server to receive a request, meaning
+       // Get() is waiting for an s3 operation.
+       select {
+       case <-timeout:
+               c.Fatal("timed out waiting for test func to call our handler")
+       case <-doneFunc:
+               c.Fatal("test func finished without even calling our handler!")
+       case <-handler.requested:
+       }
+
+       cancel()
+
+       select {
+       case <-timeout:
+               c.Fatal("timed out")
+       case <-doneFunc:
+       }
+}
+
+func (s *StubbedS3AWSSuite) TestBackendStates(c *check.C) {
+       s.cluster.Collections.BlobTrashLifetime.Set("1h")
+       s.cluster.Collections.BlobSigningTTL.Set("1h")
+
+       v := s.newTestableVolume(c, s.cluster, arvados.Volume{Replication: 2}, newVolumeMetricsVecs(prometheus.NewRegistry()), 5*time.Minute)
+       var none time.Time
+
+       putS3Obj := func(t time.Time, key string, data []byte) {
+               if t == none {
+                       return
+               }
+               v.serverClock.now = &t
+               uploader := s3manager.NewUploaderWithClient(v.bucket.svc)
+               _, err := uploader.UploadWithContext(context.Background(), &s3manager.UploadInput{
+                       Bucket: aws.String(v.bucket.bucket),
+                       Key:    aws.String(key),
+                       Body:   bytes.NewReader(data),
+               })
+               if err != nil {
+                       panic(err)
+               }
+               v.serverClock.now = nil
+               _, err = v.Head(key)
+               if err != nil {
+                       panic(err)
+               }
+       }
+
+       t0 := time.Now()
+       nextKey := 0
+       for _, scenario := range []struct {
+               label               string
+               dataT               time.Time
+               recentT             time.Time
+               trashT              time.Time
+               canGet              bool
+               canTrash            bool
+               canGetAfterTrash    bool
+               canUntrash          bool
+               haveTrashAfterEmpty bool
+               freshAfterEmpty     bool
+       }{
+               {
+                       "No related objects",
+                       none, none, none,
+                       false, false, false, false, false, false,
+               },
+               {
+                       // Stored by older version, or there was a
+                       // race between EmptyTrash and Put: Trash is a
+                       // no-op even though the data object is very
+                       // old
+                       "No recent/X",
+                       t0.Add(-48 * time.Hour), none, none,
+                       true, true, true, false, false, false,
+               },
+               {
+                       "Not trash, but old enough to be eligible for trash",
+                       t0.Add(-24 * time.Hour), t0.Add(-2 * time.Hour), none,
+                       true, true, false, false, false, false,
+               },
+               {
+                       "Not trash, and not old enough to be eligible for trash",
+                       t0.Add(-24 * time.Hour), t0.Add(-30 * time.Minute), none,
+                       true, true, true, false, false, false,
+               },
+               {
+                       "Trashed + untrashed copies exist, due to recent race between Trash and Put",
+                       t0.Add(-24 * time.Hour), t0.Add(-3 * time.Minute), t0.Add(-2 * time.Minute),
+                       true, true, true, true, true, false,
+               },
+               {
+                       "Trashed + untrashed copies exist, trash nearly eligible for deletion: prone to Trash race",
+                       t0.Add(-24 * time.Hour), t0.Add(-12 * time.Hour), t0.Add(-59 * time.Minute),
+                       true, false, true, true, true, false,
+               },
+               {
+                       "Trashed + untrashed copies exist, trash is eligible for deletion: prone to Trash race",
+                       t0.Add(-24 * time.Hour), t0.Add(-12 * time.Hour), t0.Add(-61 * time.Minute),
+                       true, false, true, true, false, false,
+               },
+               {
+                       "Trashed + untrashed copies exist, due to old race between Put and unfinished Trash: emptying trash is unsafe",
+                       t0.Add(-24 * time.Hour), t0.Add(-12 * time.Hour), t0.Add(-12 * time.Hour),
+                       true, false, true, true, true, true,
+               },
+               {
+                       "Trashed + untrashed copies exist, used to be unsafe to empty, but since made safe by fixRace+Touch",
+                       t0.Add(-time.Second), t0.Add(-time.Second), t0.Add(-12 * time.Hour),
+                       true, true, true, true, false, false,
+               },
+               {
+                       "Trashed + untrashed copies exist because Trash operation was interrupted (no race)",
+                       t0.Add(-24 * time.Hour), t0.Add(-24 * time.Hour), t0.Add(-12 * time.Hour),
+                       true, false, true, true, false, false,
+               },
+               {
+                       "Trash, not yet eligible for deletion",
+                       none, t0.Add(-12 * time.Hour), t0.Add(-time.Minute),
+                       false, false, false, true, true, false,
+               },
+               {
+                       "Trash, not yet eligible for deletion, prone to races",
+                       none, t0.Add(-12 * time.Hour), t0.Add(-59 * time.Minute),
+                       false, false, false, true, true, false,
+               },
+               {
+                       "Trash, eligible for deletion",
+                       none, t0.Add(-12 * time.Hour), t0.Add(-2 * time.Hour),
+                       false, false, false, true, false, false,
+               },
+               {
+                       "Erroneously trashed during a race, detected before BlobTrashLifetime",
+                       none, t0.Add(-30 * time.Minute), t0.Add(-29 * time.Minute),
+                       true, false, true, true, true, false,
+               },
+               {
+                       "Erroneously trashed during a race, rescue during EmptyTrash despite reaching BlobTrashLifetime",
+                       none, t0.Add(-90 * time.Minute), t0.Add(-89 * time.Minute),
+                       true, false, true, true, true, false,
+               },
+               {
+                       "Trashed copy exists with no recent/* marker (cause unknown); repair by untrashing",
+                       none, none, t0.Add(-time.Minute),
+                       false, false, false, true, true, true,
+               },
+       } {
+               c.Log("Scenario: ", scenario.label)
+
+               // We have a few tests to run for each scenario, and
+               // the tests are expected to change state. By calling
+               // this setup func between tests, we (re)create the
+               // scenario as specified, using a new unique block
+               // locator to prevent interference from previous
+               // tests.
+
+               setupScenario := func() (string, []byte) {
+                       nextKey++
+                       blk := []byte(fmt.Sprintf("%d", nextKey))
+                       loc := fmt.Sprintf("%x", md5.Sum(blk))
+                       c.Log("\t", loc)
+                       putS3Obj(scenario.dataT, loc, blk)
+                       putS3Obj(scenario.recentT, "recent/"+loc, nil)
+                       putS3Obj(scenario.trashT, "trash/"+loc, blk)
+                       v.serverClock.now = &t0
+                       return loc, blk
+               }
+
+               // Check canGet
+               loc, blk := setupScenario()
+               buf := make([]byte, len(blk))
+               _, err := v.Get(context.Background(), loc, buf)
+               c.Check(err == nil, check.Equals, scenario.canGet)
+               if err != nil {
+                       c.Check(os.IsNotExist(err), check.Equals, true)
+               }
+
+               // Call Trash, then check canTrash and canGetAfterTrash
+               loc, _ = setupScenario()
+               err = v.Trash(loc)
+               c.Check(err == nil, check.Equals, scenario.canTrash)
+               _, err = v.Get(context.Background(), loc, buf)
+               c.Check(err == nil, check.Equals, scenario.canGetAfterTrash)
+               if err != nil {
+                       c.Check(os.IsNotExist(err), check.Equals, true)
+               }
+
+               // Call Untrash, then check canUntrash
+               loc, _ = setupScenario()
+               err = v.Untrash(loc)
+               c.Check(err == nil, check.Equals, scenario.canUntrash)
+               if scenario.dataT != none || scenario.trashT != none {
+                       // In all scenarios where the data exists, we
+                       // should be able to Get after Untrash --
+                       // regardless of timestamps, errors, race
+                       // conditions, etc.
+                       _, err = v.Get(context.Background(), loc, buf)
+                       c.Check(err, check.IsNil)
+               }
+
+               // Call EmptyTrash, then check haveTrashAfterEmpty and
+               // freshAfterEmpty
+               loc, _ = setupScenario()
+               v.EmptyTrash()
+               _, err = v.Head("trash/" + loc)
+               c.Check(err == nil, check.Equals, scenario.haveTrashAfterEmpty)
+               if scenario.freshAfterEmpty {
+                       t, err := v.Mtime(loc)
+                       c.Check(err, check.IsNil)
+                       // new mtime must be current (with an
+                       // allowance for 1s timestamp precision)
+                       c.Check(t.After(t0.Add(-time.Second)), check.Equals, true)
+               }
+
+               // Check for current Mtime after Put (applies to all
+               // scenarios)
+               loc, blk = setupScenario()
+               err = v.Put(context.Background(), loc, blk)
+               c.Check(err, check.IsNil)
+               t, err := v.Mtime(loc)
+               c.Check(err, check.IsNil)
+               c.Check(t.After(t0.Add(-time.Second)), check.Equals, true)
+       }
+}
+
+type TestableS3AWSVolume struct {
+       *S3AWSVolume
+       server      *httptest.Server
+       c           *check.C
+       serverClock *s3AWSFakeClock
+}
+
+type LogrusLog struct {
+       log *logrus.FieldLogger
+}
+
+func (l LogrusLog) Print(level gofakes3.LogLevel, v ...interface{}) {
+       switch level {
+       case gofakes3.LogErr:
+               (*l.log).Errorln(v...)
+       case gofakes3.LogWarn:
+               (*l.log).Warnln(v...)
+       case gofakes3.LogInfo:
+               (*l.log).Infoln(v...)
+       default:
+               panic("unknown level")
+       }
+}
+
+func (s *StubbedS3AWSSuite) newTestableVolume(c *check.C, cluster *arvados.Cluster, volume arvados.Volume, metrics *volumeMetricsVecs, raceWindow time.Duration) *TestableS3AWSVolume {
+
+       clock := &s3AWSFakeClock{}
+       // fake s3
+       backend := s3mem.New(s3mem.WithTimeSource(clock))
+
+       // To enable GoFakeS3 debug logging, pass logger to gofakes3.WithLogger()
+       /* logger := new(LogrusLog)
+       ctxLogger := ctxlog.FromContext(context.Background())
+       logger.log = &ctxLogger */
+       faker := gofakes3.New(backend, gofakes3.WithTimeSource(clock), gofakes3.WithLogger(nil), gofakes3.WithTimeSkewLimit(0))
+       srv := httptest.NewServer(faker.Server())
+
+       endpoint := srv.URL
+       if s.s3server != nil {
+               endpoint = s.s3server.URL
+       }
+
+       iamRole, accessKey, secretKey := "", "xxx", "xxx"
+       if s.metadata != nil {
+               iamRole, accessKey, secretKey = s.metadata.URL+"/fake-metadata/test-role", "", ""
+       }
+
+       v := &TestableS3AWSVolume{
+               S3AWSVolume: &S3AWSVolume{
+                       S3VolumeDriverParameters: arvados.S3VolumeDriverParameters{
+                               IAMRole:            iamRole,
+                               AccessKey:          accessKey,
+                               SecretKey:          secretKey,
+                               Bucket:             S3AWSTestBucketName,
+                               Endpoint:           endpoint,
+                               Region:             "test-region-1",
+                               LocationConstraint: true,
+                               UnsafeDelete:       true,
+                               IndexPageSize:      1000,
+                       },
+                       cluster: cluster,
+                       volume:  volume,
+                       logger:  ctxlog.TestLogger(c),
+                       metrics: metrics,
+               },
+               c:           c,
+               server:      srv,
+               serverClock: clock,
+       }
+       c.Assert(v.S3AWSVolume.check(""), check.IsNil)
+       // Our test S3 server uses the older 'Path Style'
+       v.S3AWSVolume.bucket.svc.ForcePathStyle = true
+       // Create the testbucket
+       input := &s3.CreateBucketInput{
+               Bucket: aws.String(S3AWSTestBucketName),
+       }
+       req := v.S3AWSVolume.bucket.svc.CreateBucketRequest(input)
+       _, err := req.Send(context.Background())
+       c.Assert(err, check.IsNil)
+       // We couldn't set RaceWindow until now because check()
+       // rejects negative values.
+       v.S3AWSVolume.RaceWindow = arvados.Duration(raceWindow)
+       return v
+}
+
+// PutRaw skips the ContentMD5 test
+func (v *TestableS3AWSVolume) PutRaw(loc string, block []byte) {
+
+       r := NewCountingReader(bytes.NewReader(block), v.bucket.stats.TickOutBytes)
+
+       uploader := s3manager.NewUploaderWithClient(v.bucket.svc, func(u *s3manager.Uploader) {
+               u.PartSize = 5 * 1024 * 1024
+               u.Concurrency = 13
+       })
+
+       _, err := uploader.Upload(&s3manager.UploadInput{
+               Bucket: aws.String(v.bucket.bucket),
+               Key:    aws.String(loc),
+               Body:   r,
+       })
+       if err != nil {
+               v.logger.Printf("PutRaw: %s: %+v", loc, err)
+       }
+
+       empty := bytes.NewReader([]byte{})
+       _, err = uploader.Upload(&s3manager.UploadInput{
+               Bucket: aws.String(v.bucket.bucket),
+               Key:    aws.String("recent/" + loc),
+               Body:   empty,
+       })
+       if err != nil {
+               v.logger.Printf("PutRaw: recent/%s: %+v", loc, err)
+       }
+}
+
+// TouchWithDate turns back the clock while doing a Touch(). We assume
+// there are no other operations happening on the same s3test server
+// while we do this.
+func (v *TestableS3AWSVolume) TouchWithDate(locator string, lastPut time.Time) {
+       v.serverClock.now = &lastPut
+
+       uploader := s3manager.NewUploaderWithClient(v.bucket.svc)
+       empty := bytes.NewReader([]byte{})
+       _, err := uploader.UploadWithContext(context.Background(), &s3manager.UploadInput{
+               Bucket: aws.String(v.bucket.bucket),
+               Key:    aws.String("recent/" + locator),
+               Body:   empty,
+       })
+       if err != nil {
+               panic(err)
+       }
+
+       v.serverClock.now = nil
+}
+
+func (v *TestableS3AWSVolume) Teardown() {
+       v.server.Close()
+}
+
+func (v *TestableS3AWSVolume) ReadWriteOperationLabelValues() (r, w string) {
+       return "get", "put"
+}
diff --git a/services/nodemanager/.gitignore b/services/nodemanager/.gitignore
deleted file mode 120000 (symlink)
index ed3b362..0000000
+++ /dev/null
@@ -1 +0,0 @@
-../../sdk/python/.gitignore
\ No newline at end of file
diff --git a/services/nodemanager/MANIFEST.in b/services/nodemanager/MANIFEST.in
deleted file mode 100644 (file)
index 8410420..0000000
+++ /dev/null
@@ -1,8 +0,0 @@
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: AGPL-3.0
-
-include agpl-3.0.txt
-include README.rst
-include arvados_version.py
-include arvados-node-manager.service
diff --git a/services/nodemanager/README.rst b/services/nodemanager/README.rst
deleted file mode 100644 (file)
index 1d725e0..0000000
+++ /dev/null
@@ -1,43 +0,0 @@
-.. Copyright (C) The Arvados Authors. All rights reserved.
-..
-.. SPDX-License-Identifier: AGPL-3.0
-
-====================
-Arvados Node Manager
-====================
-
-Overview
---------
-
-This package provides ``arvados-node-manager``.  It dynamically starts
-and stops compute nodes on an Arvados_ cloud installation based on job
-demand.
-
-.. _Arvados: https://arvados.org/
-
-Setup
------
-
-1. Install the package.
-
-2. Write a configuration file.  ``doc/ec2.example.cfg`` documents all
-   of the options available, with specific tunables for EC2 clouds.
-
-3. Run ``arvados-node-manager --config YOURCONFIGFILE`` using whatever
-   supervisor you like (e.g., runit).
-
-Testing and Development
------------------------
-
-To run tests, just run::
-
-  python setup.py test
-
-Our `hacking guide
-<https://arvados.org/projects/arvados/wiki/Hacking_Node_Manager>`_
-provides an architectural overview of the Arvados Node Manager to help
-you find your way around the source.  The `Lifecycle of an Arvados
-compute node
-<https://arvados.org/projects/arvados/wiki/Lifecycle_of_an_Arvados_compute_node>`_
-page explains how it works in concert with other Arvados components to
-prepare a node for compute work.
diff --git a/services/nodemanager/agpl-3.0.txt b/services/nodemanager/agpl-3.0.txt
deleted file mode 100644 (file)
index dba13ed..0000000
+++ /dev/null
@@ -1,661 +0,0 @@
-                    GNU AFFERO GENERAL PUBLIC LICENSE
-                       Version 3, 19 November 2007
-
- Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
- Everyone is permitted to copy and distribute verbatim copies
- of this license document, but changing it is not allowed.
-
-                            Preamble
-
-  The GNU Affero General Public License is a free, copyleft license for
-software and other kinds of works, specifically designed to ensure
-cooperation with the community in the case of network server software.
-
-  The licenses for most software and other practical works are designed
-to take away your freedom to share and change the works.  By contrast,
-our General Public Licenses are intended to guarantee your freedom to
-share and change all versions of a program--to make sure it remains free
-software for all its users.
-
-  When we speak of free software, we are referring to freedom, not
-price.  Our General Public Licenses are designed to make sure that you
-have the freedom to distribute copies of free software (and charge for
-them if you wish), that you receive source code or can get it if you
-want it, that you can change the software or use pieces of it in new
-free programs, and that you know you can do these things.
-
-  Developers that use our General Public Licenses protect your rights
-with two steps: (1) assert copyright on the software, and (2) offer
-you this License which gives you legal permission to copy, distribute
-and/or modify the software.
-
-  A secondary benefit of defending all users' freedom is that
-improvements made in alternate versions of the program, if they
-receive widespread use, become available for other developers to
-incorporate.  Many developers of free software are heartened and
-encouraged by the resulting cooperation.  However, in the case of
-software used on network servers, this result may fail to come about.
-The GNU General Public License permits making a modified version and
-letting the public access it on a server without ever releasing its
-source code to the public.
-
-  The GNU Affero General Public License is designed specifically to
-ensure that, in such cases, the modified source code becomes available
-to the community.  It requires the operator of a network server to
-provide the source code of the modified version running there to the
-users of that server.  Therefore, public use of a modified version, on
-a publicly accessible server, gives the public access to the source
-code of the modified version.
-
-  An older license, called the Affero General Public License and
-published by Affero, was designed to accomplish similar goals.  This is
-a different license, not a version of the Affero GPL, but Affero has
-released a new version of the Affero GPL which permits relicensing under
-this license.
-
-  The precise terms and conditions for copying, distribution and
-modification follow.
-
-                       TERMS AND CONDITIONS
-
-  0. Definitions.
-
-  "This License" refers to version 3 of the GNU Affero General Public License.
-
-  "Copyright" also means copyright-like laws that apply to other kinds of
-works, such as semiconductor masks.
-
-  "The Program" refers to any copyrightable work licensed under this
-License.  Each licensee is addressed as "you".  "Licensees" and
-"recipients" may be individuals or organizations.
-
-  To "modify" a work means to copy from or adapt all or part of the work
-in a fashion requiring copyright permission, other than the making of an
-exact copy.  The resulting work is called a "modified version" of the
-earlier work or a work "based on" the earlier work.
-
-  A "covered work" means either the unmodified Program or a work based
-on the Program.
-
-  To "propagate" a work means to do anything with it that, without
-permission, would make you directly or secondarily liable for
-infringement under applicable copyright law, except executing it on a
-computer or modifying a private copy.  Propagation includes copying,
-distribution (with or without modification), making available to the
-public, and in some countries other activities as well.
-
-  To "convey" a work means any kind of propagation that enables other
-parties to make or receive copies.  Mere interaction with a user through
-a computer network, with no transfer of a copy, is not conveying.
-
-  An interactive user interface displays "Appropriate Legal Notices"
-to the extent that it includes a convenient and prominently visible
-feature that (1) displays an appropriate copyright notice, and (2)
-tells the user that there is no warranty for the work (except to the
-extent that warranties are provided), that licensees may convey the
-work under this License, and how to view a copy of this License.  If
-the interface presents a list of user commands or options, such as a
-menu, a prominent item in the list meets this criterion.
-
-  1. Source Code.
-
-  The "source code" for a work means the preferred form of the work
-for making modifications to it.  "Object code" means any non-source
-form of a work.
-
-  A "Standard Interface" means an interface that either is an official
-standard defined by a recognized standards body, or, in the case of
-interfaces specified for a particular programming language, one that
-is widely used among developers working in that language.
-
-  The "System Libraries" of an executable work include anything, other
-than the work as a whole, that (a) is included in the normal form of
-packaging a Major Component, but which is not part of that Major
-Component, and (b) serves only to enable use of the work with that
-Major Component, or to implement a Standard Interface for which an
-implementation is available to the public in source code form.  A
-"Major Component", in this context, means a major essential component
-(kernel, window system, and so on) of the specific operating system
-(if any) on which the executable work runs, or a compiler used to
-produce the work, or an object code interpreter used to run it.
-
-  The "Corresponding Source" for a work in object code form means all
-the source code needed to generate, install, and (for an executable
-work) run the object code and to modify the work, including scripts to
-control those activities.  However, it does not include the work's
-System Libraries, or general-purpose tools or generally available free
-programs which are used unmodified in performing those activities but
-which are not part of the work.  For example, Corresponding Source
-includes interface definition files associated with source files for
-the work, and the source code for shared libraries and dynamically
-linked subprograms that the work is specifically designed to require,
-such as by intimate data communication or control flow between those
-subprograms and other parts of the work.
-
-  The Corresponding Source need not include anything that users
-can regenerate automatically from other parts of the Corresponding
-Source.
-
-  The Corresponding Source for a work in source code form is that
-same work.
-
-  2. Basic Permissions.
-
-  All rights granted under this License are granted for the term of
-copyright on the Program, and are irrevocable provided the stated
-conditions are met.  This License explicitly affirms your unlimited
-permission to run the unmodified Program.  The output from running a
-covered work is covered by this License only if the output, given its
-content, constitutes a covered work.  This License acknowledges your
-rights of fair use or other equivalent, as provided by copyright law.
-
-  You may make, run and propagate covered works that you do not
-convey, without conditions so long as your license otherwise remains
-in force.  You may convey covered works to others for the sole purpose
-of having them make modifications exclusively for you, or provide you
-with facilities for running those works, provided that you comply with
-the terms of this License in conveying all material for which you do
-not control copyright.  Those thus making or running the covered works
-for you must do so exclusively on your behalf, under your direction
-and control, on terms that prohibit them from making any copies of
-your copyrighted material outside their relationship with you.
-
-  Conveying under any other circumstances is permitted solely under
-the conditions stated below.  Sublicensing is not allowed; section 10
-makes it unnecessary.
-
-  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
-
-  No covered work shall be deemed part of an effective technological
-measure under any applicable law fulfilling obligations under article
-11 of the WIPO copyright treaty adopted on 20 December 1996, or
-similar laws prohibiting or restricting circumvention of such
-measures.
-
-  When you convey a covered work, you waive any legal power to forbid
-circumvention of technological measures to the extent such circumvention
-is effected by exercising rights under this License with respect to
-the covered work, and you disclaim any intention to limit operation or
-modification of the work as a means of enforcing, against the work's
-users, your or third parties' legal rights to forbid circumvention of
-technological measures.
-
-  4. Conveying Verbatim Copies.
-
-  You may convey verbatim copies of the Program's source code as you
-receive it, in any medium, provided that you conspicuously and
-appropriately publish on each copy an appropriate copyright notice;
-keep intact all notices stating that this License and any
-non-permissive terms added in accord with section 7 apply to the code;
-keep intact all notices of the absence of any warranty; and give all
-recipients a copy of this License along with the Program.
-
-  You may charge any price or no price for each copy that you convey,
-and you may offer support or warranty protection for a fee.
-
-  5. Conveying Modified Source Versions.
-
-  You may convey a work based on the Program, or the modifications to
-produce it from the Program, in the form of source code under the
-terms of section 4, provided that you also meet all of these conditions:
-
-    a) The work must carry prominent notices stating that you modified
-    it, and giving a relevant date.
-
-    b) The work must carry prominent notices stating that it is
-    released under this License and any conditions added under section
-    7.  This requirement modifies the requirement in section 4 to
-    "keep intact all notices".
-
-    c) You must license the entire work, as a whole, under this
-    License to anyone who comes into possession of a copy.  This
-    License will therefore apply, along with any applicable section 7
-    additional terms, to the whole of the work, and all its parts,
-    regardless of how they are packaged.  This License gives no
-    permission to license the work in any other way, but it does not
-    invalidate such permission if you have separately received it.
-
-    d) If the work has interactive user interfaces, each must display
-    Appropriate Legal Notices; however, if the Program has interactive
-    interfaces that do not display Appropriate Legal Notices, your
-    work need not make them do so.
-
-  A compilation of a covered work with other separate and independent
-works, which are not by their nature extensions of the covered work,
-and which are not combined with it such as to form a larger program,
-in or on a volume of a storage or distribution medium, is called an
-"aggregate" if the compilation and its resulting copyright are not
-used to limit the access or legal rights of the compilation's users
-beyond what the individual works permit.  Inclusion of a covered work
-in an aggregate does not cause this License to apply to the other
-parts of the aggregate.
-
-  6. Conveying Non-Source Forms.
-
-  You may convey a covered work in object code form under the terms
-of sections 4 and 5, provided that you also convey the
-machine-readable Corresponding Source under the terms of this License,
-in one of these ways:
-
-    a) Convey the object code in, or embodied in, a physical product
-    (including a physical distribution medium), accompanied by the
-    Corresponding Source fixed on a durable physical medium
-    customarily used for software interchange.
-
-    b) Convey the object code in, or embodied in, a physical product
-    (including a physical distribution medium), accompanied by a
-    written offer, valid for at least three years and valid for as
-    long as you offer spare parts or customer support for that product
-    model, to give anyone who possesses the object code either (1) a
-    copy of the Corresponding Source for all the software in the
-    product that is covered by this License, on a durable physical
-    medium customarily used for software interchange, for a price no
-    more than your reasonable cost of physically performing this
-    conveying of source, or (2) access to copy the
-    Corresponding Source from a network server at no charge.
-
-    c) Convey individual copies of the object code with a copy of the
-    written offer to provide the Corresponding Source.  This
-    alternative is allowed only occasionally and noncommercially, and
-    only if you received the object code with such an offer, in accord
-    with subsection 6b.
-
-    d) Convey the object code by offering access from a designated
-    place (gratis or for a charge), and offer equivalent access to the
-    Corresponding Source in the same way through the same place at no
-    further charge.  You need not require recipients to copy the
-    Corresponding Source along with the object code.  If the place to
-    copy the object code is a network server, the Corresponding Source
-    may be on a different server (operated by you or a third party)
-    that supports equivalent copying facilities, provided you maintain
-    clear directions next to the object code saying where to find the
-    Corresponding Source.  Regardless of what server hosts the
-    Corresponding Source, you remain obligated to ensure that it is
-    available for as long as needed to satisfy these requirements.
-
-    e) Convey the object code using peer-to-peer transmission, provided
-    you inform other peers where the object code and Corresponding
-    Source of the work are being offered to the general public at no
-    charge under subsection 6d.
-
-  A separable portion of the object code, whose source code is excluded
-from the Corresponding Source as a System Library, need not be
-included in conveying the object code work.
-
-  A "User Product" is either (1) a "consumer product", which means any
-tangible personal property which is normally used for personal, family,
-or household purposes, or (2) anything designed or sold for incorporation
-into a dwelling.  In determining whether a product is a consumer product,
-doubtful cases shall be resolved in favor of coverage.  For a particular
-product received by a particular user, "normally used" refers to a
-typical or common use of that class of product, regardless of the status
-of the particular user or of the way in which the particular user
-actually uses, or expects or is expected to use, the product.  A product
-is a consumer product regardless of whether the product has substantial
-commercial, industrial or non-consumer uses, unless such uses represent
-the only significant mode of use of the product.
-
-  "Installation Information" for a User Product means any methods,
-procedures, authorization keys, or other information required to install
-and execute modified versions of a covered work in that User Product from
-a modified version of its Corresponding Source.  The information must
-suffice to ensure that the continued functioning of the modified object
-code is in no case prevented or interfered with solely because
-modification has been made.
-
-  If you convey an object code work under this section in, or with, or
-specifically for use in, a User Product, and the conveying occurs as
-part of a transaction in which the right of possession and use of the
-User Product is transferred to the recipient in perpetuity or for a
-fixed term (regardless of how the transaction is characterized), the
-Corresponding Source conveyed under this section must be accompanied
-by the Installation Information.  But this requirement does not apply
-if neither you nor any third party retains the ability to install
-modified object code on the User Product (for example, the work has
-been installed in ROM).
-
-  The requirement to provide Installation Information does not include a
-requirement to continue to provide support service, warranty, or updates
-for a work that has been modified or installed by the recipient, or for
-the User Product in which it has been modified or installed.  Access to a
-network may be denied when the modification itself materially and
-adversely affects the operation of the network or violates the rules and
-protocols for communication across the network.
-
-  Corresponding Source conveyed, and Installation Information provided,
-in accord with this section must be in a format that is publicly
-documented (and with an implementation available to the public in
-source code form), and must require no special password or key for
-unpacking, reading or copying.
-
-  7. Additional Terms.
-
-  "Additional permissions" are terms that supplement the terms of this
-License by making exceptions from one or more of its conditions.
-Additional permissions that are applicable to the entire Program shall
-be treated as though they were included in this License, to the extent
-that they are valid under applicable law.  If additional permissions
-apply only to part of the Program, that part may be used separately
-under those permissions, but the entire Program remains governed by
-this License without regard to the additional permissions.
-
-  When you convey a copy of a covered work, you may at your option
-remove any additional permissions from that copy, or from any part of
-it.  (Additional permissions may be written to require their own
-removal in certain cases when you modify the work.)  You may place
-additional permissions on material, added by you to a covered work,
-for which you have or can give appropriate copyright permission.
-
-  Notwithstanding any other provision of this License, for material you
-add to a covered work, you may (if authorized by the copyright holders of
-that material) supplement the terms of this License with terms:
-
-    a) Disclaiming warranty or limiting liability differently from the
-    terms of sections 15 and 16 of this License; or
-
-    b) Requiring preservation of specified reasonable legal notices or
-    author attributions in that material or in the Appropriate Legal
-    Notices displayed by works containing it; or
-
-    c) Prohibiting misrepresentation of the origin of that material, or
-    requiring that modified versions of such material be marked in
-    reasonable ways as different from the original version; or
-
-    d) Limiting the use for publicity purposes of names of licensors or
-    authors of the material; or
-
-    e) Declining to grant rights under trademark law for use of some
-    trade names, trademarks, or service marks; or
-
-    f) Requiring indemnification of licensors and authors of that
-    material by anyone who conveys the material (or modified versions of
-    it) with contractual assumptions of liability to the recipient, for
-    any liability that these contractual assumptions directly impose on
-    those licensors and authors.
-
-  All other non-permissive additional terms are considered "further
-restrictions" within the meaning of section 10.  If the Program as you
-received it, or any part of it, contains a notice stating that it is
-governed by this License along with a term that is a further
-restriction, you may remove that term.  If a license document contains
-a further restriction but permits relicensing or conveying under this
-License, you may add to a covered work material governed by the terms
-of that license document, provided that the further restriction does
-not survive such relicensing or conveying.
-
-  If you add terms to a covered work in accord with this section, you
-must place, in the relevant source files, a statement of the
-additional terms that apply to those files, or a notice indicating
-where to find the applicable terms.
-
-  Additional terms, permissive or non-permissive, may be stated in the
-form of a separately written license, or stated as exceptions;
-the above requirements apply either way.
-
-  8. Termination.
-
-  You may not propagate or modify a covered work except as expressly
-provided under this License.  Any attempt otherwise to propagate or
-modify it is void, and will automatically terminate your rights under
-this License (including any patent licenses granted under the third
-paragraph of section 11).
-
-  However, if you cease all violation of this License, then your
-license from a particular copyright holder is reinstated (a)
-provisionally, unless and until the copyright holder explicitly and
-finally terminates your license, and (b) permanently, if the copyright
-holder fails to notify you of the violation by some reasonable means
-prior to 60 days after the cessation.
-
-  Moreover, your license from a particular copyright holder is
-reinstated permanently if the copyright holder notifies you of the
-violation by some reasonable means, this is the first time you have
-received notice of violation of this License (for any work) from that
-copyright holder, and you cure the violation prior to 30 days after
-your receipt of the notice.
-
-  Termination of your rights under this section does not terminate the
-licenses of parties who have received copies or rights from you under
-this License.  If your rights have been terminated and not permanently
-reinstated, you do not qualify to receive new licenses for the same
-material under section 10.
-
-  9. Acceptance Not Required for Having Copies.
-
-  You are not required to accept this License in order to receive or
-run a copy of the Program.  Ancillary propagation of a covered work
-occurring solely as a consequence of using peer-to-peer transmission
-to receive a copy likewise does not require acceptance.  However,
-nothing other than this License grants you permission to propagate or
-modify any covered work.  These actions infringe copyright if you do
-not accept this License.  Therefore, by modifying or propagating a
-covered work, you indicate your acceptance of this License to do so.
-
-  10. Automatic Licensing of Downstream Recipients.
-
-  Each time you convey a covered work, the recipient automatically
-receives a license from the original licensors, to run, modify and
-propagate that work, subject to this License.  You are not responsible
-for enforcing compliance by third parties with this License.
-
-  An "entity transaction" is a transaction transferring control of an
-organization, or substantially all assets of one, or subdividing an
-organization, or merging organizations.  If propagation of a covered
-work results from an entity transaction, each party to that
-transaction who receives a copy of the work also receives whatever
-licenses to the work the party's predecessor in interest had or could
-give under the previous paragraph, plus a right to possession of the
-Corresponding Source of the work from the predecessor in interest, if
-the predecessor has it or can get it with reasonable efforts.
-
-  You may not impose any further restrictions on the exercise of the
-rights granted or affirmed under this License.  For example, you may
-not impose a license fee, royalty, or other charge for exercise of
-rights granted under this License, and you may not initiate litigation
-(including a cross-claim or counterclaim in a lawsuit) alleging that
-any patent claim is infringed by making, using, selling, offering for
-sale, or importing the Program or any portion of it.
-
-  11. Patents.
-
-  A "contributor" is a copyright holder who authorizes use under this
-License of the Program or a work on which the Program is based.  The
-work thus licensed is called the contributor's "contributor version".
-
-  A contributor's "essential patent claims" are all patent claims
-owned or controlled by the contributor, whether already acquired or
-hereafter acquired, that would be infringed by some manner, permitted
-by this License, of making, using, or selling its contributor version,
-but do not include claims that would be infringed only as a
-consequence of further modification of the contributor version.  For
-purposes of this definition, "control" includes the right to grant
-patent sublicenses in a manner consistent with the requirements of
-this License.
-
-  Each contributor grants you a non-exclusive, worldwide, royalty-free
-patent license under the contributor's essential patent claims, to
-make, use, sell, offer for sale, import and otherwise run, modify and
-propagate the contents of its contributor version.
-
-  In the following three paragraphs, a "patent license" is any express
-agreement or commitment, however denominated, not to enforce a patent
-(such as an express permission to practice a patent or covenant not to
-sue for patent infringement).  To "grant" such a patent license to a
-party means to make such an agreement or commitment not to enforce a
-patent against the party.
-
-  If you convey a covered work, knowingly relying on a patent license,
-and the Corresponding Source of the work is not available for anyone
-to copy, free of charge and under the terms of this License, through a
-publicly available network server or other readily accessible means,
-then you must either (1) cause the Corresponding Source to be so
-available, or (2) arrange to deprive yourself of the benefit of the
-patent license for this particular work, or (3) arrange, in a manner
-consistent with the requirements of this License, to extend the patent
-license to downstream recipients.  "Knowingly relying" means you have
-actual knowledge that, but for the patent license, your conveying the
-covered work in a country, or your recipient's use of the covered work
-in a country, would infringe one or more identifiable patents in that
-country that you have reason to believe are valid.
-
-  If, pursuant to or in connection with a single transaction or
-arrangement, you convey, or propagate by procuring conveyance of, a
-covered work, and grant a patent license to some of the parties
-receiving the covered work authorizing them to use, propagate, modify
-or convey a specific copy of the covered work, then the patent license
-you grant is automatically extended to all recipients of the covered
-work and works based on it.
-
-  A patent license is "discriminatory" if it does not include within
-the scope of its coverage, prohibits the exercise of, or is
-conditioned on the non-exercise of one or more of the rights that are
-specifically granted under this License.  You may not convey a covered
-work if you are a party to an arrangement with a third party that is
-in the business of distributing software, under which you make payment
-to the third party based on the extent of your activity of conveying
-the work, and under which the third party grants, to any of the
-parties who would receive the covered work from you, a discriminatory
-patent license (a) in connection with copies of the covered work
-conveyed by you (or copies made from those copies), or (b) primarily
-for and in connection with specific products or compilations that
-contain the covered work, unless you entered into that arrangement,
-or that patent license was granted, prior to 28 March 2007.
-
-  Nothing in this License shall be construed as excluding or limiting
-any implied license or other defenses to infringement that may
-otherwise be available to you under applicable patent law.
-
-  12. No Surrender of Others' Freedom.
-
-  If conditions are imposed on you (whether by court order, agreement or
-otherwise) that contradict the conditions of this License, they do not
-excuse you from the conditions of this License.  If you cannot convey a
-covered work so as to satisfy simultaneously your obligations under this
-License and any other pertinent obligations, then as a consequence you may
-not convey it at all.  For example, if you agree to terms that obligate you
-to collect a royalty for further conveying from those to whom you convey
-the Program, the only way you could satisfy both those terms and this
-License would be to refrain entirely from conveying the Program.
-
-  13. Remote Network Interaction; Use with the GNU General Public License.
-
-  Notwithstanding any other provision of this License, if you modify the
-Program, your modified version must prominently offer all users
-interacting with it remotely through a computer network (if your version
-supports such interaction) an opportunity to receive the Corresponding
-Source of your version by providing access to the Corresponding Source
-from a network server at no charge, through some standard or customary
-means of facilitating copying of software.  This Corresponding Source
-shall include the Corresponding Source for any work covered by version 3
-of the GNU General Public License that is incorporated pursuant to the
-following paragraph.
-
-  Notwithstanding any other provision of this License, you have
-permission to link or combine any covered work with a work licensed
-under version 3 of the GNU General Public License into a single
-combined work, and to convey the resulting work.  The terms of this
-License will continue to apply to the part which is the covered work,
-but the work with which it is combined will remain governed by version
-3 of the GNU General Public License.
-
-  14. Revised Versions of this License.
-
-  The Free Software Foundation may publish revised and/or new versions of
-the GNU Affero General Public License from time to time.  Such new versions
-will be similar in spirit to the present version, but may differ in detail to
-address new problems or concerns.
-
-  Each version is given a distinguishing version number.  If the
-Program specifies that a certain numbered version of the GNU Affero General
-Public License "or any later version" applies to it, you have the
-option of following the terms and conditions either of that numbered
-version or of any later version published by the Free Software
-Foundation.  If the Program does not specify a version number of the
-GNU Affero General Public License, you may choose any version ever published
-by the Free Software Foundation.
-
-  If the Program specifies that a proxy can decide which future
-versions of the GNU Affero General Public License can be used, that proxy's
-public statement of acceptance of a version permanently authorizes you
-to choose that version for the Program.
-
-  Later license versions may give you additional or different
-permissions.  However, no additional obligations are imposed on any
-author or copyright holder as a result of your choosing to follow a
-later version.
-
-  15. Disclaimer of Warranty.
-
-  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
-APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
-HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
-OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
-THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
-IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
-ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
-
-  16. Limitation of Liability.
-
-  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
-WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
-THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
-GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
-USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
-DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
-PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
-EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
-SUCH DAMAGES.
-
-  17. Interpretation of Sections 15 and 16.
-
-  If the disclaimer of warranty and limitation of liability provided
-above cannot be given local legal effect according to their terms,
-reviewing courts shall apply local law that most closely approximates
-an absolute waiver of all civil liability in connection with the
-Program, unless a warranty or assumption of liability accompanies a
-copy of the Program in return for a fee.
-
-                     END OF TERMS AND CONDITIONS
-
-            How to Apply These Terms to Your New Programs
-
-  If you develop a new program, and you want it to be of the greatest
-possible use to the public, the best way to achieve this is to make it
-free software which everyone can redistribute and change under these terms.
-
-  To do so, attach the following notices to the program.  It is safest
-to attach them to the start of each source file to most effectively
-state the exclusion of warranty; and each file should have at least
-the "copyright" line and a pointer to where the full notice is found.
-
-    <one line to give the program's name and a brief idea of what it does.>
-    Copyright (C) <year>  <name of author>
-
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU Affero General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU Affero General Public License for more details.
-
-    You should have received a copy of the GNU Affero General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
-
-Also add information on how to contact you by electronic and paper mail.
-
-  If your software can interact with users remotely through a computer
-network, you should also make sure that it provides a way for users to
-get its source.  For example, if your program is a web application, its
-interface could display a "Source" link that leads users to an archive
-of the code.  There are many ways you could offer source, and different
-solutions will be better for different programs; see section 13 for the
-specific requirements.
-
-  You should also get your employer (if you work as a programmer) or school,
-if any, to sign a "copyright disclaimer" for the program, if necessary.
-For more information on this, and how to apply and follow the GNU AGPL, see
-<http://www.gnu.org/licenses/>.
diff --git a/services/nodemanager/arvados-node-manager.service b/services/nodemanager/arvados-node-manager.service
deleted file mode 100644 (file)
index 38c525b..0000000
+++ /dev/null
@@ -1,32 +0,0 @@
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: AGPL-3.0
-
-[Unit]
-Description=Arvados Node Manager Daemon
-Documentation=https://doc.arvados.org/
-After=network.target
-AssertPathExists=/etc/arvados-node-manager/config.ini
-
-# systemd==229 (ubuntu:xenial) obeys StartLimitInterval in the [Unit] section
-StartLimitInterval=0
-
-# systemd>=230 (debian:9) obeys StartLimitIntervalSec in the [Unit] section
-StartLimitIntervalSec=0
-
-[Service]
-EnvironmentFile=-/etc/default/arvados-node-manager
-LimitDATA=3145728K
-LimitRSS=3145728K
-LimitMEMLOCK=3145728K
-LimitNOFILE=10240
-Type=simple
-ExecStart=/usr/bin/env sh -c '/usr/bin/arvados-node-manager --foreground --config /etc/arvados-node-manager/config.ini 2>&1 | cat'
-Restart=always
-RestartSec=1
-
-# systemd<=219 (centos:7, debian:8, ubuntu:trusty) obeys StartLimitInterval in the [Service] section
-StartLimitInterval=0
-
-[Install]
-WantedBy=multi-user.target
diff --git a/services/nodemanager/arvados_version.py b/services/nodemanager/arvados_version.py
deleted file mode 100644 (file)
index 0c65369..0000000
+++ /dev/null
@@ -1,51 +0,0 @@
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-import subprocess
-import time
-import os
-import re
-
-SETUP_DIR = os.path.dirname(os.path.abspath(__file__))
-
-def choose_version_from():
-    sdk_ts = subprocess.check_output(
-        ['git', 'log', '--first-parent', '--max-count=1',
-         '--format=format:%ct', os.path.join(SETUP_DIR, "../../sdk/python")]).strip()
-    cwl_ts = subprocess.check_output(
-        ['git', 'log', '--first-parent', '--max-count=1',
-         '--format=format:%ct', SETUP_DIR]).strip()
-    if int(sdk_ts) > int(cwl_ts):
-        getver = os.path.join(SETUP_DIR, "../../sdk/python")
-    else:
-        getver = SETUP_DIR
-    return getver
-
-def git_version_at_commit():
-    curdir = choose_version_from()
-    myhash = subprocess.check_output(['git', 'log', '-n1', '--first-parent',
-                                       '--format=%H', curdir]).strip()
-    myversion = subprocess.check_output([curdir+'/../../build/version-at-commit.sh', myhash]).strip().decode()
-    return myversion
-
-def save_version(setup_dir, module, v):
-  with open(os.path.join(setup_dir, module, "_version.py"), 'wt') as fp:
-      return fp.write("__version__ = '%s'\n" % v)
-
-def read_version(setup_dir, module):
-  with open(os.path.join(setup_dir, module, "_version.py"), 'rt') as fp:
-      return re.match("__version__ = '(.*)'$", fp.read()).groups()[0]
-
-def get_version(setup_dir, module):
-    env_version = os.environ.get("ARVADOS_BUILDING_VERSION")
-
-    if env_version:
-        save_version(setup_dir, module, env_version)
-    else:
-        try:
-            save_version(setup_dir, module, git_version_at_commit())
-        except (subprocess.CalledProcessError, OSError):
-            pass
-
-    return read_version(setup_dir, module)
diff --git a/services/nodemanager/arvnodeman/__init__.py b/services/nodemanager/arvnodeman/__init__.py
deleted file mode 100644 (file)
index 3f94807..0000000
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: AGPL-3.0
-
-from __future__ import absolute_import, print_function
-
-import _strptime  # See <http://bugs.python.org/issue7980#msg221094>.
-import logging
-
-logger = logging.getLogger('arvnodeman')
-logger.addHandler(logging.NullHandler())
diff --git a/services/nodemanager/arvnodeman/baseactor.py b/services/nodemanager/arvnodeman/baseactor.py
deleted file mode 100644 (file)
index bdfe5d4..0000000
+++ /dev/null
@@ -1,129 +0,0 @@
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: AGPL-3.0
-
-from __future__ import absolute_import, print_function
-
-import errno
-import logging
-import os
-import signal
-import time
-import threading
-import traceback
-
-import pykka
-
-from .status import tracker
-
-class _TellCallableProxy(object):
-    """Internal helper class for proxying callables."""
-
-    def __init__(self, ref, attr_path):
-        self.actor_ref = ref
-        self._attr_path = attr_path
-
-    def __call__(self, *args, **kwargs):
-        message = {
-            'command': 'pykka_call',
-            'attr_path': self._attr_path,
-            'args': args,
-            'kwargs': kwargs,
-        }
-        self.actor_ref.tell(message)
-
-
-class TellActorProxy(pykka.ActorProxy):
-    """ActorProxy in which all calls are implemented as using tell().
-
-    The standard pykka.ActorProxy always uses ask() and returns a Future.  If
-    the target method raises an exception, it is placed in the Future object
-    and re-raised when get() is called on the Future.  Unfortunately, most
-    messaging in Node Manager is asynchronous and the caller does not store the
-    Future object returned by the call to ActorProxy.  As a result, exceptions
-    resulting from these calls end up in limbo, neither reported in the logs
-    nor handled by on_failure().
-
-    The TellActorProxy uses tell() instead of ask() and does not return a
-    Future object.  As a result, if the target method raises an exception, it
-    will be logged and on_failure() will be called as intended.
-
-    """
-
-    def __repr__(self):
-        return '<ActorProxy for %s, attr_path=%s>' % (
-            self.actor_ref, self._attr_path)
-
-    def __getattr__(self, name):
-        """Get a callable from the actor."""
-        attr_path = self._attr_path + (name,)
-        if attr_path not in self._known_attrs:
-            self._known_attrs = self._get_attributes()
-        attr_info = self._known_attrs.get(attr_path)
-        if attr_info is None:
-            raise AttributeError('%s has no attribute "%s"' % (self, name))
-        if attr_info['callable']:
-            if attr_path not in self._callable_proxies:
-                self._callable_proxies[attr_path] = _TellCallableProxy(
-                    self.actor_ref, attr_path)
-            return self._callable_proxies[attr_path]
-        else:
-            raise AttributeError('attribute "%s" is not a callable on %s' % (name, self))
-
-class TellableActorRef(pykka.ActorRef):
-    """ActorRef adding the tell_proxy() method to get TellActorProxy."""
-
-    def tell_proxy(self):
-        return TellActorProxy(self)
-
-class BaseNodeManagerActor(pykka.ThreadingActor):
-    """Base class for actors in node manager, redefining actor_ref as a
-    TellableActorRef and providing a default on_failure handler.
-    """
-
-    def __init__(self, *args, **kwargs):
-         super(pykka.ThreadingActor, self).__init__(*args, **kwargs)
-         self.actor_ref = TellableActorRef(self)
-         self._killfunc = kwargs.get("killfunc", os.kill)
-
-    def on_failure(self, exception_type, exception_value, tb):
-        lg = getattr(self, "_logger", logging)
-        if (exception_type in (threading.ThreadError, MemoryError) or
-            exception_type is OSError and exception_value.errno == errno.ENOMEM):
-            lg.critical("Unhandled exception is a fatal error, killing Node Manager")
-            self._killfunc(os.getpid(), signal.SIGKILL)
-        tracker.counter_add('actor_exceptions')
-
-    def ping(self):
-        return True
-
-    def get_thread(self):
-        return threading.current_thread()
-
-class WatchdogActor(pykka.ThreadingActor):
-    def __init__(self, timeout, *args, **kwargs):
-         super(pykka.ThreadingActor, self).__init__(*args, **kwargs)
-         self.timeout = timeout
-         self.actors = [a.proxy() for a in args]
-         self.actor_ref = TellableActorRef(self)
-         self._later = self.actor_ref.tell_proxy()
-         self._killfunc = kwargs.get("killfunc", os.kill)
-
-    def kill_self(self, e, act):
-        lg = getattr(self, "_logger", logging)
-        lg.critical("Watchdog exception", exc_info=e)
-        lg.critical("Actor %s watchdog ping time out, killing Node Manager", act)
-        self._killfunc(os.getpid(), signal.SIGKILL)
-
-    def on_start(self):
-        self._later.run()
-
-    def run(self):
-        a = None
-        try:
-            for a in self.actors:
-                a.ping().get(self.timeout)
-            time.sleep(20)
-            self._later.run()
-        except Exception as e:
-            self.kill_self(e, a)
diff --git a/services/nodemanager/arvnodeman/clientactor.py b/services/nodemanager/arvnodeman/clientactor.py
deleted file mode 100644 (file)
index afc4f1c..0000000
+++ /dev/null
@@ -1,116 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: AGPL-3.0
-
-from __future__ import absolute_import, print_function
-
-import logging
-import time
-
-import pykka
-
-from .config import actor_class
-
-def _notify_subscribers(response, subscribers):
-    """Send the response to all the subscriber methods.
-
-    If any of the subscriber actors have stopped, remove them from the
-    subscriber set.
-    """
-    dead_subscribers = set()
-    for subscriber in subscribers:
-        try:
-            subscriber(response)
-        except pykka.ActorDeadError:
-            dead_subscribers.add(subscriber)
-    subscribers.difference_update(dead_subscribers)
-
-class RemotePollLoopActor(actor_class):
-    """Abstract actor class to regularly poll a remote service.
-
-    This actor sends regular requests to a remote service, and sends each
-    response to subscribers.  It takes care of error handling, and retrying
-    requests with exponential backoff.
-
-    To use this actor, define the _send_request method.  If you also
-    define an _item_key method, this class will support subscribing to
-    a specific item by key in responses.
-    """
-    def __init__(self, client, timer_actor, poll_wait=60, max_poll_wait=180):
-        super(RemotePollLoopActor, self).__init__()
-        self._client = client
-        self._timer = timer_actor
-        self._later = self.actor_ref.tell_proxy()
-        self._polling_started = False
-        self.min_poll_wait = poll_wait
-        self.max_poll_wait = max_poll_wait
-        self.poll_wait = self.min_poll_wait
-        self.all_subscribers = set()
-        self.key_subscribers = {}
-        if hasattr(self, '_item_key'):
-            self.subscribe_to = self._subscribe_to
-
-    def on_start(self):
-        self._logger = logging.getLogger("%s.%s" % (self.__class__.__name__, id(self.actor_urn[9:])))
-
-    def _start_polling(self):
-        if not self._polling_started:
-            self._polling_started = True
-            self._later.poll()
-
-    def subscribe(self, subscriber):
-        self.all_subscribers.add(subscriber)
-        self._logger.debug("%s subscribed to all events", subscriber.actor_ref.actor_urn)
-        self._start_polling()
-
-    # __init__ exposes this method to the proxy if the subclass defines
-    # _item_key.
-    def _subscribe_to(self, key, subscriber):
-        self.key_subscribers.setdefault(key, set()).add(subscriber)
-        self._logger.debug("%s subscribed to events for '%s'", subscriber.actor_ref.actor_urn, key)
-        self._start_polling()
-
-    def _send_request(self):
-        raise NotImplementedError("subclasses must implement request method")
-
-    def _got_response(self, response):
-        self.poll_wait = self.min_poll_wait
-        _notify_subscribers(response, self.all_subscribers)
-        if hasattr(self, '_item_key'):
-            items = {self._item_key(x): x for x in response}
-            for key, subscribers in self.key_subscribers.iteritems():
-                _notify_subscribers(items.get(key), subscribers)
-
-    def _got_error(self, error):
-        self.poll_wait = min(self.poll_wait * 2, self.max_poll_wait)
-        return "got error: {} - will try again in {} seconds".format(
-            error, self.poll_wait)
-
-    def is_common_error(self, exception):
-        return False
-
-    def poll(self, scheduled_start=None):
-        self._logger.debug("sending request")
-        start_time = time.time()
-        if scheduled_start is None:
-            scheduled_start = start_time
-        try:
-            response = self._send_request()
-        except Exception as error:
-            errmsg = self._got_error(error)
-            if self.is_common_error(error):
-                self._logger.warning(errmsg)
-            else:
-                self._logger.exception(errmsg)
-            next_poll = start_time + self.poll_wait
-        else:
-            self._got_response(response)
-            next_poll = scheduled_start + self.poll_wait
-            self._logger.info("got response with %d items in %s seconds, next poll at %s",
-                              len(response), (time.time() - scheduled_start),
-                              time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(next_poll)))
-        end_time = time.time()
-        if next_poll < end_time:  # We've drifted too much; start fresh.
-            next_poll = end_time + self.poll_wait
-        self._timer.schedule(next_poll, self._later.poll, next_poll)
diff --git a/services/nodemanager/arvnodeman/computenode/__init__.py b/services/nodemanager/arvnodeman/computenode/__init__.py
deleted file mode 100644 (file)
index b124c66..0000000
+++ /dev/null
@@ -1,201 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: AGPL-3.0
-
-from __future__ import absolute_import, print_function
-
-import calendar
-import functools
-import itertools
-import re
-import time
-
-from ..config import CLOUD_ERRORS
-from ..status import tracker
-from libcloud.common.exceptions import BaseHTTPError, RateLimitReachedError
-
-ARVADOS_TIMEFMT = '%Y-%m-%dT%H:%M:%SZ'
-ARVADOS_TIMESUBSEC_RE = re.compile(r'(\.\d+)Z$')
-
-def arvados_node_fqdn(arvados_node, default_hostname='dynamic.compute'):
-    hostname = arvados_node.get('hostname') or default_hostname
-    return '{}.{}'.format(hostname, arvados_node['domain'])
-
-def arvados_node_mtime(node):
-    return arvados_timestamp(node['modified_at'])
-
-def arvados_timestamp(timestr):
-    subsec_match = ARVADOS_TIMESUBSEC_RE.search(timestr)
-    if subsec_match is None:
-        subsecs = .0
-    else:
-        subsecs = float(subsec_match.group(1))
-        timestr = timestr[:subsec_match.start()] + 'Z'
-    return calendar.timegm(time.strptime(timestr + 'UTC',
-                                         ARVADOS_TIMEFMT + '%Z')) + subsecs
-
-def timestamp_fresh(timestamp, fresh_time):
-    return (time.time() - timestamp) < fresh_time
-
-def arvados_node_missing(arvados_node, fresh_time):
-    """Indicate if cloud node corresponding to the arvados
-    node is "missing".
-
-    If True, this means the node has not pinged the API server within the timeout
-    period.  If False, the ping is up to date.  If the node has never pinged,
-    returns None.
-    """
-    if arvados_node["last_ping_at"] is None:
-        return None
-    else:
-        return not timestamp_fresh(arvados_timestamp(arvados_node["last_ping_at"]), fresh_time)
-
-class RetryMixin(object):
-    """Retry decorator for an method that makes remote requests.
-
-    Use this function to decorate method, and pass in a tuple of exceptions to
-    catch.  If the original method raises a known cloud driver error, or any of
-    the given exception types, this decorator will either go into a
-    sleep-and-retry loop with exponential backoff either by sleeping (if
-    self._timer is None) or by scheduling retries of the method (if self._timer
-    is a timer actor.)
-
-    """
-    def __init__(self, retry_wait, max_retry_wait, logger, cloud, timer=None):
-        self.min_retry_wait = max(1, retry_wait)
-        self.max_retry_wait = max(self.min_retry_wait, max_retry_wait)
-        self.retry_wait = retry_wait
-        self._logger = logger
-        self._cloud = cloud
-        self._timer = timer
-
-    @staticmethod
-    def _retry(errors=()):
-        def decorator(orig_func):
-            @functools.wraps(orig_func)
-            def retry_wrapper(self, *args, **kwargs):
-                while True:
-                    should_retry = False
-                    try:
-                        ret = orig_func(self, *args, **kwargs)
-                    except RateLimitReachedError as error:
-                        # If retry-after is zero, continue with exponential
-                        # backoff.
-                        if error.retry_after != 0:
-                            self.retry_wait = error.retry_after
-                        should_retry = True
-                    except BaseHTTPError as error:
-                        if error.headers and error.headers.get("retry-after"):
-                            try:
-                                retry_after = int(error.headers["retry-after"])
-                                # If retry-after is zero, continue with
-                                # exponential backoff.
-                                if retry_after != 0:
-                                    self.retry_wait = retry_after
-                                should_retry = True
-                            except ValueError:
-                                self._logger.warning(
-                                    "Unrecognizable Retry-After header: %r",
-                                    error.headers["retry-after"],
-                                    exc_info=error)
-                        if error.code == 429 or error.code >= 500:
-                            should_retry = True
-                    except CLOUD_ERRORS as error:
-                        tracker.counter_add('cloud_errors')
-                        should_retry = True
-                    except errors as error:
-                        should_retry = True
-                    except Exception as error:
-                        # As a libcloud workaround for drivers that don't use
-                        # typed exceptions, consider bare Exception() objects
-                        # retryable.
-                        if type(error) is Exception:
-                            tracker.counter_add('cloud_errors')
-                            should_retry = True
-                    else:
-                        # No exception
-                        self.retry_wait = self.min_retry_wait
-                        return ret
-
-                    # Only got here if an exception was caught.  Now determine what to do about it.
-                    if not should_retry:
-                        self.retry_wait = self.min_retry_wait
-                        self._logger.warning(
-                            "Re-raising error (no retry): %s",
-                            error, exc_info=error)
-                        raise
-
-                    # Retry wait out of bounds?
-                    if self.retry_wait < self.min_retry_wait:
-                        self.retry_wait = self.min_retry_wait
-                    elif self.retry_wait > self.max_retry_wait:
-                        self.retry_wait = self.max_retry_wait
-
-                    self._logger.warning(
-                        "Client error: %s - %s %s seconds",
-                        error,
-                        "scheduling retry in" if self._timer else "sleeping",
-                        self.retry_wait,
-                        exc_info=error)
-
-                    if self._timer:
-                        start_time = time.time()
-                        # reschedule to be called again
-                        self._timer.schedule(start_time + self.retry_wait,
-                                             getattr(self._later,
-                                                     orig_func.__name__),
-                                             *args, **kwargs)
-                    else:
-                        # sleep on it.
-                        time.sleep(self.retry_wait)
-
-                    self.retry_wait = min(self.retry_wait * 2,
-                                          self.max_retry_wait)
-                    if self._timer:
-                        # expect to be called again by timer so don't loop
-                        return
-
-            return retry_wrapper
-        return decorator
-
-class ShutdownTimer(object):
-    """Keep track of a cloud node's shutdown windows.
-
-    Instantiate this class with a timestamp of when a cloud node started,
-    and a list of durations (in minutes) of when the node must not and may
-    be shut down, alternating.  The class will tell you when a shutdown
-    window is open, and when the next open window will start.
-    """
-    def __init__(self, start_time, shutdown_windows):
-        # The implementation is easiest if we have an even number of windows,
-        # because then windows always alternate between open and closed.
-        # Rig that up: calculate the first shutdown window based on what's
-        # passed in.  Then, if we were given an odd number of windows, merge
-        # that first window into the last one, since they both# represent
-        # closed state.
-        first_window = shutdown_windows[0]
-        shutdown_windows = list(shutdown_windows[1:])
-        self._next_opening = start_time + (60 * first_window)
-        if len(shutdown_windows) % 2:
-            shutdown_windows.append(first_window)
-        else:
-            shutdown_windows[-1] += first_window
-        self.shutdown_windows = itertools.cycle([60 * n
-                                                 for n in shutdown_windows])
-        self._open_start = self._next_opening
-        self._open_for = next(self.shutdown_windows)
-
-    def _advance_opening(self):
-        while self._next_opening < time.time():
-            self._open_start = self._next_opening
-            self._next_opening += self._open_for + next(self.shutdown_windows)
-            self._open_for = next(self.shutdown_windows)
-
-    def next_opening(self):
-        self._advance_opening()
-        return self._next_opening
-
-    def window_open(self):
-        self._advance_opening()
-        return 0 < (time.time() - self._open_start) < self._open_for
diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
deleted file mode 100644 (file)
index 77c515d..0000000
+++ /dev/null
@@ -1,536 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: AGPL-3.0
-
-from __future__ import absolute_import, print_function
-
-import functools
-import logging
-import time
-import re
-
-import libcloud.common.types as cloud_types
-from libcloud.common.exceptions import BaseHTTPError
-
-import pykka
-
-from .. import \
-    arvados_node_fqdn, arvados_node_mtime, arvados_timestamp, timestamp_fresh, \
-    arvados_node_missing, RetryMixin
-from ...clientactor import _notify_subscribers
-from ... import config
-from ... import status
-from .transitions import transitions
-
-QuotaExceeded = "QuotaExceeded"
-
-class ComputeNodeStateChangeBase(config.actor_class, RetryMixin):
-    """Base class for actors that change a compute node's state.
-
-    This base class takes care of retrying changes and notifying
-    subscribers when the change is finished.
-    """
-    def __init__(self, cloud_client, arvados_client, timer_actor,
-                 retry_wait, max_retry_wait):
-        super(ComputeNodeStateChangeBase, self).__init__()
-        RetryMixin.__init__(self, retry_wait, max_retry_wait,
-                            None, cloud_client, timer_actor)
-        self._later = self.actor_ref.tell_proxy()
-        self._arvados = arvados_client
-        self.subscribers = set()
-
-    def _set_logger(self):
-        self._logger = logging.getLogger("%s.%s" % (self.__class__.__name__, self.actor_urn[33:]))
-
-    def on_start(self):
-        self._set_logger()
-
-    def _finished(self):
-        if self.subscribers is None:
-            raise Exception("Actor tried to finish twice")
-        _notify_subscribers(self.actor_ref.proxy(), self.subscribers)
-        self.subscribers = None
-        self._logger.info("finished")
-
-    def subscribe(self, subscriber):
-        if self.subscribers is None:
-            try:
-                subscriber(self.actor_ref.proxy())
-            except pykka.ActorDeadError:
-                pass
-        else:
-            self.subscribers.add(subscriber)
-
-    def _clean_arvados_node(self, arvados_node, explanation):
-        return self._arvados.nodes().update(
-            uuid=arvados_node['uuid'],
-            body={'hostname': None,
-                  'ip_address': None,
-                  'slot_number': None,
-                  'first_ping_at': None,
-                  'last_ping_at': None,
-                  'properties': {},
-                  'info': {'ec2_instance_id': None,
-                           'last_action': explanation}},
-            ).execute()
-
-    @staticmethod
-    def _finish_on_exception(orig_func):
-        @functools.wraps(orig_func)
-        def finish_wrapper(self, *args, **kwargs):
-            try:
-                return orig_func(self, *args, **kwargs)
-            except Exception as error:
-                self._logger.error("Actor error %s", error)
-                self._finished()
-        return finish_wrapper
-
-
-class ComputeNodeSetupActor(ComputeNodeStateChangeBase):
-    """Actor to create and set up a cloud compute node.
-
-    This actor prepares an Arvados node record for a new compute node
-    (either creating one or cleaning one passed in), then boots the
-    actual compute node.  It notifies subscribers when the cloud node
-    is successfully created (the last step in the process for Node
-    Manager to handle).
-    """
-    def __init__(self, timer_actor, arvados_client, cloud_client,
-                 cloud_size, arvados_node=None,
-                 retry_wait=1, max_retry_wait=180):
-        super(ComputeNodeSetupActor, self).__init__(
-            cloud_client, arvados_client, timer_actor,
-            retry_wait, max_retry_wait)
-        self.cloud_size = cloud_size
-        self.arvados_node = None
-        self.cloud_node = None
-        self.error = None
-        if arvados_node is None:
-            self._later.create_arvados_node()
-        else:
-            self._later.prepare_arvados_node(arvados_node)
-
-    @ComputeNodeStateChangeBase._finish_on_exception
-    @RetryMixin._retry(config.ARVADOS_ERRORS)
-    def create_arvados_node(self):
-        self.arvados_node = self._arvados.nodes().create(
-            body={}, assign_slot=True).execute()
-        self._later.create_cloud_node()
-
-    @ComputeNodeStateChangeBase._finish_on_exception
-    @RetryMixin._retry(config.ARVADOS_ERRORS)
-    def prepare_arvados_node(self, node):
-        self._clean_arvados_node(node, "Prepared by Node Manager")
-        self.arvados_node = self._arvados.nodes().update(
-            uuid=node['uuid'], body={}, assign_slot=True).execute()
-        self._later.create_cloud_node()
-
-    @ComputeNodeStateChangeBase._finish_on_exception
-    @RetryMixin._retry()
-    def create_cloud_node(self):
-        self._logger.info("Sending create_node request for node size %s.",
-                          self.cloud_size.id)
-        try:
-            self.cloud_node = self._cloud.create_node(self.cloud_size,
-                                                      self.arvados_node)
-        except BaseHTTPError as e:
-            if e.code == 429 or "RequestLimitExceeded" in e.message:
-                # Don't consider API rate limits to be quota errors.
-                # re-raise so the Retry logic applies.
-                raise
-
-            # The set of possible error codes / messages isn't documented for
-            # all clouds, so use a keyword heuristic to determine if the
-            # failure is likely due to a quota.
-            if re.search(r'(exceed|quota|limit)', e.message, re.I):
-                self.error = QuotaExceeded
-                self._logger.warning("Quota exceeded: %s", e)
-                self._finished()
-                return
-            else:
-                # Something else happened, re-raise so the Retry logic applies.
-                raise
-        except Exception as e:
-            raise
-
-        # The information included in the node size object we get from libcloud
-        # is inconsistent between cloud drivers.  Replace libcloud NodeSize
-        # object with compatible CloudSizeWrapper object which merges the size
-        # info reported from the cloud with size information from the
-        # configuration file.
-        self.cloud_node.size = self.cloud_size
-
-        self._logger.info("Cloud node %s created.", self.cloud_node.id)
-        self._later.update_arvados_node_properties()
-
-    @ComputeNodeStateChangeBase._finish_on_exception
-    @RetryMixin._retry(config.ARVADOS_ERRORS)
-    def update_arvados_node_properties(self):
-        """Tell Arvados some details about the cloud node.
-
-        Currently we only include size/price from our request, which
-        we already knew before create_cloud_node(), but doing it here
-        gives us an opportunity to provide more detail from
-        self.cloud_node, too.
-        """
-        self.arvados_node['properties']['cloud_node'] = {
-            # Note this 'size' is the node size we asked the cloud
-            # driver to create -- not necessarily equal to the size
-            # reported by the cloud driver for the node that was
-            # created.
-            'size': self.cloud_size.id,
-            'price': self.cloud_size.price,
-        }
-        self.arvados_node = self._arvados.nodes().update(
-            uuid=self.arvados_node['uuid'],
-            body={'properties': self.arvados_node['properties']},
-        ).execute()
-        self._logger.info("%s updated properties.", self.arvados_node['uuid'])
-        self._later.post_create()
-
-    @RetryMixin._retry()
-    def post_create(self):
-        self._cloud.post_create_node(self.cloud_node)
-        self._logger.info("%s post-create work done.", self.cloud_node.id)
-        self._finished()
-
-    def stop_if_no_cloud_node(self):
-        if self.cloud_node is not None:
-            return False
-        self.stop()
-        return True
-
-
-class ComputeNodeShutdownActor(ComputeNodeStateChangeBase):
-    """Actor to shut down a compute node.
-
-    This actor simply destroys a cloud node, retrying as needed.
-    """
-    # Reasons for a shutdown to be cancelled.
-    WINDOW_CLOSED = "shutdown window closed"
-    DESTROY_FAILED = "destroy_node failed"
-
-    def __init__(self, timer_actor, cloud_client, arvados_client, node_monitor,
-                 cancellable=True, retry_wait=1, max_retry_wait=180):
-        # If a ShutdownActor is cancellable, it will ask the
-        # ComputeNodeMonitorActor if it's still eligible before taking each
-        # action, and stop the shutdown process if the node is no longer
-        # eligible.  Normal shutdowns based on job demand should be
-        # cancellable; shutdowns based on node misbehavior should not.
-        super(ComputeNodeShutdownActor, self).__init__(
-            cloud_client, arvados_client, timer_actor,
-            retry_wait, max_retry_wait)
-        self._monitor = node_monitor.proxy()
-        self.cloud_node = self._monitor.cloud_node.get()
-        self.cancellable = cancellable
-        self.cancel_reason = None
-        self.success = None
-
-    def _set_logger(self):
-        self._logger = logging.getLogger("%s.%s.%s" % (self.__class__.__name__, self.actor_urn[33:], self.cloud_node.name))
-
-    def on_start(self):
-        super(ComputeNodeShutdownActor, self).on_start()
-        self._later.shutdown_node()
-
-    def _arvados_node(self):
-        return self._monitor.arvados_node.get()
-
-    def _finished(self, success_flag=None):
-        if success_flag is not None:
-            self.success = success_flag
-        return super(ComputeNodeShutdownActor, self)._finished()
-
-    def cancel_shutdown(self, reason, **kwargs):
-        if not self.cancellable:
-            return False
-        if self.cancel_reason is not None:
-            # already cancelled
-            return False
-        self.cancel_reason = reason
-        self._logger.info("Shutdown cancelled: %s.", reason)
-        self._finished(success_flag=False)
-        return True
-
-    def _cancel_on_exception(orig_func):
-        @functools.wraps(orig_func)
-        def finish_wrapper(self, *args, **kwargs):
-            try:
-                return orig_func(self, *args, **kwargs)
-            except Exception as error:
-                self._logger.error("Actor error %s", error)
-                self._logger.debug("", exc_info=True)
-                self._later.cancel_shutdown("Unhandled exception %s" % error, try_resume=False)
-        return finish_wrapper
-
-    @_cancel_on_exception
-    def shutdown_node(self):
-        if self.cancel_reason is not None:
-            # already cancelled
-            return
-        if self.cancellable:
-            self._logger.info("Checking that node is still eligible for shutdown")
-            eligible, reason = self._monitor.shutdown_eligible().get()
-            if not eligible:
-                self.cancel_shutdown("No longer eligible for shut down because %s" % reason,
-                                     try_resume=True)
-                return
-        # If boot failed, count the event
-        if self._monitor.get_state().get() == 'unpaired':
-            status.tracker.counter_add('boot_failures')
-        self._destroy_node()
-
-    def _destroy_node(self):
-        self._logger.info("Starting shutdown")
-        arv_node = self._arvados_node()
-        if self._cloud.destroy_node(self.cloud_node):
-            self.cancellable = False
-            self._logger.info("Shutdown success")
-            if arv_node:
-                self._later.clean_arvados_node(arv_node)
-            else:
-                self._finished(success_flag=True)
-        else:
-            self.cancel_shutdown(self.DESTROY_FAILED, try_resume=False)
-
-    @ComputeNodeStateChangeBase._finish_on_exception
-    @RetryMixin._retry(config.ARVADOS_ERRORS)
-    def clean_arvados_node(self, arvados_node):
-        self._clean_arvados_node(arvados_node, "Shut down by Node Manager")
-        self._finished(success_flag=True)
-
-
-class ComputeNodeUpdateActor(config.actor_class, RetryMixin):
-    """Actor to dispatch one-off cloud management requests.
-
-    This actor receives requests for small cloud updates, and
-    dispatches them to a real driver.  ComputeNodeMonitorActors use
-    this to perform maintenance tasks on themselves.  Having a
-    dedicated actor for this gives us the opportunity to control the
-    flow of requests; e.g., by backing off when errors occur.
-    """
-    def __init__(self, cloud_factory, timer_actor, max_retry_wait=180):
-        super(ComputeNodeUpdateActor, self).__init__()
-        RetryMixin.__init__(self, 1, max_retry_wait,
-                            None, cloud_factory(), timer_actor)
-        self._cloud = cloud_factory()
-        self._later = self.actor_ref.tell_proxy()
-
-    def _set_logger(self):
-        self._logger = logging.getLogger("%s.%s" % (self.__class__.__name__, self.actor_urn[33:]))
-
-    def on_start(self):
-        self._set_logger()
-
-    @RetryMixin._retry()
-    def sync_node(self, cloud_node, arvados_node):
-        if self._cloud.node_fqdn(cloud_node) != arvados_node_fqdn(arvados_node):
-            return self._cloud.sync_node(cloud_node, arvados_node)
-
-
-class ComputeNodeMonitorActor(config.actor_class):
-    """Actor to manage a running compute node.
-
-    This actor gets updates about a compute node's cloud and Arvados records.
-    It uses this information to notify subscribers when the node is eligible
-    for shutdown.
-    """
-    def __init__(self, cloud_node, cloud_node_start_time, shutdown_timer,
-                 timer_actor, update_actor, cloud_client,
-                 arvados_node=None, poll_stale_after=600, node_stale_after=3600,
-                 boot_fail_after=1800, consecutive_idle_count=0
-    ):
-        super(ComputeNodeMonitorActor, self).__init__()
-        self._later = self.actor_ref.tell_proxy()
-        self._shutdowns = shutdown_timer
-        self._timer = timer_actor
-        self._update = update_actor
-        self._cloud = cloud_client
-        self.cloud_node = cloud_node
-        self.cloud_node_start_time = cloud_node_start_time
-        self.poll_stale_after = poll_stale_after
-        self.node_stale_after = node_stale_after
-        self.boot_fail_after = boot_fail_after
-        self.subscribers = set()
-        self.arvados_node = None
-        self.consecutive_idle_count = consecutive_idle_count
-        self.consecutive_idle = 0
-        self._later.update_arvados_node(arvados_node)
-        self.last_shutdown_opening = None
-        self._later.consider_shutdown()
-
-    def _set_logger(self):
-        self._logger = logging.getLogger("%s.%s.%s" % (self.__class__.__name__, self.actor_urn[33:], self.cloud_node.name))
-
-    def on_start(self):
-        self._set_logger()
-        self._timer.schedule(self.cloud_node_start_time + self.boot_fail_after, self._later.consider_shutdown)
-
-    def subscribe(self, subscriber):
-        self.subscribers.add(subscriber)
-
-    def _debug(self, msg, *args):
-        self._logger.debug(msg, *args)
-
-    def get_state(self):
-        """Get node state, one of ['unpaired', 'busy', 'idle', 'down']."""
-
-        # If this node is not associated with an Arvados node, return
-        # 'unpaired' if we're in the boot grace period, and 'down' if not,
-        # so it isn't counted towards usable nodes.
-        if self.arvados_node is None:
-            if timestamp_fresh(self.cloud_node_start_time,
-                               self.boot_fail_after):
-                return 'unpaired'
-            else:
-                return 'down'
-
-        state = self.arvados_node['crunch_worker_state']
-
-        # If state information is not available because it is missing or the
-        # record is stale, return 'down'.
-        if not state or not timestamp_fresh(arvados_node_mtime(self.arvados_node),
-                                            self.node_stale_after):
-            state = 'down'
-
-        # There's a window between when a node pings for the first time and the
-        # value of 'slurm_state' is synchronized by crunch-dispatch.  In this
-        # window, the node will still report as 'down'.  Check that
-        # first_ping_at is truthy and consider the node 'idle' during the
-        # initial boot grace period.
-        if (state == 'down' and
-            self.arvados_node['first_ping_at'] and
-            timestamp_fresh(self.cloud_node_start_time,
-                            self.boot_fail_after) and
-            not self._cloud.broken(self.cloud_node)):
-            state = 'idle'
-
-        # "missing" means last_ping_at is stale, this should be
-        # considered "down"
-        if arvados_node_missing(self.arvados_node, self.node_stale_after):
-            state = 'down'
-
-        # Turns out using 'job_uuid' this way is a bad idea.  The node record
-        # is assigned the job_uuid before the job is locked (which removes it
-        # from the queue) which means the job will be double-counted as both in
-        # the wishlist and but also keeping a node busy.  This end result is
-        # excess nodes being booted.
-        #if state == 'idle' and self.arvados_node['job_uuid']:
-        #    state = 'busy'
-
-        # Update idle node times tracker
-        if state == 'idle':
-            status.tracker.idle_in(self.arvados_node['hostname'])
-        else:
-            status.tracker.idle_out(self.arvados_node['hostname'])
-
-        return state
-
-    def in_state(self, *states):
-        return self.get_state() in states
-
-    def shutdown_eligible(self):
-        """Determine if node is candidate for shut down.
-
-        Returns a tuple of (boolean, string) where the first value is whether
-        the node is candidate for shut down, and the second value is the
-        reason for the decision.
-        """
-
-        # If this node's size is invalid (because it has a stale arvados_node_size
-        # tag), return True so that it's properly shut down.
-        if self.cloud_node.size.id == 'invalid':
-            return (True, "node's size tag '%s' not recognizable" % (self.cloud_node.extra['arvados_node_size'],))
-
-        # Collect states and then consult state transition table whether we
-        # should shut down.  Possible states are:
-        # crunch_worker_state = ['unpaired', 'busy', 'idle', 'down']
-        # window = ["open", "closed"]
-        # boot_grace = ["boot wait", "boot exceeded"]
-        # idle_grace = ["not idle", "idle wait", "idle exceeded"]
-
-        if self.arvados_node and not timestamp_fresh(arvados_node_mtime(self.arvados_node), self.node_stale_after):
-            return (False, "node state is stale")
-
-        crunch_worker_state = self.get_state()
-
-        window = "open" if self._shutdowns.window_open() else "closed"
-
-        if timestamp_fresh(self.cloud_node_start_time, self.boot_fail_after):
-            boot_grace = "boot wait"
-        else:
-            boot_grace = "boot exceeded"
-
-        if crunch_worker_state == "idle":
-            # Must report as "idle" at least "consecutive_idle_count" times
-            if self.consecutive_idle < self.consecutive_idle_count:
-                idle_grace = 'idle wait'
-            else:
-                idle_grace = 'idle exceeded'
-        else:
-            idle_grace = 'not idle'
-
-        node_state = (crunch_worker_state, window, boot_grace, idle_grace)
-        t = transitions[node_state]
-        if t is not None:
-            # yes, shutdown eligible
-            return (True, "node state is %s" % (node_state,))
-        else:
-            # no, return a reason
-            return (False, "node state is %s" % (node_state,))
-
-    def consider_shutdown(self):
-        try:
-            eligible, reason = self.shutdown_eligible()
-            next_opening = self._shutdowns.next_opening()
-            if eligible:
-                self._debug("Suggesting shutdown because %s", reason)
-                _notify_subscribers(self.actor_ref.proxy(), self.subscribers)
-            else:
-                self._debug("Not eligible for shut down because %s", reason)
-
-                if self.last_shutdown_opening != next_opening:
-                    self._debug("Shutdown window closed.  Next at %s.",
-                                time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(next_opening)))
-                    self._timer.schedule(next_opening, self._later.consider_shutdown)
-                    self.last_shutdown_opening = next_opening
-        except Exception:
-            self._logger.exception("Unexpected exception")
-
-    def offer_arvados_pair(self, arvados_node):
-        first_ping_s = arvados_node.get('first_ping_at')
-        if (self.arvados_node is not None) or (not first_ping_s):
-            return None
-        elif ((arvados_node['info'].get('ec2_instance_id') == self._cloud.node_id(self.cloud_node)) and
-              (arvados_timestamp(first_ping_s) >= self.cloud_node_start_time)):
-            self._later.update_arvados_node(arvados_node)
-            return self.cloud_node.id
-        else:
-            return None
-
-    def update_cloud_node(self, cloud_node):
-        if cloud_node is not None:
-            self.cloud_node = cloud_node
-            self._later.consider_shutdown()
-
-    def update_arvados_node(self, arvados_node):
-        """Called when the latest Arvados node record is retrieved.
-
-        Calls the updater's sync_node() method.
-
-        """
-        # This method is a little unusual in the way it just fires off the
-        # request without checking the result or retrying errors.  That's
-        # because this update happens every time we reload the Arvados node
-        # list: if a previous sync attempt failed, we'll see that the names
-        # are out of sync and just try again.  ComputeNodeUpdateActor has
-        # the logic to throttle those effective retries when there's trouble.
-        if arvados_node is not None:
-            self.arvados_node = arvados_node
-            self._update.sync_node(self.cloud_node, self.arvados_node)
-            if self.arvados_node['crunch_worker_state'] == "idle":
-                self.consecutive_idle += 1
-            else:
-                self.consecutive_idle = 0
-            self._later.consider_shutdown()
diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py b/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
deleted file mode 100644 (file)
index 5b7785a..0000000
+++ /dev/null
@@ -1,118 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: AGPL-3.0
-
-from __future__ import absolute_import, print_function
-
-import subprocess32 as subprocess
-import time
-
-from . import ComputeNodeMonitorActor
-from . import ComputeNodeSetupActor as SetupActorBase
-from . import ComputeNodeShutdownActor as ShutdownActorBase
-from . import ComputeNodeUpdateActor as UpdateActorBase
-from .. import RetryMixin
-
-class SlurmMixin(object):
-    SLURM_END_STATES = frozenset(['down\n', 'down*\n',
-                                  'drain\n', 'drain*\n',
-                                  'fail\n', 'fail*\n'])
-    SLURM_DRAIN_STATES = frozenset(['drain\n', 'drng\n'])
-
-    def _update_slurm_node(self, nodename, updates):
-        cmd = ['scontrol', 'update', 'NodeName=' + nodename] + updates
-        try:
-            subprocess.check_output(cmd)
-        except:
-            self._logger.error(
-                "SLURM update %r failed", cmd, exc_info=True)
-
-    def _update_slurm_size_attrs(self, nodename, size):
-        self._update_slurm_node(nodename, [
-            'Weight=%i' % int(size.price * 1000),
-            'Features=instancetype=' + size.id,
-        ])
-
-    def _get_slurm_state(self, nodename):
-        return subprocess.check_output(['sinfo', '--noheader', '-o', '%t', '-n', nodename])
-
-
-class ComputeNodeSetupActor(SlurmMixin, SetupActorBase):
-    def create_cloud_node(self):
-        hostname = self.arvados_node.get("hostname")
-        if hostname:
-            self._update_slurm_size_attrs(hostname, self.cloud_size)
-        return super(ComputeNodeSetupActor, self).create_cloud_node()
-
-
-class ComputeNodeShutdownActor(SlurmMixin, ShutdownActorBase):
-    def on_start(self):
-        arv_node = self._arvados_node()
-        if arv_node is None:
-            self._nodename = None
-            return super(ComputeNodeShutdownActor, self).on_start()
-        else:
-            self._set_logger()
-            self._nodename = arv_node['hostname']
-            self._logger.info("Draining SLURM node %s", self._nodename)
-            self._later.issue_slurm_drain()
-
-    @RetryMixin._retry((subprocess.CalledProcessError, OSError))
-    def cancel_shutdown(self, reason, try_resume=True):
-        if self._nodename:
-            if try_resume and self._get_slurm_state(self._nodename) in self.SLURM_DRAIN_STATES:
-                # Resume from "drng" or "drain"
-                self._update_slurm_node(self._nodename, ['State=RESUME'])
-            else:
-                # Node is in a state such as 'idle' or 'alloc' so don't
-                # try to resume it because that will just raise an error.
-                pass
-        return super(ComputeNodeShutdownActor, self).cancel_shutdown(reason)
-
-    @RetryMixin._retry((subprocess.CalledProcessError, OSError))
-    def issue_slurm_drain(self):
-        if self.cancel_reason is not None:
-            return
-        if self._nodename:
-            self._update_slurm_node(self._nodename, [
-                'State=DRAIN', 'Reason=Node Manager shutdown'])
-            self._logger.info("Waiting for SLURM node %s to drain", self._nodename)
-            self._later.await_slurm_drain()
-        else:
-            self._later.shutdown_node()
-
-    @RetryMixin._retry((subprocess.CalledProcessError, OSError))
-    def await_slurm_drain(self):
-        if self.cancel_reason is not None:
-            return
-        output = self._get_slurm_state(self._nodename)
-        if output in ("drng\n", "alloc\n", "drng*\n", "alloc*\n"):
-            self._timer.schedule(time.time() + 10,
-                                 self._later.await_slurm_drain)
-        elif output in ("idle\n",):
-            # Not in "drng" but idle, don't shut down
-            self.cancel_shutdown("slurm state is %s" % output.strip(), try_resume=False)
-        else:
-            # any other state.
-            self._later.shutdown_node()
-
-    def _destroy_node(self):
-        if self._nodename:
-            self._update_slurm_node(self._nodename, [
-                'State=DOWN', 'Reason=Node Manager shutdown'])
-        super(ComputeNodeShutdownActor, self)._destroy_node()
-
-
-class ComputeNodeUpdateActor(SlurmMixin, UpdateActorBase):
-    def sync_node(self, cloud_node, arvados_node):
-        """Keep SLURM's node properties up to date."""
-        hostname = arvados_node.get("hostname")
-        features = arvados_node.get("slurm_node_features", "").split(",")
-        sizefeature = "instancetype=" + cloud_node.size.id
-        if hostname and sizefeature not in features:
-            # This probably means SLURM has restarted and lost our
-            # dynamically configured node weights and features.
-            self._update_slurm_size_attrs(hostname, cloud_node.size)
-        return super(ComputeNodeUpdateActor, self).sync_node(
-            cloud_node, arvados_node)
diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/transitions.py b/services/nodemanager/arvnodeman/computenode/dispatch/transitions.py
deleted file mode 100644 (file)
index 93f50c1..0000000
+++ /dev/null
@@ -1,69 +0,0 @@
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: AGPL-3.0
-
-transitions = {
- ('busy', 'closed', 'boot exceeded', 'idle exceeded'): None,
- ('busy', 'closed', 'boot exceeded', 'idle wait'): None,
- ('busy', 'closed', 'boot exceeded', 'not idle'): None,
- ('busy', 'closed', 'boot wait', 'idle exceeded'): None,
- ('busy', 'closed', 'boot wait', 'idle wait'): None,
- ('busy', 'closed', 'boot wait', 'not idle'): None,
- ('busy', 'open', 'boot exceeded', 'idle exceeded'): None,
- ('busy', 'open', 'boot exceeded', 'idle wait'): None,
- ('busy', 'open', 'boot exceeded', 'not idle'): None,
- ('busy', 'open', 'boot wait', 'idle exceeded'): None,
- ('busy', 'open', 'boot wait', 'idle wait'): None,
- ('busy', 'open', 'boot wait', 'not idle'): None,
-
- ('down', 'closed', 'boot exceeded', 'idle exceeded'): "START_SHUTDOWN",
- ('down', 'closed', 'boot exceeded', 'idle wait'): "START_SHUTDOWN",
- ('down', 'closed', 'boot exceeded', 'not idle'): "START_SHUTDOWN",
- ('down', 'closed', 'boot wait', 'idle exceeded'): None,
- ('down', 'closed', 'boot wait', 'idle wait'): None,
- ('down', 'closed', 'boot wait', 'not idle'): None,
- ('down', 'open', 'boot exceeded', 'idle exceeded'): "START_SHUTDOWN",
- ('down', 'open', 'boot exceeded', 'idle wait'): "START_SHUTDOWN",
- ('down', 'open', 'boot exceeded', 'not idle'): "START_SHUTDOWN",
- ('down', 'open', 'boot wait', 'idle exceeded'): "START_SHUTDOWN",
- ('down', 'open', 'boot wait', 'idle wait'): "START_SHUTDOWN",
- ('down', 'open', 'boot wait', 'not idle'): "START_SHUTDOWN",
-
- ('idle', 'closed', 'boot exceeded', 'idle exceeded'): None,
- ('idle', 'closed', 'boot exceeded', 'idle wait'): None,
- ('idle', 'closed', 'boot exceeded', 'not idle'): None,
- ('idle', 'closed', 'boot wait', 'idle exceeded'): None,
- ('idle', 'closed', 'boot wait', 'idle wait'): None,
- ('idle', 'closed', 'boot wait', 'not idle'): None,
- ('idle', 'open', 'boot exceeded', 'idle exceeded'): "START_DRAIN",
- ('idle', 'open', 'boot exceeded', 'idle wait'): None,
- ('idle', 'open', 'boot exceeded', 'not idle'): None,
- ('idle', 'open', 'boot wait', 'idle exceeded'): "START_DRAIN",
- ('idle', 'open', 'boot wait', 'idle wait'): None,
- ('idle', 'open', 'boot wait', 'not idle'): None,
-
- ('unpaired', 'closed', 'boot exceeded', 'idle exceeded'): "START_SHUTDOWN",
- ('unpaired', 'closed', 'boot exceeded', 'idle wait'): "START_SHUTDOWN",
- ('unpaired', 'closed', 'boot exceeded', 'not idle'): "START_SHUTDOWN",
- ('unpaired', 'closed', 'boot wait', 'idle exceeded'): None,
- ('unpaired', 'closed', 'boot wait', 'idle wait'): None,
- ('unpaired', 'closed', 'boot wait', 'not idle'): None,
- ('unpaired', 'open', 'boot exceeded', 'idle exceeded'): "START_SHUTDOWN",
- ('unpaired', 'open', 'boot exceeded', 'idle wait'): "START_SHUTDOWN",
- ('unpaired', 'open', 'boot exceeded', 'not idle'): "START_SHUTDOWN",
- ('unpaired', 'open', 'boot wait', 'idle exceeded'): None,
- ('unpaired', 'open', 'boot wait', 'idle wait'): None,
- ('unpaired', 'open', 'boot wait', 'not idle'): None,
-
- ('fail', 'closed', 'boot exceeded', 'idle exceeded'): "START_SHUTDOWN",
- ('fail', 'closed', 'boot exceeded', 'idle wait'): "START_SHUTDOWN",
- ('fail', 'closed', 'boot exceeded', 'not idle'): "START_SHUTDOWN",
- ('fail', 'closed', 'boot wait', 'idle exceeded'): "START_SHUTDOWN",
- ('fail', 'closed', 'boot wait', 'idle wait'): "START_SHUTDOWN",
- ('fail', 'closed', 'boot wait', 'not idle'): "START_SHUTDOWN",
- ('fail', 'open', 'boot exceeded', 'idle exceeded'): "START_SHUTDOWN",
- ('fail', 'open', 'boot exceeded', 'idle wait'): "START_SHUTDOWN",
- ('fail', 'open', 'boot exceeded', 'not idle'): "START_SHUTDOWN",
- ('fail', 'open', 'boot wait', 'idle exceeded'): "START_SHUTDOWN",
- ('fail', 'open', 'boot wait', 'idle wait'): "START_SHUTDOWN",
- ('fail', 'open', 'boot wait', 'not idle'): "START_SHUTDOWN"}
diff --git a/services/nodemanager/arvnodeman/computenode/driver/__init__.py b/services/nodemanager/arvnodeman/computenode/driver/__init__.py
deleted file mode 100644 (file)
index 48d19f5..0000000
+++ /dev/null
@@ -1,253 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: AGPL-3.0
-
-from __future__ import absolute_import, print_function
-
-import logging
-from operator import attrgetter
-
-import libcloud.common.types as cloud_types
-from libcloud.compute.base import NodeDriver, NodeAuthSSHKey
-
-from ...config import CLOUD_ERRORS
-from ...status import tracker
-from .. import RetryMixin
-
-class BaseComputeNodeDriver(RetryMixin):
-    """Abstract base class for compute node drivers.
-
-    libcloud drivers abstract away many of the differences between
-    cloud providers, but managing compute nodes requires some
-    cloud-specific features (e.g., keeping track of node FQDNs and
-    boot times).  Compute node drivers are responsible for translating
-    the node manager's cloud requests to a specific cloud's
-    vocabulary.
-
-    Subclasses must implement arvados_create_kwargs, sync_node,
-    node_fqdn, and node_start_time.
-    """
-
-
-    @RetryMixin._retry()
-    def _create_driver(self, driver_class, **auth_kwargs):
-        return driver_class(**auth_kwargs)
-
-    @RetryMixin._retry()
-    def sizes(self):
-        if self._sizes is None:
-            self._sizes = {sz.id: sz for sz in self.real.list_sizes()}
-        return self._sizes
-
-    def __init__(self, auth_kwargs, list_kwargs, create_kwargs,
-                 driver_class, retry_wait=1, max_retry_wait=180):
-        """Base initializer for compute node drivers.
-
-        Arguments:
-        * auth_kwargs: A dictionary of arguments that are passed into the
-          driver_class constructor to instantiate a libcloud driver.
-        * list_kwargs: A dictionary of arguments that are passed to the
-          libcloud driver's list_nodes method to return the list of compute
-          nodes.
-        * create_kwargs: A dictionary of arguments that are passed to the
-          libcloud driver's create_node method to create a new compute node.
-        * driver_class: The class of a libcloud driver to use.
-        """
-
-        super(BaseComputeNodeDriver, self).__init__(retry_wait, max_retry_wait,
-                                         logging.getLogger(self.__class__.__name__),
-                                         type(self),
-                                         None)
-        self.real = self._create_driver(driver_class, **auth_kwargs)
-        self.list_kwargs = list_kwargs
-        self.create_kwargs = create_kwargs
-        # Transform entries in create_kwargs.  For each key K, if this class
-        # has an _init_K method, remove the entry and call _init_K with the
-        # corresponding value.  If _init_K returns None, the entry stays out
-        # of the dictionary (we expect we're holding the value somewhere
-        # else, like an instance variable).  Otherwise, _init_K returns a
-        # key-value tuple pair, and we add that entry to create_kwargs.
-        for key in self.create_kwargs.keys():
-            init_method = getattr(self, '_init_' + key, None)
-            if init_method is not None:
-                new_pair = init_method(self.create_kwargs.pop(key))
-                if new_pair is not None:
-                    self.create_kwargs[new_pair[0]] = new_pair[1]
-
-        self._sizes = None
-
-    def _init_ping_host(self, ping_host):
-        self.ping_host = ping_host
-
-    def _init_ssh_key(self, filename):
-        with open(filename) as ssh_file:
-            key = NodeAuthSSHKey(ssh_file.read())
-        return 'auth', key
-
-    def search_for_now(self, term, list_method, key=attrgetter('id'), **kwargs):
-        """Return one matching item from a list of cloud objects.
-
-        Raises ValueError if the number of matching objects is not exactly 1.
-
-        Arguments:
-        * term: The value that identifies a matching item.
-        * list_method: A string that names the method to call for a
-          list of objects.
-        * key: A function that accepts a cloud object and returns a
-          value search for a `term` match on each item.  Returns the
-          object's 'id' attribute by default.
-        """
-        try:
-            list_func = getattr(self, list_method)
-        except AttributeError:
-            list_func = getattr(self.real, list_method)
-        items = list_func(**kwargs)
-        results = [item for item in items if key(item) == term]
-        count = len(results)
-        if count != 1:
-            raise ValueError("{} returned {} results for {!r}".format(
-                    list_method, count, term))
-        return results[0]
-
-    def search_for(self, term, list_method, key=attrgetter('id'), **kwargs):
-        """Return one cached matching item from a list of cloud objects.
-
-        See search_for_now() for details of arguments and exceptions.
-        This method caches results, so it's good to find static cloud objects
-        like node sizes, regions, etc.
-        """
-        cache_key = (list_method, term)
-        if cache_key not in self.SEARCH_CACHE:
-            self.SEARCH_CACHE[cache_key] = self.search_for_now(
-                term, list_method, key, **kwargs)
-        return self.SEARCH_CACHE[cache_key]
-
-    def list_nodes(self, **kwargs):
-        l = self.list_kwargs.copy()
-        l.update(kwargs)
-        try:
-            return self.real.list_nodes(**l)
-        except CLOUD_ERRORS:
-            tracker.counter_add('list_nodes_errors')
-            raise
-
-    def create_cloud_name(self, arvados_node):
-        """Return a cloud node name for the given Arvados node record.
-
-        Subclasses must override this method.  It should return a string
-        that can be used as the name for a newly-created cloud node,
-        based on identifying information in the Arvados node record.
-
-        Arguments:
-        * arvados_node: This Arvados node record to seed the new cloud node.
-        """
-        raise NotImplementedError("BaseComputeNodeDriver.create_cloud_name")
-
-    def arvados_create_kwargs(self, size, arvados_node):
-        """Return dynamic keyword arguments for create_node.
-
-        Subclasses must override this method.  It should return a dictionary
-        of keyword arguments to pass to the libcloud driver's create_node
-        method.  These arguments will extend the static arguments in
-        create_kwargs.
-
-        Arguments:
-        * size: The node size that will be created (libcloud NodeSize object)
-        * arvados_node: The Arvados node record that will be associated
-          with this cloud node, as returned from the API server.
-        """
-        raise NotImplementedError("BaseComputeNodeDriver.arvados_create_kwargs")
-
-    def broken(self, cloud_node):
-        """Return true if libcloud has indicated the node is in a "broken" state."""
-        return False
-
-    def _make_ping_url(self, arvados_node):
-        return 'https://{}/arvados/v1/nodes/{}/ping?ping_secret={}'.format(
-            self.ping_host, arvados_node['uuid'],
-            arvados_node['info']['ping_secret'])
-
-    @staticmethod
-    def _name_key(cloud_object):
-        return cloud_object.name
-
-    def create_node(self, size, arvados_node):
-        try:
-            kwargs = self.create_kwargs.copy()
-            kwargs.update(self.arvados_create_kwargs(size, arvados_node))
-            kwargs['size'] = size.real
-            return self.real.create_node(**kwargs)
-        except CLOUD_ERRORS as create_error:
-            # Workaround for bug #6702: sometimes the create node request
-            # succeeds but times out and raises an exception instead of
-            # returning a result.  If this happens, we get stuck in a retry
-            # loop forever because subsequent create_node attempts will fail
-            # due to node name collision.  So check if the node we intended to
-            # create shows up in the cloud node list and return it if found.
-            try:
-                return self.search_for_now(kwargs['name'], 'list_nodes', self._name_key)
-            except ValueError:
-                tracker.counter_add('create_node_errors')
-                raise create_error
-
-    def post_create_node(self, cloud_node):
-        # ComputeNodeSetupActor calls this method after the cloud node is
-        # created.  Any setup tasks that need to happen afterward (e.g.,
-        # tagging) should be done in this method.
-        pass
-
-    def sync_node(self, cloud_node, arvados_node):
-        # When a compute node first pings the API server, the API server
-        # will automatically assign some attributes on the corresponding
-        # node record, like hostname.  This method should propagate that
-        # information back to the cloud node appropriately.
-        raise NotImplementedError("BaseComputeNodeDriver.sync_node")
-
-    @classmethod
-    def node_fqdn(cls, node):
-        # This method should return the FQDN of the node object argument.
-        # Different clouds store this in different places.
-        raise NotImplementedError("BaseComputeNodeDriver.node_fqdn")
-
-    @classmethod
-    def node_start_time(cls, node):
-        # This method should return the time the node was started, in
-        # seconds since the epoch UTC.
-        raise NotImplementedError("BaseComputeNodeDriver.node_start_time")
-
-    def destroy_node(self, cloud_node):
-        try:
-            return self.real.destroy_node(cloud_node)
-        except CLOUD_ERRORS:
-            # Sometimes the destroy node request succeeds but times out and
-            # raises an exception instead of returning success.  If this
-            # happens, we get a noisy stack trace.  Check if the node is still
-            # on the node list.  If it is gone, we can declare victory.
-            try:
-                self.search_for_now(cloud_node.id, 'list_nodes')
-            except ValueError:
-                # If we catch ValueError, that means search_for_now didn't find
-                # it, which means destroy_node actually succeeded.
-                return True
-            # The node is still on the list.  Re-raise.
-            tracker.counter_add('destroy_node_errors')
-            raise
-
-    # Now that we've defined all our own methods, delegate generic, public
-    # attributes of libcloud drivers that we haven't defined ourselves.
-    def _delegate_to_real(attr_name):
-        return property(
-            lambda self: getattr(self.real, attr_name),
-            lambda self, value: setattr(self.real, attr_name, value),
-            doc=getattr(getattr(NodeDriver, attr_name), '__doc__', None))
-
-    # node id
-    @classmethod
-    def node_id(cls):
-        raise NotImplementedError("BaseComputeNodeDriver.node_id")
-
-    _locals = locals()
-    for _attr_name in dir(NodeDriver):
-        if (not _attr_name.startswith('_')) and (_attr_name not in _locals):
-            _locals[_attr_name] = _delegate_to_real(_attr_name)
diff --git a/services/nodemanager/arvnodeman/computenode/driver/azure.py b/services/nodemanager/arvnodeman/computenode/driver/azure.py
deleted file mode 100644 (file)
index 35c8b5a..0000000
+++ /dev/null
@@ -1,112 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: AGPL-3.0
-
-from __future__ import absolute_import, print_function
-
-import pipes
-import time
-
-import libcloud.compute.base as cloud_base
-import libcloud.compute.providers as cloud_provider
-import libcloud.compute.types as cloud_types
-from libcloud.common.exceptions import BaseHTTPError
-
-from . import BaseComputeNodeDriver
-from .. import arvados_node_fqdn, arvados_timestamp, ARVADOS_TIMEFMT
-
-class ComputeNodeDriver(BaseComputeNodeDriver):
-
-    DEFAULT_DRIVER = cloud_provider.get_driver(cloud_types.Provider.AZURE_ARM)
-    SEARCH_CACHE = {}
-
-    def __init__(self, auth_kwargs, list_kwargs, create_kwargs,
-                 driver_class=DEFAULT_DRIVER):
-
-        if not list_kwargs.get("ex_resource_group"):
-            raise Exception("Must include ex_resource_group in Cloud List configuration (list_kwargs)")
-
-        create_kwargs["ex_resource_group"] = list_kwargs["ex_resource_group"]
-
-        self.tags = {key[4:]: value
-                     for key, value in create_kwargs.iteritems()
-                     if key.startswith('tag_')}
-        # filter out tags from create_kwargs
-        create_kwargs = {key: value
-                         for key, value in create_kwargs.iteritems()
-                         if not key.startswith('tag_')}
-        super(ComputeNodeDriver, self).__init__(
-            auth_kwargs, list_kwargs, create_kwargs,
-            driver_class)
-
-    def create_cloud_name(self, arvados_node):
-        uuid_parts = arvados_node['uuid'].split('-', 2)
-        return 'compute-{parts[2]}-{parts[0]}'.format(parts=uuid_parts)
-
-    def arvados_create_kwargs(self, size, arvados_node):
-        tags = {
-            # Set up tag indicating the Arvados assigned Cloud Size id.
-            'arvados_node_size': size.id,
-            'booted_at': time.strftime(ARVADOS_TIMEFMT, time.gmtime()),
-            'arv-ping-url': self._make_ping_url(arvados_node)
-        }
-        tags.update(self.tags)
-
-        name = self.create_cloud_name(arvados_node)
-        customdata = """#!/bin/sh
-mkdir -p    /var/tmp/arv-node-data/meta-data
-echo %s > /var/tmp/arv-node-data/arv-ping-url
-echo %s > /var/tmp/arv-node-data/meta-data/instance-id
-echo %s > /var/tmp/arv-node-data/meta-data/instance-type
-""" % (pipes.quote(tags['arv-ping-url']),
-       pipes.quote(name),
-       pipes.quote(size.id))
-
-        return {
-            'name': name,
-            'ex_tags': tags,
-            'ex_customdata': customdata
-        }
-
-    def sync_node(self, cloud_node, arvados_node):
-        try:
-            self.real.ex_create_tags(cloud_node,
-                                     {'hostname': arvados_node_fqdn(arvados_node)})
-            return True
-        except BaseHTTPError as b:
-            return False
-
-    def _init_image(self, urn):
-        return "image", self.get_image(urn)
-
-    def list_nodes(self):
-        # Azure only supports filtering node lists by resource group.
-        # Do our own filtering based on tag.
-        nodes = [node for node in
-                super(ComputeNodeDriver, self).list_nodes(ex_fetch_nic=False, ex_fetch_power_state=False)
-                if node.extra.get("tags", {}).get("arvados-class") == self.tags["arvados-class"]]
-        for n in nodes:
-            # Need to populate Node.size
-            if not n.size:
-                n.size = self.sizes()[n.extra["properties"]["hardwareProfile"]["vmSize"]]
-            n.extra['arvados_node_size'] = n.extra.get('tags', {}).get('arvados_node_size') or n.size.id
-        return nodes
-
-    def broken(self, cloud_node):
-        """Return true if libcloud has indicated the node is in a "broken" state."""
-        # UNKNOWN means the node state is unrecognized, which in practice means some combination
-        # of failure that the Azure libcloud driver doesn't know how to interpret.
-        return (cloud_node.state in (cloud_types.NodeState.ERROR, cloud_types.NodeState.UNKNOWN))
-
-    @classmethod
-    def node_fqdn(cls, node):
-        return node.extra["tags"].get("hostname")
-
-    @classmethod
-    def node_start_time(cls, node):
-        return arvados_timestamp(node.extra["tags"].get("booted_at"))
-
-    @classmethod
-    def node_id(cls, node):
-        return node.name
diff --git a/services/nodemanager/arvnodeman/computenode/driver/dummy.py b/services/nodemanager/arvnodeman/computenode/driver/dummy.py
deleted file mode 100644 (file)
index 14845ac..0000000
+++ /dev/null
@@ -1,61 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: AGPL-3.0
-
-from __future__ import absolute_import, print_function
-
-import time
-
-import libcloud.compute.providers as cloud_provider
-import libcloud.compute.types as cloud_types
-
-from . import BaseComputeNodeDriver
-from .. import arvados_node_fqdn
-
-class ComputeNodeDriver(BaseComputeNodeDriver):
-    """Compute node driver wrapper for libcloud's dummy driver.
-
-    This class provides the glue necessary to run the node manager with a
-    dummy cloud.  It's useful for testing.
-    """
-    DEFAULT_DRIVER = cloud_provider.get_driver(cloud_types.Provider.DUMMY)
-    DEFAULT_REAL = DEFAULT_DRIVER('ComputeNodeDriver')
-    DUMMY_START_TIME = time.time()
-
-    def __init__(self, auth_kwargs, list_kwargs, create_kwargs,
-                 driver_class=DEFAULT_DRIVER):
-        super(ComputeNodeDriver, self).__init__(
-            auth_kwargs, list_kwargs, create_kwargs, driver_class)
-        if driver_class is self.DEFAULT_DRIVER:
-            self.real = self.DEFAULT_REAL
-
-    def _ensure_private_ip(self, node):
-        if not node.private_ips:
-            node.private_ips = ['10.10.0.{}'.format(node.id)]
-
-    def arvados_create_kwargs(self, size, arvados_node):
-        return {}
-
-    def list_nodes(self):
-        nodelist = super(ComputeNodeDriver, self).list_nodes()
-        for node in nodelist:
-            self._ensure_private_ip(node)
-            node.size = self.sizes()["1"]
-        return nodelist
-
-    def create_node(self, size, arvados_node):
-        node = super(ComputeNodeDriver, self).create_node(size, arvados_node)
-        self._ensure_private_ip(node)
-        return node
-
-    def sync_node(self, cloud_node, arvados_node):
-        cloud_node.name = arvados_node_fqdn(arvados_node)
-
-    @classmethod
-    def node_fqdn(cls, node):
-        return node.name
-
-    @classmethod
-    def node_start_time(cls, node):
-        return cls.DUMMY_START_TIME
diff --git a/services/nodemanager/arvnodeman/computenode/driver/ec2.py b/services/nodemanager/arvnodeman/computenode/driver/ec2.py
deleted file mode 100644 (file)
index 418a9f9..0000000
+++ /dev/null
@@ -1,129 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: AGPL-3.0
-
-from __future__ import absolute_import, print_function
-
-import time
-
-import libcloud.compute.base as cloud_base
-import libcloud.compute.providers as cloud_provider
-import libcloud.compute.types as cloud_types
-from libcloud.compute.drivers import ec2 as cloud_ec2
-
-from . import BaseComputeNodeDriver
-from .. import arvados_node_fqdn
-
-### Monkeypatch libcloud to support AWS' new SecurityGroup API.
-# These classes can be removed when libcloud support specifying
-# security groups with the SecurityGroupId parameter.
-class ANMEC2Connection(cloud_ec2.EC2Connection):
-    def request(self, *args, **kwargs):
-        params = kwargs.get('params')
-        if (params is not None) and (params.get('Action') == 'RunInstances'):
-            for key in params.keys():
-                if key.startswith('SecurityGroup.'):
-                    new_key = key.replace('Group.', 'GroupId.', 1)
-                    params[new_key] = params.pop(key).id
-            kwargs['params'] = params
-        return super(ANMEC2Connection, self).request(*args, **kwargs)
-
-
-class ANMEC2NodeDriver(cloud_ec2.EC2NodeDriver):
-    connectionCls = ANMEC2Connection
-
-
-class ComputeNodeDriver(BaseComputeNodeDriver):
-    """Compute node driver wrapper for EC2.
-
-    This translates cloud driver requests to EC2's specific parameters.
-    """
-    DEFAULT_DRIVER = ANMEC2NodeDriver
-### End monkeypatch
-    SEARCH_CACHE = {}
-
-    def __init__(self, auth_kwargs, list_kwargs, create_kwargs,
-                 driver_class=DEFAULT_DRIVER):
-        # We need full lists of keys up front because these loops modify
-        # dictionaries in-place.
-        for key in list_kwargs.keys():
-            list_kwargs[key.replace('_', ':')] = list_kwargs.pop(key)
-        self.tags = {key[4:]: value
-                     for key, value in list_kwargs.iteritems()
-                     if key.startswith('tag:')}
-        # Tags are assigned at instance creation time
-        create_kwargs.setdefault('ex_metadata', {})
-        create_kwargs['ex_metadata'].update(self.tags)
-        super(ComputeNodeDriver, self).__init__(
-            auth_kwargs, {'ex_filters': list_kwargs}, create_kwargs,
-            driver_class)
-
-    def _init_image_id(self, image_id):
-        return 'image', self.search_for(image_id, 'list_images', ex_owner='self')
-
-    def _init_security_groups(self, group_names):
-        return 'ex_security_groups', [
-            self.search_for(gname.strip(), 'ex_get_security_groups')
-            for gname in group_names.split(',')]
-
-    def _init_subnet_id(self, subnet_id):
-        return 'ex_subnet', self.search_for(subnet_id, 'ex_list_subnets')
-
-    create_cloud_name = staticmethod(arvados_node_fqdn)
-
-    def arvados_create_kwargs(self, size, arvados_node):
-        kw = {'name': self.create_cloud_name(arvados_node),
-                'ex_userdata': self._make_ping_url(arvados_node)}
-        # libcloud/ec2 disk sizes are in GB, Arvados/SLURM "scratch" value is in MB
-        scratch = int(size.scratch / 1000) + 1
-        if scratch > size.disk:
-            volsize = scratch - size.disk
-            if volsize > 16384:
-                # Must be 1-16384 for General Purpose SSD (gp2) devices
-                # https://docs.aws.amazon.com/AWSEC2/latest/APIReference/API_EbsBlockDevice.html
-                self._logger.warning("Requested EBS volume size %d is too large, capping size request to 16384 GB", volsize)
-                volsize = 16384
-            kw["ex_blockdevicemappings"] = [{
-                "DeviceName": "/dev/xvdt",
-                "Ebs": {
-                    "DeleteOnTermination": True,
-                    "VolumeSize": volsize,
-                    "VolumeType": "gp2"
-                }}]
-        if size.preemptible:
-            # Request a Spot instance for this node
-            kw['ex_spot_market'] = True
-        return kw
-
-    def sync_node(self, cloud_node, arvados_node):
-        self.real.ex_create_tags(cloud_node,
-                                 {'Name': arvados_node_fqdn(arvados_node)})
-
-    def create_node(self, size, arvados_node):
-        # Set up tag indicating the Arvados assigned Cloud Size id.
-        self.create_kwargs['ex_metadata'].update({'arvados_node_size': size.id})
-        return super(ComputeNodeDriver, self).create_node(size, arvados_node)
-
-    def list_nodes(self):
-        # Need to populate Node.size
-        nodes = super(ComputeNodeDriver, self).list_nodes()
-        for n in nodes:
-            if not n.size:
-                n.size = self.sizes()[n.extra["instance_type"]]
-            n.extra['arvados_node_size'] = n.extra.get('tags', {}).get('arvados_node_size') or n.size.id
-        return nodes
-
-    @classmethod
-    def node_fqdn(cls, node):
-        return node.name
-
-    @classmethod
-    def node_start_time(cls, node):
-        time_str = node.extra['launch_time'].split('.', 2)[0] + 'UTC'
-        return time.mktime(time.strptime(
-                time_str,'%Y-%m-%dT%H:%M:%S%Z')) - time.timezone
-
-    @classmethod
-    def node_id(cls, node):
-        return node.id
diff --git a/services/nodemanager/arvnodeman/computenode/driver/gce.py b/services/nodemanager/arvnodeman/computenode/driver/gce.py
deleted file mode 100644 (file)
index 23a1017..0000000
+++ /dev/null
@@ -1,181 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: AGPL-3.0
-
-from __future__ import absolute_import, print_function
-
-import functools
-import json
-import time
-
-import libcloud.compute.providers as cloud_provider
-import libcloud.compute.types as cloud_types
-
-from . import BaseComputeNodeDriver
-from .. import arvados_node_fqdn, arvados_timestamp, ARVADOS_TIMEFMT
-
-class ComputeNodeDriver(BaseComputeNodeDriver):
-    """Compute node driver wrapper for GCE
-
-    This translates cloud driver requests to GCE's specific parameters.
-    """
-    DEFAULT_DRIVER = cloud_provider.get_driver(cloud_types.Provider.GCE)
-    SEARCH_CACHE = {}
-
-    def __init__(self, auth_kwargs, list_kwargs, create_kwargs,
-                 driver_class=DEFAULT_DRIVER):
-        list_kwargs = list_kwargs.copy()
-        tags_str = list_kwargs.pop('tags', '')
-        if not tags_str.strip():
-            self.node_tags = frozenset()
-        else:
-            self.node_tags = frozenset(t.strip() for t in tags_str.split(','))
-        create_kwargs = create_kwargs.copy()
-        create_kwargs.setdefault('external_ip', None)
-        create_kwargs.setdefault('ex_metadata', {})
-        self._project = auth_kwargs.get("project")
-        super(ComputeNodeDriver, self).__init__(
-            auth_kwargs, list_kwargs, create_kwargs,
-            driver_class)
-        self._disktype_links = {dt.name: self._object_link(dt)
-                                for dt in self.real.ex_list_disktypes()}
-
-    @staticmethod
-    def _object_link(cloud_object):
-        return cloud_object.extra.get('selfLink')
-
-    def _init_image(self, image_name):
-        return 'image', self.search_for(
-            image_name, 'list_images', self._name_key, ex_project=self._project)
-
-    def _init_network(self, network_name):
-        return 'ex_network', self.search_for(
-            network_name, 'ex_list_networks', self._name_key)
-
-    def _init_service_accounts(self, service_accounts_str):
-        return 'ex_service_accounts', json.loads(service_accounts_str)
-
-    def _init_ssh_key(self, filename):
-        # SSH keys are delivered to GCE nodes via ex_metadata: see
-        # http://stackoverflow.com/questions/26752617/creating-sshkeys-for-gce-instance-using-libcloud
-        with open(filename) as ssh_file:
-            self.create_kwargs['ex_metadata']['sshKeys'] = (
-                'root:' + ssh_file.read().strip())
-
-    def create_cloud_name(self, arvados_node):
-        uuid_parts = arvados_node['uuid'].split('-', 2)
-        return 'compute-{parts[2]}-{parts[0]}'.format(parts=uuid_parts)
-
-    def arvados_create_kwargs(self, size, arvados_node):
-        name = self.create_cloud_name(arvados_node)
-
-        if size.scratch > 375000:
-            self._logger.warning("Requested %d MB scratch space, but GCE driver currently only supports attaching a single 375 GB disk.", size.scratch)
-
-        disks = [
-            {'autoDelete': True,
-             'boot': True,
-             'deviceName': name,
-             'initializeParams':
-                 {'diskName': name,
-                  'diskType': self._disktype_links['pd-standard'],
-                  'sourceImage': self._object_link(self.create_kwargs['image']),
-                  },
-             'type': 'PERSISTENT',
-             },
-            {'autoDelete': True,
-             'boot': False,
-             # Boot images rely on this device name to find the SSD.
-             # Any change must be coordinated in the image.
-             'deviceName': 'tmp',
-             'initializeParams':
-                 {'diskType': self._disktype_links['local-ssd'],
-                  },
-             'type': 'SCRATCH',
-             },
-            ]
-        result = {'name': name,
-                  'ex_metadata': self.create_kwargs['ex_metadata'].copy(),
-                  'ex_tags': list(self.node_tags),
-                  'ex_disks_gce_struct': disks,
-                  }
-        result['ex_metadata'].update({
-            'arvados_node_size': size.id,
-            'arv-ping-url': self._make_ping_url(arvados_node),
-            'booted_at': time.strftime(ARVADOS_TIMEFMT, time.gmtime()),
-            'hostname': arvados_node_fqdn(arvados_node),
-        })
-        return result
-
-    def list_nodes(self):
-        # The GCE libcloud driver only supports filtering node lists by zone.
-        # Do our own filtering based on tag list.
-        nodelist = [node for node in
-                    super(ComputeNodeDriver, self).list_nodes()
-                    if self.node_tags.issubset(node.extra.get('tags', []))]
-        for node in nodelist:
-            # As of 0.18, the libcloud GCE driver sets node.size to the size's name.
-            # It's supposed to be the actual size object.  Check that it's not,
-            # and monkeypatch the results when that's the case.
-            if not hasattr(node.size, 'id'):
-                node.size = self.sizes()[node.size]
-            # Get arvados-assigned cloud size id
-            node.extra['arvados_node_size'] = node.extra.get('metadata', {}).get('arvados_node_size') or node.size.id
-        return nodelist
-
-    @classmethod
-    def _find_metadata(cls, metadata_items, key):
-        # Given a list of two-item metadata dictonaries, return the one with
-        # the named key.  Raise KeyError if not found.
-        try:
-            return next(data_dict for data_dict in metadata_items
-                        if data_dict.get('key') == key)
-        except StopIteration:
-            raise KeyError(key)
-
-    @classmethod
-    def _get_metadata(cls, metadata_items, key, *default):
-        try:
-            return cls._find_metadata(metadata_items, key)['value']
-        except KeyError:
-            if default:
-                return default[0]
-            raise
-
-    def sync_node(self, cloud_node, arvados_node):
-        # Update the cloud node record to ensure we have the correct metadata
-        # fingerprint.
-        cloud_node = self.real.ex_get_node(cloud_node.name, cloud_node.extra['zone'])
-
-        # We can't store the FQDN on the name attribute or anything like it,
-        # because (a) names are static throughout the node's life (so FQDN
-        # isn't available because we don't know it at node creation time) and
-        # (b) it can't contain dots.  Instead stash it in metadata.
-        hostname = arvados_node_fqdn(arvados_node)
-        metadata_req = cloud_node.extra['metadata'].copy()
-        metadata_items = metadata_req.setdefault('items', [])
-        try:
-            self._find_metadata(metadata_items, 'hostname')['value'] = hostname
-        except KeyError:
-            metadata_items.append({'key': 'hostname', 'value': hostname})
-
-        self.real.ex_set_node_metadata(cloud_node, metadata_items)
-
-    @classmethod
-    def node_fqdn(cls, node):
-        # See sync_node comment.
-        return cls._get_metadata(node.extra['metadata'].get('items', []),
-                                 'hostname', '')
-
-    @classmethod
-    def node_start_time(cls, node):
-        try:
-            return arvados_timestamp(cls._get_metadata(
-                    node.extra['metadata']['items'], 'booted_at'))
-        except KeyError:
-            return 0
-
-    @classmethod
-    def node_id(cls, node):
-        return node.id
diff --git a/services/nodemanager/arvnodeman/config.py b/services/nodemanager/arvnodeman/config.py
deleted file mode 100644 (file)
index 4857e89..0000000
+++ /dev/null
@@ -1,184 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: AGPL-3.0
-
-from __future__ import absolute_import, print_function
-
-import ConfigParser
-import importlib
-import logging
-import sys
-
-import arvados
-import httplib2
-import pykka
-from apiclient import errors as apierror
-
-from .baseactor import BaseNodeManagerActor
-
-from functools import partial
-from libcloud.common.types import LibcloudError
-from libcloud.common.exceptions import BaseHTTPError
-
-# IOError is the base class for socket.error, ssl.SSLError, and friends.
-# It seems like it hits the sweet spot for operations we want to retry:
-# it's low-level, but unlikely to catch code bugs.
-NETWORK_ERRORS = (IOError,)
-ARVADOS_ERRORS = NETWORK_ERRORS + (apierror.Error,)
-CLOUD_ERRORS = NETWORK_ERRORS + (LibcloudError, BaseHTTPError)
-
-actor_class = BaseNodeManagerActor
-
-class NodeManagerConfig(ConfigParser.SafeConfigParser):
-    """Node Manager Configuration class.
-
-    This a standard Python ConfigParser, with additional helper methods to
-    create objects instantiated with configuration information.
-    """
-
-    LOGGING_NONLEVELS = frozenset(['file'])
-
-    def __init__(self, *args, **kwargs):
-        # Can't use super() because SafeConfigParser is an old-style class.
-        ConfigParser.SafeConfigParser.__init__(self, *args, **kwargs)
-        for sec_name, settings in {
-            'Arvados': {'insecure': 'no',
-                        'timeout': '15',
-                        'jobs_queue': 'yes',
-                        'slurm_queue': 'yes'
-                    },
-            'Daemon': {'min_nodes': '0',
-                       'max_nodes': '1',
-                       'poll_time': '60',
-                       'cloudlist_poll_time': '0',
-                       'nodelist_poll_time': '0',
-                       'wishlist_poll_time': '0',
-                       'max_poll_time': '300',
-                       'poll_stale_after': '600',
-                       'max_total_price': '0',
-                       'boot_fail_after': str(sys.maxint),
-                       'node_stale_after': str(60 * 60 * 2),
-                       'watchdog': '600',
-                       'node_mem_scaling': '0.95',
-                       'consecutive_idle_count': '2'},
-            'Manage': {'address': '127.0.0.1',
-                       'port': '-1',
-                       'ManagementToken': ''},
-            'Logging': {'file': '/dev/stderr',
-                        'level': 'WARNING'}
-        }.iteritems():
-            if not self.has_section(sec_name):
-                self.add_section(sec_name)
-            for opt_name, value in settings.iteritems():
-                if not self.has_option(sec_name, opt_name):
-                    self.set(sec_name, opt_name, value)
-
-    def get_section(self, section, transformers={}, default_transformer=None):
-        transformer_map = {
-            str: self.get,
-            int: self.getint,
-            bool: self.getboolean,
-            float: self.getfloat,
-        }
-        result = self._dict()
-        for key, value in self.items(section):
-            transformer = None
-            if transformers.get(key) in transformer_map:
-                transformer = partial(transformer_map[transformers[key]], section)
-            elif default_transformer in transformer_map:
-                transformer = partial(transformer_map[default_transformer], section)
-            if transformer is not None:
-                try:
-                    value = transformer(key)
-                except (TypeError, ValueError):
-                    pass
-            result[key] = value
-        return result
-
-    def log_levels(self):
-        return {key: getattr(logging, self.get('Logging', key).upper())
-                for key in self.options('Logging')
-                if key not in self.LOGGING_NONLEVELS}
-
-    def dispatch_classes(self):
-        mod_name = 'arvnodeman.computenode.dispatch'
-        if self.has_option('Daemon', 'dispatcher'):
-            mod_name = '{}.{}'.format(mod_name,
-                                      self.get('Daemon', 'dispatcher'))
-        module = importlib.import_module(mod_name)
-        return (module.ComputeNodeSetupActor,
-                module.ComputeNodeShutdownActor,
-                module.ComputeNodeUpdateActor,
-                module.ComputeNodeMonitorActor)
-
-    def new_arvados_client(self):
-        if self.has_option('Daemon', 'certs_file'):
-            certs_file = self.get('Daemon', 'certs_file')
-        else:
-            certs_file = None
-        insecure = self.getboolean('Arvados', 'insecure')
-        http = httplib2.Http(timeout=self.getint('Arvados', 'timeout'),
-                             ca_certs=certs_file,
-                             disable_ssl_certificate_validation=insecure)
-        return arvados.api(version='v1',
-                           host=self.get('Arvados', 'host'),
-                           token=self.get('Arvados', 'token'),
-                           insecure=insecure,
-                           http=http)
-
-    def new_cloud_client(self):
-        module = importlib.import_module('arvnodeman.computenode.driver.' +
-                                         self.get('Cloud', 'provider'))
-        driver_class = module.ComputeNodeDriver.DEFAULT_DRIVER
-        if self.has_option('Cloud', 'driver_class'):
-            d = self.get('Cloud', 'driver_class').split('.')
-            mod = '.'.join(d[:-1])
-            cls = d[-1]
-            driver_class = importlib.import_module(mod).__dict__[cls]
-        auth_kwargs = self.get_section('Cloud Credentials')
-        if 'timeout' in auth_kwargs:
-            auth_kwargs['timeout'] = int(auth_kwargs['timeout'])
-        return module.ComputeNodeDriver(auth_kwargs,
-                                        self.get_section('Cloud List'),
-                                        self.get_section('Cloud Create'),
-                                        driver_class=driver_class)
-
-    def node_sizes(self):
-        """Finds all acceptable NodeSizes for our installation.
-
-        Returns a list of (NodeSize, kwargs) pairs for each NodeSize object
-        returned by libcloud that matches a size listed in our config file.
-        """
-        all_sizes = self.new_cloud_client().list_sizes()
-        size_kwargs = {}
-        section_types = {
-            'instance_type': str,
-            'price': float,
-            'preemptible': bool,
-        }
-        for sec_name in self.sections():
-            sec_words = sec_name.split(None, 2)
-            if sec_words[0] != 'Size':
-                continue
-            size_spec = self.get_section(sec_name, section_types, int)
-            if 'preemptible' not in size_spec:
-                size_spec['preemptible'] = False
-            if 'instance_type' not in size_spec:
-                # Assume instance type is Size name if missing
-                size_spec['instance_type'] = sec_words[1]
-            size_spec['id'] = sec_words[1]
-            size_kwargs[sec_words[1]] = size_spec
-        # EC2 node sizes are identified by id. GCE sizes are identified by name.
-        matching_sizes = []
-        for size in all_sizes:
-            matching_sizes += [
-                (size, size_kwargs[s]) for s in size_kwargs
-                if size_kwargs[s]['instance_type'] == size.id
-                or size_kwargs[s]['instance_type'] == size.name
-            ]
-        return matching_sizes
-
-    def shutdown_windows(self):
-        return [float(n)
-                for n in self.get('Cloud', 'shutdown_windows').split(',')]
diff --git a/services/nodemanager/arvnodeman/daemon.py b/services/nodemanager/arvnodeman/daemon.py
deleted file mode 100644 (file)
index 1edf4dc..0000000
+++ /dev/null
@@ -1,583 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: AGPL-3.0
-
-from __future__ import absolute_import, print_function
-
-import functools
-import logging
-import time
-
-import pykka
-
-from . import computenode as cnode
-from . import status
-from .computenode import dispatch
-from .config import actor_class
-
-class _ComputeNodeRecord(object):
-    def __init__(self, actor=None, cloud_node=None, arvados_node=None,
-                 assignment_time=float('-inf')):
-        self.actor = actor
-        self.cloud_node = cloud_node
-        self.arvados_node = arvados_node
-        self.assignment_time = assignment_time
-        self.shutdown_actor = None
-
-class _BaseNodeTracker(object):
-    def __init__(self):
-        self.nodes = {}
-        self.orphans = {}
-
-    # Proxy the methods listed below to self.nodes.
-    def _proxy_method(name):
-        method = getattr(dict, name)
-        @functools.wraps(method, ('__name__', '__doc__'))
-        def wrapper(self, *args, **kwargs):
-            return method(self.nodes, *args, **kwargs)
-        return wrapper
-
-    for _method_name in ['__contains__', '__getitem__', '__len__', 'get']:
-        locals()[_method_name] = _proxy_method(_method_name)
-
-    def record_key(self, record):
-        return self.item_key(getattr(record, self.RECORD_ATTR))
-
-    def add(self, record):
-        self.nodes[self.record_key(record)] = record
-
-    def update_record(self, key, item):
-        setattr(self.nodes[key], self.RECORD_ATTR, item)
-
-    def update_from(self, response):
-        unseen = set(self.nodes.iterkeys())
-        for item in response:
-            key = self.item_key(item)
-            if key in unseen:
-                unseen.remove(key)
-                self.update_record(key, item)
-            else:
-                yield key, item
-        self.orphans = {key: self.nodes.pop(key) for key in unseen}
-
-    def unpaired(self):
-        return (record for record in self.nodes.itervalues()
-                if getattr(record, self.PAIR_ATTR) is None)
-
-
-class _CloudNodeTracker(_BaseNodeTracker):
-    RECORD_ATTR = 'cloud_node'
-    PAIR_ATTR = 'arvados_node'
-    item_key = staticmethod(lambda cloud_node: cloud_node.id)
-
-
-class _ArvadosNodeTracker(_BaseNodeTracker):
-    RECORD_ATTR = 'arvados_node'
-    PAIR_ATTR = 'cloud_node'
-    item_key = staticmethod(lambda arvados_node: arvados_node['uuid'])
-
-    def find_stale_node(self, stale_time):
-        # Try to select a stale node record that have an assigned slot first
-        for record in sorted(self.nodes.itervalues(),
-                             key=lambda r: r.arvados_node['slot_number'],
-                             reverse=True):
-            node = record.arvados_node
-            if (not cnode.timestamp_fresh(cnode.arvados_node_mtime(node),
-                                          stale_time) and
-                  not cnode.timestamp_fresh(record.assignment_time,
-                                            stale_time)):
-                return node
-        return None
-
-
-class NodeManagerDaemonActor(actor_class):
-    """Node Manager daemon.
-
-    This actor subscribes to all information polls about cloud nodes,
-    Arvados nodes, and the job queue.  It creates a ComputeNodeMonitorActor
-    for every cloud node, subscribing them to poll updates
-    appropriately.  It creates and destroys cloud nodes based on job queue
-    demand, and stops the corresponding ComputeNode actors when their work
-    is done.
-    """
-    def __init__(self, server_wishlist_actor, arvados_nodes_actor,
-                 cloud_nodes_actor, cloud_update_actor, timer_actor,
-                 arvados_factory, cloud_factory,
-                 shutdown_windows, server_calculator,
-                 min_nodes, max_nodes,
-                 poll_stale_after=600,
-                 boot_fail_after=1800,
-                 node_stale_after=7200,
-                 node_setup_class=dispatch.ComputeNodeSetupActor,
-                 node_shutdown_class=dispatch.ComputeNodeShutdownActor,
-                 node_actor_class=dispatch.ComputeNodeMonitorActor,
-                 max_total_price=0,
-                 consecutive_idle_count=1):
-        super(NodeManagerDaemonActor, self).__init__()
-        self._node_setup = node_setup_class
-        self._node_shutdown = node_shutdown_class
-        self._node_actor = node_actor_class
-        self._cloud_updater = cloud_update_actor
-        self._timer = timer_actor
-        self._new_arvados = arvados_factory
-        self._new_cloud = cloud_factory
-        self._cloud_driver = self._new_cloud()
-        self._later = self.actor_ref.tell_proxy()
-        self.shutdown_windows = shutdown_windows
-        self.server_calculator = server_calculator
-        self.min_cloud_size = self.server_calculator.cheapest_size()
-        self.min_nodes = min_nodes
-        self.max_nodes = max_nodes
-        self.node_quota = max_nodes
-        self.max_total_price = max_total_price
-        self.poll_stale_after = poll_stale_after
-        self.boot_fail_after = boot_fail_after
-        self.node_stale_after = node_stale_after
-        self.consecutive_idle_count = consecutive_idle_count
-        self.last_polls = {}
-        for poll_name in ['server_wishlist', 'arvados_nodes', 'cloud_nodes']:
-            poll_actor = locals()[poll_name + '_actor']
-            poll_actor.subscribe(getattr(self._later, 'update_' + poll_name))
-            setattr(self, '_{}_actor'.format(poll_name), poll_actor)
-            self.last_polls[poll_name] = -self.poll_stale_after
-        self.cloud_nodes = _CloudNodeTracker()
-        self.arvados_nodes = _ArvadosNodeTracker()
-        self.booting = {}       # Actor IDs to ComputeNodeSetupActors
-        self.sizes_booting = {} # Actor IDs to node size
-
-    def on_start(self):
-        self._logger = logging.getLogger("%s.%s" % (self.__class__.__name__, self.actor_urn[33:]))
-        self._logger.debug("Daemon started")
-
-    def _update_poll_time(self, poll_key):
-        self.last_polls[poll_key] = time.time()
-
-    def _pair_nodes(self, node_record, arvados_node):
-        self._logger.info("Cloud node %s is now paired with Arvados node %s with hostname %s",
-                          node_record.cloud_node.name, arvados_node['uuid'], arvados_node['hostname'])
-        self._arvados_nodes_actor.subscribe_to(
-            arvados_node['uuid'], node_record.actor.update_arvados_node)
-        node_record.arvados_node = arvados_node
-        self.arvados_nodes.add(node_record)
-
-    def _new_node(self, cloud_node):
-        start_time = self._cloud_driver.node_start_time(cloud_node)
-        shutdown_timer = cnode.ShutdownTimer(start_time,
-                                             self.shutdown_windows)
-        actor = self._node_actor.start(
-            cloud_node=cloud_node,
-            cloud_node_start_time=start_time,
-            shutdown_timer=shutdown_timer,
-            update_actor=self._cloud_updater,
-            timer_actor=self._timer,
-            arvados_node=None,
-            poll_stale_after=self.poll_stale_after,
-            node_stale_after=self.node_stale_after,
-            cloud_client=self._cloud_driver,
-            boot_fail_after=self.boot_fail_after,
-            consecutive_idle_count=self.consecutive_idle_count)
-        actorTell = actor.tell_proxy()
-        actorTell.subscribe(self._later.node_can_shutdown)
-        self._cloud_nodes_actor.subscribe_to(cloud_node.id,
-                                             actorTell.update_cloud_node)
-        record = _ComputeNodeRecord(actor.proxy(), cloud_node)
-        return record
-
-    def _register_cloud_node(self, node):
-        rec = self.cloud_nodes.get(node.id)
-        if rec is None:
-            self._logger.info("Registering new cloud node %s", node.id)
-            record = self._new_node(node)
-            self.cloud_nodes.add(record)
-        else:
-            rec.cloud_node = node
-
-    def update_cloud_nodes(self, nodelist):
-        self._update_poll_time('cloud_nodes')
-        for _, node in self.cloud_nodes.update_from(nodelist):
-            self._register_cloud_node(node)
-
-        self.try_pairing()
-
-        for record in self.cloud_nodes.orphans.itervalues():
-            if record.shutdown_actor:
-                try:
-                    record.shutdown_actor.stop()
-                except pykka.ActorDeadError:
-                    pass
-                record.shutdown_actor = None
-
-            # A recently booted node is a node that successfully completed the
-            # setup actor but has not yet appeared in the cloud node list.
-            # This will have the tag _nodemanager_recently_booted on it, which
-            # means (if we're not shutting it down) we want to put it back into
-            # the cloud node list.  Once it really appears in the cloud list,
-            # the object in record.cloud_node will be replaced by a new one
-            # that lacks the "_nodemanager_recently_booted" tag.
-            if hasattr(record.cloud_node, "_nodemanager_recently_booted"):
-                self.cloud_nodes.add(record)
-            else:
-                # Node disappeared from the cloud node list. If it's paired,
-                # remove its idle time counter.
-                if record.arvados_node:
-                    status.tracker.idle_out(record.arvados_node.get('hostname'))
-                # Stop the monitor actor if necessary and forget about the node.
-                if record.actor:
-                    try:
-                        record.actor.stop()
-                    except pykka.ActorDeadError:
-                        pass
-                    record.actor = None
-                record.cloud_node = None
-
-    def _register_arvados_node(self, key, arv_node):
-        self._logger.info("Registering new Arvados node %s", key)
-        record = _ComputeNodeRecord(arvados_node=arv_node)
-        self.arvados_nodes.add(record)
-
-    def update_arvados_nodes(self, nodelist):
-        self._update_poll_time('arvados_nodes')
-        for key, node in self.arvados_nodes.update_from(nodelist):
-            self._register_arvados_node(key, node)
-        self.try_pairing()
-
-    def try_pairing(self):
-        for record in self.cloud_nodes.unpaired():
-            for arv_rec in self.arvados_nodes.unpaired():
-                if record.actor is not None and record.actor.offer_arvados_pair(arv_rec.arvados_node).get():
-                    self._pair_nodes(record, arv_rec.arvados_node)
-                    break
-
-    def _nodes_booting(self, size):
-        s = sum(1
-                for c in self.booting.iterkeys()
-                if size is None or self.sizes_booting[c].id == size.id)
-        return s
-
-    def _node_states(self, size):
-        proxy_states = []
-        states = []
-        for rec in self.cloud_nodes.nodes.itervalues():
-            if size is None or rec.cloud_node.size.id == size.id:
-                if rec.shutdown_actor is None and rec.actor is not None:
-                    proxy_states.append(rec.actor.get_state())
-                else:
-                    states.append("shutdown")
-        return states + pykka.get_all(proxy_states)
-
-    def _update_tracker(self):
-        updates = {
-            k: 0
-            for k in status.tracker.keys()
-            if k.startswith('nodes_')
-        }
-        for s in self._node_states(size=None):
-            updates.setdefault('nodes_'+s, 0)
-            updates['nodes_'+s] += 1
-        updates['nodes_wish'] = len(self.last_wishlist)
-        updates['node_quota'] = self.node_quota
-        status.tracker.update(updates)
-
-    def _state_counts(self, size):
-        states = self._node_states(size)
-        counts = {
-            "booting": self._nodes_booting(size),
-            "unpaired": 0,
-            "busy": 0,
-            "idle": 0,
-            "fail": 0,
-            "down": 0,
-            "shutdown": 0
-        }
-        for s in states:
-            counts[s] = counts[s] + 1
-        return counts
-
-    def _nodes_up(self, counts):
-        up = counts["booting"] + counts["unpaired"] + counts["idle"] + counts["busy"]
-        return up
-
-    def _total_price(self):
-        cost = 0
-        cost += sum(self.sizes_booting[c].price
-                    for c in self.booting.iterkeys())
-        cost += sum(c.cloud_node.size.price
-                    for c in self.cloud_nodes.nodes.itervalues())
-        return cost
-
-    def _size_wishlist(self, size):
-        return sum(1 for c in self.last_wishlist if c.id == size.id)
-
-    def _nodes_wanted(self, size):
-        total_node_count = self._nodes_booting(None) + len(self.cloud_nodes)
-        under_min = self.min_nodes - total_node_count
-        over_max = total_node_count - self.node_quota
-        total_price = self._total_price()
-
-        counts = self._state_counts(size)
-
-        up_count = self._nodes_up(counts)
-        busy_count = counts["busy"]
-        wishlist_count = self._size_wishlist(size)
-
-        self._logger.info("%s: wishlist %i, up %i (booting %i, unpaired %i, idle %i, busy %i), down %i, shutdown %i", size.id,
-                          wishlist_count,
-                          up_count,
-                          counts["booting"],
-                          counts["unpaired"],
-                          counts["idle"],
-                          busy_count,
-                          counts["down"]+counts["fail"],
-                          counts["shutdown"])
-
-        if over_max >= 0:
-            return -over_max
-        elif under_min > 0 and size.id == self.min_cloud_size.id:
-            return under_min
-
-        wanted = wishlist_count - (up_count - busy_count)
-        if wanted > 0 and self.max_total_price and ((total_price + (size.price*wanted)) > self.max_total_price):
-            can_boot = int((self.max_total_price - total_price) / size.price)
-            if can_boot == 0:
-                self._logger.info("Not booting %s (price %s) because with it would exceed max_total_price of %s (current total_price is %s)",
-                                  size.id, size.price, self.max_total_price, total_price)
-            return can_boot
-        else:
-            return wanted
-
-    def _nodes_excess(self, size):
-        counts = self._state_counts(size)
-        up_count = self._nodes_up(counts)
-        if size.id == self.min_cloud_size.id:
-            up_count -= self.min_nodes
-        return up_count - (counts["busy"] + self._size_wishlist(size))
-
-    def update_server_wishlist(self, wishlist):
-        self._update_poll_time('server_wishlist')
-        requestable_nodes = self.node_quota - (self._nodes_booting(None) + len(self.cloud_nodes))
-        self.last_wishlist = wishlist[:requestable_nodes]
-        for size in reversed(self.server_calculator.cloud_sizes):
-            try:
-                nodes_wanted = self._nodes_wanted(size)
-                if nodes_wanted > 0:
-                    self._later.start_node(size)
-                elif (nodes_wanted < 0) and self.booting:
-                    self._later.stop_booting_node(size)
-            except Exception:
-                self._logger.exception("while calculating nodes wanted for size %s", getattr(size, "id", "(id not available)"))
-        try:
-            self._update_tracker()
-        except:
-            self._logger.exception("while updating tracker")
-
-    def _check_poll_freshness(orig_func):
-        """Decorator to inhibit a method when poll information is stale.
-
-        This decorator checks the timestamps of all the poll information the
-        daemon has received.  The decorated method is only called if none
-        of the timestamps are considered stale.
-        """
-        @functools.wraps(orig_func)
-        def wrapper(self, *args, **kwargs):
-            now = time.time()
-            if all(now - t < self.poll_stale_after
-                   for t in self.last_polls.itervalues()):
-                return orig_func(self, *args, **kwargs)
-            else:
-                return None
-        return wrapper
-
-    @_check_poll_freshness
-    def start_node(self, cloud_size):
-        nodes_wanted = self._nodes_wanted(cloud_size)
-        if nodes_wanted < 1:
-            return None
-
-        if not self.cancel_node_shutdown(cloud_size):
-            arvados_node = self.arvados_nodes.find_stale_node(self.node_stale_after)
-            self._logger.info("Want %i more %s nodes.  Booting a node.",
-                              nodes_wanted, cloud_size.id)
-            new_setup = self._node_setup.start(
-                timer_actor=self._timer,
-                arvados_client=self._new_arvados(),
-                arvados_node=arvados_node,
-                cloud_client=self._new_cloud(),
-                cloud_size=self.server_calculator.find_size(cloud_size.id))
-            self.booting[new_setup.actor_urn] = new_setup.proxy()
-            self.sizes_booting[new_setup.actor_urn] = cloud_size
-
-            if arvados_node is not None:
-                self.arvados_nodes[arvados_node['uuid']].assignment_time = (
-                    time.time())
-            new_setup.tell_proxy().subscribe(self._later.node_setup_finished)
-
-        if nodes_wanted > 1:
-            self._later.start_node(cloud_size)
-
-    def _get_actor_attrs(self, actor, *attr_names):
-        return pykka.get_all([getattr(actor, name) for name in attr_names])
-
-    def node_setup_finished(self, setup_proxy):
-        # Called when a SetupActor has completed.
-        cloud_node, arvados_node, error = self._get_actor_attrs(
-            setup_proxy, 'cloud_node', 'arvados_node', 'error')
-        setup_proxy.stop()
-
-        if cloud_node is None:
-            # If cloud_node is None then the node create wasn't successful.
-            if error == dispatch.QuotaExceeded:
-                # We've hit a quota limit, so adjust node_quota to stop trying to
-                # boot new nodes until the node count goes down.
-                self.node_quota = len(self.cloud_nodes)
-                self._logger.warning("After quota exceeded error setting node quota to %s", self.node_quota)
-        else:
-            # Node creation succeeded.  Update cloud node list.
-            cloud_node._nodemanager_recently_booted = True
-            self._register_cloud_node(cloud_node)
-
-            # Different quota policies may in force depending on the cloud
-            # provider, account limits, and the specific mix of nodes sizes
-            # that are already created.  If we are right at the quota limit,
-            # we want to probe to see if the last quota still applies or if we
-            # are allowed to create more nodes.
-            #
-            # For example, if the quota is actually based on core count, the
-            # quota might be 20 single-core machines or 10 dual-core machines.
-            # If we previously set node_quota to 10 dual core machines, but are
-            # now booting single core machines (actual quota 20), we want to
-            # allow the quota to expand so we don't get stuck at 10 machines
-            # forever.
-            if len(self.cloud_nodes) >= self.node_quota:
-                self.node_quota = len(self.cloud_nodes)+1
-                self._logger.warning("After successful boot setting node quota to %s", self.node_quota)
-
-        self.node_quota = min(self.node_quota, self.max_nodes)
-        del self.booting[setup_proxy.actor_ref.actor_urn]
-        del self.sizes_booting[setup_proxy.actor_ref.actor_urn]
-
-    @_check_poll_freshness
-    def stop_booting_node(self, size):
-        nodes_excess = self._nodes_excess(size)
-        if (nodes_excess < 1) or not self.booting:
-            return None
-        for key, node in self.booting.iteritems():
-            try:
-                if node and node.cloud_size.get().id == size.id and node.stop_if_no_cloud_node().get(2):
-                    del self.booting[key]
-                    del self.sizes_booting[key]
-                    if nodes_excess > 1:
-                        self._later.stop_booting_node(size)
-                    return
-            except pykka.Timeout:
-                pass
-
-    @_check_poll_freshness
-    def cancel_node_shutdown(self, size):
-        # Go through shutdown actors and see if there are any of the appropriate size that can be cancelled
-        for record in self.cloud_nodes.nodes.itervalues():
-            try:
-                if (record.shutdown_actor is not None and
-                    record.cloud_node.size.id == size.id and
-                    record.shutdown_actor.cancel_shutdown("Node size is in wishlist").get(2)):
-                        return True
-            except (pykka.ActorDeadError, pykka.Timeout) as e:
-                pass
-        return False
-
-    def _begin_node_shutdown(self, node_actor, cancellable):
-        cloud_node_obj = node_actor.cloud_node.get()
-        cloud_node_id = cloud_node_obj.id
-        record = self.cloud_nodes[cloud_node_id]
-        if record.shutdown_actor is not None:
-            return None
-        shutdown = self._node_shutdown.start(
-            timer_actor=self._timer, cloud_client=self._new_cloud(),
-            arvados_client=self._new_arvados(),
-            node_monitor=node_actor.actor_ref, cancellable=cancellable)
-        record.shutdown_actor = shutdown.proxy()
-        shutdown.tell_proxy().subscribe(self._later.node_finished_shutdown)
-
-    @_check_poll_freshness
-    def node_can_shutdown(self, node_actor):
-        try:
-            if self._nodes_excess(node_actor.cloud_node.get().size) > 0:
-                self._begin_node_shutdown(node_actor, cancellable=True)
-            elif self.cloud_nodes.nodes.get(node_actor.cloud_node.get().id).arvados_node is None:
-                # Node is unpaired, which means it probably exceeded its booting
-                # grace period without a ping, so shut it down so we can boot a new
-                # node in its place.
-                self._begin_node_shutdown(node_actor, cancellable=False)
-            elif node_actor.in_state('down', 'fail').get():
-                # Node is down and unlikely to come back.
-                self._begin_node_shutdown(node_actor, cancellable=False)
-        except pykka.ActorDeadError as e:
-            # The monitor actor sends shutdown suggestions every time the
-            # node's state is updated, and these go into the daemon actor's
-            # message queue.  It's possible that the node has already been shut
-            # down (which shuts down the node monitor actor).  In that case,
-            # this message is stale and we'll get ActorDeadError when we try to
-            # access node_actor.  Log the error.
-            self._logger.debug("ActorDeadError in node_can_shutdown: %s", e)
-
-    def node_finished_shutdown(self, shutdown_actor):
-        try:
-            cloud_node, success = self._get_actor_attrs(
-                shutdown_actor, 'cloud_node', 'success')
-        except pykka.ActorDeadError:
-            return
-        cloud_node_id = cloud_node.id
-
-        try:
-            shutdown_actor.stop()
-        except pykka.ActorDeadError:
-            pass
-
-        try:
-            record = self.cloud_nodes[cloud_node_id]
-        except KeyError:
-            # Cloud node was already removed from the cloud node list
-            # supposedly while the destroy_node call was finishing its
-            # job.
-            return
-        record.shutdown_actor = None
-
-        if not success:
-            return
-
-        # Shutdown was successful, so stop the monitor actor, otherwise it
-        # will keep offering the node as a candidate for shutdown.
-        record.actor.stop()
-        record.actor = None
-
-        # If the node went from being booted to being shut down without ever
-        # appearing in the cloud node list, it will have the
-        # _nodemanager_recently_booted tag, so get rid of it so that the node
-        # can be forgotten completely.
-        if hasattr(record.cloud_node, "_nodemanager_recently_booted"):
-            del record.cloud_node._nodemanager_recently_booted
-
-    def shutdown(self):
-        self._logger.info("Shutting down after signal.")
-        self.poll_stale_after = -1  # Inhibit starting/stopping nodes
-
-        # Shut down pollers
-        self._server_wishlist_actor.stop()
-        self._arvados_nodes_actor.stop()
-        self._cloud_nodes_actor.stop()
-
-        # Clear cloud node list
-        self.update_cloud_nodes([])
-
-        # Stop setup actors unless they are in the middle of setup.
-        setup_stops = {key: node.stop_if_no_cloud_node()
-                       for key, node in self.booting.iteritems()}
-        self.booting = {key: self.booting[key]
-                        for key in setup_stops if not setup_stops[key].get()}
-        self._later.await_shutdown()
-
-    def await_shutdown(self):
-        if self.booting:
-            self._timer.schedule(time.time() + 1, self._later.await_shutdown)
-        else:
-            self.stop()
diff --git a/services/nodemanager/arvnodeman/jobqueue.py b/services/nodemanager/arvnodeman/jobqueue.py
deleted file mode 100644 (file)
index 7ca9c95..0000000
+++ /dev/null
@@ -1,255 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: AGPL-3.0
-
-from __future__ import absolute_import, print_function
-
-import logging
-import re
-import subprocess32 as subprocess
-
-import arvados.util
-
-from . import clientactor
-from .config import ARVADOS_ERRORS
-
-
-class ServerCalculator(object):
-    """Generate cloud server wishlists from an Arvados job queue.
-
-    Instantiate this class with a list of cloud node sizes you're willing to
-    use, plus keyword overrides from the configuration.  Then you can pass
-    job queues to servers_for_queue.  It will return a list of node sizes
-    that would best satisfy the jobs, choosing the cheapest size that
-    satisfies each job, and ignoring jobs that can't be satisfied.
-    """
-    class InvalidCloudSize(object):
-        """
-        Dummy CloudSizeWrapper-like class, to be used when a cloud node doesn't
-        have a recognizable arvados_node_size tag.
-        """
-        def __init__(self):
-            self.id = 'invalid'
-            self.name = 'invalid'
-            self.ram = 0
-            self.disk = 0
-            self.scratch = 0
-            self.cores = 0
-            self.bandwidth = 0
-            # price is multiplied by 1000 to get the node weight
-            # the maximum node weight is                  4294967280
-            # so use invalid node weight 4294967 * 1000 = 4294967000
-            self.price = 4294967
-            self.preemptible = False
-            self.extra = {}
-
-        def meets_constraints(self, **kwargs):
-            return False
-
-
-    class CloudSizeWrapper(object):
-        def __init__(self, real_size, node_mem_scaling, **kwargs):
-            self.real = real_size
-            for name in ['id', 'name', 'ram', 'disk', 'bandwidth', 'price',
-                         'extra']:
-                setattr(self, name, getattr(self.real, name))
-            self.cores = kwargs.pop('cores')
-            # libcloud disk sizes are in GB, Arvados/SLURM are in MB
-            # multiply by 1000 instead of 1024 to err on low side
-            if self.disk is None:
-                self.disk = 0
-            self.scratch = self.disk * 1000
-            self.ram = int(self.ram * node_mem_scaling)
-            self.preemptible = False
-            for name, override in kwargs.iteritems():
-                if name == 'instance_type': continue
-                if not hasattr(self, name):
-                    raise ValueError("unrecognized size field '%s'" % (name,))
-                setattr(self, name, override)
-
-            if self.price is None:
-                raise ValueError("Required field 'price' is None")
-
-        def meets_constraints(self, **kwargs):
-            for name, want_value in kwargs.iteritems():
-                have_value = getattr(self, name)
-                if (have_value != 0) and (have_value < want_value):
-                    return False
-            return True
-
-
-    def __init__(self, server_list, max_nodes=None, max_price=None,
-                 node_mem_scaling=0.95):
-        self.cloud_sizes = [self.CloudSizeWrapper(s, node_mem_scaling, **kws)
-                            for s, kws in server_list]
-        self.cloud_sizes.sort(key=lambda s: s.price)
-        self.max_nodes = max_nodes or float('inf')
-        self.max_price = max_price or float('inf')
-        self.logger = logging.getLogger('arvnodeman.jobqueue')
-
-        self.logger.info("Using cloud node sizes:")
-        for s in self.cloud_sizes:
-            self.logger.info(str(s.__dict__))
-
-    @staticmethod
-    def coerce_int(x, fallback):
-        try:
-            return int(x)
-        except (TypeError, ValueError):
-            return fallback
-
-    def cloud_size_for_constraints(self, constraints):
-        specified_size = constraints.get('instance_type')
-        want_value = lambda key: self.coerce_int(constraints.get(key), 0)
-        wants = {'cores': want_value('min_cores_per_node'),
-                 'ram': want_value('min_ram_mb_per_node'),
-                 'scratch': want_value('min_scratch_mb_per_node')}
-        # EC2 node sizes are identified by id. GCE sizes are identified by name.
-        for size in self.cloud_sizes:
-            if (size.meets_constraints(**wants) and
-                (specified_size is None or
-                    size.id == specified_size or size.name == specified_size)):
-                        return size
-        return None
-
-    def servers_for_queue(self, queue):
-        servers = []
-        unsatisfiable_jobs = {}
-        for job in queue:
-            constraints = job['runtime_constraints']
-            want_count = max(1, self.coerce_int(constraints.get('min_nodes'), 1))
-            cloud_size = self.cloud_size_for_constraints(constraints)
-            if cloud_size is None:
-                unsatisfiable_jobs[job['uuid']] = (
-                    "Constraints cannot be satisfied by any node type")
-            elif (want_count > self.max_nodes):
-                unsatisfiable_jobs[job['uuid']] = (
-                    "Job's min_nodes constraint is greater than the configured "
-                    "max_nodes (%d)" % self.max_nodes)
-            elif (want_count*cloud_size.price <= self.max_price):
-                servers.extend([cloud_size] * want_count)
-            else:
-                unsatisfiable_jobs[job['uuid']] = (
-                    "Job's price (%d) is above system's max_price "
-                    "limit (%d)" % (want_count*cloud_size.price, self.max_price))
-        return (servers, unsatisfiable_jobs)
-
-    def cheapest_size(self):
-        return self.cloud_sizes[0]
-
-    def find_size(self, sizeid):
-        for s in self.cloud_sizes:
-            if s.id == sizeid:
-                return s
-        return self.InvalidCloudSize()
-
-
-class JobQueueMonitorActor(clientactor.RemotePollLoopActor):
-    """Actor to generate server wishlists from the job queue.
-
-    This actor regularly polls Arvados' job queue, and uses the provided
-    ServerCalculator to turn that into a list of requested node sizes.  That
-    list is sent to subscribers on every poll.
-    """
-
-    CLIENT_ERRORS = ARVADOS_ERRORS
-
-    def __init__(self, client, timer_actor, server_calc,
-                 jobs_queue, slurm_queue, *args, **kwargs):
-        super(JobQueueMonitorActor, self).__init__(
-            client, timer_actor, *args, **kwargs)
-        self.jobs_queue = jobs_queue
-        self.slurm_queue = slurm_queue
-        self._calculator = server_calc
-
-    @staticmethod
-    def coerce_to_mb(x):
-        v, u = x[:-1], x[-1]
-        if u in ("M", "m"):
-            return int(v)
-        elif u in ("G", "g"):
-            return float(v) * 2**10
-        elif u in ("T", "t"):
-            return float(v) * 2**20
-        elif u in ("P", "p"):
-            return float(v) * 2**30
-        else:
-            return int(x)
-
-    def _send_request(self):
-        queuelist = []
-        if self.slurm_queue:
-            # cpus, memory, tempory disk space, reason, job name, feature constraints, priority
-            squeue_out = subprocess.check_output(["squeue", "--state=PENDING", "--noheader", "--format=%c|%m|%d|%r|%j|%f|%Q"])
-            for out in squeue_out.splitlines():
-                try:
-                    cpu, ram, disk, reason, jobname, features, priority = out.split("|", 6)
-                except ValueError:
-                    self._logger.warning("ignored malformed line in squeue output: %r", out)
-                    continue
-                if '-dz642-' not in jobname:
-                    continue
-                if not re.search(r'BadConstraints|ReqNodeNotAvail|Resources|Priority', reason):
-                    continue
-
-                for feature in features.split(','):
-                    m = re.match(r'instancetype=(.*)', feature)
-                    if not m:
-                        continue
-                    instance_type = m.group(1)
-                    # Ignore cpu/ram/scratch requirements, bring up
-                    # the requested node type.
-                    queuelist.append({
-                        "uuid": jobname,
-                        "runtime_constraints": {
-                            "instance_type": instance_type,
-                        },
-                        "priority": int(priority)
-                    })
-                    break
-                else:
-                    # No instance type specified. Choose a node type
-                    # to suit cpu/ram/scratch requirements.
-                    queuelist.append({
-                        "uuid": jobname,
-                        "runtime_constraints": {
-                            "min_cores_per_node": cpu,
-                            "min_ram_mb_per_node": self.coerce_to_mb(ram),
-                            "min_scratch_mb_per_node": self.coerce_to_mb(disk)
-                        },
-                        "priority": int(priority)
-                    })
-            queuelist.sort(key=lambda x: x.get('priority', 1), reverse=True)
-
-        if self.jobs_queue:
-            queuelist.extend(self._client.jobs().queue().execute()['items'])
-
-        return queuelist
-
-    def _got_response(self, queue):
-        server_list, unsatisfiable_jobs = self._calculator.servers_for_queue(queue)
-        # Cancel any job/container with unsatisfiable requirements, emitting
-        # a log explaining why.
-        for job_uuid, reason in unsatisfiable_jobs.iteritems():
-            try:
-                self._client.logs().create(body={
-                    'object_uuid': job_uuid,
-                    'event_type': 'stderr',
-                    'properties': {'text': reason},
-                }).execute()
-                # Cancel the job depending on its type
-                if arvados.util.container_uuid_pattern.match(job_uuid):
-                    subprocess.check_call(['scancel', '--name='+job_uuid])
-                elif arvados.util.job_uuid_pattern.match(job_uuid):
-                    self._client.jobs().cancel(uuid=job_uuid).execute()
-                else:
-                    raise Exception('Unknown job type')
-                self._logger.debug("Cancelled unsatisfiable job '%s'", job_uuid)
-            except Exception as error:
-                self._logger.error("Trying to cancel job '%s': %s",
-                                   job_uuid,
-                                   error)
-        self._logger.debug("Calculated wishlist: %s",
-                           ', '.join(s.id for s in server_list) or "(empty)")
-        return super(JobQueueMonitorActor, self)._got_response(server_list)
diff --git a/services/nodemanager/arvnodeman/launcher.py b/services/nodemanager/arvnodeman/launcher.py
deleted file mode 100644 (file)
index 34ea9ad..0000000
+++ /dev/null
@@ -1,171 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: AGPL-3.0
-
-from __future__ import absolute_import, print_function
-
-import argparse
-import logging
-import signal
-import sys
-import time
-
-import daemon
-import pykka
-import libcloud
-
-from . import config as nmconfig
-from . import status
-from .baseactor import WatchdogActor
-from .daemon import NodeManagerDaemonActor
-from .jobqueue import JobQueueMonitorActor, ServerCalculator
-from .nodelist import ArvadosNodeListMonitorActor, CloudNodeListMonitorActor
-from .timedcallback import TimedCallBackActor
-from ._version import __version__
-
-node_daemon = None
-watchdog = None
-
-def abort(msg, code=1):
-    print("arvados-node-manager: " + msg)
-    sys.exit(code)
-
-def parse_cli(args):
-    parser = argparse.ArgumentParser(
-        prog='arvados-node-manager',
-        description="Dynamically allocate Arvados cloud compute nodes")
-    parser.add_argument(
-        '--version', action='version',
-        version="%s %s" % (sys.argv[0], __version__),
-        help='Print version and exit.')
-    parser.add_argument(
-        '--foreground', action='store_true', default=False,
-        help="Run in the foreground.  Don't daemonize.")
-    parser.add_argument(
-        '--config', help="Path to configuration file")
-    return parser.parse_args(args)
-
-def load_config(path):
-    if not path:
-        abort("No --config file specified", 2)
-    config = nmconfig.NodeManagerConfig()
-    try:
-        with open(path) as config_file:
-            config.readfp(config_file)
-    except (IOError, OSError) as error:
-        abort("Error reading configuration file {}: {}".format(path, error))
-    return config
-
-def setup_logging(path, level, **sublevels):
-    handler = logging.FileHandler(path)
-    handler.setFormatter(logging.Formatter(
-            '%(asctime)s %(name)s[%(process)d] %(levelname)s: %(message)s',
-            '%Y-%m-%d %H:%M:%S'))
-    root_logger = logging.getLogger()
-    root_logger.addHandler(handler)
-    root_logger.setLevel(level)
-    for logger_name, sublevel in sublevels.iteritems():
-        sublogger = logging.getLogger(logger_name)
-        sublogger.setLevel(sublevel)
-    return root_logger
-
-def build_server_calculator(config):
-    cloud_size_list = config.node_sizes()
-    if not cloud_size_list:
-        abort("No valid node sizes configured")
-    return ServerCalculator(cloud_size_list,
-                            config.getint('Daemon', 'max_nodes'),
-                            config.getfloat('Daemon', 'max_total_price'),
-                            config.getfloat('Daemon', 'node_mem_scaling'))
-
-def launch_pollers(config, server_calculator):
-    poll_time = config.getfloat('Daemon', 'poll_time')
-    max_poll_time = config.getint('Daemon', 'max_poll_time')
-
-    cloudlist_poll_time = config.getfloat('Daemon', 'cloudlist_poll_time') or poll_time
-    nodelist_poll_time = config.getfloat('Daemon', 'nodelist_poll_time') or poll_time
-    wishlist_poll_time = config.getfloat('Daemon', 'wishlist_poll_time') or poll_time
-
-    timer = TimedCallBackActor.start(poll_time / 10.0).tell_proxy()
-    cloud_node_poller = CloudNodeListMonitorActor.start(
-        config.new_cloud_client(), timer, server_calculator, cloudlist_poll_time, max_poll_time).tell_proxy()
-    arvados_node_poller = ArvadosNodeListMonitorActor.start(
-        config.new_arvados_client(), timer, nodelist_poll_time, max_poll_time).tell_proxy()
-    job_queue_poller = JobQueueMonitorActor.start(
-        config.new_arvados_client(), timer, server_calculator,
-        config.getboolean('Arvados', 'jobs_queue'),
-        config.getboolean('Arvados', 'slurm_queue'),
-        wishlist_poll_time, max_poll_time
-    ).tell_proxy()
-    return timer, cloud_node_poller, arvados_node_poller, job_queue_poller
-
-_caught_signals = {}
-def shutdown_signal(signal_code, frame):
-    current_count = _caught_signals.get(signal_code, 0)
-    _caught_signals[signal_code] = current_count + 1
-    if node_daemon is None:
-        pykka.ActorRegistry.stop_all()
-        sys.exit(-signal_code)
-    elif current_count == 0:
-        watchdog.stop()
-        node_daemon.shutdown()
-    elif current_count == 1:
-        pykka.ActorRegistry.stop_all()
-    else:
-        sys.exit(-signal_code)
-
-def main(args=None):
-    global node_daemon, watchdog
-    args = parse_cli(args)
-    config = load_config(args.config)
-
-    if not args.foreground:
-        daemon.DaemonContext().open()
-    for sigcode in [signal.SIGINT, signal.SIGQUIT, signal.SIGTERM]:
-        signal.signal(sigcode, shutdown_signal)
-
-    status.Server(config).start()
-
-    try:
-        root_logger = setup_logging(config.get('Logging', 'file'), **config.log_levels())
-        root_logger.info("%s %s started, libcloud %s", sys.argv[0], __version__, libcloud.__version__)
-        node_setup, node_shutdown, node_update, node_monitor = \
-            config.dispatch_classes()
-        server_calculator = build_server_calculator(config)
-        timer, cloud_node_poller, arvados_node_poller, job_queue_poller = \
-            launch_pollers(config, server_calculator)
-        cloud_node_updater = node_update.start(config.new_cloud_client, timer).tell_proxy()
-        node_daemon = NodeManagerDaemonActor.start(
-            job_queue_poller, arvados_node_poller, cloud_node_poller,
-            cloud_node_updater, timer,
-            config.new_arvados_client, config.new_cloud_client,
-            config.shutdown_windows(),
-            server_calculator,
-            config.getint('Daemon', 'min_nodes'),
-            config.getint('Daemon', 'max_nodes'),
-            config.getint('Daemon', 'poll_stale_after'),
-            config.getint('Daemon', 'boot_fail_after'),
-            config.getint('Daemon', 'node_stale_after'),
-            node_setup, node_shutdown, node_monitor,
-            max_total_price=config.getfloat('Daemon', 'max_total_price'),
-            consecutive_idle_count=config.getint('Daemon', 'consecutive_idle_count'),).tell_proxy()
-
-        watchdog = WatchdogActor.start(config.getint('Daemon', 'watchdog'),
-                            cloud_node_poller.actor_ref,
-                            arvados_node_poller.actor_ref,
-                            job_queue_poller.actor_ref,
-                            node_daemon.actor_ref)
-
-        signal.pause()
-        daemon_stopped = node_daemon.actor_ref.actor_stopped.is_set
-        while not daemon_stopped():
-            time.sleep(1)
-    except Exception:
-        logging.exception("Uncaught exception during setup")
-    finally:
-        pykka.ActorRegistry.stop_all()
-
-
-if __name__ == '__main__':
-    main()
diff --git a/services/nodemanager/arvnodeman/nodelist.py b/services/nodemanager/arvnodeman/nodelist.py
deleted file mode 100644 (file)
index 0abb3b3..0000000
+++ /dev/null
@@ -1,87 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: AGPL-3.0
-
-from __future__ import absolute_import, print_function
-
-import subprocess32 as subprocess
-
-from . import clientactor
-from . import config
-
-import arvados.util
-
-class ArvadosNodeListMonitorActor(clientactor.RemotePollLoopActor):
-    """Actor to poll the Arvados node list.
-
-    This actor regularly polls the list of Arvados node records,
-    augments it with the latest SLURM node info (`sinfo`), and sends
-    it to subscribers.
-    """
-
-    def is_common_error(self, exception):
-        return isinstance(exception, config.ARVADOS_ERRORS)
-
-    def _item_key(self, node):
-        return node['uuid']
-
-    def _send_request(self):
-        nodelist = arvados.util.list_all(self._client.nodes().list)
-
-        # node hostname, state
-        sinfo_out = subprocess.check_output(["sinfo", "--noheader", "--format=%n|%t|%f"])
-        nodestates = {}
-        nodefeatures = {}
-        for out in sinfo_out.splitlines():
-            try:
-                nodename, state, features = out.split("|", 3)
-            except ValueError:
-                continue
-            if state in ('alloc', 'alloc*',
-                         'comp',  'comp*',
-                         'mix',   'mix*',
-                         'drng',  'drng*'):
-                nodestates[nodename] = 'busy'
-            elif state in ('idle', 'fail'):
-                nodestates[nodename] = state
-            else:
-                nodestates[nodename] = 'down'
-            if features != "(null)":
-                nodefeatures[nodename] = features
-
-        for n in nodelist:
-            if n["slot_number"] and n["hostname"] and n["hostname"] in nodestates:
-                n["crunch_worker_state"] = nodestates[n["hostname"]]
-            else:
-                n["crunch_worker_state"] = 'down'
-            n["slurm_node_features"] = nodefeatures.get(n["hostname"], "")
-
-        return nodelist
-
-class CloudNodeListMonitorActor(clientactor.RemotePollLoopActor):
-    """Actor to poll the cloud node list.
-
-    This actor regularly polls the cloud to get a list of running compute
-    nodes, and sends it to subscribers.
-    """
-
-    def __init__(self, client, timer_actor, server_calc, *args, **kwargs):
-        super(CloudNodeListMonitorActor, self).__init__(
-            client, timer_actor, *args, **kwargs)
-        self._calculator = server_calc
-
-    def is_common_error(self, exception):
-        return isinstance(exception, config.CLOUD_ERRORS)
-
-    def _item_key(self, node):
-        return node.id
-
-    def _send_request(self):
-        nodes = self._client.list_nodes()
-        for n in nodes:
-            # Replace the libcloud NodeSize object with compatible
-            # CloudSizeWrapper object which merges the size info reported from
-            # the cloud with size information from the configuration file.
-            n.size = self._calculator.find_size(n.extra['arvados_node_size'])
-        return nodes
diff --git a/services/nodemanager/arvnodeman/status.py b/services/nodemanager/arvnodeman/status.py
deleted file mode 100644 (file)
index 1e18996..0000000
+++ /dev/null
@@ -1,129 +0,0 @@
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: AGPL-3.0
-
-from __future__ import absolute_import, print_function
-from future import standard_library
-
-import http.server
-import time
-import json
-import logging
-import socketserver
-import threading
-
-from ._version import __version__
-
-_logger = logging.getLogger('status.Handler')
-
-
-class Server(socketserver.ThreadingMixIn, http.server.HTTPServer, object):
-    def __init__(self, config):
-        port = config.getint('Manage', 'port')
-        self.enabled = port >= 0
-        if not self.enabled:
-            _logger.warning("Management server disabled. "+
-                            "Use [Manage] config section to enable.")
-            return
-        self._config = config
-        self._tracker = tracker
-        self._tracker.update({'config_max_nodes': config.getint('Daemon', 'max_nodes')})
-        super(Server, self).__init__(
-            (config.get('Manage', 'address'), port), Handler)
-        self._thread = threading.Thread(target=self.serve_forever)
-        self._thread.daemon = True
-
-    def start(self):
-        if self.enabled:
-            self._thread.start()
-
-
-class Handler(http.server.BaseHTTPRequestHandler, object):
-    def do_GET(self):
-        if self.path == '/status.json':
-            self.send_response(200)
-            self.send_header('Content-type', 'application/json')
-            self.end_headers()
-            self.wfile.write(tracker.get_json())
-        elif self.path == '/_health/ping':
-            code, msg = self.check_auth()
-
-            if code != 200:
-              self.send_response(code)
-              self.wfile.write(msg)
-            else:
-              self.send_response(200)
-              self.send_header('Content-type', 'application/json')
-              self.end_headers()
-              self.wfile.write(json.dumps({"health":"OK"}))
-        else:
-            self.send_response(404)
-
-    def log_message(self, fmt, *args, **kwargs):
-        _logger.info(fmt, *args, **kwargs)
-
-    def check_auth(self):
-        mgmt_token = self.server._config.get('Manage', 'ManagementToken')
-        auth_header = self.headers.get('Authorization', None)
-
-        if mgmt_token == '':
-          return 404, "disabled"
-        elif auth_header == None:
-          return 401, "authorization required"
-        elif auth_header != 'Bearer '+mgmt_token:
-          return 403, "authorization error"
-        return 200, ""
-
-class Tracker(object):
-    def __init__(self):
-        self._mtx = threading.Lock()
-        self._latest = {
-            'list_nodes_errors': 0,
-            'create_node_errors': 0,
-            'destroy_node_errors': 0,
-            'boot_failures': 0,
-            'actor_exceptions': 0
-        }
-        self._version = {'Version' : __version__}
-        self._idle_nodes = {}
-
-    def get_json(self):
-        with self._mtx:
-            times = {'idle_times' : {}}
-            now = time.time()
-            for node, ts in self._idle_nodes.items():
-                times['idle_times'][node] = int(now - ts)
-            return json.dumps(
-                dict(dict(self._latest, **self._version), **times))
-
-    def keys(self):
-        with self._mtx:
-            return self._latest.keys()
-
-    def get(self, key):
-        with self._mtx:
-            return self._latest.get(key)
-
-    def update(self, updates):
-        with self._mtx:
-            self._latest.update(updates)
-
-    def counter_add(self, counter, value=1):
-        with self._mtx:
-            self._latest.setdefault(counter, 0)
-            self._latest[counter] += value
-
-    def idle_in(self, nodename):
-        with self._mtx:
-            if self._idle_nodes.get(nodename):
-                return
-            self._idle_nodes[nodename] = time.time()
-
-    def idle_out(self, nodename):
-        with self._mtx:
-            try:
-                del self._idle_nodes[nodename]
-            except KeyError:
-                pass
-
-tracker = Tracker()
diff --git a/services/nodemanager/arvnodeman/test/fake_driver.py b/services/nodemanager/arvnodeman/test/fake_driver.py
deleted file mode 100644 (file)
index 2a592f9..0000000
+++ /dev/null
@@ -1,226 +0,0 @@
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: AGPL-3.0
-
-import re
-import urllib
-import ssl
-import time
-
-from arvnodeman.computenode import ARVADOS_TIMEFMT
-
-from libcloud.compute.base import NodeSize, Node, NodeDriver, NodeState, NodeImage
-from libcloud.compute.drivers.gce import GCEDiskType
-from libcloud.common.exceptions import BaseHTTPError, RateLimitReachedError
-
-all_nodes = []
-create_calls = 0
-quota = 2
-
-class FakeDriver(NodeDriver):
-    def __init__(self, *args, **kwargs):
-        self.name = "FakeDriver"
-
-    def list_sizes(self, **kwargs):
-        return [NodeSize("Standard_D3", "Standard_D3", 3500, 200, 0, 0, self),
-                NodeSize("Standard_D4", "Standard_D4", 7000, 400, 0, 0, self)]
-
-    def list_nodes(self, **kwargs):
-        return all_nodes
-
-    def create_node(self, name=None,
-                    size=None,
-                    image=None,
-                    auth=None,
-                    ex_storage_account=None,
-                    ex_customdata=None,
-                    ex_resource_group=None,
-                    ex_user_name=None,
-                    ex_tags=None,
-                    ex_metadata=None,
-                    ex_network=None,
-                    ex_userdata=None):
-        global all_nodes, create_calls
-        create_calls += 1
-        nodeid = "node%i" % create_calls
-        if ex_tags is None:
-            ex_tags = {}
-        ex_tags.update({'arvados_node_size': size.id})
-        n = Node(nodeid, nodeid, NodeState.RUNNING, [], [], self, size=size, extra={"tags": ex_tags})
-        all_nodes.append(n)
-        if ex_customdata:
-            ping_url = re.search(r"echo '(.*)' > /var/tmp/arv-node-data/arv-ping-url", ex_customdata).groups(1)[0]
-        if ex_userdata:
-            ping_url = ex_userdata
-        elif ex_metadata:
-            ping_url = ex_metadata["arv-ping-url"]
-        ping_url += "&instance_id=" + nodeid
-        ctx = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
-        ctx.verify_mode = ssl.CERT_NONE
-        f = urllib.urlopen(ping_url, "", context=ctx)
-        f.close()
-        return n
-
-    def destroy_node(self, cloud_node):
-        global all_nodes
-        all_nodes = [n for n in all_nodes if n.id != cloud_node.id]
-        return True
-
-    def get_image(self, img):
-        pass
-
-    def ex_create_tags(self, cloud_node, tags):
-        pass
-
-class QuotaDriver(FakeDriver):
-    def create_node(self, name=None,
-                    size=None,
-                    image=None,
-                    auth=None,
-                    ex_storage_account=None,
-                    ex_customdata=None,
-                    ex_resource_group=None,
-                    ex_user_name=None,
-                    ex_tags=None,
-                    ex_network=None):
-        global all_nodes, create_calls, quota
-        if len(all_nodes) >= quota:
-            raise BaseHTTPError(503, "Quota exceeded")
-        else:
-            return super(QuotaDriver, self).create_node(name=name,
-                    size=size,
-                    image=image,
-                    auth=auth,
-                    ex_storage_account=ex_storage_account,
-                    ex_customdata=ex_customdata,
-                    ex_resource_group=ex_resource_group,
-                    ex_user_name=ex_user_name,
-                    ex_tags=ex_tags,
-                    ex_network=ex_network)
-
-    def destroy_node(self, cloud_node):
-        global all_nodes, quota
-        all_nodes = [n for n in all_nodes if n.id != cloud_node.id]
-        if len(all_nodes) == 0:
-            quota = 4
-        return True
-
-class FailingDriver(FakeDriver):
-    def create_node(self, name=None,
-                    size=None,
-                    image=None,
-                    auth=None,
-                    ex_storage_account=None,
-                    ex_customdata=None,
-                    ex_resource_group=None,
-                    ex_user_name=None,
-                    ex_tags=None,
-                    ex_network=None):
-        raise Exception("nope")
-
-class RetryDriver(FakeDriver):
-    def create_node(self, name=None,
-                    size=None,
-                    image=None,
-                    auth=None,
-                    ex_storage_account=None,
-                    ex_customdata=None,
-                    ex_resource_group=None,
-                    ex_user_name=None,
-                    ex_tags=None,
-                    ex_network=None):
-        global create_calls
-        create_calls += 1
-        if create_calls < 2:
-            raise RateLimitReachedError(429, "Rate limit exceeded",
-                                        headers={'retry-after': '2'})
-        elif create_calls < 3:
-            raise BaseHTTPError(429, "Rate limit exceeded",
-                                {'retry-after': '1'})
-        else:
-            return super(RetryDriver, self).create_node(name=name,
-                    size=size,
-                    image=image,
-                    auth=auth,
-                    ex_storage_account=ex_storage_account,
-                    ex_customdata=ex_customdata,
-                    ex_resource_group=ex_resource_group,
-                    ex_user_name=ex_user_name,
-                    ex_tags=ex_tags,
-                    ex_network=ex_network)
-
-class FakeAwsDriver(FakeDriver):
-
-    def create_node(self, name=None,
-                    size=None,
-                    image=None,
-                    auth=None,
-                    ex_userdata=None,
-                    ex_metadata=None,
-                    ex_blockdevicemappings=None):
-        n = super(FakeAwsDriver, self).create_node(name=name,
-                                                      size=size,
-                                                      image=image,
-                                                      auth=auth,
-                                                      ex_metadata=ex_metadata,
-                                                      ex_userdata=ex_userdata)
-        n.extra = {
-            "launch_time": time.strftime(ARVADOS_TIMEFMT, time.gmtime())[:-1],
-            "tags" : {
-                "arvados_node_size": size.id
-            }
-        }
-        return n
-
-    def list_sizes(self, **kwargs):
-        return [NodeSize("m3.xlarge", "Extra Large Instance", 3500, 80, 0, 0, self),
-                NodeSize("m4.xlarge", "Extra Large Instance", 3500, 0, 0, 0, self),
-                NodeSize("m4.2xlarge", "Double Extra Large Instance", 7000, 0, 0, 0, self)]
-
-
-class FakeGceDriver(FakeDriver):
-
-    def create_node(self, name=None,
-                    size=None,
-                    image=None,
-                    auth=None,
-                    external_ip=None,
-                    ex_metadata=None,
-                    ex_tags=None,
-                    ex_disks_gce_struct=None):
-        n = super(FakeGceDriver, self).create_node(name=name,
-                                                   size=size,
-                                                   image=image,
-                                                   auth=auth,
-                                                   ex_metadata=ex_metadata)
-        n.extra = {
-            "metadata": {
-                "items": [{"key": k, "value": v} for k,v in ex_metadata.iteritems()],
-                "arvados_node_size": size.id
-            },
-            "zone": "fake"
-        }
-        return n
-
-    def list_images(self, ex_project=None):
-        return [NodeImage("fake_image_id", "fake_image_id", self)]
-
-    def list_sizes(self, **kwargs):
-        return [NodeSize("n1-standard-1", "Standard", 3750, None, 0, 0, self),
-                NodeSize("n1-standard-2", "Double standard", 7500, None, 0, 0, self)]
-
-    def ex_list_disktypes(self, zone=None):
-        return [GCEDiskType("pd-standard", "pd-standard", zone, self,
-                            extra={"selfLink": "pd-standard"}),
-                GCEDiskType("local-ssd", "local-ssd", zone, self,
-                            extra={"selfLink": "local-ssd"})]
-
-    def ex_get_node(self, name, zone=None):
-        global all_nodes
-        for n in all_nodes:
-            if n.id == name:
-                return n
-        return None
-
-    def ex_set_node_metadata(self, n, items):
-        n.extra["metadata"]["items"] = items
diff --git a/services/nodemanager/arvnodeman/timedcallback.py b/services/nodemanager/arvnodeman/timedcallback.py
deleted file mode 100644 (file)
index e7e3f25..0000000
+++ /dev/null
@@ -1,49 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: AGPL-3.0
-
-from __future__ import absolute_import, print_function
-
-import heapq
-import time
-
-import pykka
-
-from .config import actor_class
-
-class TimedCallBackActor(actor_class):
-    """Send messages to other actors on a schedule.
-
-    Other actors can call the schedule() method to schedule delivery of a
-    message at a later time.  This actor runs the necessary event loop for
-    delivery.
-    """
-    def __init__(self, max_sleep=1, timefunc=None):
-        super(TimedCallBackActor, self).__init__()
-        self._proxy = self.actor_ref.tell_proxy()
-        self.messages = []
-        self.max_sleep = max_sleep
-        if timefunc is None:
-            self._timefunc = time.time
-        else:
-            self._timefunc = timefunc
-
-    def schedule(self, delivery_time, receiver, *args, **kwargs):
-        if not self.messages:
-            self._proxy.deliver()
-        heapq.heappush(self.messages, (delivery_time, receiver, args, kwargs))
-
-    def deliver(self):
-        if not self.messages:
-            return
-        til_next = self.messages[0][0] - self._timefunc()
-        if til_next <= 0:
-            t, receiver, args, kwargs = heapq.heappop(self.messages)
-            try:
-                receiver(*args, **kwargs)
-            except pykka.ActorDeadError:
-                pass
-        else:
-            time.sleep(min(til_next, self.max_sleep))
-        self._proxy.deliver()
diff --git a/services/nodemanager/bin/arvados-node-manager b/services/nodemanager/bin/arvados-node-manager
deleted file mode 100755 (executable)
index 72e0831..0000000
+++ /dev/null
@@ -1,9 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: AGPL-3.0
-
-from __future__ import absolute_import, print_function
-
-from arvnodeman.launcher import main
-main()
diff --git a/services/nodemanager/doc/azure.example.cfg b/services/nodemanager/doc/azure.example.cfg
deleted file mode 100644 (file)
index 8ba6801..0000000
+++ /dev/null
@@ -1,202 +0,0 @@
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: AGPL-3.0
-
-# Azure configuration for Arvados Node Manager.
-# All times are in seconds unless specified otherwise.
-
-[Manage]
-# The management server responds to http://addr:port/status.json with
-# a snapshot of internal state.
-
-# Management server listening address (default 127.0.0.1)
-#address = 0.0.0.0
-
-# Management server port number (default -1, server is disabled)
-#port = 8989
-
-[Daemon]
-# The dispatcher can customize the start and stop procedure for
-# cloud nodes.  For example, the SLURM dispatcher drains nodes
-# through SLURM before shutting them down.
-#dispatcher = slurm
-
-# Node Manager will ensure that there are at least this many nodes running at
-# all times.  If node manager needs to start new idle nodes for the purpose of
-# satisfying min_nodes, it will use the cheapest node type.  However, depending
-# on usage patterns, it may also satisfy min_nodes by keeping alive some
-# more-expensive nodes
-min_nodes = 0
-
-# Node Manager will not start any compute nodes when at least this
-# many are running.
-max_nodes = 8
-
-# Upper limit on rate of spending (in $/hr), will not boot additional nodes
-# if total price of already running nodes meets or exceeds this threshold.
-# default 0 means no limit.
-max_total_price = 0
-
-# Poll Azure nodes and Arvados for new information every N seconds.
-poll_time = 60
-
-# Polls have exponential backoff when services fail to respond.
-# This is the longest time to wait between polls.
-max_poll_time = 300
-
-# If Node Manager can't succesfully poll a service for this long,
-# it will never start or stop compute nodes, on the assumption that its
-# information is too outdated.
-poll_stale_after = 600
-
-# If Node Manager boots a cloud node, and it does not pair with an Arvados
-# node before this long, assume that there was a cloud bootstrap failure and
-# shut it down.  Note that normal shutdown windows apply (see the Cloud
-# section), so this should be shorter than the first shutdown window value.
-boot_fail_after = 1800
-
-# "Node stale time" affects two related behaviors.
-# 1. If a compute node has been running for at least this long, but it
-# isn't paired with an Arvados node, do not shut it down, but leave it alone.
-# This prevents the node manager from shutting down a node that might
-# actually be doing work, but is having temporary trouble contacting the
-# API server.
-# 2. When the Node Manager starts a new compute node, it will try to reuse
-# an Arvados node that hasn't been updated for this long.
-node_stale_after = 14400
-
-# Number of consecutive times a node must report as "idle" before it
-# will be considered eligible for shutdown.  Node status is checked
-# each poll period, and node can go idle at any point during a poll
-# period (meaning a node could be reported as idle that has only been
-# idle for 1 second).  With a 60 second poll period, three consecutive
-# status updates of "idle" suggests the node has been idle at least
-# 121 seconds.
-consecutive_idle_count = 3
-
-# Scaling factor to be applied to nodes' available RAM size. Usually there's a
-# variable discrepancy between the advertised RAM value on cloud nodes and the
-# actual amount available.
-# If not set, this value will be set to 0.95
-node_mem_scaling = 0.95
-
-# File path for Certificate Authorities
-certs_file = /etc/ssl/certs/ca-certificates.crt
-
-
-[Logging]
-# Log file path
-file = /var/log/arvados/node-manager.log
-
-# Log level for most Node Manager messages.
-# Choose one of DEBUG, INFO, WARNING, ERROR, or CRITICAL.
-# WARNING lets you know when polling a service fails.
-# INFO additionally lets you know when a compute node is started or stopped.
-level = INFO
-
-# You can also set different log levels for specific libraries.
-# Pykka is the Node Manager's actor library.
-# Setting this to DEBUG will display tracebacks for uncaught
-# exceptions in the actors, but it's also very chatty.
-pykka = WARNING
-
-# Setting apiclient to INFO will log the URL of every Arvados API request.
-apiclient = WARNING
-
-[Arvados]
-host = zyxwv.arvadosapi.com
-token = ARVADOS_TOKEN
-timeout = 15
-jobs_queue = yes   # Get work request from Arvados jobs queue (jobs API)
-slurm_queue = yes  # Get work request from squeue (containers API)
-
-# Accept an untrusted SSL certificate from the API server?
-insecure = no
-
-[Cloud]
-provider = azure
-
-# Shutdown windows define periods of time when a node may and may not be shut
-# down.  These are windows in full minutes, separated by commas.  Counting from
-# the time the node is booted, the node WILL NOT shut down for N1 minutes; then
-# it MAY shut down for N2 minutes; then it WILL NOT shut down for N3 minutes;
-# and so on.  For example, "20, 999999" means the node may shut down between
-# the 20th and 999999th minutes of uptime.
-# Azure bills by the minute, so it makes sense to agressively shut down idle
-# nodes.  Specify at least two windows.  You can add as many as you need beyond
-# that.
-shutdown_windows = 20, 999999
-
-[Cloud Credentials]
-# Use "azure account list" with the azure CLI to get these values.
-tenant_id = 00000000-0000-0000-0000-000000000000
-subscription_id = 00000000-0000-0000-0000-000000000000
-
-# The following directions are based on
-# https://azure.microsoft.com/en-us/documentation/articles/resource-group-authenticate-service-principal/
-#
-# azure config mode arm
-# azure ad app create --name "<Your Application Display Name>" --home-page "<https://YourApplicationHomePage>" --identifier-uris "<https://YouApplicationUri>" --password <Your_Password>
-# azure ad sp create "<Application_Id>"
-# azure role assignment create --objectId "<Object_Id>" -o Owner -c /subscriptions/{subscriptionId}/
-#
-# Use <Application_Id> for "key" and the <Your_Password> for "secret"
-#
-key = 00000000-0000-0000-0000-000000000000
-secret = PASSWORD
-timeout = 60
-region = East US
-
-[Cloud List]
-# The resource group in which the compute node virtual machines will be created
-# and listed.
-ex_resource_group = ArvadosResourceGroup
-
-[Cloud Create]
-# The image id, in the form "Publisher:Offer:SKU:Version"
-image = Canonical:UbuntuServer:14.04.3-LTS:14.04.201508050
-
-# Path to a local ssh key file that will be used to provision new nodes.
-ssh_key = /home/arvadosuser/.ssh/id_rsa.pub
-
-# The account name for the admin user that will be provisioned on new nodes.
-ex_user_name = arvadosuser
-
-# The Azure storage account that will be used to store the node OS disk images.
-ex_storage_account = arvadosstorage
-
-# The virtual network the VMs will be associated with.
-ex_network = ArvadosNetwork
-
-# Optional subnet of the virtual network.
-#ex_subnet = default
-
-# Node tags
-tag_arvados-class = dynamic-compute
-tag_cluster = zyxwv
-
-# the API server to ping
-ping_host = hostname:port
-
-# You can define any number of Size sections to list Azure sizes you're willing
-# to use.  The Node Manager should boot the cheapest size(s) that can run jobs
-# in the queue.  You must also provide price per hour as the Azure driver
-# compute currently does not report prices.
-#
-# See https://azure.microsoft.com/en-us/pricing/details/virtual-machines/
-# for a list of known machine types that may be used as a Size parameter.
-#
-# Each size section MUST define the number of cores are available in this
-# size class (since libcloud does not provide any consistent API for exposing
-# this setting).
-# You may also want to define the amount of scratch space (expressed
-# in GB) for Crunch jobs.  You can also override Microsoft's provided
-# data fields by setting them here.
-
-[Size Standard_D3]
-cores = 4
-price = 0.56
-
-[Size Standard_D4]
-cores = 8
-price = 1.12
diff --git a/services/nodemanager/doc/ec2.example.cfg b/services/nodemanager/doc/ec2.example.cfg
deleted file mode 100644 (file)
index 3bc905b..0000000
+++ /dev/null
@@ -1,205 +0,0 @@
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: AGPL-3.0
-
-# EC2 configuration for Arvados Node Manager.
-# All times are in seconds unless specified otherwise.
-
-[Manage]
-# The management server responds to http://addr:port/status.json with
-# a snapshot of internal state.
-
-# Management server listening address (default 127.0.0.1)
-#address = 0.0.0.0
-
-# Management server port number (default -1, server is disabled)
-#port = 8989
-
-[Daemon]
-# The dispatcher can customize the start and stop procedure for
-# cloud nodes.  For example, the SLURM dispatcher drains nodes
-# through SLURM before shutting them down.
-#dispatcher = slurm
-
-# Node Manager will ensure that there are at least this many nodes running at
-# all times.  If node manager needs to start new idle nodes for the purpose of
-# satisfying min_nodes, it will use the cheapest node type.  However, depending
-# on usage patterns, it may also satisfy min_nodes by keeping alive some
-# more-expensive nodes
-min_nodes = 0
-
-# Node Manager will not start any compute nodes when at least this
-# many are running.
-max_nodes = 8
-
-# Upper limit on rate of spending (in $/hr), will not boot additional nodes
-# if total price of already running nodes meets or exceeds this threshold.
-# default 0 means no limit.
-max_total_price = 0
-
-# Poll EC2 nodes and Arvados for new information every N seconds.
-poll_time = 60
-
-# Polls have exponential backoff when services fail to respond.
-# This is the longest time to wait between polls.
-max_poll_time = 300
-
-# If Node Manager can't succesfully poll a service for this long,
-# it will never start or stop compute nodes, on the assumption that its
-# information is too outdated.
-poll_stale_after = 600
-
-# If Node Manager boots a cloud node, and it does not pair with an Arvados
-# node before this long, assume that there was a cloud bootstrap failure and
-# shut it down.  Note that normal shutdown windows apply (see the Cloud
-# section), so this should be shorter than the first shutdown window value.
-boot_fail_after = 1800
-
-# "Node stale time" affects two related behaviors.
-# 1. If a compute node has been running for at least this long, but it
-# isn't paired with an Arvados node, do not shut it down, but leave it alone.
-# This prevents the node manager from shutting down a node that might
-# actually be doing work, but is having temporary trouble contacting the
-# API server.
-# 2. When the Node Manager starts a new compute node, it will try to reuse
-# an Arvados node that hasn't been updated for this long.
-node_stale_after = 14400
-
-# Number of consecutive times a node must report as "idle" before it
-# will be considered eligible for shutdown.  Node status is checked
-# each poll period, and node can go idle at any point during a poll
-# period (meaning a node could be reported as idle that has only been
-# idle for 1 second).  With a 60 second poll period, three consecutive
-# status updates of "idle" suggests the node has been idle at least
-# 121 seconds.
-consecutive_idle_count = 3
-
-# Scaling factor to be applied to nodes' available RAM size. Usually there's a
-# variable discrepancy between the advertised RAM value on cloud nodes and the
-# actual amount available.
-# If not set, this value will be set to 0.95
-node_mem_scaling = 0.95
-
-# File path for Certificate Authorities
-certs_file = /etc/ssl/certs/ca-certificates.crt
-
-[Logging]
-# Log file path
-file = /var/log/arvados/node-manager.log
-
-# Log level for most Node Manager messages.
-# Choose one of DEBUG, INFO, WARNING, ERROR, or CRITICAL.
-# WARNING lets you know when polling a service fails.
-# INFO additionally lets you know when a compute node is started or stopped.
-level = INFO
-
-# You can also set different log levels for specific libraries.
-# Pykka is the Node Manager's actor library.
-# Setting this to DEBUG will display tracebacks for uncaught
-# exceptions in the actors, but it's also very chatty.
-pykka = WARNING
-
-# Setting apiclient to INFO will log the URL of every Arvados API request.
-apiclient = WARNING
-
-[Arvados]
-host = zyxwv.arvadosapi.com
-token = ARVADOS_TOKEN
-timeout = 15
-jobs_queue = yes   # Get work request from Arvados jobs queue (jobs API)
-slurm_queue = yes  # Get work request from squeue (containers API)
-
-# Accept an untrusted SSL certificate from the API server?
-insecure = no
-
-[Cloud]
-provider = ec2
-
-# It's usually most cost-effective to shut down compute nodes during narrow
-# windows of time.  For example, EC2 bills each node by the hour, so the best
-# time to shut down a node is right before a new hour of uptime starts.
-# Shutdown windows define these periods of time.  These are windows in
-# full minutes, separated by commas.  Counting from the time the node is
-# booted, the node WILL NOT shut down for N1 minutes; then it MAY shut down
-# for N2 minutes; then it WILL NOT shut down for N3 minutes; and so on.
-# For example, "54, 5, 1" means the node may shut down from the 54th to the
-# 59th minute of each hour of uptime.
-# Specify at least two windows.  You can add as many as you need beyond that.
-shutdown_windows = 54, 5, 1
-
-[Cloud Credentials]
-key = KEY
-secret = SECRET_KEY
-region = us-east-1
-timeout = 60
-
-[Cloud List]
-# This section defines filters that find compute nodes.
-# Tags that you specify here will automatically be added to nodes you create.
-# Replace colons in Amazon filters with underscores
-# (e.g., write "tag:mytag" as "tag_mytag").
-instance-state-name = running
-tag_arvados-class = dynamic-compute
-tag_cluster = zyxwv
-
-[Cloud Create]
-# New compute nodes will send pings to Arvados at this host.
-# You may specify a port, and use brackets to disambiguate IPv6 addresses.
-ping_host = hostname:port
-
-# Give the name of an SSH key on AWS...
-ex_keyname = string
-
-# ... or a file path for an SSH key that can log in to the compute node.
-# (One or the other, not both.)
-# ssh_key = path
-
-# The EC2 IDs of the image and subnet compute nodes should use.
-image_id = idstring
-subnet_id = idstring
-
-# Comma-separated EC2 IDs for the security group(s) assigned to each
-# compute node.
-security_groups = idstring1, idstring2
-
-# Apply an Instance Profile ARN to the newly created compute nodes
-# For more info, see:
-# https://aws.amazon.com/premiumsupport/knowledge-center/iam-policy-restrict-vpc/
-# ex_iamprofile = arn:aws:iam::ACCOUNTNUMBER:instance-profile/ROLENAME
-
-
-# You can define any number of Size sections to list EC2 sizes you're
-# willing to use.  The Node Manager should boot the cheapest size(s) that
-# can run jobs in the queue.
-#
-# Each size section MUST define the number of cores are available in this
-# size class (since libcloud does not provide any consistent API for exposing
-# this setting).
-# You may also want to define the amount of scratch space (expressed
-# in MB) for Crunch jobs.  You can also override Amazon's provided
-# data fields (such as price per hour) by setting them here.
-#
-# Additionally, you can ask for a preemptible instance (AWS's spot instance)
-# by adding the appropriate boolean configuration flag. If you want to have
-# both spot & reserved versions of the same size, you can do so by renaming
-# the Size section and specifying the instance type inside it.
-
-# 100 GB scratch space
-[Size m4.large]
-cores = 2
-price = 0.126
-scratch = 100000
-
-# 10 GB scratch space
-[Size m4.large.spot]
-instance_type = m4.large
-preemptible = true
-cores = 2
-price = 0.126
-scratch = 10000
-
-# 200 GB scratch space
-[Size m4.xlarge]
-cores = 4
-price = 0.252
-scratch = 200000
diff --git a/services/nodemanager/doc/gce.example.cfg b/services/nodemanager/doc/gce.example.cfg
deleted file mode 100644 (file)
index acd3fd1..0000000
+++ /dev/null
@@ -1,187 +0,0 @@
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: AGPL-3.0
-
-# Google Compute Engine configuration for Arvados Node Manager.
-# All times are in seconds unless specified otherwise.
-
-[Manage]
-# The management server responds to http://addr:port/status.json with
-# a snapshot of internal state.
-
-# Management server listening address (default 127.0.0.1)
-#address = 0.0.0.0
-
-# Management server port number (default -1, server is disabled)
-#port = 8989
-
-[Daemon]
-# Node Manager will ensure that there are at least this many nodes running at
-# all times.  If node manager needs to start new idle nodes for the purpose of
-# satisfying min_nodes, it will use the cheapest node type.  However, depending
-# on usage patterns, it may also satisfy min_nodes by keeping alive some
-# more-expensive nodes
-min_nodes = 0
-
-# Node Manager will not start any compute nodes when at least this
-# running at all times.  By default, these will be the cheapest node size.
-max_nodes = 8
-
-# Poll compute nodes and Arvados for new information every N seconds.
-poll_time = 60
-
-# Upper limit on rate of spending (in $/hr), will not boot additional nodes
-# if total price of already running nodes meets or exceeds this threshold.
-# default 0 means no limit.
-max_total_price = 0
-
-# Polls have exponential backoff when services fail to respond.
-# This is the longest time to wait between polls.
-max_poll_time = 300
-
-# If Node Manager can't succesfully poll a service for this long,
-# it will never start or stop compute nodes, on the assumption that its
-# information is too outdated.
-poll_stale_after = 600
-
-# "Node stale time" affects two related behaviors.
-# 1. If a compute node has been running for at least this long, but it
-# isn't paired with an Arvados node, do not shut it down, but leave it alone.
-# This prevents the node manager from shutting down a node that might
-# actually be doing work, but is having temporary trouble contacting the
-# API server.
-# 2. When the Node Manager starts a new compute node, it will try to reuse
-# an Arvados node that hasn't been updated for this long.
-node_stale_after = 14400
-
-# Number of consecutive times a node must report as "idle" before it
-# will be considered eligible for shutdown.  Node status is checked
-# each poll period, and node can go idle at any point during a poll
-# period (meaning a node could be reported as idle that has only been
-# idle for 1 second).  With a 60 second poll period, three consecutive
-# status updates of "idle" suggests the node has been idle at least
-# 121 seconds.
-consecutive_idle_count = 3
-
-# Scaling factor to be applied to nodes' available RAM size. Usually there's a
-# variable discrepancy between the advertised RAM value on cloud nodes and the
-# actual amount available.
-# If not set, this value will be set to 0.95
-node_mem_scaling = 0.95
-
-# File path for Certificate Authorities
-certs_file = /etc/ssl/certs/ca-certificates.crt
-
-[Logging]
-# Log file path
-file = /var/log/arvados/node-manager.log
-
-# Log level for most Node Manager messages.
-# Choose one of DEBUG, INFO, WARNING, ERROR, or CRITICAL.
-# WARNING lets you know when polling a service fails.
-# INFO additionally lets you know when a compute node is started or stopped.
-level = INFO
-
-# You can also set different log levels for specific libraries.
-# Pykka is the Node Manager's actor library.
-# Setting this to DEBUG will display tracebacks for uncaught
-# exceptions in the actors, but it's also very chatty.
-pykka = WARNING
-
-# Setting apiclient to INFO will log the URL of every Arvados API request.
-apiclient = WARNING
-
-[Arvados]
-host = zyxwv.arvadosapi.com
-token = ARVADOS_TOKEN
-timeout = 15
-jobs_queue = yes   # Get work request from Arvados jobs queue (jobs API)
-slurm_queue = yes  # Get work request from squeue (containers API)
-
-# Accept an untrusted SSL certificate from the API server?
-insecure = no
-
-[Cloud]
-provider = gce
-
-# Shutdown windows define periods of time when a node may and may not
-# be shut down.  These are windows in full minutes, separated by
-# commas.  Counting from the time the node is booted, the node WILL
-# NOT shut down for N1 minutes; then it MAY shut down for N2 minutes;
-# then it WILL NOT shut down for N3 minutes; and so on.  For example,
-# "54, 5, 1" means the node may shut down from the 54th to the 59th
-# minute of each hour of uptime.
-# GCE bills by the minute, and does not provide information about when
-# a node booted.  Node Manager will store this information in metadata
-# when it boots a node; if that information is not available, it will
-# assume the node booted at the epoch.  These shutdown settings are
-# very aggressive.  You may want to adjust this if you want more
-# continuity of service from a single node.
-shutdown_windows = 20, 999999
-
-[Cloud Credentials]
-user_id = client_email_address@developer.gserviceaccount.com
-key = path_to_certificate.pem
-project = project-id-from-google-cloud-dashboard
-timeout = 60
-
-# Valid location (zone) names: https://cloud.google.com/compute/docs/zones
-datacenter = us-central1-a
-
-# Optional settings. For full documentation see
-# http://libcloud.readthedocs.org/en/latest/compute/drivers/gce.html#libcloud.compute.drivers.gce.GCENodeDriver
-#
-# auth_type = SA               # SA, IA or GCE
-# scopes = https://www.googleapis.com/auth/compute
-# credential_file =
-
-[Cloud List]
-# A comma-separated list of tags that must be applied to a node for it to
-# be considered a compute node.
-# The driver will automatically apply these tags to nodes it creates.
-tags = zyxwv, compute
-
-[Cloud Create]
-# New compute nodes will send pings to Arvados at this host.
-# You may specify a port, and use brackets to disambiguate IPv6 addresses.
-ping_host = hostname:port
-
-# A file path for an SSH key that can log in to the compute node.
-# ssh_key = path
-
-# The GCE image name and network zone name to use when creating new nodes.
-image = debian-7
-# network = your_network_name
-
-# JSON string of service account authorizations for this cluster.
-# See http://libcloud.readthedocs.org/en/latest/compute/drivers/gce.html#specifying-service-account-scopes
-# service_accounts = [{'email':'account@example.com', 'scopes':['storage-ro']}]
-
-
-# You can define any number of Size sections to list node sizes you're
-# willing to use.  The Node Manager should boot the cheapest size(s) that
-# can run jobs in the queue.
-#
-# The Size fields are interpreted the same way as with a libcloud NodeSize:
-# http://libcloud.readthedocs.org/en/latest/compute/api.html#libcloud.compute.base.NodeSize
-#
-# See https://cloud.google.com/compute/docs/machine-types for a list
-# of known machine types that may be used as a Size parameter.
-#
-# Each size section MUST define the number of cores are available in this
-# size class (since libcloud does not provide any consistent API for exposing
-# this setting).
-# You may also want to define the amount of scratch space (expressed
-# in GB) for Crunch jobs.
-# You can also override Google's provided data fields (such as price per hour)
-# by setting them here.
-
-[Size n1-standard-2]
-cores = 2
-price = 0.076
-scratch = 100
-
-[Size n1-standard-4]
-cores = 4
-price = 0.152
-scratch = 200
\ No newline at end of file
diff --git a/services/nodemanager/doc/local.example.cfg b/services/nodemanager/doc/local.example.cfg
deleted file mode 100644 (file)
index 1221775..0000000
+++ /dev/null
@@ -1,50 +0,0 @@
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: AGPL-3.0
-
-# You can use this configuration to run a development Node Manager for
-# testing.  It uses libcloud's dummy driver and your own development API server.
-# When new cloud nodes are created, you'll need to simulate the ping that
-# they send to the Arvados API server.  The easiest way I've found to do that
-# is through the API server Rails console: load the Node object, set its
-# IP address to 10.10.0.N (where N is the cloud node's ID), and save.
-
-[Manage]
-address = 0.0.0.0
-port = 8989
-
-[Daemon]
-min_nodes = 0
-max_nodes = 8
-poll_time = 15
-max_poll_time = 60
-poll_stale_after = 600
-node_stale_after = 300
-certs_file = /etc/ssl/certs/ca-certificates.crt
-
-[Logging]
-level = DEBUG
-pykka = DEBUG
-apiclient = WARNING
-
-[Arvados]
-host = localhost:3030
-# This is the token for the text fixture's admin user.
-token = 4axaw8zxe0qm22wa6urpp5nskcne8z88cvbupv653y1njyi05h
-insecure = yes
-timeout = 15
-
-[Cloud]
-provider = dummy
-shutdown_windows = 1, 1
-timeout = 15
-
-[Cloud Credentials]
-creds = dummycreds
-
-[Cloud List]
-[Cloud Create]
-
-[Size 2]
-cores = 4
-scratch = 1234
diff --git a/services/nodemanager/fpm-info.sh b/services/nodemanager/fpm-info.sh
deleted file mode 100644 (file)
index c4a9dbb..0000000
+++ /dev/null
@@ -1,9 +0,0 @@
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: AGPL-3.0
-
-case "$TARGET" in
-    debian* | ubuntu*)
-        fpm_depends+=(libcurl3-gnutls libpython2.7)
-        ;;
-esac
diff --git a/services/nodemanager/gittaggers.py b/services/nodemanager/gittaggers.py
deleted file mode 120000 (symlink)
index a9ad861..0000000
+++ /dev/null
@@ -1 +0,0 @@
-../../sdk/python/gittaggers.py
\ No newline at end of file
diff --git a/services/nodemanager/setup.py b/services/nodemanager/setup.py
deleted file mode 100644 (file)
index 75e8f85..0000000
+++ /dev/null
@@ -1,60 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: AGPL-3.0
-
-from __future__ import absolute_import
-import os
-import sys
-import re
-
-from setuptools import setup, find_packages
-
-SETUP_DIR = os.path.dirname(__file__) or '.'
-README = os.path.join(SETUP_DIR, 'README.rst')
-
-import arvados_version
-version = arvados_version.get_version(SETUP_DIR, "arvnodeman")
-if os.environ.get('ARVADOS_BUILDING_VERSION', False):
-    pysdk_dep = "=={}".format(version)
-else:
-    # On dev releases, arvados-python-client may have a different timestamp
-    pysdk_dep = "<={}".format(version)
-
-short_tests_only = False
-if '--short-tests-only' in sys.argv:
-    short_tests_only = True
-    sys.argv.remove('--short-tests-only')
-
-setup(name='arvados-node-manager',
-      version=version,
-      description='Arvados compute node manager',
-      long_description=open(README).read(),
-      author='Arvados',
-      author_email='info@arvados.org',
-      url="https://arvados.org",
-      license='GNU Affero General Public License, version 3.0',
-      packages=find_packages(),
-      scripts=['bin/arvados-node-manager'],
-      data_files=[
-          ('share/doc/arvados-node-manager', ['agpl-3.0.txt', 'README.rst', 'arvados-node-manager.service']),
-      ],
-      install_requires=[
-          'apache-libcloud==2.5.0', # 2.6.0 cannot create azure nodes, #15649
-          'arvados-python-client{}'.format(pysdk_dep),
-          'future',
-          'pykka < 2',
-          'python-daemon',
-          'setuptools',
-          'subprocess32>=3.5.1',
-      ],
-      test_suite='tests',
-      tests_require=[
-          'requests',
-          'pbr<1.7.0',
-          'mock>=1.0',
-          'apache-libcloud==2.5.0',
-          'subprocess32>=3.5.1',
-      ],
-      zip_safe=False,
-)
diff --git a/services/nodemanager/tests/__init__.py b/services/nodemanager/tests/__init__.py
deleted file mode 100644 (file)
index 20e02f9..0000000
+++ /dev/null
@@ -1,18 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: AGPL-3.0
-
-import logging
-import os
-
-# Set the ANMTEST_LOGLEVEL environment variable to enable logging at that level.
-loglevel = os.environ.get('ANMTEST_LOGLEVEL', 'CRITICAL')
-logging.basicConfig(level=getattr(logging, loglevel.upper()))
-
-# Set the ANMTEST_TIMEOUT environment variable to the maximum amount of time to
-# wait for tested actors to respond to important messages.  The default value
-# is very conservative, because a small value may produce false negatives on
-# slower systems.  If you're debugging a known timeout issue, however, you may
-# want to set this lower to speed up tests.
-pykka_timeout = int(os.environ.get('ANMTEST_TIMEOUT', '10'))
diff --git a/services/nodemanager/tests/fake_azure.cfg.template b/services/nodemanager/tests/fake_azure.cfg.template
deleted file mode 100644 (file)
index e5deac8..0000000
+++ /dev/null
@@ -1,194 +0,0 @@
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: AGPL-3.0
-
-# Azure configuration for Arvados Node Manager.
-# All times are in seconds unless specified otherwise.
-
-[Manage]
-# The management server responds to http://addr:port/status.json with
-# a snapshot of internal state.
-
-# Management server listening address (default 127.0.0.1)
-address = 0.0.0.0
-
-# Management server port number (default -1, server is disabled)
-port = 8989
-
-MangementToken = xxx
-
-[Daemon]
-# The dispatcher can customize the start and stop procedure for
-# cloud nodes.  For example, the SLURM dispatcher drains nodes
-# through SLURM before shutting them down.
-#dispatcher = slurm
-
-# Node Manager will ensure that there are at least this many nodes running at
-# all times.  If node manager needs to start new idle nodes for the purpose of
-# satisfying min_nodes, it will use the cheapest node type.  However, depending
-# on usage patterns, it may also satisfy min_nodes by keeping alive some
-# more-expensive nodes
-min_nodes = 0
-
-# Node Manager will not start any compute nodes when at least this
-# many are running.
-max_nodes = 8
-
-# Upper limit on rate of spending (in $/hr), will not boot additional nodes
-# if total price of already running nodes meets or exceeds this threshold.
-# default 0 means no limit.
-max_total_price = 0
-
-# Poll Azure nodes and Arvados for new information every N seconds.
-poll_time = 0.5
-
-# Polls have exponential backoff when services fail to respond.
-# This is the longest time to wait between polls.
-max_poll_time = 1
-
-# If Node Manager can't succesfully poll a service for this long,
-# it will never start or stop compute nodes, on the assumption that its
-# information is too outdated.
-poll_stale_after = 1
-
-# If Node Manager boots a cloud node, and it does not pair with an Arvados
-# node before this long, assume that there was a cloud bootstrap failure and
-# shut it down.  Note that normal shutdown windows apply (see the Cloud
-# section), so this should be shorter than the first shutdown window value.
-boot_fail_after = 45
-
-# "Node stale time" affects two related behaviors.
-# 1. If a compute node has been running for at least this long, but it
-# isn't paired with an Arvados node, do not shut it down, but leave it alone.
-# This prevents the node manager from shutting down a node that might
-# actually be doing work, but is having temporary trouble contacting the
-# API server.
-# 2. When the Node Manager starts a new compute node, it will try to reuse
-# an Arvados node that hasn't been updated for this long.
-node_stale_after = 14400
-
-# Scaling factor to be applied to nodes' available RAM size. Usually there's a
-# variable discrepancy between the advertised RAM value on cloud nodes and the
-# actual amount available.
-# If not set, this value will be set to 0.95
-node_mem_scaling = 0.95
-
-# File path for Certificate Authorities
-certs_file = /etc/ssl/certs/ca-certificates.crt
-
-[Logging]
-# Log file path
-#file = node-manager.log
-
-# Log level for most Node Manager messages.
-# Choose one of DEBUG, INFO, WARNING, ERROR, or CRITICAL.
-# WARNING lets you know when polling a service fails.
-# INFO additionally lets you know when a compute node is started or stopped.
-level = DEBUG
-
-# You can also set different log levels for specific libraries.
-# Pykka is the Node Manager's actor library.
-# Setting this to DEBUG will display tracebacks for uncaught
-# exceptions in the actors, but it's also very chatty.
-pykka = WARNING
-
-# Setting apiclient to INFO will log the URL of every Arvados API request.
-apiclient = WARNING
-
-[Arvados]
-host = {host}
-token = {token}
-timeout = 15
-jobs_queue = no
-
-# Accept an untrusted SSL certificate from the API server?
-insecure = yes
-
-[Cloud]
-provider = azure
-driver_class = {driver_class}
-
-# Shutdown windows define periods of time when a node may and may not be shut
-# down.  These are windows in full minutes, separated by commas.  Counting from
-# the time the node is booted, the node WILL NOT shut down for N1 minutes; then
-# it MAY shut down for N2 minutes; then it WILL NOT shut down for N3 minutes;
-# and so on.  For example, "20, 999999" means the node may shut down between
-# the 20th and 999999th minutes of uptime.
-# Azure bills by the minute, so it makes sense to agressively shut down idle
-# nodes.  Specify at least two windows.  You can add as many as you need beyond
-# that.
-shutdown_windows = 0.05, 999999
-
-[Cloud Credentials]
-# Use "azure account list" with the azure CLI to get these values.
-tenant_id = 00000000-0000-0000-0000-000000000000
-subscription_id = 00000000-0000-0000-0000-000000000000
-
-# The following directions are based on
-# https://azure.microsoft.com/en-us/documentation/articles/resource-group-authenticate-service-principal/
-#
-# azure config mode arm
-# azure ad app create --name "<Your Application Display Name>" --home-page "<https://YourApplicationHomePage>" --identifier-uris "<https://YouApplicationUri>" --password <Your_Password>
-# azure ad sp create "<Application_Id>"
-# azure role assignment create --objectId "<Object_Id>" -o Owner -c /subscriptions/<subscriptionId>/
-#
-# Use <Application_Id> for "key" and the <Your_Password> for "secret"
-#
-key = 00000000-0000-0000-0000-000000000000
-secret = PASSWORD
-timeout = 60
-region = East US
-
-[Cloud List]
-# The resource group in which the compute node virtual machines will be created
-# and listed.
-ex_resource_group = ArvadosResourceGroup
-
-[Cloud Create]
-# The image id, in the form "Publisher:Offer:SKU:Version"
-image = Canonical:UbuntuServer:14.04.3-LTS:14.04.201508050
-
-# Path to a local ssh key file that will be used to provision new nodes.
-ssh_key = {ssh_key}
-
-# The account name for the admin user that will be provisioned on new nodes.
-ex_user_name = arvadosuser
-
-# The Azure storage account that will be used to store the node OS disk images.
-ex_storage_account = arvadosstorage
-
-# The virtual network the VMs will be associated with.
-ex_network = ArvadosNetwork
-
-# Optional subnet of the virtual network.
-#ex_subnet = default
-
-# Node tags
-tag_arvados-class = dynamic-compute
-tag_cluster = zyxwv
-
-# the API server to ping
-ping_host = {host}
-
-# You can define any number of Size sections to list Azure sizes you're willing
-# to use.  The Node Manager should boot the cheapest size(s) that can run jobs
-# in the queue.  You must also provide price per hour as the Azure driver
-# compute currently does not report prices.
-#
-# See https://azure.microsoft.com/en-us/pricing/details/virtual-machines/
-# for a list of known machine types that may be used as a Size parameter.
-#
-# Each size section MUST define the number of cores are available in this
-# size class (since libcloud does not provide any consistent API for exposing
-# this setting).
-# You may also want to define the amount of scratch space (expressed
-# in GB) for Crunch jobs.  You can also override Microsoft's provided
-# data fields by setting them here.
-
-[Size Standard_D3]
-cores = 4
-price = 0.56
-
-[Size Standard_D4]
-cores = 8
-price = 1.12
diff --git a/services/nodemanager/tests/fake_ec2.cfg.template b/services/nodemanager/tests/fake_ec2.cfg.template
deleted file mode 100644 (file)
index 2bb7d0e..0000000
+++ /dev/null
@@ -1,162 +0,0 @@
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: AGPL-3.0
-
-# Azure configuration for Arvados Node Manager.
-# All times are in seconds unless specified otherwise.
-
-[Manage]
-# The management server responds to http://addr:port/status.json with
-# a snapshot of internal state.
-
-# Management server listening address (default 127.0.0.1)
-#address = 0.0.0.0
-
-# Management server port number (default -1, server is disabled)
-#port = 8989
-
-[Daemon]
-# The dispatcher can customize the start and stop procedure for
-# cloud nodes.  For example, the SLURM dispatcher drains nodes
-# through SLURM before shutting them down.
-#dispatcher = slurm
-
-# Node Manager will ensure that there are at least this many nodes running at
-# all times.  If node manager needs to start new idle nodes for the purpose of
-# satisfying min_nodes, it will use the cheapest node type.  However, depending
-# on usage patterns, it may also satisfy min_nodes by keeping alive some
-# more-expensive nodes
-min_nodes = 0
-
-# Node Manager will not start any compute nodes when at least this
-# many are running.
-max_nodes = 8
-
-# Upper limit on rate of spending (in $/hr), will not boot additional nodes
-# if total price of already running nodes meets or exceeds this threshold.
-# default 0 means no limit.
-max_total_price = 0
-
-# Poll Azure nodes and Arvados for new information every N seconds.
-poll_time = 0.5
-
-# Polls have exponential backoff when services fail to respond.
-# This is the longest time to wait between polls.
-max_poll_time = 1
-
-# If Node Manager can't succesfully poll a service for this long,
-# it will never start or stop compute nodes, on the assumption that its
-# information is too outdated.
-poll_stale_after = 1
-
-# If Node Manager boots a cloud node, and it does not pair with an Arvados
-# node before this long, assume that there was a cloud bootstrap failure and
-# shut it down.  Note that normal shutdown windows apply (see the Cloud
-# section), so this should be shorter than the first shutdown window value.
-boot_fail_after = 45
-
-# "Node stale time" affects two related behaviors.
-# 1. If a compute node has been running for at least this long, but it
-# isn't paired with an Arvados node, do not shut it down, but leave it alone.
-# This prevents the node manager from shutting down a node that might
-# actually be doing work, but is having temporary trouble contacting the
-# API server.
-# 2. When the Node Manager starts a new compute node, it will try to reuse
-# an Arvados node that hasn't been updated for this long.
-node_stale_after = 14400
-
-# Scaling factor to be applied to nodes' available RAM size. Usually there's a
-# variable discrepancy between the advertised RAM value on cloud nodes and the
-# actual amount available.
-# If not set, this value will be set to 0.95
-node_mem_scaling = 0.95
-
-# File path for Certificate Authorities
-certs_file = /etc/ssl/certs/ca-certificates.crt
-
-[Logging]
-# Log file path
-#file = node-manager.log
-
-# Log level for most Node Manager messages.
-# Choose one of DEBUG, INFO, WARNING, ERROR, or CRITICAL.
-# WARNING lets you know when polling a service fails.
-# INFO additionally lets you know when a compute node is started or stopped.
-level = DEBUG
-
-# You can also set different log levels for specific libraries.
-# Pykka is the Node Manager's actor library.
-# Setting this to DEBUG will display tracebacks for uncaught
-# exceptions in the actors, but it's also very chatty.
-pykka = WARNING
-
-# Setting apiclient to INFO will log the URL of every Arvados API request.
-apiclient = WARNING
-
-[Arvados]
-host = {host}
-token = {token}
-timeout = 15
-jobs_queue = no
-
-# Accept an untrusted SSL certificate from the API server?
-insecure = yes
-
-[Cloud]
-provider = ec2
-driver_class = {driver_class}
-
-# Shutdown windows define periods of time when a node may and may not be shut
-# down.  These are windows in full minutes, separated by commas.  Counting from
-# the time the node is booted, the node WILL NOT shut down for N1 minutes; then
-# it MAY shut down for N2 minutes; then it WILL NOT shut down for N3 minutes;
-# and so on.  For example, "20, 999999" means the node may shut down between
-# the 20th and 999999th minutes of uptime.
-# Azure bills by the minute, so it makes sense to agressively shut down idle
-# nodes.  Specify at least two windows.  You can add as many as you need beyond
-# that.
-shutdown_windows = 0.05, 999999
-
-[Cloud Credentials]
-
-key = 00000000-0000-0000-0000-000000000000
-secret = PASSWORD
-timeout = 60
-region = East US
-
-[Cloud List]
-
-[Cloud Create]
-# The image id
-image = fake_image_id
-
-# Path to a local ssh key file that will be used to provision new nodes.
-ssh_key = {ssh_key}
-
-# the API server to ping
-ping_host = {host}
-
-# You can define any number of Size sections to list Azure sizes you're willing
-# to use.  The Node Manager should boot the cheapest size(s) that can run jobs
-# in the queue.  You must also provide price per hour as the Azure driver
-# compute currently does not report prices.
-#
-# See https://azure.microsoft.com/en-us/pricing/details/virtual-machines/
-# for a list of known machine types that may be used as a Size parameter.
-#
-# Each size section MUST define the number of cores are available in this
-# size class (since libcloud does not provide any consistent API for exposing
-# this setting).
-# You may also want to define the amount of scratch space (expressed
-# in GB) for Crunch jobs.  You can also override Microsoft's provided
-# data fields by setting them here.
-
-[Size m4.xlarge]
-cores = 4
-price = 0.56
-scratch = 250
-
-[Size m4.2xlarge]
-cores = 8
-price = 1.12
-scratch = 500
diff --git a/services/nodemanager/tests/fake_gce.cfg.template b/services/nodemanager/tests/fake_gce.cfg.template
deleted file mode 100644 (file)
index 11131ef..0000000
+++ /dev/null
@@ -1,159 +0,0 @@
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: AGPL-3.0
-
-# Azure configuration for Arvados Node Manager.
-# All times are in seconds unless specified otherwise.
-
-[Manage]
-# The management server responds to http://addr:port/status.json with
-# a snapshot of internal state.
-
-# Management server listening address (default 127.0.0.1)
-#address = 0.0.0.0
-
-# Management server port number (default -1, server is disabled)
-#port = 8989
-
-[Daemon]
-# The dispatcher can customize the start and stop procedure for
-# cloud nodes.  For example, the SLURM dispatcher drains nodes
-# through SLURM before shutting them down.
-#dispatcher = slurm
-
-# Node Manager will ensure that there are at least this many nodes running at
-# all times.  If node manager needs to start new idle nodes for the purpose of
-# satisfying min_nodes, it will use the cheapest node type.  However, depending
-# on usage patterns, it may also satisfy min_nodes by keeping alive some
-# more-expensive nodes
-min_nodes = 0
-
-# Node Manager will not start any compute nodes when at least this
-# many are running.
-max_nodes = 8
-
-# Upper limit on rate of spending (in $/hr), will not boot additional nodes
-# if total price of already running nodes meets or exceeds this threshold.
-# default 0 means no limit.
-max_total_price = 0
-
-# Poll Azure nodes and Arvados for new information every N seconds.
-poll_time = 0.5
-
-# Polls have exponential backoff when services fail to respond.
-# This is the longest time to wait between polls.
-max_poll_time = 1
-
-# If Node Manager can't succesfully poll a service for this long,
-# it will never start or stop compute nodes, on the assumption that its
-# information is too outdated.
-poll_stale_after = 1
-
-# If Node Manager boots a cloud node, and it does not pair with an Arvados
-# node before this long, assume that there was a cloud bootstrap failure and
-# shut it down.  Note that normal shutdown windows apply (see the Cloud
-# section), so this should be shorter than the first shutdown window value.
-boot_fail_after = 45
-
-# "Node stale time" affects two related behaviors.
-# 1. If a compute node has been running for at least this long, but it
-# isn't paired with an Arvados node, do not shut it down, but leave it alone.
-# This prevents the node manager from shutting down a node that might
-# actually be doing work, but is having temporary trouble contacting the
-# API server.
-# 2. When the Node Manager starts a new compute node, it will try to reuse
-# an Arvados node that hasn't been updated for this long.
-node_stale_after = 14400
-
-# Scaling factor to be applied to nodes' available RAM size. Usually there's a
-# variable discrepancy between the advertised RAM value on cloud nodes and the
-# actual amount available.
-# If not set, this value will be set to 0.95
-node_mem_scaling = 0.95
-
-# File path for Certificate Authorities
-certs_file = /etc/ssl/certs/ca-certificates.crt
-
-[Logging]
-# Log file path
-#file = node-manager.log
-
-# Log level for most Node Manager messages.
-# Choose one of DEBUG, INFO, WARNING, ERROR, or CRITICAL.
-# WARNING lets you know when polling a service fails.
-# INFO additionally lets you know when a compute node is started or stopped.
-level = DEBUG
-
-# You can also set different log levels for specific libraries.
-# Pykka is the Node Manager's actor library.
-# Setting this to DEBUG will display tracebacks for uncaught
-# exceptions in the actors, but it's also very chatty.
-pykka = WARNING
-
-# Setting apiclient to INFO will log the URL of every Arvados API request.
-apiclient = WARNING
-
-[Arvados]
-host = {host}
-token = {token}
-timeout = 15
-jobs_queue = no
-
-# Accept an untrusted SSL certificate from the API server?
-insecure = yes
-
-[Cloud]
-provider = gce
-driver_class = {driver_class}
-
-# Shutdown windows define periods of time when a node may and may not be shut
-# down.  These are windows in full minutes, separated by commas.  Counting from
-# the time the node is booted, the node WILL NOT shut down for N1 minutes; then
-# it MAY shut down for N2 minutes; then it WILL NOT shut down for N3 minutes;
-# and so on.  For example, "20, 999999" means the node may shut down between
-# the 20th and 999999th minutes of uptime.
-# Azure bills by the minute, so it makes sense to agressively shut down idle
-# nodes.  Specify at least two windows.  You can add as many as you need beyond
-# that.
-shutdown_windows = 0.05, 999999
-
-[Cloud Credentials]
-key = 00000000-0000-0000-0000-000000000000
-secret = PASSWORD
-timeout = 60
-region = East US
-
-[Cloud List]
-
-[Cloud Create]
-# The image id
-image = fake_image_id
-
-# Path to a local ssh key file that will be used to provision new nodes.
-ssh_key = {ssh_key}
-
-# the API server to ping
-ping_host = {host}
-
-# You can define any number of Size sections to list Azure sizes you're willing
-# to use.  The Node Manager should boot the cheapest size(s) that can run jobs
-# in the queue.  You must also provide price per hour as the Azure driver
-# compute currently does not report prices.
-#
-# See https://azure.microsoft.com/en-us/pricing/details/virtual-machines/
-# for a list of known machine types that may be used as a Size parameter.
-#
-# Each size section MUST define the number of cores are available in this
-# size class (since libcloud does not provide any consistent API for exposing
-# this setting).
-# You may also want to define the amount of scratch space (expressed
-# in GB) for Crunch jobs.  You can also override Microsoft's provided
-# data fields by setting them here.
-
-[Size n1-standard-1]
-cores = 1
-price = 0.56
-
-[Size n1-standard-2]
-cores = 2
-price = 1.12
\ No newline at end of file
diff --git a/services/nodemanager/tests/integration_test.py b/services/nodemanager/tests/integration_test.py
deleted file mode 100755 (executable)
index 1ba2957..0000000
+++ /dev/null
@@ -1,494 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: AGPL-3.0
-
-"""Integration test framework for node manager.
-
-Runs full node manager with an API server (needs ARVADOS_API_HOST and
-ARVADOS_API_TOKEN).  Stubs out the cloud driver and slurm commands to mock
-specific behaviors.  Monitors the log output to verify an expected sequence of
-events or behaviors for each test.
-
-"""
-
-import subprocess32 as subprocess
-import os
-import sys
-import re
-import time
-import logging
-import stat
-import tempfile
-import shutil
-import errno
-from functools import partial
-import arvados
-import StringIO
-
-formatter = logging.Formatter('%(asctime)s %(levelname)s: %(message)s')
-
-handler = logging.StreamHandler(sys.stderr)
-handler.setFormatter(formatter)
-logger = logging.getLogger("logger")
-logger.setLevel(logging.INFO)
-logger.addHandler(handler)
-
-detail = logging.getLogger("detail")
-detail.setLevel(logging.INFO)
-if os.environ.get("ANMTEST_LOGLEVEL"):
-    detail_content = sys.stderr
-else:
-    detail_content = StringIO.StringIO()
-handler = logging.StreamHandler(detail_content)
-handler.setFormatter(formatter)
-detail.addHandler(handler)
-
-fake_slurm = None
-compute_nodes = None
-all_jobs = None
-unsatisfiable_job_scancelled = None
-
-def update_script(path, val):
-    with open(path+"_", "w") as f:
-        f.write(val)
-    os.chmod(path+"_", stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR)
-    os.rename(path+"_", path)
-    detail.info("Update script %s: %s", path, val)
-
-def set_squeue(g):
-    global all_jobs
-    update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n" +
-                  "\n".join("echo '1|100|100|%s|%s|(null)|1234567890'" % (v, k) for k,v in all_jobs.items()))
-    return 0
-
-def set_queue_unsatisfiable(g):
-    global all_jobs, unsatisfiable_job_scancelled
-    # Simulate a job requesting a 99 core node.
-    update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n" +
-                  "\n".join("echo '99|100|100|%s|%s|(null)|1234567890'" % (v, k) for k,v in all_jobs.items()))
-    update_script(os.path.join(fake_slurm, "scancel"), "#!/bin/sh\n" +
-                  "\ntouch %s" % unsatisfiable_job_scancelled)
-    return 0
-
-def job_cancelled(g):
-    global unsatisfiable_job_scancelled
-    cancelled_job = g.group(1)
-    api = arvados.api('v1')
-    # Check that 'scancel' was called
-    if not os.path.isfile(unsatisfiable_job_scancelled):
-        return 1
-    # Check for the log entry
-    log_entry = api.logs().list(
-        filters=[
-            ['object_uuid', '=', cancelled_job],
-            ['event_type', '=', 'stderr'],
-        ]).execute()['items'][0]
-    if not re.match(
-            r"Constraints cannot be satisfied",
-            log_entry['properties']['text']):
-        return 1
-    return 0
-
-def node_paired(g):
-    global compute_nodes
-    compute_nodes[g.group(1)] = g.group(3)
-
-    update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n" +
-                  "\n".join("echo '%s|alloc|(null)'" % (v) for k,v in compute_nodes.items()))
-
-    for k,v in all_jobs.items():
-        if v == "ReqNodeNotAvail":
-            all_jobs[k] = "Running"
-            break
-
-    set_squeue(g)
-
-    return 0
-
-def node_busy(g):
-    update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n" +
-                  "\n".join("echo '%s|idle|(null)'" % (v) for k,v in compute_nodes.items()))
-    return 0
-
-def node_shutdown(g):
-    global compute_nodes
-    if g.group(1) in compute_nodes:
-        del compute_nodes[g.group(1)]
-        return 0
-    else:
-        return 1
-
-
-def jobs_req(g):
-    global all_jobs
-    for k,v in all_jobs.items():
-        all_jobs[k] = "ReqNodeNotAvail"
-    set_squeue(g)
-    return 0
-
-def noop(g):
-    return 0
-
-def fail(checks, pattern, g):
-    return 1
-
-def expect_count(count, checks, pattern, g):
-    if count == 0:
-        return 1
-    else:
-        checks[pattern] = partial(expect_count, count-1)
-        return 0
-
-def run_test(name, actions, checks, driver_class, jobs, provider):
-    code = 0
-    global unsatisfiable_job_scancelled
-    unsatisfiable_job_scancelled = os.path.join(tempfile.mkdtemp(),
-                                                "scancel_called")
-
-    # Delete any stale node records
-    api = arvados.api('v1')
-    for n in api.nodes().list().execute()['items']:
-        api.nodes().delete(uuid=n["uuid"]).execute()
-
-    logger.info("Start %s", name)
-
-    global fake_slurm
-    fake_slurm = tempfile.mkdtemp()
-    detail.info("fake_slurm is %s", fake_slurm)
-
-    global compute_nodes
-    compute_nodes = {}
-
-    global all_jobs
-    all_jobs = jobs
-
-    env = os.environ.copy()
-    env["PATH"] = fake_slurm + ":" + env["PATH"]
-
-    # Reset fake squeue/sinfo to empty
-    update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n")
-    update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n")
-
-    # Write configuration file for test
-    with open("tests/fake_%s.cfg.template" % provider) as f:
-        open(os.path.join(fake_slurm, "id_rsa.pub"), "w").close()
-        with open(os.path.join(fake_slurm, "fake.cfg"), "w") as cfg:
-            cfg.write(f.read().format(host=os.environ["ARVADOS_API_HOST"],
-                                      token=os.environ["ARVADOS_API_TOKEN"],
-                                      driver_class=driver_class,
-                                      ssh_key=os.path.join(fake_slurm, "id_rsa.pub")))
-
-    # Tests must complete in less than 30 seconds.
-    timeout = time.time() + 30
-    terminated = False
-
-    # Now start node manager
-    p = subprocess.Popen(["bin/arvados-node-manager", "--foreground", "--config", os.path.join(fake_slurm, "fake.cfg")],
-                         bufsize=0, stderr=subprocess.PIPE, env=env)
-
-    # Test main loop:
-    # - Read line
-    # - Apply negative checks (things that are not supposed to happen)
-    # - Check timeout
-    # - Check if the next action should trigger
-    # - If all actions are exhausted, terminate with test success
-    # - If it hits timeout with actions remaining, terminate with test failed
-    try:
-        # naive line iteration over pipes gets buffered, which isn't what we want,
-        # see https://bugs.python.org/issue3907
-        for line in iter(p.stderr.readline, ""):
-            detail_content.write(line)
-
-            for k,v in checks.items():
-                g = re.match(k, line)
-                if g:
-                    detail.info("Matched check %s", k)
-                    code += v(checks, k, g)
-                    if code != 0:
-                        detail.error("Check failed")
-                        if not terminated:
-                            p.kill()
-                            terminated = True
-
-            if terminated:
-                continue
-
-            if time.time() > timeout:
-                detail.error("Exceeded timeout with actions remaining: %s", actions)
-                code += 1
-                if not terminated:
-                    p.kill()
-                    terminated = True
-
-            k, v = actions[0]
-            g = re.match(k, line)
-            if g:
-                detail.info("Matched action %s", k)
-                actions.pop(0)
-                code += v(g)
-                if code != 0:
-                    detail.error("Action failed")
-                    p.kill()
-                    terminated = True
-
-            if not actions:
-                p.kill()
-                terminated = True
-    except KeyboardInterrupt:
-        p.kill()
-
-    if actions:
-        detail.error("Ended with remaining actions: %s", actions)
-        code = 1
-
-    shutil.rmtree(fake_slurm)
-    shutil.rmtree(os.path.dirname(unsatisfiable_job_scancelled))
-
-    if code == 0:
-        logger.info("%s passed", name)
-    else:
-        if isinstance(detail_content, StringIO.StringIO):
-            detail_content.seek(0)
-            chunk = detail_content.read(4096)
-            while chunk:
-                try:
-                    sys.stderr.write(chunk)
-                    chunk = detail_content.read(4096)
-                except IOError as e:
-                    if e.errno == errno.EAGAIN:
-                        # try again (probably pipe buffer full)
-                        pass
-                    else:
-                        raise
-        logger.info("%s failed", name)
-
-    return code
-
-
-def main():
-    # Test lifecycle.
-
-    tests = {
-        "test_unsatisfiable_jobs" : (
-            # Actions (pattern -> action)
-            [
-                (r".*Daemon started", set_queue_unsatisfiable),
-                (r".*Cancelled unsatisfiable job '(\S+)'", job_cancelled),
-            ],
-            # Checks (things that shouldn't happen)
-            {
-                r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": fail,
-                r".*Trying to cancel job '(\S+)'": fail,
-            },
-            # Driver class
-            "arvnodeman.test.fake_driver.FakeDriver",
-            # Jobs
-            {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
-            # Provider
-            "azure"),
-        "test_single_node_azure": (
-            # Actions (pattern -> action)
-            [
-                (r".*Daemon started", set_squeue),
-                (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
-                (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
-                (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
-                (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
-            ],
-            # Checks (things that shouldn't happen)
-            {
-                r".*Suggesting shutdown because node state is \('down', .*\)": fail,
-                r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1),
-                r".*Setting node quota.*": fail,
-            },
-            # Driver class
-            "arvnodeman.test.fake_driver.FakeDriver",
-            # Jobs
-            {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
-            # Provider
-            "azure"),
-        "test_multiple_nodes": (
-            # Actions (pattern -> action)
-            [
-                (r".*Daemon started", set_squeue),
-                (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
-                (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
-                (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
-                (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
-                (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
-                (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
-                (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
-                (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
-                (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
-                (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
-            ],
-            # Checks (things that shouldn't happen)
-            {
-                r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 4),
-                r".*Setting node quota.*": fail,
-            },
-            # Driver class
-            "arvnodeman.test.fake_driver.FakeDriver",
-            # Jobs
-            {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
-             "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
-             "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
-             "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"},
-            # Provider
-            "azure"),
-        "test_hit_quota": (
-            # Actions (pattern -> action)
-            [
-                (r".*Daemon started", set_squeue),
-                (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
-                (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
-                (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
-                (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
-                (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
-                (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown)
-            ],
-            # Checks (things that shouldn't happen)
-            {
-                r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 2),
-                r".*Sending create_node request.*": partial(expect_count, 5)
-            },
-            # Driver class
-            "arvnodeman.test.fake_driver.QuotaDriver",
-            # Jobs
-            {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
-             "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
-             "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
-             "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"},
-            # Provider
-            "azure"),
-        "test_probe_quota": (
-            # Actions (pattern -> action)
-            [
-                (r".*Daemon started", set_squeue),
-                (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
-                (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
-                (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
-                (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
-                (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
-                (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
-                (r".*sending request", jobs_req),
-                (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
-                (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
-                (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
-                (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
-                (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
-                (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
-                (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
-                (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
-                (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
-                (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
-            ],
-            # Checks (things that shouldn't happen)
-            {
-                r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 6),
-                r".*Sending create_node request.*": partial(expect_count, 9)
-            },
-            # Driver class
-            "arvnodeman.test.fake_driver.QuotaDriver",
-            # Jobs
-            {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
-             "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
-             "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
-             "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"},
-            # Provider
-            "azure"),
-        "test_no_hang_failing_node_create": (
-            # Actions (pattern -> action)
-            [
-                (r".*Daemon started", set_squeue),
-                (r".*Client error: nope", noop),
-                (r".*Client error: nope", noop),
-                (r".*Client error: nope", noop),
-                (r".*Client error: nope", noop),
-            ],
-            # Checks (things that shouldn't happen)
-            {},
-            # Driver class
-            "arvnodeman.test.fake_driver.FailingDriver",
-            # Jobs
-            {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
-             "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
-             "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
-             "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"},
-            # Provider
-            "azure"),
-        "test_retry_create": (
-            # Actions (pattern -> action)
-            [
-                (r".*Daemon started", set_squeue),
-                (r".*Rate limit exceeded - scheduling retry in 2 seconds", noop),
-                (r".*Rate limit exceeded - scheduling retry in 1 seconds", noop),
-                (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", noop),
-            ],
-            # Checks (things that shouldn't happen)
-            {},
-            # Driver class
-            "arvnodeman.test.fake_driver.RetryDriver",
-            # Jobs
-            {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail"},
-            # Provider
-            "azure"),
-        "test_single_node_aws": (
-            # Actions (pattern -> action)
-            [
-                (r".*Daemon started", set_squeue),
-                (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
-                (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
-                (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
-                (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
-            ],
-            # Checks (things that shouldn't happen)
-            {
-                r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1),
-                r".*Setting node quota.*": fail,
-            },
-            # Driver class
-            "arvnodeman.test.fake_driver.FakeAwsDriver",
-            # Jobs
-            {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
-            # Provider
-            "ec2"),
-        "test_single_node_gce": (
-            # Actions (pattern -> action)
-            [
-                (r".*Daemon started", set_squeue),
-                (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
-                (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
-                (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
-                (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
-            ],
-            # Checks (things that shouldn't happen)
-            {
-                r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1),
-                r".*Setting node quota.*": fail,
-            },
-            # Driver class
-            "arvnodeman.test.fake_driver.FakeGceDriver",
-            # Jobs
-            {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
-            # Provider
-            "gce")
-    }
-
-    code = 0
-    if len(sys.argv) > 1:
-        code = run_test(sys.argv[1], *tests[sys.argv[1]])
-    else:
-        for t in sorted(tests.keys()):
-            code += run_test(t, *tests[t])
-
-    if code == 0:
-        logger.info("Tests passed")
-    else:
-        logger.info("Tests failed")
-
-    exit(code)
-
-if __name__ == '__main__':
-    main()
diff --git a/services/nodemanager/tests/stress_test.cwl b/services/nodemanager/tests/stress_test.cwl
deleted file mode 100644 (file)
index 082df64..0000000
+++ /dev/null
@@ -1,51 +0,0 @@
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: AGPL-3.0
-#
-#
-# Usage: arvados-cwl-runner stress_test.cwl
-#
-# Submits 100 jobs or containers, creating load on node manager and
-# scheduler.
-
-class: Workflow
-cwlVersion: v1.0
-requirements:
-  ScatterFeatureRequirement: {}
-  InlineJavascriptRequirement: {}
-inputs: []
-outputs: []
-steps:
-  step1:
-    in: []
-    out: [out]
-    run:
-      class: ExpressionTool
-      inputs: []
-      outputs:
-        out: int[]
-      expression: |
-        ${
-          var r = [];
-          for (var i = 1; i <= 100; i++) {
-            r.push(i);
-          }
-          return {out: r};
-        }
-  step2:
-    in:
-      num: step1/out
-    out: []
-    scatter: num
-    run:
-      class: CommandLineTool
-      requirements:
-        ShellCommandRequirement: {}
-      inputs:
-        num: int
-      outputs: []
-      arguments: [echo, "starting",
-        {shellQuote: false, valueFrom: "&&"},
-        sleep, $((101-inputs.num)*2),
-        {shellQuote: false, valueFrom: "&&"},
-        echo, "the number of the day is", $(inputs.num)]
diff --git a/services/nodemanager/tests/test_arguments.py b/services/nodemanager/tests/test_arguments.py
deleted file mode 100644 (file)
index e325e52..0000000
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: AGPL-3.0
-
-import io
-import os
-import sys
-import tempfile
-import unittest
-
-import arvnodeman.launcher as nodeman
-from . import testutil
-
-class ArvNodemArgumentsTestCase(unittest.TestCase):
-    def run_nodeman(self, args):
-        return nodeman.main(args)
-
-    def test_unsupported_arg(self):
-        with self.assertRaises(SystemExit):
-            self.run_nodeman(['-x=unknown'])
-
-    def test_version_argument(self):
-        err = io.BytesIO()
-        out = io.BytesIO()
-        with testutil.redirected_streams(stdout=out, stderr=err):
-            with self.assertRaises(SystemExit):
-                self.run_nodeman(['--version'])
-        self.assertEqual(out.getvalue(), '')
-        self.assertRegexpMatches(err.getvalue(), "[0-9]+\.[0-9]+\.[0-9]+")
diff --git a/services/nodemanager/tests/test_clientactor.py b/services/nodemanager/tests/test_clientactor.py
deleted file mode 100644 (file)
index 19e094d..0000000
+++ /dev/null
@@ -1,152 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: AGPL-3.0
-
-from __future__ import absolute_import, print_function
-
-import unittest
-
-import mock
-import pykka
-
-import arvnodeman.clientactor as clientactor
-from . import testutil
-
-class RemotePollLoopActorTestCase(testutil.RemotePollLoopActorTestMixin,
-                                  unittest.TestCase):
-    class MockClientError(Exception):
-        pass
-
-    class TestActor(clientactor.RemotePollLoopActor):
-        LOGGER_NAME = 'arvnodeman.testpoll'
-
-        def _send_request(self):
-            return self._client()
-    TestActor.CLIENT_ERRORS = (MockClientError,)
-    TEST_CLASS = TestActor
-
-
-    def build_monitor(self, side_effect, *args, **kwargs):
-        super(RemotePollLoopActorTestCase, self).build_monitor(*args, **kwargs)
-        self.client.side_effect = side_effect
-
-    def test_poll_loop_starts_after_subscription(self):
-        self.build_monitor(['test1'])
-        self.monitor.subscribe(self.subscriber).get(self.TIMEOUT)
-        self.stop_proxy(self.monitor)
-        self.subscriber.assert_called_with('test1')
-        self.assertTrue(self.timer.schedule.called)
-
-    def test_poll_loop_continues_after_failure(self):
-        self.build_monitor(self.MockClientError)
-        self.monitor.subscribe(self.subscriber).get(self.TIMEOUT)
-        self.assertTrue(self.stop_proxy(self.monitor),
-                        "poll loop died after error")
-        self.assertTrue(self.timer.schedule.called,
-                        "poll loop did not reschedule after error")
-        self.assertFalse(self.subscriber.called,
-                         "poll loop notified subscribers after error")
-
-    def test_late_subscribers_get_responses(self):
-        self.build_monitor(['pre_late_test', 'late_test'])
-        mock_subscriber = mock.Mock(name='mock_subscriber')
-        self.monitor.subscribe(mock_subscriber).get(self.TIMEOUT)
-        self.monitor.subscribe(self.subscriber)
-        self.monitor.poll().get(self.TIMEOUT)
-        self.stop_proxy(self.monitor)
-        self.subscriber.assert_called_with('late_test')
-
-    def test_survive_dead_subscriptions(self):
-        self.build_monitor(['survive1', 'survive2'])
-        dead_subscriber = mock.Mock(name='dead_subscriber')
-        dead_subscriber.side_effect = pykka.ActorDeadError
-        self.monitor.subscribe(dead_subscriber)
-        self.monitor.subscribe(self.subscriber)
-        self.monitor.poll().get(self.TIMEOUT)
-        self.assertTrue(self.stop_proxy(self.monitor),
-                        "poll loop died from dead subscriber")
-        self.subscriber.assert_called_with('survive2')
-
-    def check_poll_timers(self, *test_times):
-        schedule_mock = self.timer.schedule
-        last_expect = None
-        with mock.patch('time.time') as time_mock:
-            for fake_time, expect_next in test_times:
-                time_mock.return_value = fake_time
-                self.monitor.poll(last_expect).get(self.TIMEOUT)
-                self.assertTrue(schedule_mock.called)
-                self.assertEqual(expect_next, schedule_mock.call_args[0][0])
-                schedule_mock.reset_mock()
-                last_expect = expect_next
-
-    def test_poll_timing_on_consecutive_successes_with_drift(self):
-        self.build_monitor(['1', '2'], poll_wait=3, max_poll_wait=14)
-        self.check_poll_timers((0, 3), (4, 6))
-
-    def test_poll_backoff_on_failures(self):
-        self.build_monitor(self.MockClientError, poll_wait=3, max_poll_wait=14)
-        self.check_poll_timers((0, 6), (6, 18), (18, 32))
-
-    def test_poll_timing_after_error_recovery(self):
-        self.build_monitor(['a', self.MockClientError(), 'b'],
-                           poll_wait=3, max_poll_wait=14)
-        self.check_poll_timers((0, 3), (4, 10), (10, 13))
-
-    def test_no_subscriptions_by_key_without_support(self):
-        self.build_monitor([])
-        with self.assertRaises(AttributeError):
-            self.monitor.subscribe_to('key')
-
-
-class RemotePollLoopActorWithKeysTestCase(testutil.RemotePollLoopActorTestMixin,
-                                          unittest.TestCase):
-    class TestActor(RemotePollLoopActorTestCase.TestActor):
-        def _item_key(self, item):
-            return item['key']
-    TEST_CLASS = TestActor
-
-
-    def build_monitor(self, side_effect, *args, **kwargs):
-        super(RemotePollLoopActorWithKeysTestCase, self).build_monitor(
-            *args, **kwargs)
-        self.client.side_effect = side_effect
-
-    def test_key_subscription(self):
-        self.build_monitor([[{'key': 1}, {'key': 2}]])
-        self.monitor.subscribe_to(2, self.subscriber).get(self.TIMEOUT)
-        self.stop_proxy(self.monitor)
-        self.subscriber.assert_called_with({'key': 2})
-
-    def test_survive_dead_key_subscriptions(self):
-        item = {'key': 3}
-        self.build_monitor([[item], [item]])
-        dead_subscriber = mock.Mock(name='dead_subscriber')
-        dead_subscriber.side_effect = pykka.ActorDeadError
-        self.monitor.subscribe_to(3, dead_subscriber)
-        self.monitor.subscribe_to(3, self.subscriber)
-        self.monitor.poll().get(self.TIMEOUT)
-        self.assertTrue(self.stop_proxy(self.monitor),
-                        "poll loop died from dead key subscriber")
-        self.subscriber.assert_called_with(item)
-
-    def test_mixed_subscriptions(self):
-        item = {'key': 4}
-        self.build_monitor([[item], [item]])
-        key_subscriber = mock.Mock(name='key_subscriber')
-        self.monitor.subscribe(self.subscriber)
-        self.monitor.subscribe_to(4, key_subscriber)
-        self.monitor.poll().get(self.TIMEOUT)
-        self.stop_proxy(self.monitor)
-        self.subscriber.assert_called_with([item])
-        key_subscriber.assert_called_with(item)
-
-    def test_subscription_to_missing_key(self):
-        self.build_monitor([[]])
-        self.monitor.subscribe_to('nonesuch', self.subscriber).get(self.TIMEOUT)
-        self.stop_proxy(self.monitor)
-        self.subscriber.assert_called_with(None)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/services/nodemanager/tests/test_computenode.py b/services/nodemanager/tests/test_computenode.py
deleted file mode 100644 (file)
index 898112b..0000000
+++ /dev/null
@@ -1,45 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: AGPL-3.0
-
-from __future__ import absolute_import, print_function
-
-import time
-import unittest
-
-import arvados.errors as arverror
-import mock
-
-import arvnodeman.computenode as cnode
-from . import testutil
-
-@mock.patch('time.time', return_value=1)
-class ShutdownTimerTestCase(unittest.TestCase):
-    def test_two_length_window(self, time_mock):
-        timer = cnode.ShutdownTimer(time_mock.return_value, [8, 2])
-        self.assertEqual(481, timer.next_opening())
-        self.assertFalse(timer.window_open())
-        time_mock.return_value += 500
-        self.assertEqual(1081, timer.next_opening())
-        self.assertTrue(timer.window_open())
-        time_mock.return_value += 200
-        self.assertEqual(1081, timer.next_opening())
-        self.assertFalse(timer.window_open())
-
-    def test_three_length_window(self, time_mock):
-        timer = cnode.ShutdownTimer(time_mock.return_value, [6, 3, 1])
-        self.assertEqual(361, timer.next_opening())
-        self.assertFalse(timer.window_open())
-        time_mock.return_value += 400
-        self.assertEqual(961, timer.next_opening())
-        self.assertTrue(timer.window_open())
-        time_mock.return_value += 200
-        self.assertEqual(961, timer.next_opening())
-        self.assertFalse(timer.window_open())
-
-
-class ArvadosTimestamp(unittest.TestCase):
-    def test_arvados_timestamp(self):
-        self.assertEqual(1527710178, cnode.arvados_timestamp('2018-05-30T19:56:18Z'))
-        self.assertEqual(1527710178.999371, cnode.arvados_timestamp('2018-05-30T19:56:18.999371Z'))
diff --git a/services/nodemanager/tests/test_computenode_dispatch.py b/services/nodemanager/tests/test_computenode_dispatch.py
deleted file mode 100644 (file)
index aee3cbd..0000000
+++ /dev/null
@@ -1,562 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: AGPL-3.0
-
-from __future__ import absolute_import, print_function
-
-import time
-import unittest
-
-import arvados.errors as arverror
-import httplib2
-import mock
-import pykka
-import threading
-
-from libcloud.common.exceptions import BaseHTTPError
-
-import arvnodeman.computenode.dispatch as dispatch
-import arvnodeman.status as status
-from arvnodeman.computenode.driver import BaseComputeNodeDriver
-from . import testutil
-
-class ComputeNodeSetupActorTestCase(testutil.ActorTestMixin, unittest.TestCase):
-    ACTOR_CLASS = dispatch.ComputeNodeSetupActor
-
-    def make_mocks(self, arvados_effect=None):
-        if arvados_effect is None:
-            arvados_effect = [testutil.arvados_node_mock(
-                slot_number=None,
-                hostname=None,
-                first_ping_at=None,
-                last_ping_at=None,
-            )]
-        self.arvados_effect = arvados_effect
-        self.timer = testutil.MockTimer()
-        self.api_client = mock.MagicMock(name='api_client')
-        self.api_client.nodes().create().execute.side_effect = arvados_effect
-        self.api_client.nodes().update().execute.side_effect = arvados_effect
-        self.cloud_client = mock.MagicMock(name='cloud_client')
-        self.cloud_client.create_node.return_value = testutil.cloud_node_mock(1)
-
-    def make_actor(self, arv_node=None):
-        if not hasattr(self, 'timer'):
-            self.make_mocks(arvados_effect=[arv_node] if arv_node else None)
-        self.setup_actor = self.ACTOR_CLASS.start(
-            self.timer, self.api_client, self.cloud_client,
-            testutil.MockSize(1), arv_node).proxy()
-
-    def assert_node_properties_updated(self, uuid=None,
-                                       size=testutil.MockSize(1)):
-        self.api_client.nodes().update.assert_any_call(
-            uuid=(uuid or self.arvados_effect[-1]['uuid']),
-            body={
-                'properties': {
-                    'cloud_node': {
-                        'size': size.id,
-                        'price': size.price}}})
-
-    def test_creation_without_arvados_node(self):
-        self.make_actor()
-        finished = threading.Event()
-        self.setup_actor.subscribe(lambda _: finished.set())
-        self.assertEqual(self.arvados_effect[-1],
-                         self.setup_actor.arvados_node.get(self.TIMEOUT))
-        assert(finished.wait(self.TIMEOUT))
-        self.api_client.nodes().create.called_with(body={}, assign_slot=True)
-        self.assertEqual(1, self.api_client.nodes().create().execute.call_count)
-        self.assertEqual(1, self.api_client.nodes().update().execute.call_count)
-        self.assert_node_properties_updated()
-        self.assertEqual(self.cloud_client.create_node(),
-                         self.setup_actor.cloud_node.get(self.TIMEOUT))
-
-    def test_creation_with_arvados_node(self):
-        self.make_mocks(arvados_effect=[testutil.arvados_node_mock()]*2)
-        self.make_actor(testutil.arvados_node_mock())
-        finished = threading.Event()
-        self.setup_actor.subscribe(lambda _: finished.set())
-        self.assertEqual(self.arvados_effect[-1],
-                         self.setup_actor.arvados_node.get(self.TIMEOUT))
-        assert(finished.wait(self.TIMEOUT))
-        self.assert_node_properties_updated()
-        self.api_client.nodes().create.called_with(body={}, assign_slot=True)
-        self.assertEqual(3, self.api_client.nodes().update().execute.call_count)
-        self.assertEqual(self.cloud_client.create_node(),
-                         self.setup_actor.cloud_node.get(self.TIMEOUT))
-
-    def test_failed_arvados_calls_retried(self):
-        self.make_mocks([
-                arverror.ApiError(httplib2.Response({'status': '500'}), ""),
-                testutil.arvados_node_mock(),
-                ])
-        self.make_actor()
-        self.wait_for_assignment(self.setup_actor, 'arvados_node')
-
-    def test_failed_cloud_calls_retried(self):
-        self.make_mocks()
-        self.cloud_client.create_node.side_effect = [
-            Exception("test cloud creation error"),
-            self.cloud_client.create_node.return_value,
-            ]
-        self.make_actor()
-        self.wait_for_assignment(self.setup_actor, 'cloud_node')
-
-    def test_basehttperror_retried(self):
-        self.make_mocks()
-        self.cloud_client.create_node.side_effect = [
-            BaseHTTPError(500, "Try again"),
-            self.cloud_client.create_node.return_value,
-            ]
-        self.make_actor()
-        self.wait_for_assignment(self.setup_actor, 'cloud_node')
-        self.setup_actor.ping().get(self.TIMEOUT)
-        self.assertEqual(1, self.cloud_client.post_create_node.call_count)
-
-    def test_instance_exceeded_not_retried(self):
-        self.make_mocks()
-        self.cloud_client.create_node.side_effect = [
-            BaseHTTPError(400, "InstanceLimitExceeded"),
-            self.cloud_client.create_node.return_value,
-            ]
-        self.make_actor()
-        done = self.FUTURE_CLASS()
-        self.setup_actor.subscribe(done.set)
-        done.get(self.TIMEOUT)
-        self.assertEqual(0, self.cloud_client.post_create_node.call_count)
-
-    def test_failed_post_create_retried(self):
-        self.make_mocks()
-        self.cloud_client.post_create_node.side_effect = [
-            Exception("test cloud post-create error"), None]
-        self.make_actor()
-        done = self.FUTURE_CLASS()
-        self.setup_actor.subscribe(done.set)
-        done.get(self.TIMEOUT)
-        self.assertEqual(2, self.cloud_client.post_create_node.call_count)
-
-    def test_stop_when_no_cloud_node(self):
-        self.make_mocks(
-            arverror.ApiError(httplib2.Response({'status': '500'}), ""))
-        self.make_actor()
-        self.assertTrue(
-            self.setup_actor.stop_if_no_cloud_node().get(self.TIMEOUT))
-        self.assertTrue(
-            self.setup_actor.actor_ref.actor_stopped.wait(self.TIMEOUT))
-
-    def test_no_stop_when_cloud_node(self):
-        self.make_actor()
-        self.wait_for_assignment(self.setup_actor, 'cloud_node')
-        self.assertFalse(
-            self.setup_actor.stop_if_no_cloud_node().get(self.TIMEOUT))
-        self.assertTrue(self.stop_proxy(self.setup_actor),
-                        "actor was stopped by stop_if_no_cloud_node")
-
-    def test_subscribe(self):
-        self.make_mocks(
-            arverror.ApiError(httplib2.Response({'status': '500'}), ""))
-        self.make_actor()
-        subscriber = mock.Mock(name='subscriber_mock')
-        self.setup_actor.subscribe(subscriber)
-        retry_resp = [testutil.arvados_node_mock()]
-        self.api_client.nodes().create().execute.side_effect = retry_resp
-        self.api_client.nodes().update().execute.side_effect = retry_resp
-        self.wait_for_assignment(self.setup_actor, 'cloud_node')
-        self.setup_actor.ping().get(self.TIMEOUT)
-        self.assertEqual(self.setup_actor.actor_ref.actor_urn,
-                         subscriber.call_args[0][0].actor_ref.actor_urn)
-
-    def test_late_subscribe(self):
-        self.make_actor()
-        subscriber = mock.Mock(name='subscriber_mock')
-        self.wait_for_assignment(self.setup_actor, 'cloud_node')
-        self.setup_actor.subscribe(subscriber).get(self.TIMEOUT)
-        self.stop_proxy(self.setup_actor)
-        self.assertEqual(self.setup_actor.actor_ref.actor_urn,
-                         subscriber.call_args[0][0].actor_ref.actor_urn)
-
-
-class ComputeNodeShutdownActorMixin(testutil.ActorTestMixin):
-    def make_mocks(self, cloud_node=None, arvados_node=None,
-                   shutdown_open=True, node_broken=False):
-        self.timer = testutil.MockTimer()
-        self.shutdowns = testutil.MockShutdownTimer()
-        self.shutdowns._set_state(shutdown_open, 300)
-        self.cloud_client = mock.MagicMock(name='cloud_client')
-        self.cloud_client.broken.return_value = node_broken
-        self.arvados_client = mock.MagicMock(name='arvados_client')
-        self.updates = mock.MagicMock(name='update_mock')
-        if cloud_node is None:
-            cloud_node = testutil.cloud_node_mock()
-        self.cloud_node = cloud_node
-        self.arvados_node = arvados_node
-
-    def make_actor(self, cancellable=True, start_time=None):
-        if not hasattr(self, 'timer'):
-            self.make_mocks()
-        if start_time is None:
-            start_time = time.time()
-        monitor_actor = dispatch.ComputeNodeMonitorActor.start(
-            self.cloud_node, start_time, self.shutdowns,
-            self.timer, self.updates, self.cloud_client,
-            self.arvados_node)
-        self.shutdown_actor = self.ACTOR_CLASS.start(
-            self.timer, self.cloud_client, self.arvados_client, monitor_actor,
-            cancellable).proxy()
-        self.monitor_actor = monitor_actor.proxy()
-
-    def check_success_flag(self, expected, allow_msg_count=1):
-        # allow_msg_count is the number of internal messages that may
-        # need to be handled for shutdown to finish.
-        for _ in range(1 + allow_msg_count):
-            last_flag = self.shutdown_actor.success.get(self.TIMEOUT)
-            if last_flag is expected:
-                break
-        else:
-            self.fail("success flag {} is not {}".format(last_flag, expected))
-
-    def test_boot_failure_counting(self, *mocks):
-        # A boot failure happens when a node transitions from unpaired to shutdown
-        status.tracker.update({'boot_failures': 0})
-        self.make_mocks(shutdown_open=True, arvados_node=testutil.arvados_node_mock(crunch_worker_state="unpaired"))
-        self.cloud_client.destroy_node.return_value = True
-        self.make_actor(cancellable=False)
-        self.check_success_flag(True, 2)
-        self.assertTrue(self.cloud_client.destroy_node.called)
-        self.assertEqual(1, status.tracker.get('boot_failures'))
-
-    def test_cancellable_shutdown(self, *mocks):
-        self.make_mocks(shutdown_open=True, arvados_node=testutil.arvados_node_mock(crunch_worker_state="busy"))
-        self.cloud_client.destroy_node.return_value = True
-        self.make_actor(cancellable=True)
-        self.check_success_flag(False, 2)
-        self.assertFalse(self.cloud_client.destroy_node.called)
-
-    def test_uncancellable_shutdown(self, *mocks):
-        status.tracker.update({'boot_failures': 0})
-        self.make_mocks(shutdown_open=True, arvados_node=testutil.arvados_node_mock(crunch_worker_state="busy"))
-        self.cloud_client.destroy_node.return_value = True
-        self.make_actor(cancellable=False)
-        self.check_success_flag(True, 4)
-        self.assertTrue(self.cloud_client.destroy_node.called)
-        # A normal shutdown shouldn't be counted as boot failure
-        self.assertEqual(0, status.tracker.get('boot_failures'))
-
-    def test_arvados_node_cleaned_after_shutdown(self, *mocks):
-        if len(mocks) == 1:
-            mocks[0].return_value = "drain\n"
-        cloud_node = testutil.cloud_node_mock(62)
-        arv_node = testutil.arvados_node_mock(62)
-        self.make_mocks(cloud_node, arv_node)
-        self.make_actor()
-        self.check_success_flag(True, 3)
-        update_mock = self.arvados_client.nodes().update
-        self.assertTrue(update_mock.called)
-        update_kwargs = update_mock.call_args_list[0][1]
-        self.assertEqual(arv_node['uuid'], update_kwargs.get('uuid'))
-        self.assertIn('body', update_kwargs)
-        for clear_key in ['slot_number', 'hostname', 'ip_address',
-                          'first_ping_at', 'last_ping_at']:
-            self.assertIn(clear_key, update_kwargs['body'])
-            self.assertIsNone(update_kwargs['body'][clear_key])
-        self.assertTrue(update_mock().execute.called)
-
-    def test_arvados_node_not_cleaned_after_shutdown_cancelled(self, *mocks):
-        if len(mocks) == 1:
-            mocks[0].return_value = "idle\n"
-        cloud_node = testutil.cloud_node_mock(61)
-        arv_node = testutil.arvados_node_mock(61)
-        self.make_mocks(cloud_node, arv_node, shutdown_open=False)
-        self.cloud_client.destroy_node.return_value = False
-        self.make_actor(cancellable=True)
-        self.shutdown_actor.cancel_shutdown("test")
-        self.shutdown_actor.ping().get(self.TIMEOUT)
-        self.check_success_flag(False, 2)
-        self.assertFalse(self.arvados_client.nodes().update.called)
-
-
-class ComputeNodeShutdownActorTestCase(ComputeNodeShutdownActorMixin,
-                                       unittest.TestCase):
-    ACTOR_CLASS = dispatch.ComputeNodeShutdownActor
-
-    def test_easy_shutdown(self):
-        self.make_actor(start_time=0)
-        self.check_success_flag(True)
-        self.assertTrue(self.cloud_client.destroy_node.called)
-
-    def test_shutdown_cancelled_when_destroy_node_fails(self):
-        self.make_mocks(node_broken=True)
-        self.cloud_client.destroy_node.return_value = False
-        self.make_actor(start_time=0)
-        self.check_success_flag(False, 2)
-        self.assertEqual(1, self.cloud_client.destroy_node.call_count)
-        self.assertEqual(self.ACTOR_CLASS.DESTROY_FAILED,
-                         self.shutdown_actor.cancel_reason.get(self.TIMEOUT))
-
-    def test_late_subscribe(self):
-        self.make_actor()
-        subscriber = mock.Mock(name='subscriber_mock')
-        self.shutdown_actor.subscribe(subscriber).get(self.TIMEOUT)
-        self.stop_proxy(self.shutdown_actor)
-        self.assertTrue(subscriber.called)
-        self.assertEqual(self.shutdown_actor.actor_ref.actor_urn,
-                         subscriber.call_args[0][0].actor_ref.actor_urn)
-
-
-class ComputeNodeUpdateActorTestCase(testutil.ActorTestMixin,
-                                     unittest.TestCase):
-    ACTOR_CLASS = dispatch.ComputeNodeUpdateActor
-
-    def make_actor(self):
-        self.driver = mock.MagicMock(name='driver_mock')
-        self.timer = mock.MagicMock(name='timer_mock')
-        self.updater = self.ACTOR_CLASS.start(self.driver, self.timer).proxy()
-
-    def test_node_sync(self, *args):
-        self.make_actor()
-        cloud_node = testutil.cloud_node_mock()
-        arv_node = testutil.arvados_node_mock()
-        self.updater.sync_node(cloud_node, arv_node).get(self.TIMEOUT)
-        self.driver().sync_node.assert_called_with(cloud_node, arv_node)
-
-    @testutil.no_sleep
-    def test_node_sync_error(self, *args):
-        self.make_actor()
-        cloud_node = testutil.cloud_node_mock()
-        arv_node = testutil.arvados_node_mock()
-        self.driver().sync_node.side_effect = (IOError, Exception, True)
-        self.updater.sync_node(cloud_node, arv_node).get(self.TIMEOUT)
-        self.updater.sync_node(cloud_node, arv_node).get(self.TIMEOUT)
-        self.updater.sync_node(cloud_node, arv_node).get(self.TIMEOUT)
-        self.driver().sync_node.assert_called_with(cloud_node, arv_node)
-
-class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
-                                      unittest.TestCase):
-    def make_mocks(self, node_num):
-        self.shutdowns = testutil.MockShutdownTimer()
-        self.shutdowns._set_state(False, 300)
-        self.timer = mock.MagicMock(name='timer_mock')
-        self.updates = mock.MagicMock(name='update_mock')
-        self.cloud_mock = testutil.cloud_node_mock(node_num)
-        self.subscriber = mock.Mock(name='subscriber_mock')
-        self.cloud_client = mock.MagicMock(name='cloud_client')
-        self.cloud_client.broken.return_value = False
-
-    def make_actor(self, node_num=1, arv_node=None, start_time=None):
-        if not hasattr(self, 'cloud_mock'):
-            self.make_mocks(node_num)
-        if start_time is None:
-            start_time = time.time()
-        self.node_actor = dispatch.ComputeNodeMonitorActor.start(
-            self.cloud_mock, start_time, self.shutdowns,
-            self.timer, self.updates, self.cloud_client,
-            arv_node, boot_fail_after=300).proxy()
-        self.node_actor.subscribe(self.subscriber).get(self.TIMEOUT)
-
-    def node_state(self, *states):
-        return self.node_actor.in_state(*states).get(self.TIMEOUT)
-
-    def test_in_state_when_unpaired(self):
-        self.make_actor()
-        self.assertTrue(self.node_state('unpaired'))
-
-    def test_in_state_when_pairing_stale(self):
-        self.make_actor(arv_node=testutil.arvados_node_mock(
-                job_uuid=None, age=90000))
-        self.assertTrue(self.node_state('down'))
-
-    def test_in_state_when_no_state_available(self):
-        self.make_actor(arv_node=testutil.arvados_node_mock(
-                crunch_worker_state=None))
-        self.assertTrue(self.node_state('idle'))
-
-    def test_in_state_when_no_state_available_old(self):
-        self.make_actor(arv_node=testutil.arvados_node_mock(
-                crunch_worker_state=None, age=90000))
-        self.assertTrue(self.node_state('down'))
-
-    def test_in_idle_state(self):
-        idle_nodes_before = status.tracker._idle_nodes.keys()
-        self.make_actor(2, arv_node=testutil.arvados_node_mock(job_uuid=None))
-        self.assertTrue(self.node_state('idle'))
-        self.assertFalse(self.node_state('busy'))
-        self.assertTrue(self.node_state('idle', 'busy'))
-        idle_nodes_after = status.tracker._idle_nodes.keys()
-        new_idle_nodes = [n for n in idle_nodes_after if n not in idle_nodes_before]
-        # There should be 1 additional idle node
-        self.assertEqual(1, len(new_idle_nodes))
-
-    def test_in_busy_state(self):
-        idle_nodes_before = status.tracker._idle_nodes.keys()
-        self.make_actor(3, arv_node=testutil.arvados_node_mock(job_uuid=True))
-        self.assertFalse(self.node_state('idle'))
-        self.assertTrue(self.node_state('busy'))
-        self.assertTrue(self.node_state('idle', 'busy'))
-        idle_nodes_after = status.tracker._idle_nodes.keys()
-        new_idle_nodes = [n for n in idle_nodes_after if n not in idle_nodes_before]
-        # There shouldn't be any additional idle node
-        self.assertEqual(0, len(new_idle_nodes))
-
-    def test_init_shutdown_scheduling(self):
-        self.make_actor()
-        self.assertTrue(self.timer.schedule.called)
-        self.assertEqual(300, self.timer.schedule.call_args[0][0])
-
-    def test_shutdown_window_close_scheduling(self):
-        self.make_actor()
-        self.shutdowns._set_state(False, 600)
-        self.timer.schedule.reset_mock()
-        self.node_actor.consider_shutdown().get(self.TIMEOUT)
-        self.stop_proxy(self.node_actor)
-        self.assertTrue(self.timer.schedule.called)
-        self.assertEqual(600, self.timer.schedule.call_args[0][0])
-        self.assertFalse(self.subscriber.called)
-
-    def test_shutdown_subscription(self):
-        self.make_actor(start_time=0)
-        self.shutdowns._set_state(True, 600)
-        self.node_actor.consider_shutdown().get(self.TIMEOUT)
-        self.assertTrue(self.subscriber.called)
-        self.assertEqual(self.node_actor.actor_ref.actor_urn,
-                         self.subscriber.call_args[0][0].actor_ref.actor_urn)
-
-    def test_no_shutdown_booting(self):
-        self.make_actor()
-        self.shutdowns._set_state(True, 600)
-        self.assertEquals(self.node_actor.shutdown_eligible().get(self.TIMEOUT),
-                          (False, "node state is ('unpaired', 'open', 'boot wait', 'not idle')"))
-
-    def test_shutdown_when_invalid_cloud_node_size(self):
-        self.make_mocks(1)
-        self.cloud_mock.size.id = 'invalid'
-        self.cloud_mock.extra['arvados_node_size'] = 'stale.type'
-        self.make_actor()
-        self.shutdowns._set_state(True, 600)
-        self.assertEquals((True, "node's size tag 'stale.type' not recognizable"),
-                          self.node_actor.shutdown_eligible().get(self.TIMEOUT))
-
-    def test_shutdown_without_arvados_node(self):
-        self.make_actor(start_time=0)
-        self.shutdowns._set_state(True, 600)
-        self.assertEquals((True, "node state is ('down', 'open', 'boot exceeded', 'not idle')"),
-                          self.node_actor.shutdown_eligible().get(self.TIMEOUT))
-
-    def test_shutdown_missing(self):
-        arv_node = testutil.arvados_node_mock(10, job_uuid=None,
-                                              crunch_worker_state="down",
-                                              last_ping_at='1970-01-01T01:02:03.04050607Z')
-        self.make_actor(10, arv_node)
-        self.shutdowns._set_state(True, 600)
-        self.assertEquals((True, "node state is ('down', 'open', 'boot wait', 'not idle')"),
-                          self.node_actor.shutdown_eligible().get(self.TIMEOUT))
-
-    def test_shutdown_running_broken(self):
-        arv_node = testutil.arvados_node_mock(12, job_uuid=None,
-                                              crunch_worker_state="down")
-        self.make_actor(12, arv_node)
-        self.shutdowns._set_state(True, 600)
-        self.cloud_client.broken.return_value = True
-        self.assertEquals((True, "node state is ('down', 'open', 'boot wait', 'not idle')"),
-                          self.node_actor.shutdown_eligible().get(self.TIMEOUT))
-
-    def test_shutdown_missing_broken(self):
-        arv_node = testutil.arvados_node_mock(11, job_uuid=None,
-                                              crunch_worker_state="down",
-                                              last_ping_at='1970-01-01T01:02:03.04050607Z')
-        self.make_actor(11, arv_node)
-        self.shutdowns._set_state(True, 600)
-        self.cloud_client.broken.return_value = True
-        self.assertEquals(self.node_actor.shutdown_eligible().get(self.TIMEOUT), (True, "node state is ('down', 'open', 'boot wait', 'not idle')"))
-
-    def test_no_shutdown_when_window_closed(self):
-        self.make_actor(3, testutil.arvados_node_mock(3, job_uuid=None))
-        self.assertEquals((False, "node state is ('idle', 'closed', 'boot wait', 'idle exceeded')"),
-                          self.node_actor.shutdown_eligible().get(self.TIMEOUT))
-
-    def test_no_shutdown_when_node_running_job(self):
-        self.make_actor(4, testutil.arvados_node_mock(4, job_uuid=True))
-        self.shutdowns._set_state(True, 600)
-        self.assertEquals((False, "node state is ('busy', 'open', 'boot wait', 'not idle')"),
-                          self.node_actor.shutdown_eligible().get(self.TIMEOUT))
-
-    def test_shutdown_when_node_state_unknown(self):
-        self.make_actor(5, testutil.arvados_node_mock(
-            5, crunch_worker_state=None))
-        self.shutdowns._set_state(True, 600)
-        self.assertEquals((True, "node state is ('idle', 'open', 'boot wait', 'idle exceeded')"),
-                          self.node_actor.shutdown_eligible().get(self.TIMEOUT))
-
-    def test_shutdown_when_node_state_fail(self):
-        self.make_actor(5, testutil.arvados_node_mock(
-            5, crunch_worker_state='fail'))
-        self.shutdowns._set_state(True, 600)
-        self.assertEquals((True, "node state is ('fail', 'open', 'boot wait', 'not idle')"),
-                          self.node_actor.shutdown_eligible().get(self.TIMEOUT))
-
-    def test_no_shutdown_when_node_state_stale(self):
-        self.make_actor(6, testutil.arvados_node_mock(6, age=90000))
-        self.shutdowns._set_state(True, 600)
-        self.assertEquals((False, "node state is stale"),
-                          self.node_actor.shutdown_eligible().get(self.TIMEOUT))
-
-    def test_arvados_node_match(self):
-        self.make_actor(2)
-        arv_node = testutil.arvados_node_mock(
-            2, hostname='compute-two.zzzzz.arvadosapi.com')
-        self.cloud_client.node_id.return_value = '2'
-        pair_id = self.node_actor.offer_arvados_pair(arv_node).get(self.TIMEOUT)
-        self.assertEqual(self.cloud_mock.id, pair_id)
-        self.stop_proxy(self.node_actor)
-        self.updates.sync_node.assert_called_with(self.cloud_mock, arv_node)
-
-    def test_arvados_node_mismatch(self):
-        self.make_actor(3)
-        arv_node = testutil.arvados_node_mock(1)
-        self.assertIsNone(
-            self.node_actor.offer_arvados_pair(arv_node).get(self.TIMEOUT))
-
-    def test_arvados_node_mismatch_first_ping_too_early(self):
-        self.make_actor(4)
-        arv_node = testutil.arvados_node_mock(
-            4, first_ping_at='1971-03-02T14:15:16.1717282Z')
-        self.assertIsNone(
-            self.node_actor.offer_arvados_pair(arv_node).get(self.TIMEOUT))
-
-    def test_update_cloud_node(self):
-        self.make_actor(1)
-        self.make_mocks(2)
-        self.cloud_mock.id = '1'
-        self.node_actor.update_cloud_node(self.cloud_mock)
-        current_cloud = self.node_actor.cloud_node.get(self.TIMEOUT)
-        self.assertEqual([testutil.ip_address_mock(2)],
-                         current_cloud.private_ips)
-
-    def test_missing_cloud_node_update(self):
-        self.make_actor(1)
-        self.node_actor.update_cloud_node(None)
-        current_cloud = self.node_actor.cloud_node.get(self.TIMEOUT)
-        self.assertEqual([testutil.ip_address_mock(1)],
-                         current_cloud.private_ips)
-
-    def test_update_arvados_node(self):
-        self.make_actor(3)
-        job_uuid = 'zzzzz-jjjjj-updatejobnode00'
-        new_arvados = testutil.arvados_node_mock(3, job_uuid)
-        self.node_actor.update_arvados_node(new_arvados)
-        current_arvados = self.node_actor.arvados_node.get(self.TIMEOUT)
-        self.assertEqual(job_uuid, current_arvados['job_uuid'])
-
-    def test_missing_arvados_node_update(self):
-        self.make_actor(4, testutil.arvados_node_mock(4))
-        self.node_actor.update_arvados_node(None)
-        current_arvados = self.node_actor.arvados_node.get(self.TIMEOUT)
-        self.assertEqual(testutil.ip_address_mock(4),
-                         current_arvados['ip_address'])
-
-    def test_update_arvados_node_calls_sync_node(self):
-        self.make_mocks(5)
-        self.cloud_mock.extra['testname'] = 'cloudfqdn.zzzzz.arvadosapi.com'
-        self.make_actor()
-        arv_node = testutil.arvados_node_mock(5)
-        self.node_actor.update_arvados_node(arv_node).get(self.TIMEOUT)
-        self.assertEqual(1, self.updates.sync_node.call_count)
diff --git a/services/nodemanager/tests/test_computenode_dispatch_slurm.py b/services/nodemanager/tests/test_computenode_dispatch_slurm.py
deleted file mode 100644 (file)
index 02d8fb6..0000000
+++ /dev/null
@@ -1,155 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: AGPL-3.0
-
-from __future__ import absolute_import, print_function
-
-import subprocess32 as subprocess
-import time
-import unittest
-
-import mock
-
-import arvnodeman.computenode.dispatch.slurm as slurm_dispatch
-from . import testutil
-from .test_computenode_dispatch import \
-    ComputeNodeShutdownActorMixin, \
-    ComputeNodeSetupActorTestCase, \
-    ComputeNodeUpdateActorTestCase
-
-@mock.patch('subprocess32.check_output')
-class SLURMComputeNodeShutdownActorTestCase(ComputeNodeShutdownActorMixin,
-                                            unittest.TestCase):
-    ACTOR_CLASS = slurm_dispatch.ComputeNodeShutdownActor
-
-    def check_slurm_got_args(self, proc_mock, *args):
-        self.assertTrue(proc_mock.called)
-        slurm_cmd = proc_mock.call_args[0][0]
-        for s in args:
-            self.assertIn(s, slurm_cmd)
-
-    def check_success_after_reset(self, proc_mock, end_state='drain\n', timer=False):
-        self.make_mocks(arvados_node=testutil.arvados_node_mock(63))
-        if not timer:
-            self.timer = testutil.MockTimer(False)
-        self.make_actor()
-        self.check_success_flag(None, 0)
-        # At this point, 1st try should have happened.
-
-        self.timer.deliver()
-        self.check_success_flag(None, 0)
-        # At this point, 2nd try should have happened.
-
-        # Order is critical here: if the mock gets called when no return value
-        # or side effect is set, we may invoke a real subprocess.
-        proc_mock.return_value = end_state
-        proc_mock.side_effect = None
-
-        # 3rd try
-        self.timer.deliver()
-
-        self.check_success_flag(True, 3)
-        self.check_slurm_got_args(proc_mock, 'NodeName=compute63')
-
-    def make_wait_state_test(start_state='drng\n', end_state='drain\n'):
-        def test(self, proc_mock):
-            proc_mock.return_value = start_state
-            self.check_success_after_reset(proc_mock, end_state)
-        return test
-
-    for wait_state in ['alloc\n', 'drng\n']:
-        locals()['test_wait_while_' + wait_state.strip()
-                 ] = make_wait_state_test(start_state=wait_state)
-
-    for end_state in ['idle*\n', 'down\n', 'down*\n', 'drain\n', 'fail\n']:
-        locals()['test_wait_until_' + end_state.strip()
-                 ] = make_wait_state_test(end_state=end_state)
-
-    def test_retry_failed_slurm_calls(self, proc_mock):
-        proc_mock.side_effect = subprocess.CalledProcessError(1, ["mock"])
-        self.check_success_after_reset(proc_mock)
-
-    def test_slurm_bypassed_when_no_arvados_node(self, proc_mock):
-        # Test we correctly handle a node that failed to bootstrap.
-        proc_mock.return_value = 'down\n'
-        self.make_actor(start_time=0)
-        self.check_success_flag(True)
-        self.assertFalse(proc_mock.called)
-
-    def test_node_resumed_when_shutdown_cancelled(self, proc_mock):
-        try:
-            proc_mock.side_effect = iter(['', 'drng\n', 'drng\n', ''])
-            self.make_mocks(arvados_node=testutil.arvados_node_mock(job_uuid=True))
-            self.timer = testutil.MockTimer(False)
-            self.make_actor()
-            self.busywait(lambda: proc_mock.call_args is not None)
-            self.shutdown_actor.cancel_shutdown("test")
-            self.check_success_flag(False, 2)
-            self.assertEqual(proc_mock.call_args_list[0], mock.call(['scontrol', 'update', 'NodeName=compute99', 'State=DRAIN', 'Reason=Node Manager shutdown']))
-            self.assertEqual(proc_mock.call_args_list[-1], mock.call(['scontrol', 'update', 'NodeName=compute99', 'State=RESUME']))
-
-        finally:
-            self.shutdown_actor.actor_ref.stop()
-
-    def test_cancel_shutdown_retry(self, proc_mock):
-        proc_mock.side_effect = iter([OSError, 'drain\n', OSError, 'idle\n', 'idle\n'])
-        self.make_mocks(arvados_node=testutil.arvados_node_mock(job_uuid=True))
-        self.make_actor()
-        self.check_success_flag(False, 5)
-
-    def test_issue_slurm_drain_retry(self, proc_mock):
-        proc_mock.side_effect = iter([OSError, OSError, 'drng\n', 'drain\n'])
-        self.check_success_after_reset(proc_mock, timer=False)
-
-    def test_arvados_node_cleaned_after_shutdown(self, proc_mock):
-        proc_mock.return_value = 'drain\n'
-        super(SLURMComputeNodeShutdownActorTestCase,
-              self).test_arvados_node_cleaned_after_shutdown()
-
-    def test_cancellable_shutdown(self, proc_mock):
-        proc_mock.return_value = 'other\n'
-        super(SLURMComputeNodeShutdownActorTestCase,
-              self).test_cancellable_shutdown()
-
-    def test_uncancellable_shutdown(self, proc_mock):
-        proc_mock.return_value = 'other\n'
-        super(SLURMComputeNodeShutdownActorTestCase,
-              self).test_uncancellable_shutdown()
-
-@mock.patch('subprocess32.check_output')
-class SLURMComputeNodeUpdateActorTestCase(ComputeNodeUpdateActorTestCase):
-    ACTOR_CLASS = slurm_dispatch.ComputeNodeUpdateActor
-
-    def test_update_node_weight(self, check_output):
-        self.make_actor()
-        cloud_node = testutil.cloud_node_mock()
-        arv_node = testutil.arvados_node_mock()
-        self.updater.sync_node(cloud_node, arv_node).get(self.TIMEOUT)
-        check_output.assert_called_with(['scontrol', 'update', 'NodeName=compute99', 'Weight=99000', 'Features=instancetype=z99.test'])
-
-class SLURMComputeNodeSetupActorTestCase(ComputeNodeSetupActorTestCase):
-    ACTOR_CLASS = slurm_dispatch.ComputeNodeSetupActor
-
-    @mock.patch('subprocess32.check_output')
-    def test_update_node_features(self, check_output):
-        # `scontrol update` happens only if the Arvados node record
-        # has a hostname. ComputeNodeSetupActorTestCase.make_mocks
-        # uses mocks with scrubbed hostnames, so we override with the
-        # default testutil.arvados_node_mock.
-        self.make_mocks(arvados_effect=[testutil.arvados_node_mock()])
-        self.make_actor()
-        self.wait_for_assignment(self.setup_actor, 'cloud_node')
-        check_output.assert_called_with(['scontrol', 'update', 'NodeName=compute99', 'Weight=1000', 'Features=instancetype=z1.test'])
-
-    @mock.patch('subprocess32.check_output')
-    def test_failed_arvados_calls_retried(self, check_output):
-        super(SLURMComputeNodeSetupActorTestCase, self).test_failed_arvados_calls_retried()
-
-    @mock.patch('subprocess32.check_output')
-    def test_subscribe(self, check_output):
-        super(SLURMComputeNodeSetupActorTestCase, self).test_subscribe()
-
-    @mock.patch('subprocess32.check_output')
-    def test_creation_with_arvados_node(self, check_output):
-        super(SLURMComputeNodeSetupActorTestCase, self).test_creation_with_arvados_node()
diff --git a/services/nodemanager/tests/test_computenode_driver.py b/services/nodemanager/tests/test_computenode_driver.py
deleted file mode 100644 (file)
index 4bf4c39..0000000
+++ /dev/null
@@ -1,113 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: AGPL-3.0
-
-from __future__ import absolute_import, print_function
-
-import unittest
-
-import libcloud.common.types as cloud_types
-import mock
-
-import arvnodeman.computenode.driver as driver_base
-import arvnodeman.status as status
-import arvnodeman.config as config
-from . import testutil
-
-class ComputeNodeDriverTestCase(unittest.TestCase):
-    def setUp(self):
-        self.driver_mock = mock.MagicMock(name='driver_mock')
-        driver_base.BaseComputeNodeDriver.SEARCH_CACHE = {}
-
-    def test_search_for_now_uses_public_method(self):
-        image = testutil.cloud_object_mock(1)
-        self.driver_mock().list_images.return_value = [image]
-        driver = driver_base.BaseComputeNodeDriver({}, {}, {}, self.driver_mock)
-        self.assertIs(image, driver.search_for_now('id_1', 'list_images'))
-        self.assertEqual(1, self.driver_mock().list_images.call_count)
-
-    def test_search_for_now_uses_private_method(self):
-        net = testutil.cloud_object_mock(1)
-        self.driver_mock().ex_list_networks.return_value = [net]
-        driver = driver_base.BaseComputeNodeDriver({}, {}, {}, self.driver_mock)
-        self.assertIs(net, driver.search_for_now('id_1', 'ex_list_networks'))
-        self.assertEqual(1, self.driver_mock().ex_list_networks.call_count)
-
-    def test_search_for_now_raises_ValueError_on_zero_results(self):
-        self.driver_mock().list_images.return_value = []
-        driver = driver_base.BaseComputeNodeDriver({}, {}, {}, self.driver_mock)
-        with self.assertRaises(ValueError) as test:
-            driver.search_for_now('id_1', 'list_images')
-
-    def test_search_for_now_raises_ValueError_on_extra_results(self):
-        image = testutil.cloud_object_mock(1)
-        self.driver_mock().list_images.return_value = [image, image]
-        driver = driver_base.BaseComputeNodeDriver({}, {}, {}, self.driver_mock)
-        with self.assertRaises(ValueError) as test:
-            driver.search_for_now('id_1', 'list_images')
-
-    def test_search_for_now_does_not_cache_results(self):
-        image1 = testutil.cloud_object_mock(1)
-        image2 = testutil.cloud_object_mock(1)
-        self.driver_mock().list_images.side_effect = [[image1], [image2]]
-        driver = driver_base.BaseComputeNodeDriver({}, {}, {}, self.driver_mock)
-        self.assertIsNot(driver.search_for_now('id_1', 'list_images'),
-                         driver.search_for_now('id_1', 'list_images'))
-        self.assertEqual(2, self.driver_mock().list_images.call_count)
-
-    def test_search_for_returns_cached_results(self):
-        image1 = testutil.cloud_object_mock(1)
-        image2 = testutil.cloud_object_mock(1)
-        self.driver_mock().list_images.side_effect = [[image1], [image2]]
-        driver = driver_base.BaseComputeNodeDriver({}, {}, {}, self.driver_mock)
-        self.assertIs(driver.search_for('id_1', 'list_images'),
-                      driver.search_for('id_1', 'list_images'))
-        self.assertEqual(1, self.driver_mock().list_images.call_count)
-
-
-    class TestBaseComputeNodeDriver(driver_base.BaseComputeNodeDriver):
-        def arvados_create_kwargs(self, size, arvados_node):
-            return {'name': arvados_node}
-
-
-    def test_create_node_only_cloud_errors_are_counted(self):
-        status.tracker.update({'create_node_errors': 0})
-        errors = [(config.CLOUD_ERRORS[0], True), (KeyError, False)]
-        self.driver_mock().list_images.return_value = []
-        driver = self.TestBaseComputeNodeDriver({}, {}, {}, self.driver_mock)
-        error_count = 0
-        for an_error, is_cloud_error in errors:
-            self.driver_mock().create_node.side_effect = an_error
-            with self.assertRaises(an_error):
-                driver.create_node(testutil.MockSize(1), 'id_1')
-            if is_cloud_error:
-                error_count += 1
-            self.assertEqual(error_count, status.tracker.get('create_node_errors'))
-
-    def test_list_nodes_only_cloud_errors_are_counted(self):
-        status.tracker.update({'list_nodes_errors': 0})
-        errors = [(config.CLOUD_ERRORS[0], True), (KeyError, False)]
-        driver = self.TestBaseComputeNodeDriver({}, {}, {}, self.driver_mock)
-        error_count = 0
-        for an_error, is_cloud_error in errors:
-            self.driver_mock().list_nodes.side_effect = an_error
-            with self.assertRaises(an_error):
-                driver.list_nodes()
-            if is_cloud_error:
-                error_count += 1
-            self.assertEqual(error_count, status.tracker.get('list_nodes_errors'))
-
-    def test_destroy_node_only_cloud_errors_are_counted(self):
-        status.tracker.update({'destroy_node_errors': 0})
-        errors = [(config.CLOUD_ERRORS[0], True), (KeyError, False)]
-        self.driver_mock().list_nodes.return_value = [testutil.MockSize(1)]
-        driver = self.TestBaseComputeNodeDriver({}, {}, {}, self.driver_mock)
-        error_count = 0
-        for an_error, is_cloud_error in errors:
-            self.driver_mock().destroy_node.side_effect = an_error
-            with self.assertRaises(an_error):
-                driver.destroy_node(testutil.MockSize(1))
-            if is_cloud_error:
-                error_count += 1
-            self.assertEqual(error_count, status.tracker.get('destroy_node_errors'))
diff --git a/services/nodemanager/tests/test_computenode_driver_azure.py b/services/nodemanager/tests/test_computenode_driver_azure.py
deleted file mode 100644 (file)
index ea7a033..0000000
+++ /dev/null
@@ -1,145 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: AGPL-3.0
-
-from __future__ import absolute_import, print_function
-
-import ssl
-import time
-import unittest
-
-import libcloud.common.types as cloud_types
-import mock
-
-import arvnodeman.computenode.driver.azure as azure
-from . import testutil
-
-class AzureComputeNodeDriverTestCase(testutil.DriverTestMixin, unittest.TestCase):
-    TEST_CLASS = azure.ComputeNodeDriver
-
-    def new_driver(self, auth_kwargs={}, list_kwargs={}, create_kwargs={}):
-        list_kwargs.setdefault("ex_resource_group", "TestResourceGroup")
-        return super(AzureComputeNodeDriverTestCase, self).new_driver(auth_kwargs, list_kwargs, create_kwargs)
-
-    def test_driver_instantiation(self):
-        kwargs = {'key': 'testkey'}
-        driver = self.new_driver(auth_kwargs=kwargs)
-        self.assertTrue(self.driver_mock.called)
-        self.assertEqual(kwargs, self.driver_mock.call_args[1])
-
-    def test_create_image_loaded_at_initialization(self):
-        get_method = self.driver_mock().get_image
-        get_method.return_value = testutil.cloud_object_mock('id_b')
-        driver = self.new_driver(create_kwargs={'image': 'id_b'})
-        self.assertEqual(1, get_method.call_count)
-
-    def test_create_includes_ping(self):
-        arv_node = testutil.arvados_node_mock(info={'ping_secret': 'ssshh'})
-        arv_node["hostname"] = None
-        driver = self.new_driver()
-        driver.create_node(testutil.MockSize(1), arv_node)
-        create_method = self.driver_mock().create_node
-        self.assertTrue(create_method.called)
-        self.assertIn('ping_secret=ssshh',
-                      create_method.call_args[1].get('ex_tags', {}).get('arv-ping-url', ""))
-
-    def test_create_includes_arvados_node_size(self):
-        arv_node = testutil.arvados_node_mock()
-        arv_node["hostname"] = None
-        size = testutil.MockSize(1)
-        driver = self.new_driver()
-        driver.create_node(size, arv_node)
-        create_method = self.driver_mock().create_node
-        self.assertTrue(create_method.called)
-        self.assertIn(
-            ('arvados_node_size', size.id),
-            create_method.call_args[1].get('ex_tags', {'tags': 'missing'}).items()
-        )
-
-    def test_name_from_new_arvados_node(self):
-        arv_node = testutil.arvados_node_mock(hostname=None)
-        driver = self.new_driver()
-        self.assertEqual('compute-000000000000063-zzzzz',
-                         driver.arvados_create_kwargs(testutil.MockSize(1), arv_node)['name'])
-
-    def check_node_tagged(self, cloud_node, expected_tags):
-        tag_mock = self.driver_mock().ex_create_tags
-        self.assertTrue(tag_mock.called)
-        self.assertIs(cloud_node, tag_mock.call_args[0][0])
-        self.assertEqual(expected_tags, tag_mock.call_args[0][1])
-
-    def test_node_create_time(self):
-        refsecs = int(time.time())
-        reftuple = time.gmtime(refsecs)
-        node = testutil.cloud_node_mock()
-        node.extra = {'tags': {'booted_at': time.strftime('%Y-%m-%dT%H:%M:%S.000Z',
-                                                   reftuple)}}
-        self.assertEqual(refsecs, azure.ComputeNodeDriver.node_start_time(node))
-
-    def test_node_fqdn(self):
-        name = 'fqdntest.zzzzz.arvadosapi.com'
-        node = testutil.cloud_node_mock()
-        node.extra = {'tags': {"hostname": name}}
-        self.assertEqual(name, azure.ComputeNodeDriver.node_fqdn(node))
-
-    def test_sync_node(self):
-        arv_node = testutil.arvados_node_mock(1)
-        cloud_node = testutil.cloud_node_mock(2)
-        driver = self.new_driver()
-        driver.sync_node(cloud_node, arv_node)
-        self.check_node_tagged(cloud_node,
-                               {'hostname': 'compute1.zzzzz.arvadosapi.com'})
-
-    def test_custom_data(self):
-        arv_node = testutil.arvados_node_mock(hostname=None)
-        driver = self.new_driver()
-        self.assertEqual("""#!/bin/sh
-mkdir -p    /var/tmp/arv-node-data/meta-data
-echo 'https://100::/arvados/v1/nodes/zzzzz-yyyyy-000000000000063/ping?ping_secret=defaulttestsecret' > /var/tmp/arv-node-data/arv-ping-url
-echo compute-000000000000063-zzzzz > /var/tmp/arv-node-data/meta-data/instance-id
-echo z1.test > /var/tmp/arv-node-data/meta-data/instance-type
-""",
-                         driver.arvados_create_kwargs(testutil.MockSize(1), arv_node)['ex_customdata'])
-
-    def test_list_nodes_ignores_nodes_without_tags(self):
-        driver = self.new_driver(create_kwargs={"tag_arvados-class": "dynamic-compute"})
-        # Mock cloud node without tags
-        nodelist = [testutil.cloud_node_mock(1)]
-        self.driver_mock().list_nodes.return_value = nodelist
-        n = driver.list_nodes()
-        self.assertEqual([], n)
-
-    def test_create_raises_but_actually_succeeded(self):
-        arv_node = testutil.arvados_node_mock(1, hostname=None)
-        driver = self.new_driver(create_kwargs={"tag_arvados-class": "dynamic-compute"})
-        nodelist = [testutil.cloud_node_mock(1, tags={"arvados-class": "dynamic-compute"})]
-        nodelist[0].name = 'compute-000000000000001-zzzzz'
-        self.driver_mock().list_nodes.return_value = nodelist
-        self.driver_mock().create_node.side_effect = IOError
-        n = driver.create_node(testutil.MockSize(1), arv_node)
-        self.assertEqual('compute-000000000000001-zzzzz', n.name)
-
-    def test_ex_fetch_nic_false(self):
-        arv_node = testutil.arvados_node_mock(1, hostname=None)
-        driver = self.new_driver(create_kwargs={"tag_arvados-class": "dynamic-compute"})
-        nodelist = [testutil.cloud_node_mock(1, tags={"arvados-class": "dynamic-compute"})]
-        nodelist[0].name = 'compute-000000000000001-zzzzz'
-        self.driver_mock().list_nodes.return_value = nodelist
-        n = driver.list_nodes()
-        self.assertEqual(nodelist, n)
-        self.driver_mock().list_nodes.assert_called_with(ex_fetch_nic=False, ex_fetch_power_state=False, ex_resource_group='TestResourceGroup')
-
-    def test_create_can_find_node_after_timeout(self):
-        super(AzureComputeNodeDriverTestCase,
-              self).test_create_can_find_node_after_timeout(
-                  create_kwargs={'tag_arvados-class': 'test'},
-                  node_extra={'tags': {'arvados-class': 'test'}})
-
-    def test_node_found_after_timeout_has_fixed_size(self):
-        size = testutil.MockSize(4)
-        node_props = {'hardwareProfile': {'vmSize': size.id}}
-        cloud_node = testutil.cloud_node_mock(tags={'arvados-class': 'test'}, properties=node_props)
-        cloud_node.size = None
-        self.check_node_found_after_timeout_has_fixed_size(
-            size, cloud_node, {'tag_arvados-class': 'test'})
diff --git a/services/nodemanager/tests/test_computenode_driver_ec2.py b/services/nodemanager/tests/test_computenode_driver_ec2.py
deleted file mode 100644 (file)
index 520c0dc..0000000
+++ /dev/null
@@ -1,175 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: AGPL-3.0
-
-from __future__ import absolute_import, print_function
-
-import ssl
-import time
-import unittest
-
-import libcloud.common.types as cloud_types
-import mock
-
-import arvnodeman.computenode.driver.ec2 as ec2
-from . import testutil
-
-class EC2ComputeNodeDriverTestCase(testutil.DriverTestMixin, unittest.TestCase):
-    TEST_CLASS = ec2.ComputeNodeDriver
-
-    def test_driver_instantiation(self):
-        kwargs = {'key': 'testkey'}
-        driver = self.new_driver(auth_kwargs=kwargs)
-        self.assertTrue(self.driver_mock.called)
-        self.assertEqual(kwargs, self.driver_mock.call_args[1])
-
-    def test_list_kwargs_become_filters(self):
-        # We're also testing tag name translation.
-        driver = self.new_driver(list_kwargs={'tag_test': 'true'})
-        driver.list_nodes()
-        list_method = self.driver_mock().list_nodes
-        self.assertTrue(list_method.called)
-        self.assertEqual({'tag:test': 'true'},
-                          list_method.call_args[1].get('ex_filters'))
-
-    def test_create_image_loaded_at_initialization(self):
-        list_method = self.driver_mock().list_images
-        list_method.return_value = [testutil.cloud_object_mock(c)
-                                    for c in 'abc']
-        driver = self.new_driver(create_kwargs={'image_id': 'id_b'})
-        self.assertEqual(1, list_method.call_count)
-
-    def test_create_includes_ping_secret(self):
-        arv_node = testutil.arvados_node_mock(info={'ping_secret': 'ssshh'})
-        driver = self.new_driver()
-        driver.create_node(testutil.MockSize(1), arv_node)
-        create_method = self.driver_mock().create_node
-        self.assertTrue(create_method.called)
-        self.assertIn('ping_secret=ssshh',
-                      create_method.call_args[1].get('ex_userdata',
-                                                     'arg missing'))
-
-    def test_create_includes_metadata(self):
-        arv_node = testutil.arvados_node_mock()
-        driver = self.new_driver(list_kwargs={'tag_test': 'testvalue'})
-        driver.create_node(testutil.MockSize(1), arv_node)
-        create_method = self.driver_mock().create_node
-        self.assertTrue(create_method.called)
-        self.assertIn(
-            ('test', 'testvalue'),
-            create_method.call_args[1].get('ex_metadata', {'arg': 'missing'}).items()
-        )
-
-    def test_create_includes_arvados_node_size(self):
-        arv_node = testutil.arvados_node_mock()
-        size = testutil.MockSize(1)
-        driver = self.new_driver()
-        driver.create_node(size, arv_node)
-        create_method = self.driver_mock().create_node
-        self.assertTrue(create_method.called)
-        self.assertIn(
-            ('arvados_node_size', size.id),
-            create_method.call_args[1].get('ex_metadata', {'arg': 'missing'}).items()
-        )
-
-    def test_create_preemptible_instance(self):
-        arv_node = testutil.arvados_node_mock()
-        driver = self.new_driver()
-        driver.create_node(testutil.MockSize(1, preemptible=True), arv_node)
-        create_method = self.driver_mock().create_node
-        self.assertTrue(create_method.called)
-        self.assertEqual(
-            True,
-            create_method.call_args[1].get('ex_spot_market', 'arg missing')
-        )
-
-    def test_hostname_from_arvados_node(self):
-        arv_node = testutil.arvados_node_mock(8)
-        driver = self.new_driver()
-        self.assertEqual('compute8.zzzzz.arvadosapi.com',
-                         driver.arvados_create_kwargs(testutil.MockSize(1), arv_node)['name'])
-
-    def test_default_hostname_from_new_arvados_node(self):
-        arv_node = testutil.arvados_node_mock(hostname=None)
-        driver = self.new_driver()
-        self.assertEqual('dynamic.compute.zzzzz.arvadosapi.com',
-                         driver.arvados_create_kwargs(testutil.MockSize(1), arv_node)['name'])
-
-    def check_node_tagged(self, cloud_node, expected_tags):
-        tag_mock = self.driver_mock().ex_create_tags
-        self.assertTrue(tag_mock.called)
-        self.assertIs(cloud_node, tag_mock.call_args[0][0])
-        self.assertEqual(expected_tags, tag_mock.call_args[0][1])
-
-    def test_sync_node(self):
-        arv_node = testutil.arvados_node_mock(1)
-        cloud_node = testutil.cloud_node_mock(2)
-        driver = self.new_driver()
-        driver.sync_node(cloud_node, arv_node)
-        self.check_node_tagged(cloud_node,
-                               {'Name': 'compute1.zzzzz.arvadosapi.com'})
-
-    def test_node_create_time(self):
-        refsecs = int(time.time())
-        reftuple = time.gmtime(refsecs)
-        node = testutil.cloud_node_mock()
-        node.extra = {'launch_time': time.strftime('%Y-%m-%dT%H:%M:%S.000Z',
-                                                   reftuple)}
-        self.assertEqual(refsecs, ec2.ComputeNodeDriver.node_start_time(node))
-
-    def test_node_fqdn(self):
-        name = 'fqdntest.zzzzz.arvadosapi.com'
-        node = testutil.cloud_node_mock()
-        node.name = name
-        self.assertEqual(name, ec2.ComputeNodeDriver.node_fqdn(node))
-
-    def test_create_ebs_volume(self):
-        arv_node = testutil.arvados_node_mock()
-        driver = self.new_driver()
-        # libcloud/ec2 "disk" sizes are in GB, Arvados/SLURM "scratch" value is in MB
-        size = testutil.MockSize(1)
-        size.disk=5
-        size.scratch=20000
-        driver.create_node(size, arv_node)
-        create_method = self.driver_mock().create_node
-        self.assertTrue(create_method.called)
-        self.assertEqual([{
-            "DeviceName": "/dev/xvdt",
-            "Ebs": {
-                "DeleteOnTermination": True,
-                "VolumeSize": 16,
-                "VolumeType": "gp2"
-            }}],
-                         create_method.call_args[1].get('ex_blockdevicemappings'))
-
-    def test_ebs_volume_not_needed(self):
-        arv_node = testutil.arvados_node_mock()
-        driver = self.new_driver()
-        # libcloud/ec2 "disk" sizes are in GB, Arvados/SLURM "scratch" value is in MB
-        size = testutil.MockSize(1)
-        size.disk=80
-        size.scratch=20000
-        driver.create_node(size, arv_node)
-        create_method = self.driver_mock().create_node
-        self.assertTrue(create_method.called)
-        self.assertIsNone(create_method.call_args[1].get('ex_blockdevicemappings'))
-
-    def test_ebs_volume_too_big(self):
-        arv_node = testutil.arvados_node_mock()
-        driver = self.new_driver()
-        # libcloud/ec2 "disk" sizes are in GB, Arvados/SLURM "scratch" value is in MB
-        size = testutil.MockSize(1)
-        size.disk=80
-        size.scratch=20000000
-        driver.create_node(size, arv_node)
-        create_method = self.driver_mock().create_node
-        self.assertTrue(create_method.called)
-        self.assertEqual([{
-            "DeviceName": "/dev/xvdt",
-            "Ebs": {
-                "DeleteOnTermination": True,
-                "VolumeSize": 16384,
-                "VolumeType": "gp2"
-            }}],
-                         create_method.call_args[1].get('ex_blockdevicemappings'))
diff --git a/services/nodemanager/tests/test_computenode_driver_gce.py b/services/nodemanager/tests/test_computenode_driver_gce.py
deleted file mode 100644 (file)
index 1446cd2..0000000
+++ /dev/null
@@ -1,252 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: AGPL-3.0
-
-from __future__ import absolute_import, print_function
-
-import json
-import time
-import unittest
-
-import mock
-
-import arvnodeman.computenode.driver.gce as gce
-from . import testutil
-
-class GCEComputeNodeDriverTestCase(testutil.DriverTestMixin, unittest.TestCase):
-    TEST_CLASS = gce.ComputeNodeDriver
-
-    def setUp(self):
-        super(GCEComputeNodeDriverTestCase, self).setUp()
-        self.driver_mock().list_images.return_value = [
-            testutil.cloud_object_mock('testimage', selfLink='image-link')]
-        self.driver_mock().ex_list_disktypes.return_value = [
-            testutil.cloud_object_mock(name, selfLink=name + '-link')
-            for name in ['pd-standard', 'pd-ssd', 'local-ssd']]
-        self.driver_mock.reset_mock()
-
-    def new_driver(self, auth_kwargs={}, list_kwargs={}, create_kwargs={}):
-        create_kwargs.setdefault('image', 'testimage')
-        return super(GCEComputeNodeDriverTestCase, self).new_driver(
-            auth_kwargs, list_kwargs, create_kwargs)
-
-    def test_driver_instantiation(self):
-        kwargs = {'user_id': 'foo'}
-        driver = self.new_driver(auth_kwargs=kwargs)
-        self.assertTrue(self.driver_mock.called)
-        self.assertEqual(kwargs, self.driver_mock.call_args[1])
-
-    def test_create_image_loaded_at_initialization_by_name(self):
-        image_mocks = [testutil.cloud_object_mock(c) for c in 'abc']
-        list_method = self.driver_mock().list_images
-        list_method.return_value = image_mocks
-        driver = self.new_driver(create_kwargs={'image': 'b'})
-        self.assertEqual(1, list_method.call_count)
-
-    def test_create_includes_ping_secret(self):
-        arv_node = testutil.arvados_node_mock(info={'ping_secret': 'ssshh'})
-        driver = self.new_driver()
-        driver.create_node(testutil.MockSize(1), arv_node)
-        metadata = self.driver_mock().create_node.call_args[1]['ex_metadata']
-        self.assertIn('ping_secret=ssshh', metadata.get('arv-ping-url'))
-
-    def test_create_includes_arvados_node_size(self):
-        arv_node = testutil.arvados_node_mock()
-        size = testutil.MockSize(1)
-        driver = self.new_driver()
-        driver.create_node(size, arv_node)
-        create_method = self.driver_mock().create_node
-        self.assertIn(
-            ('arvados_node_size', size.id),
-            create_method.call_args[1].get('ex_metadata', {'metadata':'missing'}).items()
-        )
-
-    def test_create_raises_but_actually_succeeded(self):
-        arv_node = testutil.arvados_node_mock(1, hostname=None)
-        driver = self.new_driver()
-        nodelist = [testutil.cloud_node_mock(1)]
-        nodelist[0].name = 'compute-000000000000001-zzzzz'
-        self.driver_mock().list_nodes.return_value = nodelist
-        self.driver_mock().create_node.side_effect = IOError
-        n = driver.create_node(testutil.MockSize(1), arv_node)
-        self.assertEqual('compute-000000000000001-zzzzz', n.name)
-
-    def test_create_sets_default_hostname(self):
-        driver = self.new_driver()
-        driver.create_node(testutil.MockSize(1),
-                           testutil.arvados_node_mock(254, hostname=None))
-        create_kwargs = self.driver_mock().create_node.call_args[1]
-        self.assertEqual('compute-0000000000000fe-zzzzz',
-                         create_kwargs.get('name'))
-        self.assertEqual('dynamic.compute.zzzzz.arvadosapi.com',
-                         create_kwargs.get('ex_metadata', {}).get('hostname'))
-
-    def test_create_tags_from_list_tags(self):
-        driver = self.new_driver(list_kwargs={'tags': 'testA, testB'})
-        driver.create_node(testutil.MockSize(1), testutil.arvados_node_mock())
-        self.assertEqual(['testA', 'testB'],
-                         self.driver_mock().create_node.call_args[1]['ex_tags'])
-
-    def test_create_with_two_disks_attached(self):
-        driver = self.new_driver(create_kwargs={'image': 'testimage'})
-        driver.create_node(testutil.MockSize(1), testutil.arvados_node_mock())
-        create_disks = self.driver_mock().create_node.call_args[1].get(
-            'ex_disks_gce_struct', [])
-        self.assertEqual(2, len(create_disks))
-        self.assertTrue(create_disks[0].get('autoDelete'))
-        self.assertTrue(create_disks[0].get('boot'))
-        self.assertEqual('PERSISTENT', create_disks[0].get('type'))
-        init_params = create_disks[0].get('initializeParams', {})
-        self.assertEqual('pd-standard-link', init_params.get('diskType'))
-        self.assertEqual('image-link', init_params.get('sourceImage'))
-        # Our node images expect the SSD to be named `tmp` to find and mount it.
-        self.assertEqual('tmp', create_disks[1].get('deviceName'))
-        self.assertTrue(create_disks[1].get('autoDelete'))
-        self.assertFalse(create_disks[1].get('boot', 'unset'))
-        self.assertEqual('SCRATCH', create_disks[1].get('type'))
-        init_params = create_disks[1].get('initializeParams', {})
-        self.assertEqual('local-ssd-link', init_params.get('diskType'))
-
-    def test_list_nodes_requires_tags_match(self):
-        # A node matches if our list tags are a subset of the node's tags.
-        # Test behavior with no tags, no match, partial matches, different
-        # order, and strict supersets.
-        cloud_mocks = [
-            testutil.cloud_node_mock(node_num, tags=tag_set)
-            for node_num, tag_set in enumerate(
-                [[], ['bad'], ['good'], ['great'], ['great', 'ok'],
-                 ['great', 'good'], ['good', 'fantastic', 'great']])]
-        cloud_mocks.append(testutil.cloud_node_mock())
-        self.driver_mock().list_nodes.return_value = cloud_mocks
-        driver = self.new_driver(list_kwargs={'tags': 'good, great'})
-        self.assertItemsEqual(['5', '6'], [n.id for n in driver.list_nodes()])
-
-    def build_gce_metadata(self, metadata_dict):
-        # Convert a plain metadata dictionary to the GCE data structure.
-        return {
-            'kind': 'compute#metadata',
-            'fingerprint': 'testprint',
-            'items': [{'key': key, 'value': metadata_dict[key]}
-                      for key in metadata_dict],
-            }
-
-    def check_sync_node_updates_hostname_tag(self, plain_metadata):
-        start_metadata = self.build_gce_metadata(plain_metadata)
-        arv_node = testutil.arvados_node_mock(1)
-        cloud_node = testutil.cloud_node_mock(
-            2, metadata=start_metadata.copy(),
-            zone=testutil.cloud_object_mock('testzone'))
-        self.driver_mock().ex_get_node.return_value = cloud_node
-        driver = self.new_driver()
-        driver.sync_node(cloud_node, arv_node)
-        args, kwargs = self.driver_mock().ex_set_node_metadata.call_args
-        self.assertEqual(cloud_node, args[0])
-        plain_metadata['hostname'] = 'compute1.zzzzz.arvadosapi.com'
-        self.assertEqual(
-            plain_metadata,
-            {item['key']: item['value'] for item in args[1]})
-
-    def test_sync_node_updates_hostname_tag(self):
-        self.check_sync_node_updates_hostname_tag(
-            {'testkey': 'testvalue', 'hostname': 'startvalue'})
-
-    def test_sync_node_adds_hostname_tag(self):
-        self.check_sync_node_updates_hostname_tag({'testkey': 'testval'})
-
-    def test_sync_node_raises_exception_on_failure(self):
-        arv_node = testutil.arvados_node_mock(8)
-        cloud_node = testutil.cloud_node_mock(
-            9, metadata={}, zone=testutil.cloud_object_mock('failzone'))
-        mock_response = self.driver_mock().ex_set_node_metadata.side_effect = (Exception('sync error test'),)
-        driver = self.new_driver()
-        with self.assertRaises(Exception) as err_check:
-            driver.sync_node(cloud_node, arv_node)
-        self.assertIs(err_check.exception.__class__, Exception)
-        self.assertIn('sync error test', str(err_check.exception))
-
-    def test_node_create_time_zero_for_unknown_nodes(self):
-        node = testutil.cloud_node_mock()
-        self.assertEqual(0, gce.ComputeNodeDriver.node_start_time(node))
-
-    def test_node_create_time_for_known_node(self):
-        node = testutil.cloud_node_mock(metadata=self.build_gce_metadata(
-                {'booted_at': '1970-01-01T00:01:05Z'}))
-        self.assertEqual(65, gce.ComputeNodeDriver.node_start_time(node))
-
-    def test_node_create_time_recorded_when_node_boots(self):
-        start_time = time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime())
-        arv_node = testutil.arvados_node_mock()
-        driver = self.new_driver()
-        driver.create_node(testutil.MockSize(1), arv_node)
-        metadata = self.driver_mock().create_node.call_args[1]['ex_metadata']
-        self.assertLessEqual(start_time, metadata.get('booted_at'))
-
-    def test_known_node_fqdn(self):
-        name = 'fqdntest.zzzzz.arvadosapi.com'
-        node = testutil.cloud_node_mock(metadata=self.build_gce_metadata(
-                {'hostname': name}))
-        self.assertEqual(name, gce.ComputeNodeDriver.node_fqdn(node))
-
-    def test_unknown_node_fqdn(self):
-        # Return an empty string.  This lets fqdn be safely compared
-        # against an expected value, and ComputeNodeMonitorActor
-        # should try to update it.
-        node = testutil.cloud_node_mock(metadata=self.build_gce_metadata({}))
-        self.assertEqual('', gce.ComputeNodeDriver.node_fqdn(node))
-
-    def test_deliver_ssh_key_in_metadata(self):
-        test_ssh_key = 'ssh-rsa-foo'
-        arv_node = testutil.arvados_node_mock(1)
-        with mock.patch('__builtin__.open',
-                        mock.mock_open(read_data=test_ssh_key)) as mock_file:
-            driver = self.new_driver(create_kwargs={'ssh_key': 'ssh-key-file'})
-        mock_file.assert_called_once_with('ssh-key-file')
-        driver.create_node(testutil.MockSize(1), arv_node)
-        metadata = self.driver_mock().create_node.call_args[1]['ex_metadata']
-        self.assertEqual('root:ssh-rsa-foo', metadata.get('sshKeys'))
-
-    def test_create_driver_with_service_accounts(self):
-        service_accounts = {'email': 'foo@bar', 'scopes': ['storage-full']}
-        srv_acct_config = {'service_accounts': json.dumps(service_accounts)}
-        arv_node = testutil.arvados_node_mock(1)
-        driver = self.new_driver(create_kwargs=srv_acct_config)
-        driver.create_node(testutil.MockSize(1), arv_node)
-        self.assertEqual(
-            service_accounts,
-            self.driver_mock().create_node.call_args[1]['ex_service_accounts'])
-
-    def test_fix_string_size(self):
-        # As of 0.18, the libcloud GCE driver sets node.size to the size's name.
-        # It's supposed to be the actual size object.  Make sure our driver
-        # patches that up in listings.
-        size = testutil.MockSize(2)
-        node = testutil.cloud_node_mock(size=size)
-        node.size = size.id
-        self.driver_mock().list_sizes.return_value = [size]
-        self.driver_mock().list_nodes.return_value = [node]
-        driver = self.new_driver()
-        nodelist = driver.list_nodes()
-        self.assertEqual(1, len(nodelist))
-        self.assertIs(node, nodelist[0])
-        self.assertIs(size, nodelist[0].size)
-
-    def test_skip_fix_when_size_not_string(self):
-        # Ensure we don't monkeypatch node sizes unless we need to.
-        size = testutil.MockSize(3)
-        node = testutil.cloud_node_mock(size=size)
-        self.driver_mock().list_nodes.return_value = [node]
-        driver = self.new_driver()
-        nodelist = driver.list_nodes()
-        self.assertEqual(1, len(nodelist))
-        self.assertIs(node, nodelist[0])
-        self.assertIs(size, nodelist[0].size)
-
-    def test_node_found_after_timeout_has_fixed_size(self):
-        size = testutil.MockSize(4)
-        cloud_node = testutil.cloud_node_mock(size=size.id)
-        self.check_node_found_after_timeout_has_fixed_size(size, cloud_node)
-
-    def test_list_empty_nodes(self):
-        self.driver_mock().list_nodes.return_value = []
-        self.assertEqual([], self.new_driver().list_nodes())
diff --git a/services/nodemanager/tests/test_config.py b/services/nodemanager/tests/test_config.py
deleted file mode 100644 (file)
index 8002b3b..0000000
+++ /dev/null
@@ -1,110 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: AGPL-3.0
-
-from __future__ import absolute_import, print_function
-
-import io
-import logging
-import unittest
-
-import arvnodeman.computenode.dispatch as dispatch
-import arvnodeman.computenode.dispatch.slurm as slurm_dispatch
-import arvnodeman.config as nmconfig
-
-class NodeManagerConfigTestCase(unittest.TestCase):
-    TEST_CONFIG = u"""
-[Cloud]
-provider = dummy
-shutdown_windows = 52, 6, 2
-
-[Cloud Credentials]
-creds = dummy_creds
-
-[Cloud List]
-[Cloud Create]
-
-[Size 1]
-cores = 1
-price = 0.8
-
-[Size 1.preemptible]
-instance_type = 1
-preemptible = true
-cores = 1
-price = 0.8
-
-[Logging]
-file = /dev/null
-level = DEBUG
-testlogger = INFO
-"""
-
-    def load_config(self, config=None, config_str=None):
-        if config is None:
-            config = nmconfig.NodeManagerConfig()
-        if config_str is None:
-            config_str = self.TEST_CONFIG
-        with io.StringIO(config_str) as config_fp:
-            config.readfp(config_fp)
-        return config
-
-    def test_seeded_defaults(self):
-        config = nmconfig.NodeManagerConfig()
-        sec_names = set(config.sections())
-        self.assertIn('Arvados', sec_names)
-        self.assertIn('Daemon', sec_names)
-        self.assertFalse(any(name.startswith('Size ') for name in sec_names))
-
-    def test_list_sizes(self):
-        config = self.load_config()
-        sizes = config.node_sizes()
-        self.assertEqual(2, len(sizes))
-        size, kwargs = sizes[0]
-        self.assertEqual('Small', size.name)
-        self.assertEqual(1, kwargs['cores'])
-        self.assertEqual(0.8, kwargs['price'])
-        # preemptible is False by default
-        self.assertEqual(False, kwargs['preemptible'])
-        # instance_type == arvados node size id by default
-        self.assertEqual(kwargs['id'], kwargs['instance_type'])
-        # Now retrieve the preemptible version
-        size, kwargs = sizes[1]
-        self.assertEqual('Small', size.name)
-        self.assertEqual('1.preemptible', kwargs['id'])
-        self.assertEqual(1, kwargs['cores'])
-        self.assertEqual(0.8, kwargs['price'])
-        self.assertEqual(True, kwargs['preemptible'])
-        self.assertEqual('1', kwargs['instance_type'])
-
-
-    def test_default_node_mem_scaling(self):
-        config = self.load_config()
-        self.assertEqual(0.95, config.getfloat('Daemon', 'node_mem_scaling'))
-
-    def test_shutdown_windows(self):
-        config = self.load_config()
-        self.assertEqual([52, 6, 2], config.shutdown_windows())
-
-    def test_log_levels(self):
-        config = self.load_config()
-        self.assertEqual({'level': logging.DEBUG,
-                          'testlogger': logging.INFO},
-                         config.log_levels())
-
-    def check_dispatch_classes(self, config, module):
-        setup, shutdown, update, monitor = config.dispatch_classes()
-        self.assertIs(setup, module.ComputeNodeSetupActor)
-        self.assertIs(shutdown, module.ComputeNodeShutdownActor)
-        self.assertIs(update, module.ComputeNodeUpdateActor)
-        self.assertIs(monitor, module.ComputeNodeMonitorActor)
-
-    def test_default_dispatch(self):
-        config = self.load_config()
-        self.check_dispatch_classes(config, dispatch)
-
-    def test_custom_dispatch(self):
-        config = self.load_config(
-            config_str=self.TEST_CONFIG + "[Daemon]\ndispatcher=slurm\n")
-        self.check_dispatch_classes(config, slurm_dispatch)
diff --git a/services/nodemanager/tests/test_daemon.py b/services/nodemanager/tests/test_daemon.py
deleted file mode 100644 (file)
index 1b6e4ca..0000000
+++ /dev/null
@@ -1,858 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: AGPL-3.0
-
-from __future__ import absolute_import, print_function
-
-import time
-import unittest
-
-import mock
-import pykka
-
-import arvnodeman.daemon as nmdaemon
-import arvnodeman.status as status
-from arvnodeman.jobqueue import ServerCalculator
-from arvnodeman.computenode.dispatch import ComputeNodeMonitorActor
-from . import testutil
-from . import test_status
-from . import pykka_timeout
-import logging
-
-class NodeManagerDaemonActorTestCase(testutil.ActorTestMixin,
-                                     unittest.TestCase):
-
-    def assertwait(self, f, timeout=pykka_timeout*2):
-        deadline = time.time() + timeout
-        while True:
-            try:
-                return f()
-            except AssertionError:
-                if time.time() > deadline:
-                    raise
-                pass
-            time.sleep(.1)
-            self.daemon.ping().get(self.TIMEOUT)
-
-    def busywait(self, f):
-        for n in xrange(200):
-            ok = f()
-            if ok:
-                return
-            time.sleep(.1)
-            self.daemon.ping().get(self.TIMEOUT)
-        self.assertTrue(ok) # always falsy, but not necessarily False
-
-    def mock_node_start(self, **kwargs):
-        # Make sure that every time the daemon starts a setup actor,
-        # it gets a new mock object back.
-        get_cloud_size = mock.MagicMock()
-        get_cloud_size.get.return_value = kwargs["cloud_size"]
-        mock_actor = mock.MagicMock()
-        mock_proxy = mock.NonCallableMock(name='setup_mock_proxy',
-                                          cloud_size=get_cloud_size,
-                                          actor_ref=mock_actor)
-        mock_actor.proxy.return_value = mock_proxy
-        mock_actor.tell_proxy.return_value = mock_proxy
-
-        self.last_setup = mock_proxy
-        return mock_actor
-
-    def mock_node_shutdown(self, **kwargs):
-        # Make sure that every time the daemon starts a shutdown actor,
-        # it gets a new mock object back.
-        get_cloud_node = mock.MagicMock()
-        if "node_monitor" in kwargs:
-            get_cloud_node.get.return_value = kwargs["node_monitor"].proxy().cloud_node.get()
-        mock_actor = mock.MagicMock()
-        mock_proxy = mock.NonCallableMock(name='shutdown_mock_proxy',
-                                          cloud_node=get_cloud_node,
-                                          actor_ref=mock_actor)
-
-        mock_actor.proxy.return_value = mock_proxy
-        self.last_shutdown = mock_proxy
-
-        return mock_actor
-
-    def make_daemon(self, cloud_nodes=[], arvados_nodes=[], want_sizes=[],
-                    avail_sizes=None,
-                    min_nodes=0, max_nodes=8,
-                    shutdown_windows=[54, 5, 1],
-                    max_total_price=None):
-        for name in ['cloud_nodes', 'arvados_nodes', 'server_wishlist']:
-            setattr(self, name + '_poller', mock.MagicMock(name=name + '_mock'))
-
-        if not avail_sizes:
-            if cloud_nodes or want_sizes:
-                avail_sizes=[(c.size, {"cores": int(c.id)}) for c in cloud_nodes] + [(s, {"cores": 1}) for s in want_sizes]
-            else:
-                avail_sizes=[(testutil.MockSize(1), {"cores": 1})]
-
-        self.arv_factory = mock.MagicMock(name='arvados_mock')
-        api_client = mock.MagicMock(name='api_client')
-        api_client.nodes().create().execute.side_effect = \
-            [testutil.arvados_node_mock(1),
-             testutil.arvados_node_mock(2)]
-        self.arv_factory.return_value = api_client
-
-        self.cloud_factory = mock.MagicMock(name='cloud_mock')
-        self.cloud_factory().node_start_time.return_value = time.time()
-        self.cloud_updates = mock.MagicMock(name='updates_mock')
-        self.timer = testutil.MockTimer(deliver_immediately=False)
-        self.cloud_factory().node_id.side_effect = lambda node: node.id
-        self.cloud_factory().broken.return_value = False
-
-        self.node_setup = mock.MagicMock(name='setup_mock')
-        self.node_setup.start.side_effect = self.mock_node_start
-        self.node_setup.reset_mock()
-
-        self.node_shutdown = mock.MagicMock(name='shutdown_mock')
-        self.node_shutdown.start.side_effect = self.mock_node_shutdown
-
-        self.daemon = nmdaemon.NodeManagerDaemonActor.start(
-            self.server_wishlist_poller, self.arvados_nodes_poller,
-            self.cloud_nodes_poller, self.cloud_updates, self.timer,
-            self.arv_factory, self.cloud_factory,
-            shutdown_windows, ServerCalculator(avail_sizes),
-            min_nodes, max_nodes, 600, 1800, 3600,
-            self.node_setup, self.node_shutdown,
-            max_total_price=max_total_price).proxy()
-        if arvados_nodes is not None:
-            self.daemon.update_arvados_nodes(arvados_nodes).get(self.TIMEOUT)
-        if cloud_nodes is not None:
-            self.daemon.update_cloud_nodes(cloud_nodes).get(self.TIMEOUT)
-        if want_sizes is not None:
-            self.daemon.update_server_wishlist(want_sizes).get(self.TIMEOUT)
-
-    def monitor_list(self):
-        return [c.actor.actor_ref for c in self.daemon.cloud_nodes.get(self.TIMEOUT).nodes.values() if c.actor]
-
-    def monitored_arvados_nodes(self, include_unpaired=True):
-        pairings = []
-        for future in [actor.proxy().arvados_node
-                       for actor in self.monitor_list()]:
-            try:
-                g = future.get(self.TIMEOUT)
-                if g or include_unpaired:
-                    pairings.append(g)
-            except pykka.ActorDeadError:
-                pass
-        return pairings
-
-    def alive_monitor_count(self):
-        return len(self.monitored_arvados_nodes())
-
-    def paired_monitor_count(self):
-        return len(self.monitored_arvados_nodes(False))
-
-    def assertShutdownCancellable(self, expected=True):
-        self.assertTrue(self.node_shutdown.start.called)
-        self.assertIs(expected,
-                      self.node_shutdown.start.call_args[1]['cancellable'],
-                      "ComputeNodeShutdownActor incorrectly cancellable")
-
-    def test_easy_node_creation(self):
-        size = testutil.MockSize(1)
-        self.make_daemon(want_sizes=[size])
-        self.busywait(lambda: self.node_setup.start.called)
-        self.assertIn('node_quota', status.tracker._latest)
-
-    def check_monitors_arvados_nodes(self, *arv_nodes):
-        self.assertwait(lambda: self.assertItemsEqual(arv_nodes, self.monitored_arvados_nodes()))
-
-    def test_node_pairing(self):
-        cloud_node = testutil.cloud_node_mock(1)
-        arv_node = testutil.arvados_node_mock(1)
-        self.make_daemon([cloud_node], [arv_node])
-        self.check_monitors_arvados_nodes(arv_node)
-
-    def test_node_pairing_after_arvados_update(self):
-        cloud_node = testutil.cloud_node_mock(2)
-        self.make_daemon([cloud_node],
-                         [testutil.arvados_node_mock(1, ip_address=None)])
-        arv_node = testutil.arvados_node_mock(2)
-        self.daemon.update_arvados_nodes([arv_node]).get(self.TIMEOUT)
-        self.check_monitors_arvados_nodes(arv_node)
-
-    def test_arvados_node_un_and_re_paired(self):
-        # We need to create the Arvados node mock after spinning up the daemon
-        # to make sure it's new enough to pair with the cloud node.
-        self.make_daemon(cloud_nodes=[testutil.cloud_node_mock(3)],
-                         arvados_nodes=None)
-        arv_node = testutil.arvados_node_mock(3)
-        self.daemon.update_arvados_nodes([arv_node]).get(self.TIMEOUT)
-        self.check_monitors_arvados_nodes(arv_node)
-        self.daemon.update_cloud_nodes([]).get(self.TIMEOUT)
-        self.busywait(lambda: 0 == self.alive_monitor_count())
-        self.daemon.update_cloud_nodes([testutil.cloud_node_mock(3)])
-        self.check_monitors_arvados_nodes(arv_node)
-
-    def test_old_arvados_node_not_double_assigned(self):
-        arv_node = testutil.arvados_node_mock(3, age=9000)
-        size = testutil.MockSize(3)
-        self.make_daemon(arvados_nodes=[arv_node],
-                         avail_sizes=[(size, {"cores":1})])
-        self.daemon.update_server_wishlist([size]).get(self.TIMEOUT)
-        self.daemon.update_server_wishlist([size, size]).get(self.TIMEOUT)
-        self.stop_proxy(self.daemon)
-        used_nodes = [call[1].get('arvados_node')
-                      for call in self.node_setup.start.call_args_list]
-        self.assertEqual(2, len(used_nodes))
-        self.assertIn(arv_node, used_nodes)
-        self.assertIn(None, used_nodes)
-
-    def test_node_count_satisfied(self):
-        self.make_daemon(cloud_nodes=[testutil.cloud_node_mock(1)],
-                         want_sizes=[testutil.MockSize(1)])
-        self.busywait(lambda: not self.node_setup.start.called)
-
-    def test_select_stale_node_records_with_slot_numbers_first(self):
-        """
-        Stale node records with slot_number assigned can exist when
-        clean_arvados_node() isn't executed after a node shutdown, for
-        various reasons.
-        NodeManagerDaemonActor should use these stale node records first, so
-        that they don't accumulate unused, reducing the slots available.
-        """
-        size = testutil.MockSize(1)
-        a_long_time_ago = '1970-01-01T01:02:03.04050607Z'
-        arvados_nodes = []
-        for n in range(9):
-            # Add several stale node records without slot_number assigned
-            arvados_nodes.append(
-                testutil.arvados_node_mock(
-                    n+1,
-                    slot_number=None,
-                    modified_at=a_long_time_ago))
-        # Add one record with stale_node assigned, it should be the
-        # first one selected
-        arv_node = testutil.arvados_node_mock(
-            123,
-            modified_at=a_long_time_ago)
-        arvados_nodes.append(arv_node)
-        cloud_node = testutil.cloud_node_mock(125, size=size)
-        self.make_daemon(cloud_nodes=[cloud_node],
-                         arvados_nodes=arvados_nodes)
-        arvados_nodes_tracker = self.daemon.arvados_nodes.get()
-        # Here, find_stale_node() should return the node record with
-        # the slot_number assigned.
-        self.assertEqual(arv_node,
-                         arvados_nodes_tracker.find_stale_node(3601))
-
-    def test_dont_count_missing_as_busy(self):
-        size = testutil.MockSize(1)
-        self.make_daemon(cloud_nodes=[testutil.cloud_node_mock(1, size=size),
-                                      testutil.cloud_node_mock(2, size=size)],
-                         arvados_nodes=[testutil.arvados_node_mock(1),
-                                        testutil.arvados_node_mock(
-                                            2,
-                                            last_ping_at='1970-01-01T01:02:03.04050607Z')],
-                         want_sizes=[size, size])
-        self.busywait(lambda: 2 == self.alive_monitor_count())
-        self.busywait(lambda: self.node_setup.start.called)
-
-    def test_missing_counts_towards_max(self):
-        size = testutil.MockSize(1)
-        self.make_daemon(cloud_nodes=[testutil.cloud_node_mock(1, size=size),
-                                      testutil.cloud_node_mock(2, size=size)],
-                         arvados_nodes=[testutil.arvados_node_mock(1),
-                                        testutil.arvados_node_mock(2, last_ping_at='1970-01-01T01:02:03.04050607Z')],
-                         want_sizes=[size, size],
-                         max_nodes=2)
-        self.busywait(lambda: not self.node_setup.start.called)
-
-    def test_excess_counts_missing(self):
-        size = testutil.MockSize(1)
-        cloud_nodes = [testutil.cloud_node_mock(1, size=size), testutil.cloud_node_mock(2, size=size)]
-        self.make_daemon(cloud_nodes=cloud_nodes,
-                         arvados_nodes=[testutil.arvados_node_mock(1),
-                                        testutil.arvados_node_mock(2, last_ping_at='1970-01-01T01:02:03.04050607Z')],
-                         want_sizes=[size])
-        self.assertwait(lambda: self.assertEqual(2, self.paired_monitor_count()))
-        for mon_ref in self.monitor_list():
-            self.daemon.node_can_shutdown(mon_ref.proxy()).get(self.TIMEOUT)
-        self.assertEqual(1, self.node_shutdown.start.call_count)
-
-    def test_missing_shutdown_not_excess(self):
-        size = testutil.MockSize(1)
-        cloud_nodes = [testutil.cloud_node_mock(1, size=size), testutil.cloud_node_mock(2, size=size)]
-        self.make_daemon(cloud_nodes=cloud_nodes,
-                         arvados_nodes=[testutil.arvados_node_mock(1),
-                                        testutil.arvados_node_mock(2, last_ping_at='1970-01-01T01:02:03.04050607Z')],
-                         want_sizes=[size])
-        self.assertwait(lambda: self.assertEqual(2, self.paired_monitor_count()))
-        get_cloud_node = mock.MagicMock(name="get_cloud_node")
-        get_cloud_node.get.return_value = cloud_nodes[1]
-        mock_node_monitor = mock.MagicMock()
-        mock_node_monitor.proxy.return_value = mock.NonCallableMock(cloud_node=get_cloud_node)
-        mock_shutdown = self.node_shutdown.start(node_monitor=mock_node_monitor)
-
-        self.daemon.cloud_nodes.get()[cloud_nodes[1].id].shutdown_actor = mock_shutdown.proxy()
-
-        self.assertwait(lambda: self.assertEqual(2, self.alive_monitor_count()))
-        for mon_ref in self.monitor_list():
-            self.daemon.node_can_shutdown(mon_ref.proxy()).get(self.TIMEOUT)
-        self.busywait(lambda: 1 == self.node_shutdown.start.call_count)
-
-    def test_booting_nodes_counted(self):
-        cloud_node = testutil.cloud_node_mock(1)
-        arv_node = testutil.arvados_node_mock(1)
-        server_wishlist = [testutil.MockSize(1)] * 2
-        self.make_daemon([cloud_node], [arv_node], server_wishlist)
-        self.daemon.max_nodes.get(self.TIMEOUT)
-        self.assertTrue(self.node_setup.start.called)
-        self.daemon.update_server_wishlist(server_wishlist).get(self.TIMEOUT)
-        self.busywait(lambda: 1 == self.node_setup.start.call_count)
-
-    def test_boot_new_node_when_all_nodes_busy(self):
-        size = testutil.MockSize(2)
-        arv_node = testutil.arvados_node_mock(2, job_uuid=True)
-        self.make_daemon([testutil.cloud_node_mock(2, size=size)], [arv_node],
-                         [size], avail_sizes=[(size, {"cores":1})])
-        self.assertwait(lambda: self.assertEqual(1, self.paired_monitor_count()))
-        self.assertwait(lambda: self.assertEqual(1, self.node_setup.start.called))
-
-    def test_boot_new_node_below_min_nodes(self):
-        min_size = testutil.MockSize(1)
-        wish_size = testutil.MockSize(3)
-        avail_sizes = [(min_size, {"cores": 1}),
-                       (wish_size, {"cores": 3})]
-        self.make_daemon([], [], None, avail_sizes=avail_sizes, min_nodes=2)
-        self.daemon.update_server_wishlist([wish_size]).get(self.TIMEOUT)
-        self.daemon.update_cloud_nodes([]).get(self.TIMEOUT)
-        self.daemon.update_server_wishlist([wish_size]).get(self.TIMEOUT)
-        self.stop_proxy(self.daemon)
-        self.assertEqual([wish_size, min_size],
-                         [call[1].get('cloud_size')
-                          for call in self.node_setup.start.call_args_list])
-
-    def test_no_new_node_when_ge_min_nodes_busy(self):
-        size = testutil.MockSize(2)
-        cloud_nodes = [testutil.cloud_node_mock(n, size=size) for n in range(1, 4)]
-        arv_nodes = [testutil.arvados_node_mock(n, job_uuid=True)
-                     for n in range(1, 4)]
-        self.make_daemon(cloud_nodes, arv_nodes, [], min_nodes=2)
-        self.stop_proxy(self.daemon)
-        self.assertEqual(0, self.node_setup.start.call_count)
-
-    def test_no_new_node_when_max_nodes_busy(self):
-        size = testutil.MockSize(3)
-        self.make_daemon(cloud_nodes=[testutil.cloud_node_mock(3)],
-                         arvados_nodes=[testutil.arvados_node_mock(3, job_uuid=True)],
-                         want_sizes=[size],
-                         max_nodes=1)
-        self.stop_proxy(self.daemon)
-        self.assertFalse(self.node_setup.start.called)
-
-    def start_node_boot(self, cloud_node=None, arv_node=None, id_num=1):
-        if cloud_node is None:
-            cloud_node = testutil.cloud_node_mock(id_num)
-        id_num = int(cloud_node.id)
-        if arv_node is None:
-            arv_node = testutil.arvados_node_mock(id_num)
-        self.make_daemon(want_sizes=[testutil.MockSize(id_num)],
-                         avail_sizes=[(testutil.MockSize(id_num), {"cores":1})])
-        self.daemon.max_nodes.get(self.TIMEOUT)
-        self.assertEqual(1, self.node_setup.start.call_count)
-        self.last_setup.cloud_node.get.return_value = cloud_node
-        self.last_setup.arvados_node.get.return_value = arv_node
-        return self.last_setup
-
-    def test_new_node_when_booted_node_not_usable(self):
-        cloud_node = testutil.cloud_node_mock(4)
-        arv_node = testutil.arvados_node_mock(4, crunch_worker_state='down')
-        setup = self.start_node_boot(cloud_node, arv_node)
-        self.daemon.node_setup_finished(setup).get(self.TIMEOUT)
-        self.assertEqual(1, self.alive_monitor_count())
-        self.daemon.update_arvados_nodes([arv_node])
-        self.daemon.update_cloud_nodes([cloud_node])
-        self.monitor_list()[0].proxy().cloud_node_start_time = time.time()-1801
-        self.daemon.update_server_wishlist(
-            [testutil.MockSize(4)]).get(self.TIMEOUT)
-        self.stop_proxy(self.daemon)
-        self.assertEqual(2, self.node_setup.start.call_count)
-
-    def test_no_duplication_when_booting_node_listed_fast(self):
-        # Test that we don't start two ComputeNodeMonitorActors when
-        # we learn about a booting node through a listing before we
-        # get the "node up" message from CloudNodeSetupActor.
-        cloud_node = testutil.cloud_node_mock(1)
-        setup = self.start_node_boot(cloud_node)
-        self.daemon.update_cloud_nodes([cloud_node])
-        self.daemon.node_setup_finished(setup).get(self.TIMEOUT)
-        self.assertEqual(1, self.alive_monitor_count())
-
-    def test_no_duplication_when_booted_node_listed(self):
-        cloud_node = testutil.cloud_node_mock(2)
-        setup = self.start_node_boot(cloud_node, id_num=2)
-        self.daemon.node_setup_finished(setup)
-        self.daemon.update_cloud_nodes([cloud_node]).get(self.TIMEOUT)
-        self.assertEqual(1, self.alive_monitor_count())
-
-    def test_node_counted_after_boot_with_slow_listing(self):
-        # Test that, after we boot a compute node, we assume it exists
-        # even it doesn't appear in the listing (e.g., because of delays
-        # propagating tags).
-        setup = self.start_node_boot()
-        self.daemon.node_setup_finished(setup).get(self.TIMEOUT)
-        self.assertEqual(1, self.alive_monitor_count())
-        self.daemon.update_cloud_nodes([]).get(self.TIMEOUT)
-        self.assertEqual(1, self.alive_monitor_count())
-
-    def test_booted_unlisted_node_counted(self):
-        setup = self.start_node_boot(id_num=1)
-        self.daemon.node_setup_finished(setup)
-        self.daemon.update_server_wishlist(
-            [testutil.MockSize(1)]).get(self.TIMEOUT)
-        self.stop_proxy(self.daemon)
-        self.assertEqual(1, self.node_setup.start.call_count)
-
-    def test_booted_node_can_shutdown(self):
-        setup = self.start_node_boot()
-        self.daemon.node_setup_finished(setup).get(self.TIMEOUT)
-        self.assertEqual(1, self.alive_monitor_count())
-        monitor = self.monitor_list()[0].proxy()
-        self.daemon.update_server_wishlist([])
-        self.daemon.node_can_shutdown(monitor).get(self.TIMEOUT)
-        self.daemon.update_server_wishlist([]).get(self.TIMEOUT)
-        self.stop_proxy(self.daemon)
-        self.assertTrue(self.node_shutdown.start.called,
-                        "daemon did not shut down booted node on offer")
-
-        with test_status.TestServer() as srv:
-            self.assertEqual(0, srv.get_status().get('nodes_unpaired', None))
-            self.assertEqual(1, srv.get_status().get('nodes_shutdown', None))
-            self.assertEqual(0, srv.get_status().get('nodes_wish', None))
-
-    def test_booted_node_lifecycle(self):
-        cloud_node = testutil.cloud_node_mock(6)
-        setup = self.start_node_boot(cloud_node, id_num=6)
-        self.daemon.node_setup_finished(setup).get(self.TIMEOUT)
-        self.assertEqual(1, self.alive_monitor_count())
-        monitor = self.monitor_list()[0].proxy()
-        self.daemon.update_server_wishlist([])
-        self.daemon.node_can_shutdown(monitor).get(self.TIMEOUT)
-        self.assertShutdownCancellable(True)
-        shutdown = self.node_shutdown.start().proxy()
-        shutdown.cloud_node.get.return_value = cloud_node
-        self.daemon.node_finished_shutdown(shutdown).get(self.TIMEOUT)
-        self.daemon.update_cloud_nodes([])
-        self.assertTrue(shutdown.stop.called,
-                        "shutdown actor not stopped after finishing")
-        self.assertTrue(monitor.actor_ref.actor_stopped.wait(self.TIMEOUT),
-                        "monitor for booted node not stopped after shutdown")
-        self.daemon.update_server_wishlist(
-            [testutil.MockSize(2)]).get(self.TIMEOUT)
-        self.stop_proxy(self.daemon)
-        self.assertTrue(self.node_setup.start.called,
-                        "second node not started after booted node stopped")
-
-    def test_node_disappearing_during_shutdown(self):
-        cloud_node = testutil.cloud_node_mock(6)
-        setup = self.start_node_boot(cloud_node, id_num=6)
-        self.daemon.node_setup_finished(setup).get(self.TIMEOUT)
-        self.assertEqual(1, self.alive_monitor_count())
-        monitor = self.monitor_list()[0].proxy()
-        self.daemon.update_server_wishlist([])
-        self.daemon.node_can_shutdown(monitor).get(self.TIMEOUT)
-        self.assertShutdownCancellable(True)
-        shutdown = self.node_shutdown.start().proxy()
-        shutdown.cloud_node.get.return_value = cloud_node
-        # Simulate a successful but slow node destroy call: the cloud node
-        # list gets updated before the ShutdownActor finishes.
-        record = self.daemon.cloud_nodes.get().nodes.values()[0]
-        self.assertTrue(record.shutdown_actor is not None)
-        self.daemon.cloud_nodes.get().nodes.clear()
-        self.daemon.node_finished_shutdown(shutdown).get(self.TIMEOUT)
-        self.assertTrue(
-            record.shutdown_actor is not None,
-            "test was ineffective -- failed to simulate the race condition")
-
-    def test_booted_node_shut_down_when_never_listed(self):
-        setup = self.start_node_boot()
-        self.cloud_factory().node_start_time.return_value = time.time() - 3601
-        self.daemon.node_setup_finished(setup).get(self.TIMEOUT)
-        self.assertEqual(1, self.alive_monitor_count())
-        self.assertFalse(self.node_shutdown.start.called)
-        now = time.time()
-        self.monitor_list()[0].tell_proxy().consider_shutdown()
-        self.busywait(lambda: self.node_shutdown.start.called)
-        self.assertShutdownCancellable(False)
-
-    def test_booted_node_shut_down_when_never_paired(self):
-        cloud_node = testutil.cloud_node_mock(2)
-        setup = self.start_node_boot(cloud_node)
-        self.cloud_factory().node_start_time.return_value = time.time() - 3601
-        self.daemon.node_setup_finished(setup).get(self.TIMEOUT)
-        self.assertEqual(1, self.alive_monitor_count())
-        self.daemon.update_cloud_nodes([cloud_node])
-        self.monitor_list()[0].tell_proxy().consider_shutdown()
-        self.busywait(lambda: self.node_shutdown.start.called)
-        self.assertShutdownCancellable(False)
-
-    def test_booted_node_shut_down_when_never_working(self):
-        cloud_node = testutil.cloud_node_mock(4)
-        arv_node = testutil.arvados_node_mock(4, crunch_worker_state='down')
-        setup = self.start_node_boot(cloud_node, arv_node)
-        self.daemon.update_arvados_nodes([arv_node]).get(self.TIMEOUT)
-        self.daemon.node_setup_finished(setup).get(self.TIMEOUT)
-        self.assertEqual(1, self.alive_monitor_count())
-        self.monitor_list()[0].proxy().cloud_node_start_time = time.time()-3601
-        self.daemon.update_cloud_nodes([cloud_node])
-        self.busywait(lambda: self.node_shutdown.start.called)
-        self.assertShutdownCancellable(False)
-
-    def test_node_that_pairs_not_considered_failed_boot(self):
-        cloud_node = testutil.cloud_node_mock(3)
-        arv_node = testutil.arvados_node_mock(3)
-        setup = self.start_node_boot(cloud_node, arv_node)
-        self.daemon.node_setup_finished(setup).get(self.TIMEOUT)
-        self.assertEqual(1, self.alive_monitor_count())
-        self.daemon.update_cloud_nodes([cloud_node])
-        self.daemon.update_arvados_nodes([arv_node]).get(self.TIMEOUT)
-        self.timer.deliver()
-        self.stop_proxy(self.daemon)
-        self.assertFalse(self.node_shutdown.start.called)
-
-    def test_node_that_pairs_busy_not_considered_failed_boot(self):
-        cloud_node = testutil.cloud_node_mock(5)
-        arv_node = testutil.arvados_node_mock(5, job_uuid=True)
-        setup = self.start_node_boot(cloud_node, arv_node)
-        self.daemon.node_setup_finished(setup).get(self.TIMEOUT)
-        self.assertEqual(1, self.alive_monitor_count())
-        self.daemon.update_cloud_nodes([cloud_node])
-        self.daemon.update_arvados_nodes([arv_node]).get(self.TIMEOUT)
-        self.timer.deliver()
-        self.stop_proxy(self.daemon)
-        self.assertFalse(self.node_shutdown.start.called)
-
-    def test_booting_nodes_shut_down(self):
-        self.make_daemon(want_sizes=[testutil.MockSize(1)])
-        self.daemon.update_server_wishlist([]).get(self.TIMEOUT)
-        self.busywait(lambda: self.last_setup.stop_if_no_cloud_node.called)
-
-    def test_all_booting_nodes_tried_to_shut_down(self):
-        size = testutil.MockSize(2)
-        self.make_daemon(want_sizes=[size], avail_sizes=[(size, {"cores":1})])
-        self.daemon.max_nodes.get(self.TIMEOUT)
-        setup1 = self.last_setup
-        setup1.stop_if_no_cloud_node().get.return_value = False
-        setup1.stop_if_no_cloud_node.reset_mock()
-        self.daemon.update_server_wishlist([size, size]).get(self.TIMEOUT)
-        self.daemon.max_nodes.get(self.TIMEOUT)
-        self.assertIsNot(setup1, self.last_setup)
-        self.last_setup.stop_if_no_cloud_node().get.return_value = True
-        self.last_setup.stop_if_no_cloud_node.reset_mock()
-        self.daemon.update_server_wishlist([]).get(self.TIMEOUT)
-        self.daemon.max_nodes.get(self.TIMEOUT)
-        self.stop_proxy(self.daemon)
-        self.assertEqual(1, self.last_setup.stop_if_no_cloud_node.call_count)
-        self.assertTrue(setup1.stop_if_no_cloud_node.called)
-
-    def test_shutdown_declined_at_wishlist_capacity(self):
-        cloud_node = testutil.cloud_node_mock(1)
-        arv_node = testutil.arvados_node_mock(1)
-        size = testutil.MockSize(1)
-        self.make_daemon(cloud_nodes=[cloud_node], arvados_nodes=[arv_node], want_sizes=[size])
-        self.assertwait(lambda: self.assertEqual(1, self.paired_monitor_count()))
-        monitor = self.monitor_list()[0].proxy()
-        self.daemon.node_can_shutdown(monitor).get(self.TIMEOUT)
-        self.stop_proxy(self.daemon)
-        self.assertFalse(self.node_shutdown.start.called)
-
-    def test_shutdown_declined_below_min_nodes(self):
-        cloud_node = testutil.cloud_node_mock(1)
-        arv_node = testutil.arvados_node_mock(1)
-        self.make_daemon(cloud_nodes=[cloud_node], arvados_nodes=[arv_node], min_nodes=1)
-        self.assertwait(lambda: self.assertEqual(1, self.paired_monitor_count()))
-        monitor = self.monitor_list()[0].proxy()
-        self.daemon.node_can_shutdown(monitor).get(self.TIMEOUT)
-        self.stop_proxy(self.daemon)
-        self.assertFalse(self.node_shutdown.start.called)
-
-    def test_shutdown_accepted_below_capacity(self):
-        self.make_daemon(cloud_nodes=[testutil.cloud_node_mock()])
-        self.busywait(lambda: 1 == self.alive_monitor_count())
-        monitor = self.monitor_list()[0].proxy()
-        self.daemon.node_can_shutdown(monitor).get(self.TIMEOUT)
-        self.busywait(lambda: self.node_shutdown.start.called)
-
-    def test_shutdown_declined_when_idle_and_job_queued(self):
-        size = testutil.MockSize(1)
-        cloud_nodes = [testutil.cloud_node_mock(n, size=size) for n in [3, 4]]
-        arv_nodes = [testutil.arvados_node_mock(3, job_uuid=True),
-                     testutil.arvados_node_mock(4, job_uuid=None)]
-        self.make_daemon(cloud_nodes, arv_nodes, [size])
-        self.assertwait(lambda: self.assertEqual(2, self.paired_monitor_count()))
-        for mon_ref in self.monitor_list():
-            monitor = mon_ref.proxy()
-            if monitor.cloud_node.get(self.TIMEOUT) is cloud_nodes[-1]:
-                break
-        else:
-            self.fail("monitor for idle node not found")
-        self.daemon.node_can_shutdown(monitor).get(self.TIMEOUT)
-        self.stop_proxy(self.daemon)
-        self.assertFalse(self.node_shutdown.start.called)
-
-    def test_node_shutdown_after_cancelled_shutdown(self):
-        cloud_node = testutil.cloud_node_mock(5)
-        self.make_daemon([cloud_node], [testutil.arvados_node_mock(5)])
-        self.assertEqual(1, self.alive_monitor_count())
-        monitor = self.monitor_list()[0].proxy()
-        self.daemon.node_can_shutdown(monitor).get(self.TIMEOUT)
-        self.last_shutdown.success.get.return_value = False
-        self.daemon.node_finished_shutdown(self.last_shutdown).get(self.TIMEOUT)
-        self.assertwait(lambda: self.assertEqual(1, self.paired_monitor_count()))
-
-        self.daemon.node_can_shutdown(monitor).get(self.TIMEOUT)
-        self.last_shutdown.success.get.return_value = True
-        self.last_shutdown.stop.side_effect = lambda: monitor.stop()
-        self.daemon.node_finished_shutdown(self.last_shutdown).get(self.TIMEOUT)
-        self.assertwait(lambda: self.assertEqual(0, self.paired_monitor_count()))
-
-    def test_nodes_shutting_down_replaced_below_max_nodes(self):
-        size = testutil.MockSize(6)
-        cloud_node = testutil.cloud_node_mock(6, size=size)
-        self.make_daemon([cloud_node], [testutil.arvados_node_mock(6, crunch_worker_state='down')],
-                         avail_sizes=[(size, {"cores":1})])
-        self.assertEqual(1, self.alive_monitor_count())
-        monitor = self.monitor_list()[0].proxy()
-        self.daemon.node_can_shutdown(monitor).get(self.TIMEOUT)
-        self.assertTrue(self.node_shutdown.start.called)
-        getmock = mock.MagicMock()
-        getmock.get.return_value = False
-        self.last_shutdown.cancel_shutdown.return_value = getmock
-        self.daemon.update_server_wishlist(
-            [testutil.MockSize(6)]).get(self.TIMEOUT)
-        self.busywait(lambda: self.node_setup.start.called)
-
-    def test_nodes_shutting_down_cancelled(self):
-        size = testutil.MockSize(6)
-        cloud_node = testutil.cloud_node_mock(6, size=size)
-        self.make_daemon([cloud_node], [testutil.arvados_node_mock(6, crunch_worker_state='down')],
-                         avail_sizes=[(size, {"cores":1})])
-        self.assertEqual(1, self.alive_monitor_count())
-        monitor = self.monitor_list()[0].proxy()
-        self.daemon.node_can_shutdown(monitor).get(self.TIMEOUT)
-        self.assertTrue(self.node_shutdown.start.called)
-        self.daemon.update_server_wishlist(
-            [testutil.MockSize(6)]).get(self.TIMEOUT)
-        self.busywait(lambda: self.last_shutdown.cancel_shutdown.called)
-
-    def test_nodes_shutting_down_not_replaced_at_max_nodes(self):
-        cloud_node = testutil.cloud_node_mock(7)
-        self.make_daemon([cloud_node], [testutil.arvados_node_mock(7)],
-                         max_nodes=1)
-        self.assertwait(lambda: self.assertEqual(1, self.paired_monitor_count()))
-        monitor = self.monitor_list()[0].proxy()
-        self.daemon.node_can_shutdown(monitor).get(self.TIMEOUT)
-        self.assertTrue(self.node_shutdown.start.called)
-        self.daemon.update_server_wishlist(
-            [testutil.MockSize(7)]).get(self.TIMEOUT)
-        self.busywait(lambda: not self.node_setup.start.called)
-
-    def test_nodes_shutting_down_count_against_excess(self):
-        size = testutil.MockSize(8)
-        cloud_nodes = [testutil.cloud_node_mock(n, size=size) for n in [8, 9]]
-        arv_nodes = [testutil.arvados_node_mock(n, size=size) for n in [8, 9]]
-        self.make_daemon(cloud_nodes, arv_nodes, [size],
-                         avail_sizes=[(size, {"cores":1})])
-        self.assertwait(lambda: self.assertEqual(2, self.paired_monitor_count()))
-        for mon_ref in self.monitor_list():
-            self.daemon.node_can_shutdown(mon_ref.proxy()).get(self.TIMEOUT)
-        self.assertEqual(1, self.node_shutdown.start.call_count)
-
-    def test_clean_shutdown_waits_for_node_setup_finish(self):
-        new_node = self.start_node_boot()
-        new_node.stop_if_no_cloud_node().get.return_value = False
-        new_node.stop_if_no_cloud_node.reset_mock()
-        self.daemon.shutdown().get(self.TIMEOUT)
-        self.assertTrue(new_node.stop_if_no_cloud_node.called)
-        self.daemon.node_setup_finished(new_node).get(self.TIMEOUT)
-        self.assertTrue(new_node.stop.called)
-        self.timer.deliver()
-        self.assertTrue(
-            self.daemon.actor_ref.actor_stopped.wait(self.TIMEOUT))
-
-    def test_wishlist_ignored_after_shutdown(self):
-        new_node = self.start_node_boot()
-        new_node.stop_if_no_cloud_node().get.return_value = False
-        new_node.stop_if_no_cloud_node.reset_mock()
-        self.daemon.shutdown().get(self.TIMEOUT)
-        size = testutil.MockSize(2)
-        self.daemon.update_server_wishlist([size] * 2).get(self.TIMEOUT)
-        self.timer.deliver()
-        self.busywait(lambda: 1 == self.node_setup.start.call_count)
-
-    def test_shutdown_actor_stopped_when_cloud_node_delisted(self):
-        self.make_daemon(cloud_nodes=[testutil.cloud_node_mock()])
-        self.assertEqual(1, self.alive_monitor_count())
-        monitor = self.monitor_list()[0].proxy()
-        self.daemon.node_can_shutdown(monitor).get(self.TIMEOUT)
-        self.daemon.update_cloud_nodes([]).get(self.TIMEOUT)
-        self.busywait(lambda: 1 == self.last_shutdown.stop.call_count)
-
-    def test_idle_node_disappearing_clears_status_idle_time_counter(self):
-        size = testutil.MockSize(1)
-        status.tracker._idle_nodes = {}
-        cloud_nodes = [testutil.cloud_node_mock(1, size=size)]
-        arv_nodes = [testutil.arvados_node_mock(1, job_uuid=None)]
-        self.make_daemon(cloud_nodes, arv_nodes, [size])
-        self.assertwait(lambda: self.assertEqual(1, self.paired_monitor_count()))
-        for mon_ref in self.monitor_list():
-            monitor = mon_ref.proxy()
-            if monitor.cloud_node.get(self.TIMEOUT) is cloud_nodes[-1]:
-                break
-        else:
-            self.fail("monitor for idle node not found")
-        self.assertEqual(1, status.tracker.get('nodes_idle'))
-        hostname = monitor.arvados_node.get()['hostname']
-        self.assertIn(hostname, status.tracker._idle_nodes)
-        # Simulate the node disappearing from the cloud node list
-        self.daemon.update_cloud_nodes([]).get(self.TIMEOUT)
-        self.busywait(lambda: 0 == self.alive_monitor_count())
-        self.assertNotIn(hostname, status.tracker._idle_nodes)
-
-    def test_shutdown_actor_cleanup_copes_with_dead_actors(self):
-        self.make_daemon(cloud_nodes=[testutil.cloud_node_mock()])
-        self.assertEqual(1, self.alive_monitor_count())
-        monitor = self.monitor_list()[0].proxy()
-        self.daemon.node_can_shutdown(monitor).get(self.TIMEOUT)
-        # We're mainly testing that update_cloud_nodes catches and handles
-        # the ActorDeadError.
-        self.last_shutdown.stop.side_effect = pykka.ActorDeadError
-        self.daemon.update_cloud_nodes([]).get(self.TIMEOUT)
-        self.busywait(lambda: 1 == self.last_shutdown.stop.call_count)
-
-    def test_node_create_two_sizes(self):
-        small = testutil.MockSize(1)
-        big = testutil.MockSize(2)
-        avail_sizes = [(testutil.MockSize(1), {"cores":1}),
-                        (testutil.MockSize(2), {"cores":2})]
-        self.make_daemon(want_sizes=[small, small, small, big],
-                         avail_sizes=avail_sizes, max_nodes=4)
-
-        # the daemon runs in another thread, so we need to wait and see
-        # if it does all the work we're expecting it to do before stopping it.
-        self.busywait(lambda: self.node_setup.start.call_count == 4)
-        booting = self.daemon.booting.get(self.TIMEOUT)
-        self.stop_proxy(self.daemon)
-        sizecounts = {a[0].id: 0 for a in avail_sizes}
-        for b in booting.itervalues():
-            sizecounts[b.cloud_size.get().id] += 1
-        logging.info(sizecounts)
-        self.assertEqual(3, sizecounts[small.id])
-        self.assertEqual(1, sizecounts[big.id])
-
-    def test_node_max_nodes_two_sizes(self):
-        small = testutil.MockSize(1)
-        big = testutil.MockSize(2)
-        avail_sizes = [(testutil.MockSize(1), {"cores":1}),
-                        (testutil.MockSize(2), {"cores":2})]
-        self.make_daemon(want_sizes=[small, small, big, small],
-                         avail_sizes=avail_sizes, max_nodes=3)
-
-        # the daemon runs in another thread, so we need to wait and see
-        # if it does all the work we're expecting it to do before stopping it.
-        self.busywait(lambda: self.node_setup.start.call_count == 3)
-        booting = self.daemon.booting.get(self.TIMEOUT)
-        self.stop_proxy(self.daemon)
-        sizecounts = {a[0].id: 0 for a in avail_sizes}
-        for b in booting.itervalues():
-            sizecounts[b.cloud_size.get().id] += 1
-        self.assertEqual(2, sizecounts[small.id])
-        self.assertEqual(1, sizecounts[big.id])
-
-    def test_wishlist_ordering(self):
-        # Check that big nodes aren't prioritized; since #12199 containers are
-        # scheduled on specific node sizes.
-        small = testutil.MockSize(1)
-        big = testutil.MockSize(2)
-        avail_sizes = [(testutil.MockSize(1), {"cores":1}),
-                        (testutil.MockSize(2), {"cores":2})]
-        self.make_daemon(want_sizes=[small, small, small, big],
-                         avail_sizes=avail_sizes, max_nodes=3)
-
-        # the daemon runs in another thread, so we need to wait and see
-        # if it does all the work we're expecting it to do before stopping it.
-        self.busywait(lambda: self.node_setup.start.call_count == 3)
-        booting = self.daemon.booting.get(self.TIMEOUT)
-        self.stop_proxy(self.daemon)
-        sizecounts = {a[0].id: 0 for a in avail_sizes}
-        for b in booting.itervalues():
-            sizecounts[b.cloud_size.get().id] += 1
-        self.assertEqual(3, sizecounts[small.id])
-        self.assertEqual(0, sizecounts[big.id])
-
-    def test_wishlist_reconfigure(self):
-        small = testutil.MockSize(1)
-        big = testutil.MockSize(2)
-        avail_sizes = [(small, {"cores":1}), (big, {"cores":2})]
-
-        self.make_daemon(cloud_nodes=[testutil.cloud_node_mock(1, small),
-                                      testutil.cloud_node_mock(2, small),
-                                      testutil.cloud_node_mock(3, big)],
-                         arvados_nodes=[testutil.arvados_node_mock(1),
-                                        testutil.arvados_node_mock(2),
-                                        testutil.arvados_node_mock(3)],
-                         want_sizes=[small, small, big],
-                         avail_sizes=avail_sizes)
-        self.assertwait(lambda: self.assertEqual(3, self.paired_monitor_count()))
-        self.daemon.update_server_wishlist([small, big, big]).get(self.TIMEOUT)
-
-        self.assertEqual(0, self.node_shutdown.start.call_count)
-
-        for c in self.daemon.cloud_nodes.get().nodes.itervalues():
-            self.daemon.node_can_shutdown(c.actor)
-
-        booting = self.daemon.booting.get()
-        cloud_nodes = self.daemon.cloud_nodes.get()
-
-        self.busywait(lambda: 1 == self.node_setup.start.call_count)
-        self.busywait(lambda: 1 == self.node_shutdown.start.call_count)
-
-        self.stop_proxy(self.daemon)
-
-        # booting a new big node
-        sizecounts = {a[0].id: 0 for a in avail_sizes}
-        for b in booting.itervalues():
-            sizecounts[b.cloud_size.get().id] += 1
-        self.assertEqual(0, sizecounts[small.id])
-        self.assertEqual(1, sizecounts[big.id])
-
-        # shutting down a small node
-        sizecounts = {a[0].id: 0 for a in avail_sizes}
-        for b in cloud_nodes.nodes.itervalues():
-            if b.shutdown_actor is not None:
-                sizecounts[b.cloud_node.size.id] += 1
-        self.assertEqual(1, sizecounts[small.id])
-        self.assertEqual(0, sizecounts[big.id])
-
-    def test_node_max_price(self):
-        small = testutil.MockSize(1)
-        big = testutil.MockSize(2)
-        avail_sizes = [(testutil.MockSize(1), {"cores":1, "price":1}),
-                        (testutil.MockSize(2), {"cores":2, "price":2})]
-        self.make_daemon(want_sizes=[small, small, small, big],
-                         avail_sizes=avail_sizes,
-                         max_nodes=4,
-                         max_total_price=4)
-        # the daemon runs in another thread, so we need to wait and see
-        # if it does all the work we're expecting it to do before stopping it.
-        self.busywait(lambda: self.node_setup.start.call_count == 3)
-        booting = self.daemon.booting.get()
-        self.stop_proxy(self.daemon)
-
-        sizecounts = {a[0].id: 0 for a in avail_sizes}
-        for b in booting.itervalues():
-            sizecounts[b.cloud_size.get().id] += 1
-        logging.info(sizecounts)
-
-        # Booting 3 small nodes and not booting a big node would also partially
-        # satisfy the wishlist and come in under the price cap, however the way
-        # the update_server_wishlist() currently works effectively results in a
-        # round-robin creation of one node of each size in the wishlist, so
-        # test for that.
-        self.assertEqual(2, sizecounts[small.id])
-        self.assertEqual(1, sizecounts[big.id])
diff --git a/services/nodemanager/tests/test_failure.py b/services/nodemanager/tests/test_failure.py
deleted file mode 100644 (file)
index 8bf3ea8..0000000
+++ /dev/null
@@ -1,69 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: AGPL-3.0
-
-from __future__ import absolute_import, print_function
-
-import errno
-import logging
-import time
-import threading
-import unittest
-
-import mock
-import pykka
-
-from . import testutil
-
-import arvnodeman.baseactor
-import arvnodeman.status as status
-
-class BogusActor(arvnodeman.baseactor.BaseNodeManagerActor):
-    def __init__(self, e, killfunc=None):
-        super(BogusActor, self).__init__(killfunc=killfunc)
-        self.exp = e
-
-    def doStuff(self):
-        raise self.exp
-
-    def ping(self):
-        # Called by WatchdogActorTest, this delay is longer than the test timeout
-        # of 1 second, which should cause the watchdog ping to fail.
-        time.sleep(2)
-        return True
-
-class ActorUnhandledExceptionTest(testutil.ActorTestMixin, unittest.TestCase):
-    def test_fatal_error(self):
-        for e in (MemoryError(), threading.ThreadError(), OSError(errno.ENOMEM, "")):
-            kill_mock = mock.Mock('os.kill')
-            bgact = BogusActor.start(e, killfunc=kill_mock)
-            act_thread = bgact.proxy().get_thread().get()
-            act = bgact.tell_proxy()
-            act.doStuff()
-            act.actor_ref.stop(block=True)
-            act_thread.join()
-            self.assertTrue(kill_mock.called)
-
-    def test_nonfatal_error(self):
-        status.tracker.update({'actor_exceptions': 0})
-        kill_mock = mock.Mock('os.kill')
-        bgact = BogusActor.start(OSError(errno.ENOENT, ""), killfunc=kill_mock)
-        act_thread = bgact.proxy().get_thread().get()
-        act = bgact.tell_proxy()
-        act.doStuff()
-        act.actor_ref.stop(block=True)
-        act_thread.join()
-        self.assertFalse(kill_mock.called)
-        self.assertEqual(1, status.tracker.get('actor_exceptions'))
-
-class WatchdogActorTest(testutil.ActorTestMixin, unittest.TestCase):
-
-    def test_time_timout(self):
-        kill_mock = mock.Mock('os.kill')
-        act = BogusActor.start(OSError(errno.ENOENT, ""))
-        watch = arvnodeman.baseactor.WatchdogActor.start(1, act, killfunc=kill_mock)
-        time.sleep(1)
-        watch.stop(block=True)
-        act.stop(block=True)
-        self.assertTrue(kill_mock.called)
diff --git a/services/nodemanager/tests/test_jobqueue.py b/services/nodemanager/tests/test_jobqueue.py
deleted file mode 100644 (file)
index de83b68..0000000
+++ /dev/null
@@ -1,239 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: AGPL-3.0
-
-from __future__ import absolute_import, print_function
-
-import unittest
-import mock
-
-import arvnodeman.jobqueue as jobqueue
-from . import testutil
-
-class ServerCalculatorTestCase(unittest.TestCase):
-    def make_calculator(self, factors, **kwargs):
-        return jobqueue.ServerCalculator(
-            [(testutil.MockSize(n), {'cores': n}) for n in factors], **kwargs)
-
-    def calculate(self, servcalc, *constraints):
-        return servcalc.servers_for_queue(
-            [{'uuid': 'zzzzz-jjjjj-{:015x}'.format(index),
-              'runtime_constraints': cdict}
-             for index, cdict in enumerate(constraints)])
-
-    def test_empty_queue_needs_no_servers(self):
-        servcalc = self.make_calculator([1])
-        self.assertEqual(([], {}), servcalc.servers_for_queue([]))
-
-    def test_easy_server_count(self):
-        servcalc = self.make_calculator([1])
-        servlist, _ = self.calculate(servcalc, {'min_nodes': 3})
-        self.assertEqual(3, len(servlist))
-
-    def test_default_5pct_ram_value_decrease(self):
-        servcalc = self.make_calculator([1])
-        servlist, _ = self.calculate(servcalc, {'min_ram_mb_per_node': 128})
-        self.assertEqual(0, len(servlist))
-        servlist, _ = self.calculate(servcalc, {'min_ram_mb_per_node': 121})
-        self.assertEqual(1, len(servlist))
-
-    def test_custom_node_mem_scaling_factor(self):
-        # Simulate a custom 'node_mem_scaling' config parameter by passing
-        # the value to ServerCalculator
-        servcalc = self.make_calculator([1], node_mem_scaling=0.5)
-        servlist, _ = self.calculate(servcalc, {'min_ram_mb_per_node': 128})
-        self.assertEqual(0, len(servlist))
-        servlist, _ = self.calculate(servcalc, {'min_ram_mb_per_node': 64})
-        self.assertEqual(1, len(servlist))
-
-    def test_implicit_server_count(self):
-        servcalc = self.make_calculator([1])
-        servlist, _ = self.calculate(servcalc, {}, {'min_nodes': 3})
-        self.assertEqual(4, len(servlist))
-
-    def test_bad_min_nodes_override(self):
-        servcalc = self.make_calculator([1])
-        servlist, _ = self.calculate(servcalc,
-                                     {'min_nodes': -2}, {'min_nodes': 'foo'})
-        self.assertEqual(2, len(servlist))
-
-    def test_ignore_and_return_unsatisfiable_jobs(self):
-        servcalc = self.make_calculator([1], max_nodes=9)
-        servlist, u_jobs = self.calculate(servcalc,
-                                          {'min_cores_per_node': 2},
-                                          {'min_ram_mb_per_node': 256},
-                                          {'min_nodes': 6},
-                                          {'min_nodes': 12},
-                                          {'min_scratch_mb_per_node': 300000})
-        self.assertEqual(6, len(servlist))
-        # Only unsatisfiable jobs are returned on u_jobs
-        self.assertIn('zzzzz-jjjjj-000000000000000', u_jobs.keys())
-        self.assertIn('zzzzz-jjjjj-000000000000001', u_jobs.keys())
-        self.assertNotIn('zzzzz-jjjjj-000000000000002', u_jobs.keys())
-        self.assertIn('zzzzz-jjjjj-000000000000003', u_jobs.keys())
-        self.assertIn('zzzzz-jjjjj-000000000000004', u_jobs.keys())
-
-    def test_ignore_too_expensive_jobs(self):
-        servcalc = self.make_calculator([1, 2], max_nodes=12, max_price=6)
-        servlist, _ = self.calculate(servcalc,
-                                     {'min_cores_per_node': 1, 'min_nodes': 6})
-        self.assertEqual(6, len(servlist))
-
-        servlist, _ = self.calculate(servcalc,
-                                     {'min_cores_per_node': 2, 'min_nodes': 6})
-        self.assertEqual(0, len(servlist))
-
-    def test_job_requesting_max_nodes_accepted(self):
-        servcalc = self.make_calculator([1], max_nodes=4)
-        servlist, _ = self.calculate(servcalc, {'min_nodes': 4})
-        self.assertEqual(4, len(servlist))
-
-    def test_cheapest_size(self):
-        servcalc = self.make_calculator([2, 4, 1, 3])
-        self.assertEqual(testutil.MockSize(1), servcalc.cheapest_size())
-
-    def test_next_biggest(self):
-        servcalc = self.make_calculator([1, 2, 4, 8])
-        servlist, _ = self.calculate(servcalc,
-                                     {'min_cores_per_node': 3},
-                                     {'min_cores_per_node': 6})
-        self.assertEqual([servcalc.cloud_sizes[2].id,
-                          servcalc.cloud_sizes[3].id],
-                         [s.id for s in servlist])
-
-    def test_multiple_sizes(self):
-        servcalc = self.make_calculator([1, 2])
-        servlist, _ = self.calculate(servcalc,
-                                     {'min_cores_per_node': 2},
-                                     {'min_cores_per_node': 1},
-                                     {'min_cores_per_node': 1})
-        self.assertEqual([servcalc.cloud_sizes[1].id,
-                          servcalc.cloud_sizes[0].id,
-                          servcalc.cloud_sizes[0].id],
-                         [s.id for s in servlist])
-
-        servlist, _ = self.calculate(servcalc,
-                                     {'min_cores_per_node': 1},
-                                     {'min_cores_per_node': 2},
-                                     {'min_cores_per_node': 1})
-        self.assertEqual([servcalc.cloud_sizes[0].id,
-                          servcalc.cloud_sizes[1].id,
-                          servcalc.cloud_sizes[0].id],
-                         [s.id for s in servlist])
-
-        servlist, _ = self.calculate(servcalc,
-                                     {'min_cores_per_node': 1},
-                                     {'min_cores_per_node': 1},
-                                     {'min_cores_per_node': 2})
-        self.assertEqual([servcalc.cloud_sizes[0].id,
-                          servcalc.cloud_sizes[0].id,
-                          servcalc.cloud_sizes[1].id],
-                         [s.id for s in servlist])
-
-
-
-class JobQueueMonitorActorTestCase(testutil.RemotePollLoopActorTestMixin,
-                                   unittest.TestCase):
-    TEST_CLASS = jobqueue.JobQueueMonitorActor
-
-
-    class MockCalculator(object):
-        @staticmethod
-        def servers_for_queue(queue):
-            return ([testutil.MockSize(n) for n in queue], {})
-
-
-    class MockCalculatorUnsatisfiableJobs(object):
-        @staticmethod
-        def servers_for_queue(queue):
-            return ([], {k["uuid"]: "Unsatisfiable job mock" for k in queue})
-
-
-    def build_monitor(self, side_effect, *args, **kwargs):
-        super(JobQueueMonitorActorTestCase, self).build_monitor(*args, **kwargs)
-        self.client.jobs().queue().execute.side_effect = side_effect
-
-    @mock.patch("subprocess32.check_call")
-    @mock.patch("subprocess32.check_output")
-    def test_unsatisfiable_jobs(self, mock_squeue, mock_scancel):
-        job_uuid = 'zzzzz-8i9sb-zzzzzzzzzzzzzzz'
-        container_uuid = 'yyyyy-dz642-yyyyyyyyyyyyyyy'
-        mock_squeue.return_value = "1|1024|0|(Resources)|" + container_uuid + "||1234567890\n"
-
-        self.build_monitor([{'items': [{'uuid': job_uuid}]}],
-                           self.MockCalculatorUnsatisfiableJobs(), True, True)
-        self.monitor.subscribe(self.subscriber).get(self.TIMEOUT)
-        self.monitor.ping().get(self.TIMEOUT)
-        self.stop_proxy(self.monitor)
-        self.client.jobs().cancel.assert_called_with(uuid=job_uuid)
-        mock_scancel.assert_called_with(['scancel', '--name='+container_uuid])
-
-    @mock.patch("subprocess32.check_output")
-    def test_subscribers_get_server_lists(self, mock_squeue):
-        mock_squeue.return_value = ""
-
-        self.build_monitor([{'items': [1, 2]}], self.MockCalculator(), True, True)
-        self.monitor.subscribe(self.subscriber).get(self.TIMEOUT)
-        self.stop_proxy(self.monitor)
-        self.subscriber.assert_called_with([testutil.MockSize(1),
-                                            testutil.MockSize(2)])
-
-    @mock.patch("subprocess32.check_output")
-    def test_squeue_server_list(self, mock_squeue):
-        mock_squeue.return_value = """1|1024|0|(Resources)|zzzzz-dz642-zzzzzzzzzzzzzzy|(null)|1234567890
-2|1024|0|(Resources)|zzzzz-dz642-zzzzzzzzzzzzzzz|(null)|1234567890
-"""
-
-        super(JobQueueMonitorActorTestCase, self).build_monitor(jobqueue.ServerCalculator(
-            [(testutil.MockSize(n), {'cores': n, 'ram': n*1024, 'scratch': n}) for n in range(1, 3)]),
-                                                                True, True)
-        self.monitor.subscribe(self.subscriber).get(self.TIMEOUT)
-        self.stop_proxy(self.monitor)
-        self.subscriber.assert_called_with([testutil.MockSize(1),
-                                            testutil.MockSize(2)])
-
-    @mock.patch("subprocess32.check_output")
-    def test_squeue_server_list_suffix(self, mock_squeue):
-        mock_squeue.return_value = """1|1024M|0|(ReqNodeNotAvail, UnavailableNodes:compute123)|zzzzz-dz642-zzzzzzzzzzzzzzy|(null)|1234567890
-1|2G|0|(ReqNodeNotAvail)|zzzzz-dz642-zzzzzzzzzzzzzzz|(null)|1234567890
-"""
-
-        super(JobQueueMonitorActorTestCase, self).build_monitor(jobqueue.ServerCalculator(
-            [(testutil.MockSize(n), {'cores': n, 'ram': n*1024, 'scratch': n}) for n in range(1, 3)]),
-                                                                True, True)
-        self.monitor.subscribe(self.subscriber).get(self.TIMEOUT)
-        self.stop_proxy(self.monitor)
-        self.subscriber.assert_called_with([testutil.MockSize(1),
-                                            testutil.MockSize(2)])
-
-    @mock.patch("subprocess32.check_output")
-    def test_squeue_server_list_instancetype_constraint(self, mock_squeue):
-        mock_squeue.return_value = """1|1024|0|(Resources)|zzzzz-dz642-zzzzzzzzzzzzzzy|instancetype=z2.test|1234567890\n"""
-        super(JobQueueMonitorActorTestCase, self).build_monitor(jobqueue.ServerCalculator(
-            [(testutil.MockSize(n), {'cores': n, 'ram': n*1024, 'scratch': n}) for n in range(1, 3)]),
-                                                                True, True)
-        self.monitor.subscribe(self.subscriber).get(self.TIMEOUT)
-        self.stop_proxy(self.monitor)
-        self.subscriber.assert_called_with([testutil.MockSize(2)])
-
-    def test_coerce_to_mb(self):
-        self.assertEqual(1, jobqueue.JobQueueMonitorActor.coerce_to_mb("1"))
-        self.assertEqual(512, jobqueue.JobQueueMonitorActor.coerce_to_mb("512"))
-        self.assertEqual(512, jobqueue.JobQueueMonitorActor.coerce_to_mb("512M"))
-        self.assertEqual(1024, jobqueue.JobQueueMonitorActor.coerce_to_mb("1024M"))
-        self.assertEqual(1024, jobqueue.JobQueueMonitorActor.coerce_to_mb("1G"))
-        self.assertEqual(1536, jobqueue.JobQueueMonitorActor.coerce_to_mb("1.5G"))
-        self.assertEqual(2048, jobqueue.JobQueueMonitorActor.coerce_to_mb("2G"))
-        self.assertEqual(1025, jobqueue.JobQueueMonitorActor.coerce_to_mb("1025M"))
-        self.assertEqual(1048576, jobqueue.JobQueueMonitorActor.coerce_to_mb("1T"))
-        self.assertEqual(1572864, jobqueue.JobQueueMonitorActor.coerce_to_mb("1.5T"))
-        self.assertEqual(1073741824, jobqueue.JobQueueMonitorActor.coerce_to_mb("1P"))
-        self.assertEqual(1610612736, jobqueue.JobQueueMonitorActor.coerce_to_mb("1.5P"))
-        self.assertEqual(0, jobqueue.JobQueueMonitorActor.coerce_to_mb("0"))
-        self.assertEqual(0, jobqueue.JobQueueMonitorActor.coerce_to_mb("0M"))
-        self.assertEqual(0, jobqueue.JobQueueMonitorActor.coerce_to_mb("0G"))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/services/nodemanager/tests/test_nodelist.py b/services/nodemanager/tests/test_nodelist.py
deleted file mode 100644 (file)
index df31a12..0000000
+++ /dev/null
@@ -1,106 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: AGPL-3.0
-
-from __future__ import absolute_import, print_function
-
-import unittest
-import mock
-
-import arvnodeman.nodelist as nodelist
-from libcloud.compute.base import NodeSize
-from . import testutil
-
-class ArvadosNodeListMonitorActorTestCase(testutil.RemotePollLoopActorTestMixin,
-                                          unittest.TestCase):
-    TEST_CLASS = nodelist.ArvadosNodeListMonitorActor
-
-    def build_monitor(self, side_effect, *args, **kwargs):
-        super(ArvadosNodeListMonitorActorTestCase, self).build_monitor(
-            *args, **kwargs)
-        self.client.nodes().list().execute.side_effect = side_effect
-
-    @mock.patch("subprocess32.check_output")
-    def test_uuid_is_subscription_key(self, sinfo_mock):
-        sinfo_mock.return_value = ""
-        node = testutil.arvados_node_mock()
-        self.build_monitor([{
-            'items': [node],
-            'items_available': 1,
-            'offset': 0
-        }, {
-            'items': [],
-            'items_available': 1,
-            'offset': 1
-        }])
-        self.monitor.subscribe_to(node['uuid'],
-                                  self.subscriber).get(self.TIMEOUT)
-        self.stop_proxy(self.monitor)
-        self.subscriber.assert_called_with(node)
-        self.assertEqual("down", node["crunch_worker_state"])
-
-    @mock.patch("subprocess32.check_output")
-    def test_update_from_sinfo(self, sinfo_mock):
-        sinfo_mock.return_value = """compute1|idle|instancetype=a1.test
-compute2|alloc|(null)
-notarvados12345|idle|(null)
-"""
-        nodeIdle = testutil.arvados_node_mock(node_num=1)
-        nodeBusy = testutil.arvados_node_mock(node_num=2)
-        nodeMissing = testutil.arvados_node_mock(node_num=99)
-        self.build_monitor([{
-            'items': [nodeIdle, nodeBusy, nodeMissing],
-            'items_available': 1,
-            'offset': 0
-        }, {
-            'items': [],
-            'items_available': 1,
-            'offset': 1
-        }])
-        self.monitor.subscribe_to(nodeMissing['uuid'],
-                                  self.subscriber).get(self.TIMEOUT)
-        self.stop_proxy(self.monitor)
-        self.subscriber.assert_called_with(nodeMissing)
-
-        self.assertEqual("idle", nodeIdle["crunch_worker_state"])
-        self.assertEqual("busy", nodeBusy["crunch_worker_state"])
-        self.assertEqual("down", nodeMissing["crunch_worker_state"])
-
-        self.assertEqual("instancetype=a1.test", nodeIdle["slurm_node_features"])
-        self.assertEqual("", nodeBusy["slurm_node_features"])
-        self.assertEqual("", nodeMissing["slurm_node_features"])
-
-
-class CloudNodeListMonitorActorTestCase(testutil.RemotePollLoopActorTestMixin,
-                                        unittest.TestCase):
-    TEST_CLASS = nodelist.CloudNodeListMonitorActor
-
-    class MockNode(object):
-        def __init__(self, count):
-            self.id = str(count)
-            self.name = 'test{}.example.com'.format(count)
-            self.private_ips = ['10.0.0.{}'.format(count)]
-            self.public_ips = []
-            self.size = testutil.MockSize(1)
-            self.state = 0
-            self.extra = {'arvados_node_size': self.size.id}
-
-
-    def build_monitor(self, side_effect, *args, **kwargs):
-        super(CloudNodeListMonitorActorTestCase, self).build_monitor(
-            *args, **kwargs)
-        self.client.list_nodes.side_effect = side_effect
-
-    def test_id_is_subscription_key(self):
-        node = self.MockNode(1)
-        mock_calc = mock.MagicMock()
-        mock_calc.find_size.return_value = testutil.MockSize(2)
-        self.build_monitor([[node]], mock_calc)
-        self.monitor.subscribe_to('1', self.subscriber).get(self.TIMEOUT)
-        self.stop_proxy(self.monitor)
-        self.subscriber.assert_called_with(node)
-        self.assertEqual(testutil.MockSize(2), node.size)
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/services/nodemanager/tests/test_status.py b/services/nodemanager/tests/test_status.py
deleted file mode 100644 (file)
index 2a1c0fc..0000000
+++ /dev/null
@@ -1,139 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: AGPL-3.0
-
-from __future__ import absolute_import, print_function
-from future import standard_library
-
-import json
-import mock
-import random
-import requests
-import unittest
-
-import arvnodeman.status as status
-import arvnodeman.config as config
-
-
-class TestServer(object):
-    def __init__(self, management_token=None):
-        self.mgmt_token = management_token
-
-    def __enter__(self):
-        cfg = config.NodeManagerConfig()
-        cfg.set('Manage', 'port', '0')
-        cfg.set('Manage', 'address', '127.0.0.1')
-        if self.mgmt_token != None:
-            cfg.set('Manage', 'ManagementToken', self.mgmt_token)
-        self.srv = status.Server(cfg)
-        self.srv.start()
-        addr, port = self.srv.server_address
-        self.srv_base = 'http://127.0.0.1:'+str(port)
-        return self
-
-    def __exit__(self, exc_type, exc_value, traceback):
-        self.srv.shutdown()
-
-    def get_status_response(self):
-        return requests.get(self.srv_base+'/status.json')
-
-    def get_status(self):
-        return self.get_status_response().json()
-
-    def get_healthcheck_ping(self, auth_header=None):
-        headers = {}
-        if auth_header != None:
-            headers['Authorization'] = auth_header
-        return requests.get(self.srv_base+'/_health/ping', headers=headers)
-
-class StatusServerUpdates(unittest.TestCase):
-    def test_updates(self):
-        with TestServer() as srv:
-            for n in [1, 2, 3]:
-                status.tracker.update({'nodes_'+str(n): n})
-                r = srv.get_status_response()
-                self.assertEqual(200, r.status_code)
-                self.assertEqual('application/json', r.headers['content-type'])
-                resp = r.json()
-                self.assertEqual(n, resp['nodes_'+str(n)])
-            self.assertEqual(1, resp['nodes_1'])
-            self.assertIn('Version', resp)
-            self.assertIn('config_max_nodes', resp)
-
-    def test_counters(self):
-        with TestServer() as srv:
-            resp = srv.get_status()
-            # Test counters existance
-            for counter in ['list_nodes_errors', 'create_node_errors',
-                'destroy_node_errors', 'boot_failures', 'actor_exceptions']:
-                self.assertIn(counter, resp)
-            # Test counter increment
-            for count in range(1, 3):
-                status.tracker.counter_add('a_counter')
-                resp = srv.get_status()
-                self.assertEqual(count, resp['a_counter'])
-
-    @mock.patch('time.time')
-    def test_idle_times(self, time_mock):
-        with TestServer() as srv:
-            resp = srv.get_status()
-            node_name = 'idle_compute{}'.format(random.randint(1, 1024))
-            self.assertIn('idle_times', resp)
-            # Test add an idle node
-            time_mock.return_value = 10
-            status.tracker.idle_in(node_name)
-            time_mock.return_value += 10
-            resp = srv.get_status()
-            self.assertEqual(10, resp['idle_times'][node_name])
-            # Test adding the same idle node a 2nd time
-            time_mock.return_value += 10
-            status.tracker.idle_in(node_name)
-            time_mock.return_value += 10
-            resp = srv.get_status()
-            # Idle timestamp doesn't get reset if already exists
-            self.assertEqual(30, resp['idle_times'][node_name])
-            # Test remove idle node
-            status.tracker.idle_out(node_name)
-            resp = srv.get_status()
-            self.assertNotIn(node_name, resp['idle_times'])
-
-
-class StatusServerDisabled(unittest.TestCase):
-    def test_config_disabled(self):
-        cfg = config.NodeManagerConfig()
-        cfg.set('Manage', 'port', '-1')
-        cfg.set('Manage', 'address', '127.0.0.1')
-        self.srv = status.Server(cfg)
-        self.srv.start()
-        self.assertFalse(self.srv.enabled)
-        self.assertFalse(getattr(self.srv, '_thread', False))
-
-class HealthcheckPing(unittest.TestCase):
-    def test_ping_disabled(self):
-        with TestServer() as srv:
-            r = srv.get_healthcheck_ping()
-            self.assertEqual(404, r.status_code)
-
-    def test_ping_no_auth(self):
-        with TestServer('configuredmanagementtoken') as srv:
-            r = srv.get_healthcheck_ping()
-            self.assertEqual(401, r.status_code)
-
-    def test_ping_bad_auth_format(self):
-        with TestServer('configuredmanagementtoken') as srv:
-            r = srv.get_healthcheck_ping('noBearer')
-            self.assertEqual(403, r.status_code)
-
-    def test_ping_bad_auth_token(self):
-        with TestServer('configuredmanagementtoken') as srv:
-            r = srv.get_healthcheck_ping('Bearer badtoken')
-            self.assertEqual(403, r.status_code)
-
-    def test_ping_success(self):
-        with TestServer('configuredmanagementtoken') as srv:
-            r = srv.get_healthcheck_ping('Bearer configuredmanagementtoken')
-            self.assertEqual(200, r.status_code)
-            self.assertEqual('application/json', r.headers['content-type'])
-            resp = r.json()
-            self.assertEqual('{"health": "OK"}', json.dumps(resp))
diff --git a/services/nodemanager/tests/test_timedcallback.py b/services/nodemanager/tests/test_timedcallback.py
deleted file mode 100644 (file)
index 21a9b5a..0000000
+++ /dev/null
@@ -1,65 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: AGPL-3.0
-
-from __future__ import absolute_import, print_function
-
-import time
-import unittest
-
-import mock
-import pykka
-
-import arvnodeman.timedcallback as timedcallback
-from . import testutil
-
-@testutil.no_sleep
-class TimedCallBackActorTestCase(testutil.ActorTestMixin, unittest.TestCase):
-    def test_immediate_turnaround(self):
-        receiver = mock.Mock()
-        deliverer = timedcallback.TimedCallBackActor.start().proxy()
-        deliverer.schedule(time.time() - 1, receiver,
-                           'immediate').get(self.TIMEOUT)
-        self.stop_proxy(deliverer)
-        receiver.assert_called_with('immediate')
-
-    def test_delayed_turnaround(self):
-        receiver = mock.Mock()
-        mock_now = mock.Mock()
-        mock_now.return_value = 0
-        deliverer = timedcallback.TimedCallBackActor.start(timefunc=mock_now).proxy()
-        deliverer.schedule(1, receiver, 'delayed')
-        deliverer.schedule(3, receiver, 'failure').get(self.TIMEOUT)
-        self.assertFalse(receiver.called)
-        mock_now.return_value = 2
-        deliverer.schedule(3, receiver, 'failure').get(self.TIMEOUT)
-        self.stop_proxy(deliverer)
-        receiver.assert_called_with('delayed')
-
-    def test_out_of_order_scheduling(self):
-        receiver = mock.Mock()
-        mock_now = mock.Mock()
-        mock_now.return_value = 1.5
-        deliverer = timedcallback.TimedCallBackActor.start(timefunc=mock_now).proxy()
-        deliverer.schedule(2, receiver, 'second')
-        deliverer.schedule(1, receiver, 'first')
-        deliverer.schedule(3, receiver, 'failure').get(self.TIMEOUT)
-        receiver.assert_called_with('first')
-        mock_now.return_value = 2.5
-        deliverer.schedule(3, receiver, 'failure').get(self.TIMEOUT)
-        self.stop_proxy(deliverer)
-        receiver.assert_called_with('second')
-
-    def test_dead_actors_ignored(self):
-        receiver = mock.Mock(name='dead_actor', spec=pykka.ActorRef)
-        receiver.tell.side_effect = pykka.ActorDeadError
-        deliverer = timedcallback.TimedCallBackActor.start().proxy()
-        deliverer.schedule(time.time() - 1, receiver.tell,
-                           'error').get(self.TIMEOUT)
-        self.assertTrue(self.stop_proxy(deliverer), "deliverer died")
-        receiver.tell.assert_called_with('error')
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/services/nodemanager/tests/testutil.py b/services/nodemanager/tests/testutil.py
deleted file mode 100644 (file)
index ee475ef..0000000
+++ /dev/null
@@ -1,236 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-#
-# SPDX-License-Identifier: AGPL-3.0
-
-from __future__ import absolute_import, print_function
-
-import contextlib
-import datetime
-import mock
-import pykka
-import sys
-import threading
-import time
-
-import libcloud.common.types as cloud_types
-
-from . import pykka_timeout
-
-no_sleep = mock.patch('time.sleep', lambda n: None)
-
-def arvados_node_mock(node_num=99, job_uuid=None, age=-1, **kwargs):
-    mod_time = datetime.datetime.utcnow() - datetime.timedelta(seconds=age)
-    mod_time_s = mod_time.strftime('%Y-%m-%dT%H:%M:%S.%fZ')
-    if job_uuid is True:
-        job_uuid = 'zzzzz-jjjjj-jobjobjobjobjob'
-    crunch_worker_state = 'idle' if (job_uuid is None) else 'busy'
-    node = {'uuid': 'zzzzz-yyyyy-{:015x}'.format(node_num),
-            'created_at': '2014-01-01T01:02:03.04050607Z',
-            'modified_at': mod_time_s,
-            'first_ping_at': kwargs.pop('first_ping_at', mod_time_s),
-            'last_ping_at': mod_time_s,
-            'slot_number': node_num,
-            'hostname': 'compute{}'.format(node_num),
-            'domain': 'zzzzz.arvadosapi.com',
-            'ip_address': ip_address_mock(node_num),
-            'job_uuid': job_uuid,
-            'crunch_worker_state': crunch_worker_state,
-            'properties': {},
-            'info': {'ping_secret': 'defaulttestsecret', 'ec2_instance_id': str(node_num)}}
-    node.update(kwargs)
-    return node
-
-def cloud_object_mock(name_id, **extra):
-    # A very generic mock, useful for stubbing libcloud objects we
-    # only search for and pass around, like locations, subnets, etc.
-    cloud_object = mock.NonCallableMagicMock(['id', 'name'],
-                                             name='cloud_object')
-    cloud_object.name = str(name_id)
-    cloud_object.id = 'id_' + cloud_object.name
-    cloud_object.extra = extra
-    return cloud_object
-
-
-def cloud_node_fqdn(node):
-    # We intentionally put the FQDN somewhere goofy to make sure tested code is
-    # using this function for lookups.
-    return node.extra.get('testname', node.name+'.NoTestName.invalid')
-
-def ip_address_mock(last_octet):
-    return '10.20.30.{}'.format(last_octet)
-
-@contextlib.contextmanager
-def redirected_streams(stdout=None, stderr=None):
-    orig_stdout, sys.stdout = sys.stdout, stdout or sys.stdout
-    orig_stderr, sys.stderr = sys.stderr, stderr or sys.stderr
-    try:
-        yield
-    finally:
-        sys.stdout = orig_stdout
-        sys.stderr = orig_stderr
-
-
-class MockShutdownTimer(object):
-    def _set_state(self, is_open, next_opening):
-        self.window_open = lambda: is_open
-        self.next_opening = lambda: next_opening
-
-
-class MockSize(object):
-    def __init__(self, factor, preemptible=False):
-        self.id = 'z{}.test'.format(factor)
-        self.name = 'test size '+self.id
-        self.ram = 128 * factor
-        self.disk = factor   # GB
-        self.scratch = 1000 * factor # MB
-        self.bandwidth = 16 * factor
-        self.price = float(factor)
-        self.extra = {}
-        self.real = self
-        self.preemptible = preemptible
-
-    def __eq__(self, other):
-        return self.id == other.id
-
-
-class MockTimer(object):
-    def __init__(self, deliver_immediately=True):
-        self.deliver_immediately = deliver_immediately
-        self.messages = []
-        self.lock = threading.Lock()
-
-    def deliver(self):
-        with self.lock:
-            to_deliver = self.messages
-            self.messages = []
-        for callback, args, kwargs in to_deliver:
-            try:
-                callback(*args, **kwargs)
-            except pykka.ActorDeadError:
-                pass
-
-    def schedule(self, want_time, callback, *args, **kwargs):
-        with self.lock:
-            self.messages.append((callback, args, kwargs))
-        if self.deliver_immediately:
-            self.deliver()
-
-
-class ActorTestMixin(object):
-    FUTURE_CLASS = pykka.ThreadingFuture
-    TIMEOUT = pykka_timeout
-
-    def tearDown(self):
-        pykka.ActorRegistry.stop_all()
-
-    def stop_proxy(self, proxy):
-        th = proxy.get_thread().get()
-        t = proxy.actor_ref.stop(timeout=self.TIMEOUT)
-        th.join()
-        return t
-
-    def wait_for_assignment(self, proxy, attr_name, unassigned=None,
-                            timeout=TIMEOUT):
-        deadline = time.time() + timeout
-        while True:
-            loop_timeout = deadline - time.time()
-            if loop_timeout <= 0:
-                self.fail("actor did not assign {} in time".format(attr_name))
-            result = getattr(proxy, attr_name).get(loop_timeout)
-            if result is not unassigned:
-                return result
-
-    def busywait(self, f, finalize=None):
-        n = 0
-        while not f() and n < 20:
-            time.sleep(.1)
-            n += 1
-        if finalize is not None:
-            finalize()
-        self.assertTrue(f())
-
-
-class DriverTestMixin(object):
-    def setUp(self):
-        self.driver_mock = mock.MagicMock(name='driver_mock')
-        super(DriverTestMixin, self).setUp()
-
-    def new_driver(self, auth_kwargs={}, list_kwargs={}, create_kwargs={}):
-        create_kwargs.setdefault('ping_host', '100::')
-        return self.TEST_CLASS(
-            auth_kwargs, list_kwargs, create_kwargs,
-            driver_class=self.driver_mock)
-
-    def driver_method_args(self, method_name):
-        return getattr(self.driver_mock(), method_name).call_args
-
-    def test_driver_create_retry(self):
-        with mock.patch('time.sleep'):
-            driver_mock2 = mock.MagicMock(name='driver_mock2')
-            self.driver_mock.side_effect = (Exception("oops"), driver_mock2)
-            kwargs = {'user_id': 'foo'}
-            driver = self.new_driver(auth_kwargs=kwargs)
-            self.assertTrue(self.driver_mock.called)
-            self.assertIs(driver.real, driver_mock2)
-
-    def test_create_can_find_node_after_timeout(self, create_kwargs={}, node_extra={}):
-        driver = self.new_driver(create_kwargs=create_kwargs)
-        arv_node = arvados_node_mock()
-        cloud_node = cloud_node_mock(**node_extra)
-        cloud_node.name = driver.create_cloud_name(arv_node)
-        create_method = self.driver_mock().create_node
-        create_method.side_effect = cloud_types.LibcloudError("fake timeout")
-        list_method = self.driver_mock().list_nodes
-        list_method.return_value = [cloud_node]
-        actual = driver.create_node(MockSize(1), arv_node)
-        self.assertIs(cloud_node, actual)
-
-    def test_create_can_raise_exception_after_timeout(self):
-        driver = self.new_driver()
-        arv_node = arvados_node_mock()
-        create_method = self.driver_mock().create_node
-        create_method.side_effect = cloud_types.LibcloudError("fake timeout")
-        list_method = self.driver_mock().list_nodes
-        list_method.return_value = []
-        with self.assertRaises(cloud_types.LibcloudError) as exc_test:
-            driver.create_node(MockSize(1), arv_node)
-        self.assertIs(create_method.side_effect, exc_test.exception)
-
-    def check_node_found_after_timeout_has_fixed_size(self, size, cloud_node,
-                                                      create_kwargs={}):
-        # This method needs to be called explicitly by driver test suites
-        # that need it.
-        self.driver_mock().list_sizes.return_value = [size]
-        driver = self.new_driver(create_kwargs=create_kwargs)
-        arv_node = arvados_node_mock()
-        cloud_node.name = driver.create_cloud_name(arv_node)
-        create_method = self.driver_mock().create_node
-        create_method.side_effect = cloud_types.LibcloudError("fake timeout")
-        self.driver_mock().list_nodes.return_value = [cloud_node]
-        actual = driver.create_node(size, arv_node)
-        self.assertIs(size, actual.size)
-
-
-class RemotePollLoopActorTestMixin(ActorTestMixin):
-    def build_monitor(self, *args, **kwargs):
-        self.timer = mock.MagicMock(name='timer_mock')
-        self.client = mock.MagicMock(name='client_mock')
-        self.subscriber = mock.Mock(name='subscriber_mock')
-        self.monitor = self.TEST_CLASS.start(
-            self.client, self.timer, *args, **kwargs).proxy()
-
-def cloud_node_mock(node_num=99, size=None, **extra):
-    if size is None:
-        size = MockSize(node_num)
-    node = mock.NonCallableMagicMock(
-        ['id', 'name', 'state', 'public_ips', 'private_ips', 'driver', 'size',
-         'image', 'extra'],
-        name='cloud_node')
-    node.id = str(node_num)
-    node.name = node.id
-    node.size = size
-    node.public_ips = []
-    node.private_ips = [ip_address_mock(node_num)]
-    node.extra = extra
-    return node
index 59aca1e5b4cabbc4f1f20117d9e1d76f474dc826..292a4fd746a6697fafa0bda5155e766bab79618d 100755 (executable)
@@ -194,7 +194,7 @@ run() {
             localip=$(ip addr show $defaultdev | grep 'inet ' | sed 's/ *inet \(.*\)\/.*/\1/')
         fi
        echo "Public arvbox will use address $localip"
-        iptemp=$(tempfile)
+        iptemp=$(mktemp)
         echo $localip > $iptemp
         chmod og+r $iptemp
         PUBLIC="--volume=$iptemp:/var/run/localip_override
diff --git a/tools/compute-images/.gitignore b/tools/compute-images/.gitignore
new file mode 100644 (file)
index 0000000..68fc775
--- /dev/null
@@ -0,0 +1,3 @@
+*pem
+secrets/*
+keypairs/*
similarity index 72%
rename from sdk/pam/.dockerignore
rename to tools/compute-images/.licenseignore
index 922b80e4843504a59363f6f428bdd2e71cc93fbc..6288dbbc872251d49f6f4d34615393b1260e685f 100644 (file)
@@ -1,10 +1,5 @@
 # Copyright (C) The Arvados Authors. All rights reserved.
 #
 # SPDX-License-Identifier: Apache-2.0
-
-*~
-*.pyc
-.eggs
-*.egg_info
-build
-tmp
+*.json
+1078ECD7.asc
diff --git a/tools/compute-images/1078ECD7.asc b/tools/compute-images/1078ECD7.asc
new file mode 100644 (file)
index 0000000..edc62f4
--- /dev/null
@@ -0,0 +1,30 @@
+-----BEGIN PGP PUBLIC KEY BLOCK-----
+
+mQENBEzhgeoBCAChhoK1dqpWzNyDWqRGEvdFdkJaA9D2HRwKPfBfjAoePX6ZyrpA
+ItlUsvt/8s/DRiTiPEFQR4S7VqocmU6whJc3gDEGyOM6b1NF873lIfSVwUoE42QE
+a76dO8woOYgLUyxu2mKG+bJgGMumjBJt6ZOndYVjTYB/7sEeVxwmMVulfZe0s6zg
+ut0+SoTYg2R36qIqeIcWllYt97sEYnyy1qXMis4/3IZnuWkS/frsPR3aeUI4W+o2
+NDN1kj49+LMe7Fb5b7jZY08rZbAWXi1rU1hQx4jC9RvYqlT4HNld4Bn7os1IvOOA
+wNiR0oiVdiuDbBxcMvRPktxMrFVjowusRLq/ABEBAAG0PUN1cm92ZXJzZSwgSW5j
+IEF1dG9tYXRpYyBTaWduaW5nIEtleSA8c3lzYWRtaW5AY3Vyb3ZlcnNlLmNvbT6J
+ATgEEwECACIFAlNgYIECGwMGCwkIBwMCBhUIAgkKCwQWAgMBAh4BAheAAAoJEFcW
+WREQeOzXPkEH/jQJDIYI1dxWcYiA+hczmpaZvN2/pc/kwIW/6a03+6zqmSNkebOE
+TgoDILacSYc17hy20R1/rWyUstOMKcEgFDBlSehhHyl0f7q/w7d8Ais6MabzsPfx
+IceJpsjUg87+BR7qWhgQ0sxmtIF2TKuTFLs+nkGsgSsiBOEF4NvHxuj3HD4y8F27
+HNqrkqwjLS8xJwwH5Gp2uMEVr1AXIH3iSRjJ8X124s8iEP97Q/3IazoYRf9/MCSm
+QEx8KzxwDX6t4bW6O4D01K+e9gdkTY70dcMgJoqm5IsX7yxjEubiOunphtlJnZ9d
+Oi1yBN5UM3pWKAdcfRj4rcfV9Simvpx9av+5AQ0ETOGB6gEIAMAA0HVMG0BbdnU7
+wWgl5eFdT0AUSrXK/WdcKqVEGGv+c68NETSHWZOJX7O46Eao4gY4cTYprVMBzxpY
+/BtQSYLpE0HLvBc1fcFd61Yz4H/9rGSNY0GcIQEbOjbJY5mr8qFsQ1K/mAf3aUL3
+b6ni4sHVicRiRr0Gl4Ihorlskpfu1SHs/C5tvTSVNF9p4vtl5892y1yILQeVpcBs
+NCR7MUpdS49xCpvnAWsDZX+ij6LTR3lzCm/ZLCg4gNuZkjgU9oqVfGkqysW7WZ8S
+OLvzAwUw7i1EIFX8q6QdudGoezxz8m8OgZM1v8AFpYEKlhEPf1W0MSfaRDwrj866
+8nCLruEAEQEAAYkBHwQYAQIACQUCTOGB6gIbDAAKCRBXFlkREHjs199EB/4+p0G1
+3PHxt6rLWSCGXobDOu4ZOA/qnv0D/JhOLroFds5TzQv6vnS8eAkhCTjHVA+b58cm
+kXpI0oYcD4ZP+KK1CHKq2rGfwou7HfAF+icnNqYkeBOkjjbCgkvBlcCInuAuU8JX
+DZMkfFk52+eBKwTjS/J/fQp0vDru8bHLp98WgdRHWfJQ3mc3gz4A5sR6zhrGPW6/
+ssnROS4dC2Ohp35GpgN1KjD3EmEw5RoSBYlyrARCaMsivgIKMxGUEyFZWhuJt3N1
+2MTddRwz28hbmYCi+MzHYDbRv+cSyUDmvXaWhfkNKBepClBA1rTWBcldit5vvlqr
+yPet6wIKrtLGhAqZ
+=CLkG
+-----END PGP PUBLIC KEY BLOCK-----
diff --git a/tools/compute-images/arvados-images-aws.json b/tools/compute-images/arvados-images-aws.json
new file mode 100644 (file)
index 0000000..b83207b
--- /dev/null
@@ -0,0 +1,80 @@
+{
+  "variables": {
+    "aws_access_key": "",
+    "aws_secret_key": "",
+    "aws_profile": "",
+    "build_environment": "aws",
+    "arvados_cluster": "",
+    "aws_source_ami": "ami-04d70e069399af2e9",
+    "fqdn": "",
+    "ssh_user": "admin",
+    "vpc_id": "",
+    "subnet_id": "",
+    "public_key_file": "",
+    "associate_public_ip_address": "true"
+  },
+  "builders": [{
+    "type": "amazon-ebs",
+    "profile": "{{ user `aws_profile`}}",
+    "access_key": "{{user `aws_access_key`}}",
+    "secret_key": "{{user `aws_secret_key`}}",
+    "region": "{{user `aws_default_region`}}",
+    "ena_support": "true",
+    "source_ami": "{{user `aws_source_ami`}}",
+    "instance_type": "m4.large",
+    "vpc_id": "{{user `vpc_id`}}",
+    "subnet_id": "{{user `subnet_id`}}",
+    "associate_public_ip_address": "{{user `associate_public_ip_address`}}",
+    "ssh_username": "{{user `ssh_user`}}",
+    "ami_name": "arvados-{{user `arvados_cluster`}}-compute-{{isotime \"20060102150405\"}}",
+    "ami_block_device_mappings": [
+      {
+        "device_name": "/dev/xvdb",
+        "encrypted": true,
+        "virtual_name": "ephemeral0"
+      },
+      {
+        "device_name": "/dev/xvdc",
+        "encrypted": true,
+        "virtual_name": "ephemeral1"
+      }
+    ],
+    "tags": {
+      "Name": "arvados-{{user `arvados_cluster`}}-compute",
+      "creation_date": "{{isotime \"20060102150405\"}}",
+      "packer": "true"
+    },
+    "run_tags": {
+      "Name": "packer-arvados-{{user `arvados_cluster`}}-compute-builder",
+      "creation_date": "{{isotime \"20060102150405\"}}",
+      "environment": "development"
+    },
+    "run_volume_tags": {
+      "Name": "packer-arvados-{{user `arvados_cluster`}}-compute-builder",
+      "creation_date": "{{isotime \"20060102150405\"}}",
+      "environment": "development"
+    }
+  }],
+  "provisioners": [{
+    "type": "file",
+    "source": "1078ECD7.asc",
+    "destination": "/tmp/1078ECD7.asc"
+  },{
+    "type": "file",
+    "source": "scripts/etc-cloud-cloud.cfg.d-07_compute_arvados_dispatch_cloud.cfg",
+    "destination": "/tmp/etc-cloud-cloud.cfg.d-07_compute_arvados_dispatch_cloud.cfg"
+  },{
+    "type": "file",
+    "source": "scripts/usr-local-bin-ensure-encrypted-partitions.sh",
+    "destination": "/tmp/usr-local-bin-ensure-encrypted-partitions.sh"
+  },{
+    "type": "file",
+    "source": "{{user `public_key_file`}}",
+    "destination": "/tmp/crunch-authorized_keys"
+  },{
+    "type": "shell",
+    "execute_command": "sudo -S env {{ .Vars }} /bin/bash '{{ .Path }}'",
+    "script": "scripts/base.sh",
+    "environment_vars": ["ROLE=compute","RESOLVER={{user `resolver`}}","REPOSUFFIX={{user `reposuffix`}}"]
+  }]
+}
diff --git a/tools/compute-images/arvados-images-azure.json b/tools/compute-images/arvados-images-azure.json
new file mode 100644 (file)
index 0000000..f7fc1a0
--- /dev/null
@@ -0,0 +1,75 @@
+{
+  "variables": {
+    "storage_account": null,
+    "resource_group": null,
+    "client_id": "{{env `ARM_CLIENT_ID`}}",
+    "client_secret": "{{env `ARM_CLIENT_SECRET`}}",
+    "subscription_id": "{{env `ARM_SUBSCRIPTION_ID`}}",
+    "tenant_id": "{{env `ARM_TENANT_ID`}}",
+    "build_environment": "azure-arm",
+    "cloud_environment_name": "Public",
+    "location": "centralus",
+    "ssh_user": "packer",
+    "ssh_private_key_file": "{{env `PACKERPRIVKEY`}}",
+    "image_sku": "",
+    "arvados_cluster": "",
+    "project_id": "",
+    "account_file": "",
+    "fqdn": "",
+    "resolver": "",
+    "reposuffix": "",
+    "public_key_file": ""
+  },
+  "builders": [
+    {
+      "type": "azure-arm",
+      "cloud_environment_name": "{{user `cloud_environment_name`}}",
+
+      "client_id": "{{user `client_id`}}",
+      "client_secret": "{{user `client_secret`}}",
+      "subscription_id": "{{user `subscription_id`}}",
+      "tenant_id": "{{user `tenant_id`}}",
+
+      "resource_group_name": "{{user `resource_group`}}",
+      "storage_account": "{{user `storage_account`}}",
+
+      "capture_container_name": "images",
+      "capture_name_prefix": "{{user `arvados_cluster`}}-compute",
+
+      "ssh_username": "{{user `ssh_user`}}",
+      "ssh_private_key_file": "{{user `ssh_private_key_file`}}",
+
+      "image_publisher": "Canonical",
+      "image_offer": "UbuntuServer",
+      "image_sku": "{{user `image_sku`}}",
+
+      "os_type": "Linux",
+
+      "location": "{{user `location`}}",
+      "vm_size": "Standard_D1_v2"
+    }
+  ],
+
+  "provisioners": [{
+    "type": "file",
+    "source": "1078ECD7.asc",
+    "destination": "/tmp/1078ECD7.asc"
+  },{
+    "type": "file",
+    "source": "scripts/etc-cloud-cloud.cfg.d-07_compute_arvados_dispatch_cloud.cfg",
+    "destination": "/tmp/etc-cloud-cloud.cfg.d-07_compute_arvados_dispatch_cloud.cfg"
+  },{
+    "type": "file",
+    "source": "scripts/usr-local-bin-ensure-encrypted-partitions.sh",
+    "destination": "/tmp/usr-local-bin-ensure-encrypted-partitions.sh"
+  },{
+    "type": "file",
+    "source": "{{user `public_key_file`}}",
+    "destination": "/tmp/crunch-authorized_keys"
+  },{
+    "type": "shell",
+    "execute_command": "sudo -S env {{ .Vars }} /bin/bash '{{ .Path }}'",
+    "script": "scripts/base.sh",
+    "environment_vars": ["ROLE=compute","RESOLVER={{user `resolver`}}","REPOSUFFIX={{user `reposuffix`}}"]
+  }]
+}
diff --git a/tools/compute-images/build.sh b/tools/compute-images/build.sh
new file mode 100755 (executable)
index 0000000..e8265ae
--- /dev/null
@@ -0,0 +1,277 @@
+#!/bin/bash
+
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+JSON_FILE=$1
+ARVADOS_CLUSTER=$2
+PROJECT_ID=$3
+ACCOUNT_FILE=$4
+
+read -rd "\000" helpmessage <<EOF
+$(basename $0): Build cloud images for arvados-dispatch-cloud
+
+Syntax:
+        $(basename $0) [options]
+
+Options:
+
+  --json-file (required)
+      Path to the packer json file
+  --arvados-cluster-id (required)
+      The ID of the Arvados cluster, e.g. zzzzz
+  --aws-profile (default: false)
+      AWS profile to use (valid profile from ~/.aws/config
+  --aws-secrets-file (default: false, required if building for AWS)
+      AWS secrets file which will be sourced from this script
+  --aws-source-ami (default: false, required if building for AWS)
+      The AMI to use as base for building the images
+  --aws-region (default: us-east-1)
+      The AWS region to use for building the images
+  --aws-vpc-id (optional)
+      VPC id for AWS, otherwise packer will pick the default one
+  --aws-subnet-id
+      Subnet id for AWS otherwise packer will pick the default one for the VPC
+  --gcp-project-id (default: false, required if building for GCP)
+      GCP project id
+  --gcp-account-file (default: false, required if building for GCP)
+      GCP account file
+  --gcp-zone (default: us-central1-f)
+      GCP zone
+  --azure-secrets-file (default: false, required if building for Azure)
+      Azure secrets file which will be sourced from this script
+  --azure-resource-group (default: false, required if building for Azure)
+      Azure resource group
+  --azure-storage-account (default: false, required if building for Azure)
+      Azure storage account
+  --azure-location (default: false, required if building for Azure)
+      Azure location, e.g. centralus, eastus, westeurope
+  --azure-sku (default: unset, required if building for Azure, e.g. 16.04-LTS)
+      Azure SKU image to use
+  --ssh_user  (default: packer)
+      The user packer will use to log into the image
+  --domain  (default: arvadosapi.com)
+      The domain part of the FQDN for the cluster
+  --resolver (default: 8.8.8.8)
+      The dns resolver for the machine
+  --reposuffix (default: unset)
+      Set this to "-dev" to track the unstable/dev Arvados repositories
+  --public-key-file (required)
+      Path to the public key file that a-d-c will use to log into the compute node
+  --debug
+      Output debug information (default: false)
+
+EOF
+
+JSON_FILE=
+ARVADOS_CLUSTER_ID=
+AWS_PROFILE=
+AWS_SECRETS_FILE=
+AWS_SOURCE_AMI=
+AWS_VPC_ID=
+AWS_SUBNET_ID=
+GCP_PROJECT_ID=
+GCP_ACCOUNT_FILE=
+GCP_ZONE=
+AZURE_SECRETS_FILE=
+AZURE_RESOURCE_GROUP=
+AZURE_STORAGE_ACCOUNT=
+AZURE_LOCATION=
+AZURE_CLOUD_ENVIRONMENT=
+DEBUG=
+SSH_USER=
+DOMAIN="arvadosapi.com"
+AWS_DEFAULT_REGION=us-east-1
+PUBLIC_KEY_FILE=
+
+PARSEDOPTS=$(getopt --name "$0" --longoptions \
+    help,json-file:,arvados-cluster-id:,aws-source-ami:,aws-profile:,aws-secrets-file:,aws-region:,aws-vpc-id:,aws-subnet-id:,gcp-project-id:,gcp-account-file:,gcp-zone:,azure-secrets-file:,azure-resource-group:,azure-storage-account:,azure-location:,azure-sku:,azure-cloud-environment:,ssh_user:,domain:,resolver:,reposuffix:,public-key-file:,debug \
+    -- "" "$@")
+if [ $? -ne 0 ]; then
+    exit 1
+fi
+
+eval set -- "$PARSEDOPTS"
+while [ $# -gt 0 ]; do
+    case "$1" in
+        --help)
+            echo >&2 "$helpmessage"
+            echo >&2
+            exit 1
+            ;;
+        --json-file)
+            JSON_FILE="$2"; shift
+            ;;
+        --arvados-cluster-id)
+            ARVADOS_CLUSTER_ID="$2"; shift
+            ;;
+        --aws-source-ami)
+            AWS_SOURCE_AMI="$2"; shift
+            ;;
+        --aws-profile)
+            AWS_PROFILE="$2"; shift
+            ;;
+        --aws-secrets-file)
+            AWS_SECRETS_FILE="$2"; shift
+            ;;
+        --aws-region)
+            AWS_DEFAULT_REGION="$2"; shift
+            ;;
+        --aws-vpc-id)
+            AWS_VPC_ID="$2"; shift
+            ;;
+        --aws-subnet-id)
+            AWS_SUBNET_ID="$2"; shift
+            ;;
+        --gcp-project-id)
+            GCP_PROJECT_ID="$2"; shift
+            ;;
+        --gcp-account-file)
+            GCP_ACCOUNT_FILE="$2"; shift
+            ;;
+        --gcp-zone)
+            GCP_ZONE="$2"; shift
+            ;;
+        --azure-secrets-file)
+            AZURE_SECRETS_FILE="$2"; shift
+            ;;
+        --azure-resource-group)
+            AZURE_RESOURCE_GROUP="$2"; shift
+            ;;
+        --azure-storage-account)
+            AZURE_STORAGE_ACCOUNT="$2"; shift
+            ;;
+        --azure-location)
+            AZURE_LOCATION="$2"; shift
+            ;;
+        --azure-sku)
+            AZURE_SKU="$2"; shift
+            ;;
+        --azure-cloud-environment)
+            AZURE_CLOUD_ENVIRONMENT="$2"; shift
+            ;;
+        --ssh_user)
+            SSH_USER="$2"; shift
+            ;;
+        --domain)
+            DOMAIN="$2"; shift
+            ;;
+        --resolver)
+            RESOLVER="$2"; shift
+            ;;
+        --reposuffix)
+            REPOSUFFIX="$2"; shift
+            ;;
+        --public-key-file)
+            PUBLIC_KEY_FILE="$2"; shift
+            ;;
+        --debug)
+            # If you want to debug a build issue, add the -debug flag to the build
+            # command in question.
+            # This will allow you to ssh in, if you use the .pem file that packer
+            # generates in this directory as the ssh key. The base image uses the admin
+            # user and ssh port 22.
+            EXTRA=" -debug"
+            ;;
+        --)
+            if [ $# -gt 1 ]; then
+                echo >&2 "$0: unrecognized argument '$2'. Try: $0 --help"
+                exit 1
+            fi
+            ;;
+    esac
+    shift
+done
+
+
+if [[ "$JSON_FILE" == "" ]] || [[ ! -f "$JSON_FILE" ]]; then
+  echo >&2 "$helpmessage"
+  echo >&2
+  echo >&2 "ERROR: packer json file not found"
+  echo >&2
+  exit 1
+fi
+
+if [[ -z "$ARVADOS_CLUSTER_ID" ]]; then
+  echo >&2 "$helpmessage"
+  echo >&2
+  echo >&2 "ERROR: arvados cluster id not specified"
+  echo >&2
+  exit 1
+fi
+
+if [[ "$PUBLIC_KEY_FILE" == "" ]] || [[ ! -f "$PUBLIC_KEY_FILE" ]]; then
+  echo >&2 "$helpmessage"
+  echo >&2
+  echo >&2 "ERROR: public key file file not found"
+  echo >&2
+  exit 1
+fi
+
+if [[ ! -z "$AWS_SECRETS_FILE" ]]; then
+  source $AWS_SECRETS_FILE
+fi
+
+if [[ ! -z "$AZURE_SECRETS_FILE" ]]; then
+  source $AZURE_SECRETS_FILE
+fi
+
+FQDN=" -var fqdn=compute.$ARVADOS_CLUSTER_ID.$DOMAIN ";
+
+EXTRA2=""
+
+if [[ "$AWS_SOURCE_AMI" != "" ]]; then
+  EXTRA2+=" -var aws_source_ami=$AWS_SOURCE_AMI"
+fi
+if [[ "$AWS_PROFILE" != "" ]]; then
+  EXTRA2+=" -var aws_profile=$AWS_PROFILE"
+fi
+if [[ "$AWS_VPC_ID" != "" ]]; then
+  EXTRA2+=" -var vpc_id=$AWS_VPC_ID -var associate_public_ip_address=true "
+fi
+if [[ "$AWS_SUBNET_ID" != "" ]]; then
+  EXTRA2+=" -var subnet_id=$AWS_SUBNET_ID -var associate_public_ip_address=true "
+fi
+if [[ "$AWS_DEFAULT_REGION" != "" ]]; then
+  EXTRA2+=" -var aws_default_region=$AWS_DEFAULT_REGION"
+fi
+if [[ "$GCP_PROJECT_ID" != "" ]]; then
+  EXTRA2+=" -var project_id=$GCP_PROJECT_ID"
+fi
+if [[ "$GCP_ACCOUNT_FILE" != "" ]]; then
+  EXTRA2+=" -var account_file=$GCP_ACCOUNT_FILE"
+fi
+if [[ "$GCP_ZONE" != "" ]]; then
+  EXTRA2+=" -var zone=$GCP_ZONE"
+fi
+if [[ "$AZURE_RESOURCE_GROUP" != "" ]]; then
+  EXTRA2+=" -var resource_group=$AZURE_RESOURCE_GROUP"
+fi
+if [[ "$AZURE_STORAGE_ACCOUNT" != "" ]]; then
+  EXTRA2+=" -var storage_account=$AZURE_STORAGE_ACCOUNT"
+fi
+if [[ "$AZURE_LOCATION" != "" ]]; then
+  EXTRA2+=" -var location=$AZURE_LOCATION"
+fi
+if [[ "$AZURE_SKU" != "" ]]; then
+  EXTRA2+=" -var image_sku=$AZURE_SKU"
+fi
+if [[ "$AZURE_CLOUD_ENVIRONMENT" != "" ]]; then
+  EXTRA2+=" -var cloud_environment_name=$AZURE_CLOUD_ENVIRONMENT"
+fi
+if [[ "$SSH_USER" != "" ]]; then
+  EXTRA2+=" -var ssh_user=$SSH_USER"
+fi
+if [[ "$RESOLVER" != "" ]]; then
+  EXTRA2+=" -var resolver=$RESOLVER"
+fi
+if [[ "$REPOSUFFIX" != "" ]]; then
+  EXTRA2+=" -var reposuffix=$REPOSUFFIX"
+fi
+if [[ "$PUBLIC_KEY_FILE" != "" ]]; then
+  EXTRA2+=" -var public_key_file=$PUBLIC_KEY_FILE"
+fi
+
+echo packer build$EXTRA$FQDN -var "role=$role" -var "arvados_cluster=$ARVADOS_CLUSTER_ID"$EXTRA2 $JSON_FILE
+packer build$EXTRA$FQDN -var "role=$role" -var "arvados_cluster=$ARVADOS_CLUSTER_ID"$EXTRA2 $JSON_FILE
diff --git a/tools/compute-images/scripts/base.sh b/tools/compute-images/scripts/base.sh
new file mode 100644 (file)
index 0000000..73c7b9d
--- /dev/null
@@ -0,0 +1,110 @@
+#!/bin/bash -euxo pipefail
+
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+SUDO=sudo
+
+# Run apt-get update
+$SUDO DEBIAN_FRONTEND=noninteractive apt-get --yes update
+
+# Install gnupg and dirmgr or gpg key checks will fail
+$SUDO DEBIAN_FRONTEND=noninteractive apt-get -qq --yes install \
+  gnupg \
+  dirmngr \
+  lsb-release
+
+# For good measure, apt-get upgrade
+$SUDO DEBIAN_FRONTEND=noninteractive apt-get -qq --yes upgrade
+
+# Make sure cloud-init is installed
+$SUDO DEBIAN_FRONTEND=noninteractive apt-get -qq --yes install cloud-init
+if [[ ! -d /var/lib/cloud/scripts/per-boot ]]; then
+  mkdir -p /var/lib/cloud/scripts/per-boot
+fi
+
+TMP_LSB=`/usr/bin/lsb_release -c -s`
+LSB_RELEASE_CODENAME=${TMP_LSB//[$'\t\r\n ']}
+
+# Add the arvados apt repository
+echo "# apt.arvados.org" |$SUDO tee --append /etc/apt/sources.list.d/apt.arvados.org.list
+echo "deb http://apt.arvados.org/ $LSB_RELEASE_CODENAME${REPOSUFFIX} main" |$SUDO tee --append /etc/apt/sources.list.d/apt.arvados.org.list
+
+# Add the arvados signing key
+cat /tmp/1078ECD7.asc | $SUDO apt-key add -
+# Add the debian keys
+$SUDO DEBIAN_FRONTEND=noninteractive apt-get install --yes debian-keyring debian-archive-keyring
+
+# Fix locale
+$SUDO /bin/sed -ri 's/# en_US.UTF-8 UTF-8/en_US.UTF-8 UTF-8/' /etc/locale.gen
+$SUDO /usr/sbin/locale-gen
+
+# Install some packages we always need
+$SUDO DEBIAN_FRONTEND=noninteractive apt-get --yes update
+$SUDO DEBIAN_FRONTEND=noninteractive apt-get -qq --yes install \
+  openssh-server \
+  apt-utils \
+  git \
+  curl \
+  libcurl3-gnutls \
+  libcurl4-openssl-dev \
+  lvm2 \
+  cryptsetup \
+  xfsprogs
+
+# See if python3-distutils is installable, and if so install it. This is a
+# temporary workaround for an Arvados packaging bug and should be removed once
+# Arvados 2.0.4 or 2.1.0 is released, whichever comes first.
+# See https://dev.arvados.org/issues/16611 for more information
+if apt-cache -qq show python3-distutils >/dev/null 2>&1; then
+  $SUDO DEBIAN_FRONTEND=noninteractive apt-get -qq --yes install python3-distutils
+fi
+
+# Install the Arvados packages we need
+$SUDO DEBIAN_FRONTEND=noninteractive apt-get -qq --yes install \
+  python-arvados-fuse \
+  crunch-run \
+  arvados-docker-cleaner \
+  docker.io
+
+# Remove unattended-upgrades if it is installed
+$SUDO DEBIAN_FRONTEND=noninteractive apt-get -qq --yes remove unattended-upgrades --purge
+
+# Configure arvados-docker-cleaner
+$SUDO mkdir -p /etc/arvados/docker-cleaner
+$SUDO echo -e "{\n  \"Quota\": \"10G\",\n  \"RemoveStoppedContainers\": \"always\"\n}" > /etc/arvados/docker-cleaner/docker-cleaner.json
+
+# Enable cgroup accounting
+$SUDO sed -i 's/GRUB_CMDLINE_LINUX=""/GRUB_CMDLINE_LINUX="cgroup_enable=memory swapaccount=1"/g' /etc/default/grub
+$SUDO update-grub
+
+# Set a higher ulimit for docker
+$SUDO sed -i "s/ExecStart=\(.*\)/ExecStart=\1 --default-ulimit nofile=10000:10000 --dns ${RESOLVER}/g" /lib/systemd/system/docker.service
+$SUDO systemctl daemon-reload
+
+# Make sure user_allow_other is set in fuse.conf
+$SUDO sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf
+
+# Add crunch user with sudo powers
+$SUDO adduser --disabled-password --gecos "Crunch user,,,," crunch
+# Do not require a password to sudo
+echo -e "# for the crunch user\ncrunch ALL=(ALL) NOPASSWD:ALL" | $SUDO tee /etc/sudoers.d/91-crunch
+
+# Set up the ssh public key for the crunch user
+$SUDO mkdir /home/crunch/.ssh
+$SUDO mv /tmp/crunch-authorized_keys /home/crunch/.ssh/authorized_keys
+$SUDO chown -R crunch:crunch /home/crunch/.ssh
+$SUDO chmod 600 /home/crunch/.ssh/authorized_keys
+$SUDO chmod 700 /home/crunch/.ssh/
+
+# Make sure we resolve via the provided resolver IP. Prepending is good enough because
+# unless 'rotate' is set, the nameservers are queried in order (cf. man resolv.conf)
+$SUDO sed -i "s/#prepend domain-name-servers 127.0.0.1;/prepend domain-name-servers ${RESOLVER};/" /etc/dhcp/dhclient.conf
+
+# Set up the cloud-init script that will ensure encrypted disks
+$SUDO mv /tmp/usr-local-bin-ensure-encrypted-partitions.sh /usr/local/bin/ensure-encrypted-partitions.sh
+$SUDO chmod 755 /usr/local/bin/ensure-encrypted-partitions.sh
+$SUDO chown root:root /usr/local/bin/ensure-encrypted-partitions.sh
+$SUDO mv /tmp/etc-cloud-cloud.cfg.d-07_compute_arvados_dispatch_cloud.cfg /etc/cloud/cloud.cfg.d/07_compute_arvados_dispatch_cloud.cfg
+$SUDO chown root:root /etc/cloud/cloud.cfg.d/07_compute_arvados_dispatch_cloud.cfg
diff --git a/tools/compute-images/scripts/etc-cloud-cloud.cfg.d-07_compute_arvados_dispatch_cloud.cfg b/tools/compute-images/scripts/etc-cloud-cloud.cfg.d-07_compute_arvados_dispatch_cloud.cfg
new file mode 100644 (file)
index 0000000..febeda3
--- /dev/null
@@ -0,0 +1,9 @@
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+runcmd:
+ - /bin/echo "cloudinit runcmd starting" | /usr/bin/logger
+ - /usr/local/bin/ensure-encrypted-partitions.sh
+ - /bin/echo "cloudinit runcmd ensure-encrypted-partitions.sh done" | /usr/bin/logger
+ - /bin/echo "cloudinit runcmd finished" | /usr/bin/logger
+ - /bin/touch /arvados-compute-node-boot.complete
diff --git a/tools/compute-images/scripts/usr-local-bin-ensure-encrypted-partitions.sh b/tools/compute-images/scripts/usr-local-bin-ensure-encrypted-partitions.sh
new file mode 100644 (file)
index 0000000..08579bf
--- /dev/null
@@ -0,0 +1,159 @@
+#!/bin/bash
+
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+set -e
+set -x
+
+VGNAME=compute
+LVNAME=tmp
+LVPATH="/dev/mapper/${VGNAME}-${LVNAME}"
+CRYPTPATH=/dev/mapper/tmp
+MOUNTPATH=/tmp
+
+findmntq() {
+    findmnt "$@" >/dev/null
+}
+
+ensure_umount() {
+    if findmntq "$1"; then
+        umount "$1"
+    fi
+}
+
+if findmntq --source "$CRYPTPATH" --target "$MOUNTPATH"; then
+    exit 0
+fi
+
+CLOUD_SERVER=""
+while [[ ! "$CLOUD_SERVER" ]]; do
+    CLOUD_SERVER="$(curl --silent --head http://169.254.169.254/ \
+                    | awk '($1 == "Server:"){sub("\\r+$", ""); print substr($0, 9)}')"
+done
+
+DISK_PATTERN=""
+case "$CLOUD_SERVER" in
+    # EC2
+    EC2ws) DISK_PATTERN=/dev/xvd ;;
+    # GCP
+    "Metadata Server for VM") DISK_PATTERN=/dev/sd ;;
+    # Azure
+    Microsoft-IIS/*) DISK_PATTERN=/dev/sd ;;
+esac
+
+if [[ -z "$DISK_PATTERN" ]]; then
+    echo "ensure-encrypted-partitions: Unknown disk configuration; can't run." >&2
+    exit 3
+fi
+
+declare -a LVM_DEVS=()
+
+ROOT_PARTITION=`findmnt / -f -o source -n`
+if [[ "$ROOT_PARTITION" =~ ^\/dev\/nvme ]]; then
+  # e.g. /dev/nvme0n1p1, strip last 4 characters
+  ROOT_DEVICE_STRING=${ROOT_PARTITION%????}
+else
+  # e.g. /dev/xvda1, strip last character
+  ROOT_DEVICE_STRING=${ROOT_PARTITION//[0-9]/}
+fi
+
+# Newer AWS node types use another pattern, /dev/nvmeXn1 for fast instance SSD disks
+if [[ "$CLOUD_SERVER" == "EC2ws" ]]; then
+  for dev in `ls /dev/nvme* 2>/dev/null`; do
+    if [[ "$dev" == "$ROOT_PARTITION" ]] || [[ "$dev" =~ ^$ROOT_DEVICE_STRING ]]; then
+      continue
+    fi
+    if [[ -e ${dev}n1 ]]; then
+      ensure_umount "${dev}n1"
+      if [[ "$devtype" = disk ]]; then
+        dd if=/dev/zero of="${dev}n1" bs=512 count=1
+      fi
+      LVM_DEVS+=("${dev}n1")
+    fi
+  done
+fi
+
+# Look for traditional disks but only if we're not on AWS or if we haven't found
+# a fast instance /dev/nvmeXn1 disk
+if [[ "$CLOUD_SERVER" != "EC2ws" ]] || [[ ${#LVM_DEVS[@]} -eq 0 ]]; then
+  for dev in `ls $DISK_PATTERN* 2>/dev/null`; do
+    # On Azure, we are dealing with /dev/sdb1, on GCP, /dev/sdb, on AWS, /dev/xvdb
+    if [[ "$dev" == "$ROOT_PARTITION" ]] || [[ "$dev" =~ ^$ROOT_DEVICE_STRING ]]; then
+      continue
+    fi
+    if [[ ! "$dev" =~ [a-z]$ ]]; then
+      continue
+    fi
+    if [[ -e ${dev}1 ]]; then
+        dev=${dev}1
+        devtype=partition
+    else
+        devtype=disk
+    fi
+    ensure_umount "$dev"
+    if [[ "$devtype" = disk ]]; then
+        dd if=/dev/zero of="$dev" bs=512 count=1
+    fi
+    LVM_DEVS+=("$dev")
+  done
+fi
+
+if [[ "${#LVM_DEVS[@]}" -eq 0 ]]; then
+    echo "ensure-encrypted-partitions: No extra disks found." >&2
+    exit 4
+fi
+
+vgcreate --force --yes "$VGNAME" "${LVM_DEVS[@]}"
+lvcreate --extents 100%FREE --name "$LVNAME" "$VGNAME"
+
+KEYPATH="$(mktemp -p /var/tmp key-XXXXXXXX.tmp)"
+modprobe dm_mod aes sha256
+head -c321 /dev/urandom >"$KEYPATH"
+echo YES | cryptsetup luksFormat "$LVPATH" "$KEYPATH"
+cryptsetup --key-file "$KEYPATH" luksOpen "$LVPATH" "$(basename "$CRYPTPATH")"
+shred -u "$KEYPATH"
+mkfs.xfs "$CRYPTPATH"
+
+# First make sure docker is not using /tmp, then unmount everything under it.
+if [ -d /etc/sv/docker.io ]
+then
+  sv stop docker.io || service stop docker.io || true
+else
+  service docker stop || true
+fi
+
+ensure_umount "$MOUNTPATH/docker/aufs"
+
+MOUNTOPTIONS="async"
+mount -o ${MOUNTOPTIONS} "$CRYPTPATH" "$MOUNTPATH"
+chmod a+w,+t "$MOUNTPATH"
+
+# Make sure docker uses the big partition
+cat <<EOF > /etc/docker/daemon.json
+{
+    "data-root": "$MOUNTPATH/docker-data"
+}
+EOF
+
+# restart docker
+if [ -d /etc/sv/docker.io ]
+then
+  ## runit
+  sv up docker.io
+else
+  service docker start
+fi
+
+end=$((SECONDS+60))
+
+while [ $SECONDS -lt $end ]; do
+  if /usr/bin/docker ps -q >/dev/null; then
+    exit 0
+  fi
+  sleep 1
+done
+
+# Docker didn't start within a minute, abort
+exit 1
index 2c49dcae62f1a30be179735e15d42a6b1e148934..463c552c4f1eb5caf0868337858197a747bc8fa8 100644 (file)
@@ -364,7 +364,7 @@ class Summarizer(object):
                 constraint_key,
                 int(used_cores))
 
-    # FIXME: This needs to be updated to account for current nodemanager algorithms
+    # FIXME: This needs to be updated to account for current a-d-c algorithms
     def _recommend_ram(self):
         """Recommend an economical RAM constraint for this job.
 
index 163291c238773c257c831f25691cdb9be8cb777e..84e1a6ce8e26f93ecf70088e7b79f5142095ce65 100644 (file)
@@ -19,6 +19,8 @@
 package main
 
 import (
+       "bufio"
+       "context"
        "crypto/rand"
        "encoding/binary"
        "flag"
@@ -26,12 +28,17 @@ import (
        "io"
        "io/ioutil"
        "log"
+       mathRand "math/rand"
        "net/http"
        "os"
        "os/signal"
+       "strings"
+       "sync/atomic"
        "syscall"
        "time"
 
+       "git.arvados.org/arvados.git/lib/config"
+       "git.arvados.org/arvados.git/sdk/go/arvados"
        "git.arvados.org/arvados.git/sdk/go/arvadosclient"
        "git.arvados.org/arvados.git/sdk/go/keepclient"
 )
@@ -51,8 +58,36 @@ var (
        ServiceUUID   = flag.String("uuid", "", "specify UUID of a single advertised keep service to exercise")
        getVersion    = flag.Bool("version", false, "Print version information and exit.")
        RunTime       = flag.Duration("run-time", 0, "time to run (e.g. 60s), or 0 to run indefinitely (default)")
+       Repeat        = flag.Int("repeat", 1, "number of times to repeat the experiment (default 1)")
+       UseIndex      = flag.Bool("use-index", false, "use the GetIndex call to get a list of blocks to read. Requires the SystemRoot token. Use this to rule out caching effects when reading.")
 )
 
+func createKeepClient(lgr *log.Logger) (kc *keepclient.KeepClient) {
+       arv, err := arvadosclient.MakeArvadosClient()
+       if err != nil {
+               lgr.Fatal(err)
+       }
+       kc, err = keepclient.MakeKeepClient(arv)
+       if err != nil {
+               lgr.Fatal(err)
+       }
+       kc.Want_replicas = *Replicas
+
+       kc.HTTPClient = &http.Client{
+               Timeout: 10 * time.Minute,
+               // It's not safe to copy *http.DefaultTransport
+               // because it has a mutex (which might be locked)
+               // protecting a private map (which might not be nil).
+               // So we build our own, using the Go 1.12 default
+               // values.
+               Transport: &http.Transport{
+                       TLSClientConfig: arvadosclient.MakeTLSConfig(arv.ApiInsecure),
+               },
+       }
+       overrideServices(kc, lgr)
+       return kc
+}
+
 func main() {
        flag.Parse()
 
@@ -62,79 +97,153 @@ func main() {
                os.Exit(0)
        }
 
-       stderr := log.New(os.Stderr, "", log.LstdFlags)
+       lgr := log.New(os.Stderr, "", log.LstdFlags)
 
-       arv, err := arvadosclient.MakeArvadosClient()
-       if err != nil {
-               stderr.Fatal(err)
+       if *ReadThreads > 0 && *WriteThreads == 0 && !*UseIndex {
+               lgr.Fatal("At least one write thread is required if rthreads is non-zero and -use-index is not enabled")
        }
-       kc, err := keepclient.MakeKeepClient(arv)
-       if err != nil {
-               stderr.Fatal(err)
+
+       if *ReadThreads == 0 && *WriteThreads == 0 {
+               lgr.Fatal("Nothing to do!")
        }
-       kc.Want_replicas = *Replicas
 
-       transport := *(http.DefaultTransport.(*http.Transport))
-       transport.TLSClientConfig = arvadosclient.MakeTLSConfig(arv.ApiInsecure)
-       kc.HTTPClient = &http.Client{
-               Timeout:   10 * time.Minute,
-               Transport: &transport,
+       kc := createKeepClient(lgr)
+
+       // When UseIndex is set, we need a KeepClient with SystemRoot powers to get
+       // the block index from the Keepstore. We use the SystemRootToken from
+       // the Arvados config.yml for that.
+       var cluster *arvados.Cluster
+       if *ReadThreads > 0 && *UseIndex {
+               cluster = loadConfig(lgr)
+               kc.Arvados.ApiToken = cluster.SystemRootToken
        }
 
-       overrideServices(kc, stderr)
+       ctx, cancel := context.WithCancel(context.Background())
+       defer cancel()
+       sigChan := make(chan os.Signal, 1)
+       signal.Notify(sigChan, os.Interrupt, syscall.SIGTERM)
+       go func() {
+               <-sigChan
+               // FIXME
+               //fmt.Print("\r") // Suppress the ^C print
+               cancel()
+       }()
 
-       nextLocator := make(chan string, *ReadThreads+*WriteThreads)
+       csvHeader := "Timestamp,Elapsed,Read (bytes),Avg Read Speed (MiB/s),Peak Read Speed (MiB/s),Written (bytes),Avg Write Speed (MiB/s),Peak Write Speed (MiB/s),Errors,ReadThreads,WriteThreads,VaryRequest,VaryThread,BlockSize,Replicas,StatsInterval,ServiceURL,ServiceUUID,UseIndex,RunTime,Repeat"
+       var summary string
 
-       go countBeans(nextLocator, stderr)
+       var nextBufs []chan []byte
        for i := 0; i < *WriteThreads; i++ {
                nextBuf := make(chan []byte, 1)
-               go makeBufs(nextBuf, i, stderr)
-               go doWrites(kc, nextBuf, nextLocator, stderr)
+               nextBufs = append(nextBufs, nextBuf)
+               go makeBufs(nextBuf, i, lgr)
        }
-       for i := 0; i < *ReadThreads; i++ {
-               go doReads(kc, nextLocator, stderr)
+
+       for i := 0; i < *Repeat && ctx.Err() == nil; i++ {
+               summary = runExperiment(ctx, cluster, kc, nextBufs, summary, csvHeader, lgr)
+               lgr.Printf("*************************** experiment %d complete ******************************\n", i)
+               summary += fmt.Sprintf(",%d\n", i)
        }
-       <-make(chan struct{})
+
+       lgr.Println("Summary:")
+       lgr.Println()
+       fmt.Println()
+       fmt.Println(csvHeader + ",Experiment")
+       fmt.Println(summary)
 }
 
-// Send 1234 to bytesInChan when we receive 1234 bytes from keepstore.
-var bytesInChan = make(chan uint64)
-var bytesOutChan = make(chan uint64)
+func runExperiment(ctx context.Context, cluster *arvados.Cluster, kc *keepclient.KeepClient, nextBufs []chan []byte, summary string, csvHeader string, lgr *log.Logger) (newSummary string) {
+       // Send 1234 to bytesInChan when we receive 1234 bytes from keepstore.
+       var bytesInChan = make(chan uint64)
+       var bytesOutChan = make(chan uint64)
+       // Send struct{}{} to errorsChan when an error happens.
+       var errorsChan = make(chan struct{})
+
+       var nextLocator atomic.Value
+       // when UseIndex is set, this channel is used instead of nextLocator
+       var indexLocatorChan = make(chan string, 2)
+
+       newSummary = summary
 
-// Send struct{}{} to errorsChan when an error happens.
-var errorsChan = make(chan struct{})
+       // Start warmup
+       ready := make(chan struct{})
+       var warmup bool
+       if *ReadThreads > 0 {
+               warmup = true
+               if !*UseIndex {
+                       lgr.Printf("Start warmup phase, waiting for 1 available block before reading starts\n")
+               } else {
+                       lgr.Printf("Start warmup phase, waiting for block index before reading starts\n")
+               }
+       }
+       if warmup && !*UseIndex {
+               go func() {
+                       locator, _, err := kc.PutB(<-nextBufs[0])
+                       if err != nil {
+                               lgr.Print(err)
+                               errorsChan <- struct{}{}
+                       }
+                       nextLocator.Store(locator)
+                       lgr.Println("Warmup complete!")
+                       close(ready)
+               }()
+       } else if warmup && *UseIndex {
+               // Get list of blocks to read
+               go getIndexLocators(ctx, cluster, kc, indexLocatorChan, lgr)
+               select {
+               case <-ctx.Done():
+                       return
+               case <-indexLocatorChan:
+                       lgr.Println("Warmup complete!")
+                       close(ready)
+               }
+       } else {
+               close(ready)
+       }
+       select {
+       case <-ctx.Done():
+               return
+       case <-ready:
+       }
+
+       // Warmup complete
+       ctx, cancel := context.WithDeadline(ctx, time.Now().Add(*RunTime))
+       defer cancel()
+
+       for i := 0; i < *WriteThreads; i++ {
+               go doWrites(ctx, kc, nextBufs[i], &nextLocator, bytesOutChan, errorsChan, lgr)
+       }
+       if *UseIndex {
+               for i := 0; i < *ReadThreads; i++ {
+                       go doReads(ctx, kc, nil, indexLocatorChan, bytesInChan, errorsChan, lgr)
+               }
+       } else {
+               for i := 0; i < *ReadThreads; i++ {
+                       go doReads(ctx, kc, &nextLocator, nil, bytesInChan, errorsChan, lgr)
+               }
+       }
 
-func countBeans(nextLocator chan string, stderr *log.Logger) {
        t0 := time.Now()
        var tickChan <-chan time.Time
-       var endChan <-chan time.Time
-       c := make(chan os.Signal)
-       signal.Notify(c, os.Interrupt, syscall.SIGTERM)
        if *StatsInterval > 0 {
                tickChan = time.NewTicker(*StatsInterval).C
        }
-       if *RunTime > 0 {
-               endChan = time.NewTicker(*RunTime).C
-       }
        var bytesIn uint64
        var bytesOut uint64
        var errors uint64
        var rateIn, rateOut float64
        var maxRateIn, maxRateOut float64
-       var abort, printCsv bool
+       var exit, printCsv bool
        csv := log.New(os.Stdout, "", 0)
-       csv.Println("Timestamp,Elapsed,Read (bytes),Avg Read Speed (MiB/s),Peak Read Speed (MiB/s),Written (bytes),Avg Write Speed (MiB/s),Peak Write Speed (MiB/s),Errors,ReadThreads,WriteThreads,VaryRequest,VaryThread,BlockSize,Replicas,StatsInterval,ServiceURL,ServiceUUID,RunTime")
+       csv.Println()
+       csv.Println(csvHeader)
        for {
                select {
-               case <-tickChan:
+               case <-ctx.Done():
                        printCsv = true
-               case <-endChan:
-                       printCsv = true
-                       abort = true
-               case <-c:
+                       exit = true
+               case <-tickChan:
                        printCsv = true
-                       abort = true
-                       fmt.Print("\r") // Suppress the ^C print
                case i := <-bytesInChan:
                        bytesIn += i
                case o := <-bytesOutChan:
@@ -152,8 +261,8 @@ func countBeans(nextLocator chan string, stderr *log.Logger) {
                        if rateOut > maxRateOut {
                                maxRateOut = rateOut
                        }
-                       csv.Printf("%v,%v,%v,%.1f,%.1f,%v,%.1f,%.1f,%d,%d,%d,%t,%t,%d,%d,%s,%s,%s,%s",
-                               time.Now().Format("2006-01-02 15:04:05"),
+                       line := fmt.Sprintf("%v,%v,%v,%.1f,%.1f,%v,%.1f,%.1f,%d,%d,%d,%t,%t,%d,%d,%s,%s,%s,%t,%s,%d",
+                               time.Now().Format("2006/01/02 15:04:05"),
                                elapsed,
                                bytesIn, rateIn, maxRateIn,
                                bytesOut, rateOut, maxRateOut,
@@ -167,17 +276,21 @@ func countBeans(nextLocator chan string, stderr *log.Logger) {
                                *StatsInterval,
                                *ServiceURL,
                                *ServiceUUID,
+                               *UseIndex,
                                *RunTime,
+                               *Repeat,
                        )
+                       csv.Println(line)
+                       if exit {
+                               newSummary += line
+                               return
+                       }
                        printCsv = false
                }
-               if abort {
-                       os.Exit(0)
-               }
        }
 }
 
-func makeBufs(nextBuf chan<- []byte, threadID int, stderr *log.Logger) {
+func makeBufs(nextBuf chan<- []byte, threadID int, lgr *log.Logger) {
        buf := make([]byte, *BlockSize)
        if *VaryThread {
                binary.PutVarint(buf, int64(threadID))
@@ -190,7 +303,7 @@ func makeBufs(nextBuf chan<- []byte, threadID int, stderr *log.Logger) {
                if *VaryRequest {
                        rnd := make([]byte, randSize)
                        if _, err := io.ReadFull(rand.Reader, rnd); err != nil {
-                               stderr.Fatal(err)
+                               lgr.Fatal(err)
                        }
                        buf = append(rnd, buf[randSize:]...)
                }
@@ -198,35 +311,101 @@ func makeBufs(nextBuf chan<- []byte, threadID int, stderr *log.Logger) {
        }
 }
 
-func doWrites(kc *keepclient.KeepClient, nextBuf <-chan []byte, nextLocator chan<- string, stderr *log.Logger) {
-       for buf := range nextBuf {
+func doWrites(ctx context.Context, kc *keepclient.KeepClient, nextBuf <-chan []byte, nextLocator *atomic.Value, bytesOutChan chan<- uint64, errorsChan chan<- struct{}, lgr *log.Logger) {
+       for ctx.Err() == nil {
+               //lgr.Printf("%s nextbuf %s, waiting for nextBuf\n",nextBuf,time.Now())
+               buf := <-nextBuf
+               //lgr.Printf("%s nextbuf %s, done waiting for nextBuf\n",nextBuf,time.Now())
                locator, _, err := kc.PutB(buf)
                if err != nil {
-                       stderr.Print(err)
+                       lgr.Print(err)
                        errorsChan <- struct{}{}
                        continue
                }
                bytesOutChan <- uint64(len(buf))
-               for cap(nextLocator) > len(nextLocator)+*WriteThreads {
-                       // Give the readers something to do, unless
-                       // they have lots queued up already.
-                       nextLocator <- locator
+               nextLocator.Store(locator)
+       }
+}
+
+func getIndexLocators(ctx context.Context, cluster *arvados.Cluster, kc *keepclient.KeepClient, indexLocatorChan chan<- string, lgr *log.Logger) {
+       if ctx.Err() != nil {
+               return
+       }
+       locatorsMap := make(map[string]bool)
+       var locators []string
+       var count int64
+       for uuid := range kc.LocalRoots() {
+               reader, err := kc.GetIndex(uuid, "")
+               if err != nil {
+                       lgr.Fatalf("Error getting index: %s\n", err)
                }
+               scanner := bufio.NewScanner(reader)
+               for scanner.Scan() {
+                       locatorsMap[strings.Split(scanner.Text(), " ")[0]] = true
+                       count++
+               }
+       }
+       for l := range locatorsMap {
+               locators = append(locators, l)
+       }
+       lgr.Printf("Found %d locators\n", count)
+       lgr.Printf("Found %d locators (deduplicated)\n", len(locators))
+       if len(locators) < 1 {
+               lgr.Fatal("Error: no locators found. The keepstores do not seem to contain any data. Remove the -use-index cli argument.")
        }
+
+       mathRand.Seed(time.Now().UnixNano())
+       mathRand.Shuffle(len(locators), func(i, j int) { locators[i], locators[j] = locators[j], locators[i] })
+
+       for _, locator := range locators {
+               // We need the Collections.BlobSigningKey to sign our block requests. This requires access to /etc/arvados/config.yml
+               signedLocator := arvados.SignLocator(locator, kc.Arvados.ApiToken, time.Now().Local().Add(1*time.Hour), cluster.Collections.BlobSigningTTL.Duration(), []byte(cluster.Collections.BlobSigningKey))
+               select {
+               case <-ctx.Done():
+                       return
+               case indexLocatorChan <- signedLocator:
+               }
+       }
+       lgr.Fatal("Error: ran out of locators to read!")
 }
 
-func doReads(kc *keepclient.KeepClient, nextLocator <-chan string, stderr *log.Logger) {
-       for locator := range nextLocator {
+func loadConfig(lgr *log.Logger) (cluster *arvados.Cluster) {
+       loader := config.NewLoader(os.Stdin, nil)
+       loader.SkipLegacy = true
+
+       cfg, err := loader.Load()
+       if err != nil {
+               lgr.Fatal(err)
+       }
+       cluster, err = cfg.GetCluster("")
+       if err != nil {
+               lgr.Fatal(err)
+       }
+       return
+}
+
+func doReads(ctx context.Context, kc *keepclient.KeepClient, nextLocator *atomic.Value, indexLocatorChan <-chan string, bytesInChan chan<- uint64, errorsChan chan<- struct{}, lgr *log.Logger) {
+       for ctx.Err() == nil {
+               var locator string
+               if indexLocatorChan != nil {
+                       select {
+                       case <-ctx.Done():
+                               return
+                       case locator = <-indexLocatorChan:
+                       }
+               } else {
+                       locator = nextLocator.Load().(string)
+               }
                rdr, size, url, err := kc.Get(locator)
                if err != nil {
-                       stderr.Print(err)
+                       lgr.Print(err)
                        errorsChan <- struct{}{}
                        continue
                }
                n, err := io.Copy(ioutil.Discard, rdr)
                rdr.Close()
                if n != size || err != nil {
-                       stderr.Printf("Got %d bytes (expected %d) from %s: %v", n, size, url, err)
+                       lgr.Printf("Got %d bytes (expected %d) from %s: %v", n, size, url, err)
                        errorsChan <- struct{}{}
                        continue
                        // Note we don't count the bytes received in
@@ -237,7 +416,7 @@ func doReads(kc *keepclient.KeepClient, nextLocator <-chan string, stderr *log.L
        }
 }
 
-func overrideServices(kc *keepclient.KeepClient, stderr *log.Logger) {
+func overrideServices(kc *keepclient.KeepClient, lgr *log.Logger) {
        roots := make(map[string]string)
        if *ServiceURL != "" {
                roots["zzzzz-bi6l4-000000000000000"] = *ServiceURL
@@ -249,7 +428,7 @@ func overrideServices(kc *keepclient.KeepClient, stderr *log.Logger) {
                        }
                }
                if len(roots) == 0 {
-                       stderr.Fatalf("Service %q was not in list advertised by API %+q", *ServiceUUID, kc.GatewayRoots())
+                       lgr.Fatalf("Service %q was not in list advertised by API %+q", *ServiceUUID, kc.GatewayRoots())
                }
        } else {
                return