16580: Merge branch 'master' into 16580-remove-python2-packages
authorWard Vandewege <ward@curii.com>
Tue, 18 Aug 2020 18:22:52 +0000 (14:22 -0400)
committerWard Vandewege <ward@curii.com>
Tue, 18 Aug 2020 18:23:18 +0000 (14:23 -0400)
Arvados-DCO-1.1-Signed-off-by: Ward Vandewege <ward@curii.com>

120 files changed:
build/package-testing/test-package-arvados-docker-cleaner.sh [new file with mode: 0755]
build/package-testing/test-package-arvados-node-manager.sh [deleted file]
build/package-testing/test-package-python-arvados-cwl-runner.sh [deleted symlink]
build/package-testing/test-package-python-arvados-fuse.sh [deleted symlink]
build/package-testing/test-package-python-arvados-python-client.sh [deleted symlink]
build/package-testing/test-package-python-cwltest.sh [deleted symlink]
build/package-testing/test-package-python27-python-arvados-python-client.sh [deleted file]
build/package-testing/test-package-python3-crunchstat-summary.sh [moved from services/nodemanager/arvnodeman/test/__init__.py with 66% similarity, mode: 0755]
build/package-testing/test-package-python3-cwltest.sh [moved from build/package-testing/test-package-python27-python-arvados-cwl-runner.sh with 79% similarity]
build/package-testing/test-package-python3-python-arvados-fuse.sh [changed from symlink to file mode: 0755]
build/package-testing/test-package-rh-python36-python-arvados-cwl-runner.sh [new file with mode: 0755]
build/package-testing/test-package-rh-python36-python-arvados-fuse.sh [moved from build/package-testing/test-package-python27-python-arvados-fuse.sh with 100% similarity]
build/package-testing/test-package-rh-python36-python-crunchstat-summary.sh [new file with mode: 0755]
build/package-testing/test-package-rh-python36-python-cwltest.sh [moved from build/package-testing/test-package-python27-python-cwltest.sh with 74% similarity]
doc/install/install-nodemanager.html.textile.liquid [deleted file]
sdk/pam/.dockerignore [deleted file]
sdk/pam/.gitignore [deleted symlink]
sdk/pam/Dockerfile [deleted file]
sdk/pam/LICENSE-2.0.txt [deleted file]
sdk/pam/MANIFEST.in [deleted file]
sdk/pam/README.rst [deleted file]
sdk/pam/arvados_pam/__init__.py [deleted file]
sdk/pam/arvados_pam/auth_event.py [deleted file]
sdk/pam/arvados_version.py [deleted file]
sdk/pam/examples/shellinabox [deleted file]
sdk/pam/fpm-info.sh [deleted file]
sdk/pam/gittaggers.py [deleted symlink]
sdk/pam/integration_tests/__init__.py [deleted file]
sdk/pam/integration_tests/test_pam.py [deleted file]
sdk/pam/lib/libpam_arvados.py [deleted file]
sdk/pam/pam-configs/arvados [deleted file]
sdk/pam/setup.py [deleted file]
sdk/pam/tests/__init__.py [deleted file]
sdk/pam/tests/integration_test.pl [deleted file]
sdk/pam/tests/mocker.py [deleted file]
sdk/pam/tests/test_auth_event.py [deleted file]
sdk/pam/tests/test_pam_sm.py [deleted file]
services/nodemanager/.gitignore [deleted symlink]
services/nodemanager/MANIFEST.in [deleted file]
services/nodemanager/README.rst [deleted file]
services/nodemanager/agpl-3.0.txt [deleted file]
services/nodemanager/arvados-node-manager.service [deleted file]
services/nodemanager/arvados_version.py [deleted file]
services/nodemanager/arvnodeman/__init__.py [deleted file]
services/nodemanager/arvnodeman/baseactor.py [deleted file]
services/nodemanager/arvnodeman/clientactor.py [deleted file]
services/nodemanager/arvnodeman/computenode/__init__.py [deleted file]
services/nodemanager/arvnodeman/computenode/dispatch/__init__.py [deleted file]
services/nodemanager/arvnodeman/computenode/dispatch/slurm.py [deleted file]
services/nodemanager/arvnodeman/computenode/dispatch/transitions.py [deleted file]
services/nodemanager/arvnodeman/computenode/driver/__init__.py [deleted file]
services/nodemanager/arvnodeman/computenode/driver/azure.py [deleted file]
services/nodemanager/arvnodeman/computenode/driver/dummy.py [deleted file]
services/nodemanager/arvnodeman/computenode/driver/ec2.py [deleted file]
services/nodemanager/arvnodeman/computenode/driver/gce.py [deleted file]
services/nodemanager/arvnodeman/config.py [deleted file]
services/nodemanager/arvnodeman/daemon.py [deleted file]
services/nodemanager/arvnodeman/jobqueue.py [deleted file]
services/nodemanager/arvnodeman/launcher.py [deleted file]
services/nodemanager/arvnodeman/nodelist.py [deleted file]
services/nodemanager/arvnodeman/status.py [deleted file]
services/nodemanager/arvnodeman/test/fake_driver.py [deleted file]
services/nodemanager/arvnodeman/timedcallback.py [deleted file]
services/nodemanager/bin/arvados-node-manager [deleted file]
services/nodemanager/doc/azure.example.cfg [deleted file]
services/nodemanager/doc/ec2.example.cfg [deleted file]
services/nodemanager/doc/gce.example.cfg [deleted file]
services/nodemanager/doc/local.example.cfg [deleted file]
services/nodemanager/fpm-info.sh [deleted file]
services/nodemanager/gittaggers.py [deleted symlink]
services/nodemanager/setup.py [deleted file]
services/nodemanager/tests/__init__.py [deleted file]
services/nodemanager/tests/fake_azure.cfg.template [deleted file]
services/nodemanager/tests/fake_ec2.cfg.template [deleted file]
services/nodemanager/tests/fake_gce.cfg.template [deleted file]
services/nodemanager/tests/integration_test.py [deleted file]
services/nodemanager/tests/stress_test.cwl [deleted file]
services/nodemanager/tests/test_arguments.py [deleted file]
services/nodemanager/tests/test_clientactor.py [deleted file]
services/nodemanager/tests/test_computenode.py [deleted file]
services/nodemanager/tests/test_computenode_dispatch.py [deleted file]
services/nodemanager/tests/test_computenode_dispatch_slurm.py [deleted file]
services/nodemanager/tests/test_computenode_driver.py [deleted file]
services/nodemanager/tests/test_computenode_driver_azure.py [deleted file]
services/nodemanager/tests/test_computenode_driver_ec2.py [deleted file]
services/nodemanager/tests/test_computenode_driver_gce.py [deleted file]
services/nodemanager/tests/test_config.py [deleted file]
services/nodemanager/tests/test_daemon.py [deleted file]
services/nodemanager/tests/test_failure.py [deleted file]
services/nodemanager/tests/test_jobqueue.py [deleted file]
services/nodemanager/tests/test_nodelist.py [deleted file]
services/nodemanager/tests/test_status.py [deleted file]
services/nodemanager/tests/test_timedcallback.py [deleted file]
services/nodemanager/tests/testutil.py [deleted file]

index ba08f34bcd46ebffd64adb7d387714f4b97d189b..857a9c8ebca0787801da8260d90bd0f0bcfaada2 100755 (executable)
@@ -86,15 +86,12 @@ do
             | *.py \
             | sdk/python/bin/arv-* \
             | sdk/cwl/bin/* \
-            | services/nodemanager/bin/* \
             | services/fuse/bin/* \
             | tools/crunchstat-summary/bin/* \
             | crunch_scripts/* \
             | *.yaml | *.yml | *.yml.example | *.cwl \
             | *.sh | *.service \
             | */run | */run-service | */restart-dns-server \
-            | */nodemanager/doc/*.cfg \
-            | */nodemanager/tests/fake*.cfg.template \
             | */nginx.conf \
             | build/build.list | *.R)
index 8ccab49e1e7d3d9e7c557c48758b8b146386db35..5d204464cff89c27b0e21158fb42bbb77adc12cc 100644 (file)
@@ -6,7 +6,7 @@ FROM centos:7
 MAINTAINER Arvados Package Maintainers <packaging@arvados.org>
 # Install dependencies.
-RUN yum -q -y install make automake gcc gcc-c++ libyaml-devel patch readline-devel zlib-devel libffi-devel openssl-devel bzip2 libtool bison sqlite-devel rpm-build git perl-ExtUtils-MakeMaker libattr-devel nss-devel libcurl-devel which tar unzip scl-utils centos-release-scl postgresql-devel python-devel python-setuptools fuse-devel xz-libs git python-virtualenv wget pam-devel
+RUN yum -q -y install make automake gcc gcc-c++ libyaml-devel patch readline-devel zlib-devel libffi-devel openssl-devel bzip2 libtool bison sqlite-devel rpm-build git perl-ExtUtils-MakeMaker libattr-devel nss-devel libcurl-devel which tar unzip scl-utils centos-release-scl postgresql-devel fuse-devel xz-libs git wget pam-devel
 # Install RVM
 ADD generated/mpapis.asc /tmp/
@@ -41,16 +41,16 @@ RUN ln -s /usr/local/node-v6.11.2-linux-x64/bin/* /usr/local/bin/
 # Need to "touch" RPM database to workaround bug in interaction between
 # overlayfs and yum (https://bugzilla.redhat.com/show_bug.cgi?id=1213602)
 RUN touch /var/lib/rpm/* && yum -q -y install rh-python36
-RUN scl enable rh-python36 "easy_install-3.6 pip" && easy_install-2.7 pip
+RUN scl enable rh-python36 "easy_install-3.6 pip"
 # Add epel, we need it for the python-pam dependency
-RUN wget http://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm
-RUN rpm -ivh epel-release-latest-7.noarch.rpm
+#RUN wget http://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm
+#RUN rpm -ivh epel-release-latest-7.noarch.rpm
 RUN git clone --depth 1 git://git.arvados.org/arvados.git /tmp/arvados && cd /tmp/arvados/services/api && /usr/local/rvm/bin/rvm-exec default bundle && cd /tmp/arvados/apps/workbench && /usr/local/rvm/bin/rvm-exec default bundle
 # The version of setuptools that comes with CentOS is way too old
-RUN pip install --upgrade 'setuptools<45'
+RUN scl enable rh-python36 "easy_install-3.6 pip install 'setuptools<45'"
 CMD ["scl", "enable", "rh-python36", "/usr/local/rvm/bin/rvm-exec default bash /jenkins/run-build-packages.sh --target centos7"]
index 90dfd36b52f66afb6f49c946df761fcd1651ac53..4f306c6aa4e8ca4241e39f87fcbf403b401ab431 100644 (file)
@@ -4,15 +4,15 @@
 ## dont use debian:10 here since the word 'buster' is used for rvm precompiled binaries
 FROM debian:buster
-MAINTAINER Ward Vandewege <wvandewege@veritasgenetics.com>
+MAINTAINER Arvados Package Maintainers <packaging@arvados.org>
 ENV DEBIAN_FRONTEND noninteractive
 # Install dependencies.
-RUN /usr/bin/apt-get update && /usr/bin/apt-get install -q -y python2.7-dev python3 python-setuptools python3-setuptools python3-pip libcurl4-gnutls-dev curl git procps libattr1-dev libfuse-dev libgnutls28-dev libpq-dev python-pip unzip python3-venv python3-dev libpam-dev
+RUN /usr/bin/apt-get update && /usr/bin/apt-get install -q -y python3 python3-setuptools python3-pip libcurl4-gnutls-dev curl git procps libattr1-dev libfuse-dev libgnutls28-dev libpq-dev unzip python3-venv python3-dev libpam-dev
 # Install virtualenv
-RUN /usr/bin/pip install 'virtualenv<20'
+RUN /usr/bin/pip3 install 'virtualenv<20'
 # Install RVM
 ADD generated/mpapis.asc /tmp/
index 1a84da280898d3010ea6c8bf5978bc0da648f891..5294997f054658d5f3fb5b7366af0d69eab663a8 100644 (file)
@@ -4,15 +4,15 @@
 ## dont use debian:9 here since the word 'stretch' is used for rvm precompiled binaries
 FROM debian:stretch
-MAINTAINER Nico Cesar <nico@curoverse.com>
+MAINTAINER Arvados Package Maintainers <packaging@arvados.org>
 ENV DEBIAN_FRONTEND noninteractive
 # Install dependencies.
-RUN /usr/bin/apt-get update && /usr/bin/apt-get install -q -y python2.7-dev python3 python-setuptools python3-setuptools python3-pip libcurl4-gnutls-dev curl git procps libattr1-dev libfuse-dev libgnutls28-dev libpq-dev python-pip unzip python3-venv python3-dev libpam-dev
+RUN /usr/bin/apt-get update && /usr/bin/apt-get install -q -y python3 python3-setuptools python3-pip libcurl4-gnutls-dev curl git procps libattr1-dev libfuse-dev libgnutls28-dev libpq-dev unzip python3-venv python3-dev libpam-dev
 # Install virtualenv
-RUN /usr/bin/pip install 'virtualenv<20'
+RUN /usr/bin/pip3 install 'virtualenv<20'
 # Install RVM
 ADD generated/mpapis.asc /tmp/
index 87f7712d50be68aceb65612b33154bc267b0a10c..202bab651322dd9d91cd8ea415a7146b5931f9ce 100644 (file)
@@ -8,10 +8,10 @@ MAINTAINER Arvados Package Maintainers <packaging@arvados.org>
 ENV DEBIAN_FRONTEND noninteractive
 # Install dependencies.
-RUN /usr/bin/apt-get update && /usr/bin/apt-get install -q -y python2.7-dev python3 python-setuptools python3-setuptools python3-pip libcurl4-gnutls-dev libgnutls-dev curl git libattr1-dev libfuse-dev libpq-dev python-pip unzip tzdata python3-venv python3-dev libpam-dev
+RUN /usr/bin/apt-get update && /usr/bin/apt-get install -q -y python3 python-setuptools python3-setuptools python3-pip libcurl4-gnutls-dev libgnutls-dev curl git libattr1-dev libfuse-dev libpq-dev unzip tzdata python3-venv python3-dev libpam-dev
 # Install virtualenv
-RUN /usr/bin/pip install 'virtualenv<20'
+RUN /usr/bin/pip3 install 'virtualenv<20'
 # Install RVM
 ADD generated/mpapis.asc /tmp/
index a2ec29da1cf3932134b3f524608fbcb0c0b72691..05023aa09af50e5384e69db80ed5b253c91d72bb 100644 (file)
@@ -8,10 +8,10 @@ MAINTAINER Arvados Package Maintainers <packaging@arvados.org>
 ENV DEBIAN_FRONTEND noninteractive
 # Install dependencies.
-RUN /usr/bin/apt-get update && /usr/bin/apt-get install -q -y python2.7-dev python3 python-setuptools python3-pip libcurl4-gnutls-dev libgnutls28-dev curl git libattr1-dev libfuse-dev libpq-dev python-pip unzip tzdata python3-venv python3-dev libpam-dev
+RUN /usr/bin/apt-get update && /usr/bin/apt-get install -q -y python3 python3-pip libcurl4-gnutls-dev libgnutls28-dev curl git libattr1-dev libfuse-dev libpq-dev unzip tzdata python3-venv python3-dev libpam-dev
 # Install virtualenv
-RUN /usr/bin/pip install 'virtualenv<20'
+RUN /usr/bin/pip3 install 'virtualenv<20'
 # Install RVM
 ADD generated/mpapis.asc /tmp/
diff --git a/build/package-testing/test-package-arvados-docker-cleaner.sh b/build/package-testing/test-package-arvados-docker-cleaner.sh
new file mode 100755 (executable)
index 0000000..6b344de
--- /dev/null
@@ -0,0 +1,8 @@
+# Copyright (C) The Arvados Authors. All rights reserved.
+# SPDX-License-Identifier: AGPL-3.0
+set -e
+arvados-docker-cleaner -h >/dev/null
diff --git a/build/package-testing/test-package-arvados-node-manager.sh b/build/package-testing/test-package-arvados-node-manager.sh
deleted file mode 100755 (executable)
index 9300f4c..0000000
+++ /dev/null
@@ -1,15 +0,0 @@
-# Copyright (C) The Arvados Authors. All rights reserved.
-# SPDX-License-Identifier: AGPL-3.0
-set -e
-arvados-node-manager --version
-exec /usr/share/python2.7/dist/arvados-node-manager/bin/python2.7 <<EOF
-import libcloud.compute.types
-import libcloud.compute.providers
-print "Successfully imported compatible libcloud library"
diff --git a/build/package-testing/test-package-python-arvados-cwl-runner.sh b/build/package-testing/test-package-python-arvados-cwl-runner.sh
deleted file mode 120000 (symlink)
index 61e61b1..0000000
+++ /dev/null
@@ -1 +0,0 @@
\ No newline at end of file
diff --git a/build/package-testing/test-package-python-arvados-fuse.sh b/build/package-testing/test-package-python-arvados-fuse.sh
deleted file mode 120000 (symlink)
index 3b9232c..0000000
+++ /dev/null
@@ -1 +0,0 @@
\ No newline at end of file
diff --git a/build/package-testing/test-package-python-arvados-python-client.sh b/build/package-testing/test-package-python-arvados-python-client.sh
deleted file mode 120000 (symlink)
index 8a4d0ea..0000000
+++ /dev/null
@@ -1 +0,0 @@
\ No newline at end of file
diff --git a/build/package-testing/test-package-python-cwltest.sh b/build/package-testing/test-package-python-cwltest.sh
deleted file mode 120000 (symlink)
index 9b6545b..0000000
+++ /dev/null
@@ -1 +0,0 @@
\ No newline at end of file
diff --git a/build/package-testing/test-package-python27-python-arvados-python-client.sh b/build/package-testing/test-package-python27-python-arvados-python-client.sh
deleted file mode 100755 (executable)
index 2c92a3e..0000000
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (C) The Arvados Authors. All rights reserved.
-# SPDX-License-Identifier: AGPL-3.0
-set -e
-arv-put --version
-/usr/share/python2.7/dist/python-arvados-python-client/bin/python2.7 << EOF
-import arvados
-print "Successfully imported arvados"
index 99327c016ad618dbf69971a0960e19def60469e9..ebf7b5becda9d94e371c462ce7bc5de278c82c1b 100755 (executable)
@@ -5,4 +5,4 @@
 set -e
-arvados-cwl-runner --version
+arvados-cwl-runner --version >/dev/null
index d4e66a27b9510ca06b50c2704c8e2bdee70a17d9..69f728c10e5c335967fac801c9f131726bce18a6 100755 (executable)
@@ -5,7 +5,7 @@
 set -e
-arv-put --version
+arv-put --version >/dev/null
 /usr/share/python3/dist/python3-arvados-python-client/bin/python3 << EOF
 import arvados
old mode 100644 (file)
new mode 100755 (executable)
similarity index 66%
rename from services/nodemanager/arvnodeman/test/__init__.py
rename to build/package-testing/test-package-python3-crunchstat-summary.sh
index d3ac1c2..02b6e0d
@@ -1,5 +1,8 @@
 # Copyright (C) The Arvados Authors. All rights reserved.
 # SPDX-License-Identifier: AGPL-3.0
+set -e
+crunchstat-summary -h >/dev/null
similarity index 79%
rename from build/package-testing/test-package-python27-python-arvados-cwl-runner.sh
rename to build/package-testing/test-package-python3-cwltest.sh
index 99327c016ad618dbf69971a0960e19def60469e9..77f1f44016d80bfe3e19c33cde150e8da65e1778 100755 (executable)
@@ -5,4 +5,4 @@
 set -e
-arvados-cwl-runner --version
+cwltest -h >/dev/null
deleted file mode 120000 (symlink)
index 3b9232c5fa6ccac4a9f1fdaf3e8b1703934959ed..0000000000000000000000000000000000000000
+++ /dev/null
@@ -1 +0,0 @@
\ No newline at end of file
new file mode 100755 (executable)
index 0000000000000000000000000000000000000000..81929857b8eaa6791a3e47e196f578de6f17b9a0
--- /dev/null
@@ -0,0 +1,8 @@
+# Copyright (C) The Arvados Authors. All rights reserved.
+# SPDX-License-Identifier: AGPL-3.0
+set -e
+arv-mount --version
diff --git a/build/package-testing/test-package-rh-python36-python-arvados-cwl-runner.sh b/build/package-testing/test-package-rh-python36-python-arvados-cwl-runner.sh
new file mode 100755 (executable)
index 0000000..ebf7b5b
--- /dev/null
@@ -0,0 +1,8 @@
+# Copyright (C) The Arvados Authors. All rights reserved.
+# SPDX-License-Identifier: AGPL-3.0
+set -e
+arvados-cwl-runner --version >/dev/null
diff --git a/build/package-testing/test-package-rh-python36-python-crunchstat-summary.sh b/build/package-testing/test-package-rh-python36-python-crunchstat-summary.sh
new file mode 100755 (executable)
index 0000000..02b6e0d
--- /dev/null
@@ -0,0 +1,8 @@
+# Copyright (C) The Arvados Authors. All rights reserved.
+# SPDX-License-Identifier: AGPL-3.0
+set -e
+crunchstat-summary -h >/dev/null
similarity index 74%
rename from build/package-testing/test-package-python27-python-cwltest.sh
rename to build/package-testing/test-package-rh-python36-python-cwltest.sh
index 395cefc5138ceba7647ad35995c1c8860466e424..77f1f44016d80bfe3e19c33cde150e8da65e1778 100755 (executable)
@@ -3,6 +3,6 @@
 # SPDX-License-Identifier: AGPL-3.0
-exec python <<EOF
-import cwltest
+set -e
+cwltest -h >/dev/null
index f8816dbe4873c3fad3773d47590393d1e62b5550..d0a79ad3dfa2fdf04cab380f321602fac66df618 100755 (executable)
@@ -192,27 +192,47 @@ popd
 if test -z "$packages" ; then
+        arvados-controller
+        arvados-dispatch-cloud
-        arvados-node-manager
+        arvados-health
+        arvados-server
+        arvados-sync-groups
+        arvados-workbench2
+        arvados-ws
+        keepproxy
+        keepstore
-        keepproxy
-        keepstore
+        keep-exercise
+        keep-rsync
+        keep-block-check
-        libpam-arvados
-        libpam-arvados-go
-        python-arvados-fuse
-        python-arvados-python-client
-        python-arvados-cwl-runner"
+        libpam-arvados-go"
+    if [[ "$TARGET" =~ "centos" ]]; then
+      packages="$packages
+        rh-python36-python-cwltest
+        rh-python36-python-arvados-fuse
+        rh-python36-python-arvados-python-client
+        rh-python36-python-arvados-cwl-runner
+        rh-python36-python-crunchstat-summary"
+    else
+      packages="$packages
+        python3-cwltest
+        python3-arvados-fuse
+        python3-arvados-python-client
+        python3-arvados-cwl-runner
+        python3-crunchstat-summary"
+    fi
index ba44218c4e8f076a8ab7d0a8917b5cd40cecb547..66201b3b4d0b577b66b956730c67b5b2e20d3913 100755 (executable)
@@ -195,7 +195,6 @@ if [ $PYTHON -eq 1 ]; then
   python_wrapper arvados-pam "$WORKSPACE/sdk/pam"
   python_wrapper arvados-cwl-runner "$WORKSPACE/sdk/cwl"
   python_wrapper arvados_fuse "$WORKSPACE/services/fuse"
-  python_wrapper arvados-node-manager "$WORKSPACE/services/nodemanager"
   if [ $((${#failures[@]} - $GEM_BUILD_FAILURES)) -ne 0 ]; then
index 5aa0b7e6f8e363642cf3aebfa6bff44d28926d2d..0e74ac6f2570761d34cfc91d58b36d16c1fa812d 100755 (executable)
@@ -102,18 +102,12 @@ if [[ "$DEBUG" != 0 ]]; then
 PYTHON3_VERSION=$(python3 -c 'import sys; print("{v.major}.{v.minor}".format(v=sys.version_info))')
 ## These defaults are suitable for any Debian-based distribution.
 # You can customize them as needed in distro sections below.
@@ -129,9 +123,6 @@ case "$TARGET" in
-        PYTHON2_PACKAGE=$(rpm -qf "$(which python$PYTHON2_VERSION)" --queryformat '%{NAME}\n')
-        PYTHON2_INSTALL_LIB=lib/python$PYTHON2_VERSION/site-packages
         PYTHON3_PACKAGE=$(rpm -qf "$(which python$PYTHON3_VERSION)" --queryformat '%{NAME}\n')
@@ -321,29 +312,17 @@ package_go_binary tools/keep-exercise keep-exercise \
 package_go_so lib/pam pam_arvados.so libpam-arvados-go \
     "Arvados PAM authentication module"
-# The Python SDK - Should be built first because it's needed by others
-fpm_build_virtualenv "arvados-python-client" "sdk/python"
 # The Python SDK - Python3 package
 fpm_build_virtualenv "arvados-python-client" "sdk/python" "python3"
-# Arvados cwl runner - Only supports Python3 now
+# Arvados cwl runner - Python3 package
 fpm_build_virtualenv "arvados-cwl-runner" "sdk/cwl" "python3"
-# The PAM module
-fpm_build_virtualenv "libpam-arvados" "sdk/pam"
-# The FUSE driver
-fpm_build_virtualenv "arvados-fuse" "services/fuse"
 # The FUSE driver - Python3 package
 fpm_build_virtualenv "arvados-fuse" "services/fuse" "python3"
-# The node manager
-fpm_build_virtualenv "arvados-node-manager" "services/nodemanager"
 # The Arvados crunchstat-summary tool
-fpm_build_virtualenv "crunchstat-summary" "tools/crunchstat-summary"
+fpm_build_virtualenv "crunchstat-summary" "tools/crunchstat-summary" "python3"
 # The Docker image cleaner
 fpm_build_virtualenv "arvados-docker-cleaner" "services/dockercleaner" "python3"
@@ -354,11 +333,9 @@ if [[ -e "$WORKSPACE/cwltest" ]]; then
        rm -rf "$WORKSPACE/cwltest"
 git clone https://github.com/common-workflow-language/cwltest.git
-# last release to support python 2.7
-(cd cwltest && git checkout 1.0.20190906212748)
 # signal to our build script that we want a cwltest executable installed in /usr/bin/
 mkdir cwltest/bin && touch cwltest/bin/cwltest
-fpm_build_virtualenv "cwltest" "cwltest"
+fpm_build_virtualenv "cwltest" "cwltest" "python3"
 rm -rf "$WORKSPACE/cwltest"
 calculate_go_package_version arvados_server_version cmd/arvados-server
index 3e6c9f85841d55be0e7d9794c4e86a693e5500c3..528d69d9982eac69e561a3ab7078488a94093d61 100755 (executable)
@@ -231,10 +231,6 @@ default_iteration() {
            [[ ${BASH_REMATCH[1]} -le $LICENSE_PACKAGE_TS ]]; then
-    if [[ $package_type =~ ^python ]]; then
-      # Fix --iteration for #9242.
-      iteration=2
-    fi
     echo $iteration
@@ -487,18 +483,9 @@ fpm_build_virtualenv () {
-    python)
-        # All Arvados Python2 packages depend on Python 2.7.
-        # Make sure we build with that for consistency.
-        python=python2.7
-        pip=pip
-        ;;
-  if [[ "$PKG" != "libpam-arvados" ]] &&
-     [[ "$PKG" != "arvados-node-manager" ]] &&
-     [[ "$PKG" != "arvados-docker-cleaner" ]]; then
+  if [[ "$PKG" != "arvados-docker-cleaner" ]]; then
     # Exception to our package naming convention
@@ -651,25 +638,6 @@ fpm_build_virtualenv () {
   LICENSE_STRING=`grep license $WORKSPACE/$PKG_DIR/setup.py|cut -f2 -d=|sed -e "s/[',\\"]//g"`
   COMMAND_ARR+=('--license' "$LICENSE_STRING")
-  # 12271 - As FPM-generated packages don't include scripts by default, the
-  # packages cleanup on upgrade depends on files being listed on the %files
-  # section in the generated SPEC files. To remove DIRECTORIES, they need to
-  # be listed in that section too, so we need to add this parameter to properly
-  # remove lingering dirs. But this only works for python2: if used on
-  # python33, it includes dirs like /opt/rh/python33 that belong to
-  # other packages.
-  if [[ "$FORMAT" == "rpm" ]] && [[ "$python" == "python2.7" ]]; then
-    COMMAND_ARR+=('--rpm-auto-add-directories')
-  fi
-  if [[ "$PKG" == "arvados-python-client" ]] || [[ "$PKG" == "arvados-fuse" ]]; then
-    if [[ "$python" == "python2.7" ]]; then
-      COMMAND_ARR+=('--conflicts' "$PYTHON3_PKG_PREFIX-$PKG")
-    else
-      COMMAND_ARR+=('--conflicts' "$PYTHON2_PKG_PREFIX-$PKG")
-    fi
-  fi
   if [[ "$DEBUG" != "0" ]]; then
     COMMAND_ARR+=('--verbose' '--log' 'info')
@@ -685,11 +653,7 @@ fpm_build_virtualenv () {
     COMMAND_ARR+=('--before-remove' "${WORKSPACE}/build/go-python-package-scripts/prerm")
-  if [[ "$python" == "python2.7" ]]; then
-    COMMAND_ARR+=('--depends' "$PYTHON2_PACKAGE")
-  else
-    COMMAND_ARR+=('--depends' "$PYTHON3_PACKAGE")
-  fi
+  COMMAND_ARR+=('--depends' "$PYTHON3_PACKAGE")
   # avoid warning
@@ -714,7 +678,7 @@ fpm_build_virtualenv () {
   # make sure the systemd service file ends up in the right place
-  # used by arvados-docker-cleaner and arvados-node-manager
+  # used by arvados-docker-cleaner
   if [[ -e "${systemd_unit}" ]]; then
@@ -733,15 +697,6 @@ fpm_build_virtualenv () {
-  # the libpam module should place a few files in the correct place for the pam
-  # subsystem
-  if [[ -e "$WORKSPACE/$PKG_DIR/dist/build/usr/share/$python/dist/$PYTHON_PKG/lib/security/libpam_arvados.py" ]]; then
-    COMMAND_ARR+=("usr/share/$python/dist/$PYTHON_PKG/lib/security/libpam_arvados.py=/usr/lib/security/")
-  fi
-  if [[ -e "$WORKSPACE/$PKG_DIR/dist/build/usr/share/$python/dist/$PYTHON_PKG/share/pam-configs/arvados" ]]; then
-    COMMAND_ARR+=("usr/share/$python/dist/$PYTHON_PKG/share/pam-configs/arvados=/usr/share/pam-configs/")
-  fi
   # the python-arvados-cwl-runner package comes with cwltool, expose that version
   if [[ -e "$WORKSPACE/$PKG_DIR/dist/build/usr/share/python2.7/dist/python-arvados-cwl-runner/bin/cwltool" ]]; then
@@ -802,17 +757,6 @@ fpm_build () {
-  # 12271 - As FPM-generated packages don't include scripts by default, the
-  # packages cleanup on upgrade depends on files being listed on the %files
-  # section in the generated SPEC files. To remove DIRECTORIES, they need to
-  # be listed in that section too, so we need to add this parameter to properly
-  # remove lingering dirs. But this only works for python2: if used on
-  # python33, it includes dirs like /opt/rh/python33 that belong to
-  # other packages.
-  if [[ "$FORMAT" = rpm ]] && [[ "$python" = python2.7 ]]; then
-    COMMAND_ARR+=('--rpm-auto-add-directories')
-  fi
   if [[ "$DEBUG" != "0" ]]; then
     COMMAND_ARR+=('--verbose' '--log' 'info')
index ff6ead0facc26bbb0e1141d118b4cd81a70ec4c0..2742540b16b44efe57fa113d23d3e967915e5c2f 100755 (executable)
@@ -91,6 +91,7 @@ lib/dispatchcloud/scheduler
@@ -104,14 +105,10 @@ services/keepproxy
@@ -262,7 +259,7 @@ sanity_checks() {
         || fatal "No libpq libpq-fe.h. Try: apt-get install libpq-dev"
     echo -n 'libpam pam_appl.h: '
     find /usr/include -path '*/security/pam_appl.h' | egrep --max-count=1 . \
-        || fatal "No libpam pam_appl.h. Try: apt-get install libpam-dev"
+        || fatal "No libpam pam_appl.h. Try: apt-get install libpam0g-dev"
     echo -n 'postgresql: '
     psql --version || fatal "No postgresql. Try: apt-get install postgresql postgresql-client-common"
     echo -n 'phantomjs: '
@@ -306,8 +303,6 @@ declare -A skip
 declare -A only
 declare -A testargs
-# nodemanager_integration tests are not reliable, see #12061.
 while [[ -n "$1" ]]
@@ -668,14 +663,6 @@ install_env() {
         python setup.py install
     ) || fatal "installing PyYAML and sdk/python failed"
-    # Preinstall libcloud if using a fork; otherwise nodemanager "pip
-    # install" won't pick it up by default.
-    if [[ -n "$LIBCLOUD_PIN_SRC" ]]; then
-        pip freeze 2>/dev/null | egrep ^apache-libcloud==$LIBCLOUD_PIN \
-            || pip install --pre --ignore-installed --no-cache-dir "$LIBCLOUD_PIN_SRC" >/dev/null \
-            || fatal "pip install apache-libcloud failed"
-    fi
     # Deactivate Python 2 virtualenv
@@ -722,9 +709,6 @@ do_test() {
         apps/workbench_units | apps/workbench_functionals | apps/workbench_integration)
-        services/nodemanager | services/nodemanager_integration)
-            suite=services/nodemanager_suite
-            ;;
@@ -1004,14 +988,12 @@ install_services/api() {
 declare -a pythonstuff
-    sdk/pam
-    services/nodemanager
@@ -1076,11 +1058,6 @@ test_services/login-sync() {
         && "$bundle" exec rake test TESTOPTS=-v ${testargs[services/login-sync]}
-test_services/nodemanager_integration() {
-    cd "$WORKSPACE/services/nodemanager" \
-        && tests/integration_test.py ${testargs[services/nodemanager_integration]}
 test_apps/workbench_units() {
     local TASK="test:units"
     cd "$WORKSPACE/apps/workbench" \
@@ -1175,7 +1152,6 @@ test_all() {
     do_test sdk/cli
     do_test services/login-sync
     do_test sdk/java-v2
-    do_test services/nodemanager_integration
     for p in "${pythonstuff[@]}"
index 881227b3fa9a84ce084f107de771aa862c1949c5..abdd8db734e7522f61acbcfbf0610ace401d38fe 100644 (file)
@@ -16,24 +16,6 @@ Services must have ManagementToken configured.  This is used to authorize access
 To access a monitoring endpoint, the requester must provide the HTTP header @Authorization: Bearer (ManagementToken)@.
-h2. Node Manager
-Set @port@ (the listen port) and @ManagementToken@ in the @Manage@ section of @node-manager.ini@.
-# The management server responds to http://addr:port/status.json with
-# a snapshot of internal state.
-# Management server listening address (default
-#address =
-# Management server port number (default -1, server is disabled)
-#port = 8989
-ManagementToken = xxx
 h2. API server and other services
 The following services also support monitoring.
@@ -45,7 +27,7 @@ The following services also support monitoring.
 * keepproxy
 * keepstore
 * keep-web
-* websockets
+* arvados-ws 
 Set @ManagementToken@ in the appropriate section of @/etc/arvados/config.yml@.
index 1d6b87da62116027a96788c8fe7b73c44a269133..0cfa0a2e604cc0ee40bcbe3cc1a44836b3247b72 100644 (file)
@@ -35,7 +35,6 @@ table(table table-bordered table-condensed table-hover).
@@ -44,48 +43,3 @@ table(table table-bordered table-condensed table-hover).
-h2. Node manager
-The node manager does not export prometheus-style metrics, but its @/status.json@ endpoint provides a snapshot of internal status at the time of the most recent wishlist update.
-<pre>curl -sfH "Authorization: Bearer your_management_token_goes_here" ""
-table(table table-bordered table-condensed).
-|_. Attribute|_. Type|_. Description|
-|nodes_booting|int|Number of nodes in booting state|
-|nodes_unpaired|int|Number of nodes in unpaired state|
-|nodes_busy|int|Number of nodes in busy state|
-|nodes_idle|int|Number of nodes in idle state|
-|nodes_fail|int|Number of nodes in fail state|
-|nodes_down|int|Number of nodes in down state|
-|nodes_shutdown|int|Number of nodes in shutdown state|
-|nodes_wish|int|Number of nodes in the current wishlist|
-|node_quota|int|Current node count ceiling due to cloud quota limits|
-|config_max_nodes|int|Configured max node count|
-h3. Example
-  "actor_exceptions": 0,
-  "idle_times": {
-    "compute1": 0,
-    "compute3": 0,
-    "compute2": 0,
-    "compute4": 0
-  },
-  "create_node_errors": 0,
-  "destroy_node_errors": 0,
-  "nodes_idle": 0,
-  "config_max_nodes": 8,
-  "list_nodes_errors": 0,
-  "node_quota": 8,
-  "Version": "",
-  "nodes_wish": 0,
-  "nodes_unpaired": 0,
-  "nodes_busy": 4,
-  "boot_failures": 0
index bc0600e22333e6e8bc6f45a927797128253a6b29..7f49d6961292f7371436cb04cbe3892a1a0efadb 100644 (file)
@@ -25,14 +25,14 @@ Clusters:
       UsePreemptibleInstances: true
-       Preemptible: false
+        Preemptible: false
         ProviderType: m4.large
         VCPUs: 2
         RAM: 8GiB
         AddedScratch: 32GB
         Price: 0.1
-       Preemptible: true
+        Preemptible: true
         ProviderType: m4.large
         VCPUs: 2
         RAM: 8GiB
@@ -44,8 +44,6 @@ When @UsePreemptibleInstances@ is enabled, child containers (workflow steps) wil
 If you are using "arvados-dispatch-cloud":{{site.baseurl}}/install/crunch2-cloud/install-dispatch-cloud.html no additional configuration is required.
-If you are using the legacy Nodemanager, "see below":#nodemanager .
 h2. Preemptible instances on AWS
 For general information, see "using Amazon EC2 spot instances":https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/using-spot-instances.html .
@@ -62,22 +60,7 @@ The account needs to have a service linked role created. This can be done by log
 h3. Cost Tracking
-Amazon's Spot instances prices are declared at instance request time and defined by the maximum price that the user is willing to pay per hour. By default, this price is the same amount as the on-demand version of each instance type, and this setting is the one that nodemanager uses for now, as it doesn't include any pricing data to the spot instance request.
+Amazon's Spot instances prices are declared at instance request time and defined by the maximum price that the user is willing to pay per hour. By default, this price is the same amount as the on-demand version of each instance type, and this setting is the one that @arvados-dispatch-cloud@ uses for now, as it doesn't include any pricing data to the spot instance request.
 The real price that a spot instance has at any point in time is discovered at the end of each usage hour, depending on instance demand. For this reason, AWS provides a data feed subscription to get hourly logs, as described on "Amazon's User Guide":https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/spot-data-feeds.html.
-h2(#nodemanager). Nodemanager
-If you are using the legacy Nodemanager, its config file must also declare preemptible instance sizes, which must match the API server's @InstanceTypes@:
-[Size m4.large]
-cores = 2
-scratch = 32000
-[Size m4.large.spot]
-cores = 2
-instance_type = m4.large
-preemptible = true
-scratch = 32000
index 84ef780faa09354037151f054fdb15daf9ba937f..061b68fa5d27b766e7d45bd0c08750fed210f5dd 100644 (file)
@@ -38,6 +38,10 @@ h2(#master). development master (as of 2020-06-17)
 "Upgrading from 2.0.0":#v2_0_0
+h3. Removing libpam-arvados, replaced with libpam-arvados-go
+The Python-based PAM package has been replaced with a version written in Go. See "using PAM for authentication":{{site.baseurl}}/install/setup-login.html#pam for details.
 h3. Removing sso-provider
 The SSO (single sign-on) component is deprecated and will not be supported in future releases. Existing configurations will continue to work in this release, but you should switch to one of the built-in authentication mechanisms as soon as possible. See "setting up web based login":{{site.baseurl}}/install/setup-login.html for details.
@@ -569,7 +573,7 @@ As part of story "#11349":https://dev.arvados.org/issues/11349, commit "2c094e2"
 * To enable it, add to your configuration file: <pre>[Manage]
   address =
-  port = 8989</pre> (see example configuration files in source:services/nodemanager/doc or https://doc.arvados.org/install/install-nodemanager.html for more info)
+  port = 8989</pre>
 * The server responds to @http://{address}:{port}/status.json@ with a summary of how many nodes are in each state (booting, busy, shutdown, etc.)
 h3. New websockets component (2017-03-23)
index 705048cd620cf566ad5ece5722e311262642d623..dddcd050731eaf712d1485458ba25791262a5615 100644 (file)
@@ -20,6 +20,7 @@ table(table table-bordered table-condensed).
 |_. Component|_. Description|
 |api|The API server is the core of Arvados.  It is backed by a Postgres database and manages information such as metadata for storage, a record of submitted compute jobs, users, groups, and associated permissions.|
 |arv-git-httpd|Provides a git+http interface to Arvados-managed git repositories, with permissions and authentication based on an Arvados API token.|
+|arvados-dispatch-cloud|Provide elastic computing by creating and destroying cloud based virtual machines on compute demand.|
 |crunch-dispatch-local|Get compute requests submitted to the API server and execute them locally.|
 |crunch-dispatch-slurm|Get compute requests submitted to the API server and submit them to slurm.|
 |crunch-run|Dispatched by crunch-dispatch, executes a single compute run: setting up a Docker container, running it, and collecting the output.|
@@ -31,8 +32,7 @@ table(table table-bordered table-condensed).
 |keepstore|Provides access to underlying storage (filesystem or object storage such as Amazon S3 or Azure Blob) with Arvados permissions.|
 |keep-web|Provides high-level WebDAV access to collections (file-level data access).|
 |login-sync|Synchronize virtual machine users with Arvados users and permissions.|
-|nodemanager|Provide elastic computing by creating and destroying cloud based virtual machines on compute demand.|
-|ws|Publishes API server change events over websockets.|
+|arvados-ws|Publishes API server change events over websockets.|
 |workbench|Web application providing user interface to Arvados services.|
 h3. Tools
index a94de2a601b663047869d2a1bc84869b008b3214..55095b1f20f05cb21e203a9ba6a39fa3f069a2dd 100644 (file)
@@ -60,8 +60,8 @@ table(table table-bordered table-condensed).
 |"Shell server":install-shell-server.html |Synchronize (create/delete/configure) Unix shell accounts with Arvados users.|Optional.|
 |"Git server":install-arv-git-httpd.html |Arvados-hosted git repositories, with Arvados-token based authentication.|Optional, but required by Workflow Composer.|
 |\3=. *Crunch (running containers)*|
-|"crunch-dispatch-slurm":crunch2-slurm/install-prerequisites.html |Run analysis workflows using Docker containers distributed across a Slurm cluster.|Optional if you wish to use Arvados for data management only.|
-|"Node Manager":install-nodemanager.html, "arvados-dispatch-cloud":crunch2-cloud/install-dispatch-cloud.html |Allocate and free cloud VM instances on demand based on workload.|Optional, not needed for a static Slurm cluster (such as on-premises HPC).|
+|"arvados-dispatch-cloud":crunch2-cloud/install-dispatch-cloud.html |Allocate and free cloud VM instances on demand based on workload.|Optional, not needed for a static Slurm cluster such as on-premises HPC.|
+|"crunch-dispatch-slurm":crunch2-slurm/install-prerequisites.html |Run analysis workflows using Docker containers distributed across a Slurm cluster.|Optional, not needed for a Cloud installation, or if you wish to use Arvados for data management only.|
 h2(#identity). Identity provider
diff --git a/doc/install/install-nodemanager.html.textile.liquid b/doc/install/install-nodemanager.html.textile.liquid
deleted file mode 100644 (file)
index 75e4b25..0000000
+++ /dev/null
@@ -1,629 +0,0 @@
-layout: default
-navsection: installguide
-title: Install Node Manager
-{% comment %}
-Copyright (C) The Arvados Authors. All rights reserved.
-SPDX-License-Identifier: CC-BY-SA-3.0
-{% endcomment %}
-Arvados Node Manager provides elastic computing for Arvados and Slurm by creating and destroying virtual machines on demand.  Node Manager currently supports Amazon Web Services (AWS), Google Cloud Platform (GCP) and Microsoft Azure.
-Note: node manager is only required for elastic computing cloud environments.  Fixed size clusters (such as on-premise HPC) do not require node manager.
-h2. Install
-Node manager may run anywhere, however it must be able to communicate with the cloud provider's APIs, and use the command line tools @sinfo@, @squeue@ and @scontrol@ to communicate with the cluster's Slurm controller.
-On Debian-based systems:
-<pre><code>~$ <span class="userinput">sudo apt-get install arvados-node-manager</span>
-On Red Hat-based systems:
-<pre><code>~$ <span class="userinput">sudo yum install arvados-node-manager</span>
-h2. Create compute image
-Configure a virtual machine following the "instructions to set up a compute node.":{{site.baseurl}}/install/crunch2-slurm/install-compute-node.html and set it up to run a "ping script":{{site.baseurl}}/install/install-compute-ping.html at boot.
-Create a virtual machine image using the commands provided by your cloud provider.  We recommend using a tool such as "Packer":https://www.packer.io/ to automate this process.
-Configure node manager to use the image with the @image@ or @image_id@ parameter.
-h2. Configure node manager
-The configuration file at @/etc/arvados-node-manager/config.ini@ .  Some configuration details are specific to the cloud provider you are using:
-* "Amazon Web Services":#aws
-* "Google Cloud Platform":#gcp
-* "Microsoft Azure":#azure
-h3(#aws). Amazon Web Services
-# EC2 configuration for Arvados Node Manager.
-# All times are in seconds unless specified otherwise.
-# The management server responds to http://addr:port/status.json with
-# a snapshot of internal state.
-# Management server listening address (default
-#address =
-# Management server port number (default -1, server is disabled)
-#port = 8989
-# The dispatcher can customize the start and stop procedure for
-# cloud nodes.  For example, the Slurm dispatcher drains nodes
-# through Slurm before shutting them down.
-dispatcher = slurm
-# Node Manager will ensure that there are at least this many nodes running at
-# all times.  If node manager needs to start new idle nodes for the purpose of
-# satisfying min_nodes, it will use the cheapest node type.  However, depending
-# on usage patterns, it may also satisfy min_nodes by keeping alive some
-# more-expensive nodes
-min_nodes = 0
-# Node Manager will not start any compute nodes when at least this
-# many are running.
-max_nodes = 8
-# Upper limit on rate of spending (in $/hr), will not boot additional nodes
-# if total price of already running nodes meets or exceeds this threshold.
-# default 0 means no limit.
-max_total_price = 0
-# Poll EC2 nodes and Arvados for new information every N seconds.
-poll_time = 60
-# Polls have exponential backoff when services fail to respond.
-# This is the longest time to wait between polls.
-max_poll_time = 300
-# If Node Manager can't succesfully poll a service for this long,
-# it will never start or stop compute nodes, on the assumption that its
-# information is too outdated.
-poll_stale_after = 600
-# If Node Manager boots a cloud node, and it does not pair with an Arvados
-# node before this long, assume that there was a cloud bootstrap failure and
-# shut it down.  Note that normal shutdown windows apply (see the Cloud
-# section), so this should be shorter than the first shutdown window value.
-boot_fail_after = 1800
-# "Node stale time" affects two related behaviors.
-# 1. If a compute node has been running for at least this long, but it
-# isn't paired with an Arvados node, do not shut it down, but leave it alone.
-# This prevents the node manager from shutting down a node that might
-# actually be doing work, but is having temporary trouble contacting the
-# API server.
-# 2. When the Node Manager starts a new compute node, it will try to reuse
-# an Arvados node that hasn't been updated for this long.
-node_stale_after = 14400
-# Number of consecutive times a node must report as "idle" before it
-# will be considered eligible for shutdown.  Node status is checked
-# each poll period, and node can go idle at any point during a poll
-# period (meaning a node could be reported as idle that has only been
-# idle for 1 second).  With a 60 second poll period, three consecutive
-# status updates of "idle" suggests the node has been idle at least
-# 121 seconds.
-consecutive_idle_count = 3
-# Scaling factor to be applied to nodes' available RAM size. Usually there's a
-# variable discrepancy between the advertised RAM value on cloud nodes and the
-# actual amount available.
-# If not set, this value will be set to 0.95
-node_mem_scaling = 0.95
-# File path for Certificate Authorities
-certs_file = /etc/ssl/certs/ca-certificates.crt
-# Log file path
-file = /var/log/arvados/node-manager.log
-# Log level for most Node Manager messages.
-# WARNING lets you know when polling a service fails.
-# INFO additionally lets you know when a compute node is started or stopped.
-level = INFO
-# You can also set different log levels for specific libraries.
-# Pykka is the Node Manager's actor library.
-# Setting this to DEBUG will display tracebacks for uncaught
-# exceptions in the actors, but it's also very chatty.
-pykka = WARNING
-# Setting apiclient to INFO will log the URL of every Arvados API request.
-apiclient = WARNING
-host = zyxwv.arvadosapi.com
-timeout = 15
-# Accept an untrusted SSL certificate from the API server?
-insecure = no
-provider = ec2
-# It's usually most cost-effective to shut down compute nodes during narrow
-# windows of time.  For example, EC2 bills each node by the hour, so the best
-# time to shut down a node is right before a new hour of uptime starts.
-# Shutdown windows define these periods of time.  These are windows in
-# full minutes, separated by commas.  Counting from the time the node is
-# booted, the node WILL NOT shut down for N1 minutes; then it MAY shut down
-# for N2 minutes; then it WILL NOT shut down for N3 minutes; and so on.
-# For example, "54, 5, 1" means the node may shut down from the 54th to the
-# 59th minute of each hour of uptime.
-# Specify at least two windows.  You can add as many as you need beyond that.
-shutdown_windows = 54, 5, 1
-[Cloud Credentials]
-key = KEY
-secret = SECRET_KEY
-region = us-east-1
-timeout = 60
-[Cloud List]
-# This section defines filters that find compute nodes.
-# Tags that you specify here will automatically be added to nodes you create.
-# Replace colons in Amazon filters with underscores
-# (e.g., write "tag:mytag" as "tag_mytag").
-instance-state-name = running
-tag_arvados-class = dynamic-compute
-tag_cluster = zyxwv
-[Cloud Create]
-# New compute nodes will send pings to Arvados at this host.
-# You may specify a port, and use brackets to disambiguate IPv6 addresses.
-ping_host = hostname:port
-# Give the name of an SSH key on AWS...
-ex_keyname = string
-# ... or a file path for an SSH key that can log in to the compute node.
-# (One or the other, not both.)
-# ssh_key = path
-# The EC2 IDs of the image and subnet compute nodes should use.
-image_id = idstring
-subnet_id = idstring
-# Comma-separated EC2 IDs for the security group(s) assigned to each
-# compute node.
-security_groups = idstring1, idstring2
-# Apply an Instance Profile ARN to the newly created compute nodes
-# For more info, see:
-# https://aws.amazon.com/premiumsupport/knowledge-center/iam-policy-restrict-vpc/
-# ex_iamprofile = arn:aws:iam::ACCOUNTNUMBER:instance-profile/ROLENAME
-# You can define any number of Size sections to list EC2 sizes you're
-# willing to use.  The Node Manager should boot the cheapest size(s) that
-# can run jobs in the queue.
-# Each size section MUST define the number of cores are available in this
-# size class (since libcloud does not provide any consistent API for exposing
-# this setting).
-# You may also want to define the amount of scratch space (expressed
-# in GB) for Crunch jobs.  You can also override Amazon's provided
-# data fields (such as price per hour) by setting them here.
-[Size m4.large]
-cores = 2
-price = 0.126
-scratch = 100
-[Size m4.xlarge]
-cores = 4
-price = 0.252
-scratch = 100
-h3(#gcp). Google Cloud Platform
-# Google Compute Engine configuration for Arvados Node Manager.
-# All times are in seconds unless specified otherwise.
-# The management server responds to http://addr:port/status.json with
-# a snapshot of internal state.
-# Management server listening address (default
-#address =
-# Management server port number (default -1, server is disabled)
-#port = 8989
-# Node Manager will ensure that there are at least this many nodes running at
-# all times.  If node manager needs to start new idle nodes for the purpose of
-# satisfying min_nodes, it will use the cheapest node type.  However, depending
-# on usage patterns, it may also satisfy min_nodes by keeping alive some
-# more-expensive nodes
-min_nodes = 0
-# Node Manager will not start any compute nodes when at least this
-# running at all times.  By default, these will be the cheapest node size.
-max_nodes = 8
-# Poll compute nodes and Arvados for new information every N seconds.
-poll_time = 60
-# Upper limit on rate of spending (in $/hr), will not boot additional nodes
-# if total price of already running nodes meets or exceeds this threshold.
-# default 0 means no limit.
-max_total_price = 0
-# Polls have exponential backoff when services fail to respond.
-# This is the longest time to wait between polls.
-max_poll_time = 300
-# If Node Manager can't succesfully poll a service for this long,
-# it will never start or stop compute nodes, on the assumption that its
-# information is too outdated.
-poll_stale_after = 600
-# "Node stale time" affects two related behaviors.
-# 1. If a compute node has been running for at least this long, but it
-# isn't paired with an Arvados node, do not shut it down, but leave it alone.
-# This prevents the node manager from shutting down a node that might
-# actually be doing work, but is having temporary trouble contacting the
-# API server.
-# 2. When the Node Manager starts a new compute node, it will try to reuse
-# an Arvados node that hasn't been updated for this long.
-node_stale_after = 14400
-# Number of consecutive times a node must report as "idle" before it
-# will be considered eligible for shutdown.  Node status is checked
-# each poll period, and node can go idle at any point during a poll
-# period (meaning a node could be reported as idle that has only been
-# idle for 1 second).  With a 60 second poll period, three consecutive
-# status updates of "idle" suggests the node has been idle at least
-# 121 seconds.
-consecutive_idle_count = 3
-# Scaling factor to be applied to nodes' available RAM size. Usually there's a
-# variable discrepancy between the advertised RAM value on cloud nodes and the
-# actual amount available.
-# If not set, this value will be set to 0.95
-node_mem_scaling = 0.95
-# File path for Certificate Authorities
-certs_file = /etc/ssl/certs/ca-certificates.crt
-# Log file path
-file = /var/log/arvados/node-manager.log
-# Log level for most Node Manager messages.
-# WARNING lets you know when polling a service fails.
-# INFO additionally lets you know when a compute node is started or stopped.
-level = INFO
-# You can also set different log levels for specific libraries.
-# Pykka is the Node Manager's actor library.
-# Setting this to DEBUG will display tracebacks for uncaught
-# exceptions in the actors, but it's also very chatty.
-pykka = WARNING
-# Setting apiclient to INFO will log the URL of every Arvados API request.
-apiclient = WARNING
-host = zyxwv.arvadosapi.com
-timeout = 15
-# Accept an untrusted SSL certificate from the API server?
-insecure = no
-provider = gce
-# Shutdown windows define periods of time when a node may and may not
-# be shut down.  These are windows in full minutes, separated by
-# commas.  Counting from the time the node is booted, the node WILL
-# NOT shut down for N1 minutes; then it MAY shut down for N2 minutes;
-# then it WILL NOT shut down for N3 minutes; and so on.  For example,
-# "54, 5, 1" means the node may shut down from the 54th to the 59th
-# minute of each hour of uptime.
-# GCE bills by the minute, and does not provide information about when
-# a node booted.  Node Manager will store this information in metadata
-# when it boots a node; if that information is not available, it will
-# assume the node booted at the epoch.  These shutdown settings are
-# very aggressive.  You may want to adjust this if you want more
-# continuity of service from a single node.
-shutdown_windows = 20, 999999
-[Cloud Credentials]
-user_id = client_email_address@developer.gserviceaccount.com
-key = path_to_certificate.pem
-project = project-id-from-google-cloud-dashboard
-timeout = 60
-# Valid location (zone) names: https://cloud.google.com/compute/docs/zones
-datacenter = us-central1-a
-# Optional settings. For full documentation see
-# http://libcloud.readthedocs.org/en/latest/compute/drivers/gce.html#libcloud.compute.drivers.gce.GCENodeDriver
-# auth_type = SA               # SA, IA or GCE
-# scopes = https://www.googleapis.com/auth/compute
-# credential_file =
-[Cloud List]
-# A comma-separated list of tags that must be applied to a node for it to
-# be considered a compute node.
-# The driver will automatically apply these tags to nodes it creates.
-tags = zyxwv, compute
-[Cloud Create]
-# New compute nodes will send pings to Arvados at this host.
-# You may specify a port, and use brackets to disambiguate IPv6 addresses.
-ping_host = hostname:port
-# A file path for an SSH key that can log in to the compute node.
-# ssh_key = path
-# The GCE image name and network zone name to use when creating new nodes.
-image = debian
-# network = your_network_name
-# JSON string of service account authorizations for this cluster.
-# See http://libcloud.readthedocs.org/en/latest/compute/drivers/gce.html#specifying-service-account-scopes
-# service_accounts = [{'email':'account@example.com', 'scopes':['storage-ro']}]
-# You can define any number of Size sections to list node sizes you're
-# willing to use.  The Node Manager should boot the cheapest size(s) that
-# can run jobs in the queue.
-# The Size fields are interpreted the same way as with a libcloud NodeSize:
-# http://libcloud.readthedocs.org/en/latest/compute/api.html#libcloud.compute.base.NodeSize
-# See https://cloud.google.com/compute/docs/machine-types for a list
-# of known machine types that may be used as a Size parameter.
-# Each size section MUST define the number of cores are available in this
-# size class (since libcloud does not provide any consistent API for exposing
-# this setting).
-# You may also want to define the amount of scratch space (expressed
-# in GB) for Crunch jobs.
-# You can also override Google's provided data fields (such as price per hour)
-# by setting them here.
-[Size n1-standard-2]
-cores = 2
-price = 0.076
-scratch = 100
-[Size n1-standard-4]
-cores = 4
-price = 0.152
-scratch = 200
-h3(#azure). Microsoft Azure
-# Azure configuration for Arvados Node Manager.
-# All times are in seconds unless specified otherwise.
-# The management server responds to http://addr:port/status.json with
-# a snapshot of internal state.
-# Management server listening address (default
-#address =
-# Management server port number (default -1, server is disabled)
-#port = 8989
-# The dispatcher can customize the start and stop procedure for
-# cloud nodes.  For example, the Slurm dispatcher drains nodes
-# through Slurm before shutting them down.
-dispatcher = slurm
-# Node Manager will ensure that there are at least this many nodes running at
-# all times.  If node manager needs to start new idle nodes for the purpose of
-# satisfying min_nodes, it will use the cheapest node type.  However, depending
-# on usage patterns, it may also satisfy min_nodes by keeping alive some
-# more-expensive nodes
-min_nodes = 0
-# Node Manager will not start any compute nodes when at least this
-# many are running.
-max_nodes = 8
-# Upper limit on rate of spending (in $/hr), will not boot additional nodes
-# if total price of already running nodes meets or exceeds this threshold.
-# default 0 means no limit.
-max_total_price = 0
-# Poll Azure nodes and Arvados for new information every N seconds.
-poll_time = 60
-# Polls have exponential backoff when services fail to respond.
-# This is the longest time to wait between polls.
-max_poll_time = 300
-# If Node Manager can't succesfully poll a service for this long,
-# it will never start or stop compute nodes, on the assumption that its
-# information is too outdated.
-poll_stale_after = 600
-# If Node Manager boots a cloud node, and it does not pair with an Arvados
-# node before this long, assume that there was a cloud bootstrap failure and
-# shut it down.  Note that normal shutdown windows apply (see the Cloud
-# section), so this should be shorter than the first shutdown window value.
-boot_fail_after = 1800
-# "Node stale time" affects two related behaviors.
-# 1. If a compute node has been running for at least this long, but it
-# isn't paired with an Arvados node, do not shut it down, but leave it alone.
-# This prevents the node manager from shutting down a node that might
-# actually be doing work, but is having temporary trouble contacting the
-# API server.
-# 2. When the Node Manager starts a new compute node, it will try to reuse
-# an Arvados node that hasn't been updated for this long.
-node_stale_after = 14400
-# Number of consecutive times a node must report as "idle" before it
-# will be considered eligible for shutdown.  Node status is checked
-# each poll period, and node can go idle at any point during a poll
-# period (meaning a node could be reported as idle that has only been
-# idle for 1 second).  With a 60 second poll period, three consecutive
-# status updates of "idle" suggests the node has been idle at least
-# 121 seconds.
-consecutive_idle_count = 3
-# Scaling factor to be applied to nodes' available RAM size. Usually there's a
-# variable discrepancy between the advertised RAM value on cloud nodes and the
-# actual amount available.
-# If not set, this value will be set to 0.95
-node_mem_scaling = 0.95
-# File path for Certificate Authorities
-certs_file = /etc/ssl/certs/ca-certificates.crt
-# Log file path
-file = /var/log/arvados/node-manager.log
-# Log level for most Node Manager messages.
-# WARNING lets you know when polling a service fails.
-# INFO additionally lets you know when a compute node is started or stopped.
-level = INFO
-# You can also set different log levels for specific libraries.
-# Pykka is the Node Manager's actor library.
-# Setting this to DEBUG will display tracebacks for uncaught
-# exceptions in the actors, but it's also very chatty.
-pykka = WARNING
-# Setting apiclient to INFO will log the URL of every Arvados API request.
-apiclient = WARNING
-host = zyxwv.arvadosapi.com
-timeout = 15
-# Accept an untrusted SSL certificate from the API server?
-insecure = no
-provider = azure
-# Shutdown windows define periods of time when a node may and may not be shut
-# down.  These are windows in full minutes, separated by commas.  Counting from
-# the time the node is booted, the node WILL NOT shut down for N1 minutes; then
-# it MAY shut down for N2 minutes; then it WILL NOT shut down for N3 minutes;
-# and so on.  For example, "20, 999999" means the node may shut down between
-# the 20th and 999999th minutes of uptime.
-# Azure bills by the minute, so it makes sense to agressively shut down idle
-# nodes.  Specify at least two windows.  You can add as many as you need beyond
-# that.
-shutdown_windows = 20, 999999
-[Cloud Credentials]
-# Use "azure account list" with the azure CLI to get these values.
-tenant_id = 00000000-0000-0000-0000-000000000000
-subscription_id = 00000000-0000-0000-0000-000000000000
-# The following directions are based on
-# https://azure.microsoft.com/en-us/documentation/articles/resource-group-authenticate-service-principal/
-# and updated for v2 of the Azure cli tool.
-# az ad app create --display-name "Node Manager" --homepage "https://arvados.org" --identifier-uris "https://<Your_Application_Uri>" --password <Your_Password> --end-date <Desired_credential_expiry_date>
-# az ad sp create "<Application_Id>"
-# az role assignment create --assignee "<Application_Id>" --role Owner --resource-group "<Your_Azure_Arvados_Resource_Group>"
-# Use <Application_Id> for "key" and the <Your_Password> for "secret"
-key = 00000000-0000-0000-0000-000000000000
-secret = PASSWORD
-timeout = 60
-region = East US
-[Cloud List]
-# The resource group in which the compute node virtual machines will be created
-# and listed.
-ex_resource_group = ArvadosResourceGroup
-[Cloud Create]
-# The compute node image, as a link to a VHD in Azure blob store.
-image = https://example.blob.core.windows.net/system/Microsoft.Compute/Images/images/zyxwv-compute-osDisk.vhd
-# Path to a local ssh key file that will be used to provision new nodes.
-ssh_key = /home/arvadosuser/.ssh/id_rsa.pub
-# The account name for the admin user that will be provisioned on new nodes.
-ex_user_name = arvadosuser
-# The Azure storage account that will be used to store the node OS disk images.
-ex_storage_account = arvadosstorage
-# The virtual network the VMs will be associated with.
-ex_network = ArvadosNetwork
-# Optional subnet of the virtual network.
-#ex_subnet = default
-# Node tags
-tag_arvados-class = dynamic-compute
-tag_cluster = zyxwv
-# the API server to ping
-ping_host = hostname:port
-# You can define any number of Size sections to list Azure sizes you're willing
-# to use.  The Node Manager should boot the cheapest size(s) that can run jobs
-# in the queue.  You must also provide price per hour as the Azure driver
-# compute currently does not report prices.
-# See https://azure.microsoft.com/en-us/pricing/details/virtual-machines/
-# for a list of known machine types that may be used as a Size parameter.
-# Each size section MUST define the number of cores are available in this
-# size class (since libcloud does not provide any consistent API for exposing
-# this setting).
-# You may also want to define the amount of scratch space (expressed
-# in GB) for Crunch jobs.  You can also override Microsoft's provided
-# data fields by setting them here.
-[Size Standard_D3]
-cores = 4
-price = 0.56
-[Size Standard_D4]
-cores = 8
-price = 1.12
-h2. Running
-$ arvados-node-manager --config /etc/arvados-node-manager/config.ini
index a2a34448f11cced6b01a6343eb56acc314e5556c..270d4045b5aaa395ee9b5c749763a601f35e1b6b 100644 (file)
@@ -139,9 +139,6 @@ Clusters:
         InternalURLs: {}
         ExternalURL: ""
-      Nodemanager:
-        InternalURLs: {}
-        ExternalURL: "-"
         InternalURLs: {}
         ExternalURL: "-"
index 1be7208ee38facce00e71f2cfdf07885ccffde08..0552b66adb80bed162ee3f2518a1c649c0c89ec2 100644 (file)
@@ -43,7 +43,6 @@ type nodeProfile struct {
        Keepproxy     systemServiceInstance `json:"keepproxy"`
        Keepstore     systemServiceInstance `json:"keepstore"`
        Keepweb       systemServiceInstance `json:"keep-web"`
-       Nodemanager   systemServiceInstance `json:"arvados-node-manager"`
        DispatchCloud systemServiceInstance `json:"arvados-dispatch-cloud"`
        RailsAPI      systemServiceInstance `json:"arvados-api-server"`
        Websocket     systemServiceInstance `json:"arvados-ws"`
index bddb5cedb1df8428024f6461ed52ede12d8b9607..0241673aa550e5c783a04d875e92bd588eab091f 100644 (file)
@@ -145,9 +145,6 @@ Clusters:
         InternalURLs: {}
         ExternalURL: ""
-      Nodemanager:
-        InternalURLs: {}
-        ExternalURL: "-"
         InternalURLs: {}
         ExternalURL: "-"
index 9cf1ed3cd182ba8f8659b38dee81bcf0a52ab976..c21addbba99284e5ad3e634e24e75e5deac9558e 100644 (file)
@@ -314,7 +314,6 @@ type Services struct {
        Keepbalance    Service
        Keepproxy      Service
        Keepstore      Service
-       Nodemanager    Service
        RailsAPI       Service
        SSO            Service
        WebDAVDownload Service
@@ -567,7 +566,6 @@ const (
        ServiceNameController    ServiceName = "arvados-controller"
        ServiceNameDispatchCloud ServiceName = "arvados-dispatch-cloud"
        ServiceNameHealth        ServiceName = "arvados-health"
-       ServiceNameNodemanager   ServiceName = "arvados-node-manager"
        ServiceNameWorkbench1    ServiceName = "arvados-workbench1"
        ServiceNameWorkbench2    ServiceName = "arvados-workbench2"
        ServiceNameWebsocket     ServiceName = "arvados-ws"
@@ -585,7 +583,6 @@ func (svcs Services) Map() map[ServiceName]Service {
                ServiceNameController:    svcs.Controller,
                ServiceNameDispatchCloud: svcs.DispatchCloud,
                ServiceNameHealth:        svcs.Health,
-               ServiceNameNodemanager:   svcs.Nodemanager,
                ServiceNameWorkbench1:    svcs.Workbench1,
                ServiceNameWorkbench2:    svcs.Workbench2,
                ServiceNameWebsocket:     svcs.Websocket,
index f4b0a994366db603ba0284cb76f4a6573ff266c7..2acf3e59ab81ae10ff816577f5f33fdaea8b9922 100644 (file)
@@ -157,7 +157,6 @@ func (s *AggregatorSuite) setAllServiceURLs(listen string) {
-               &svcs.Nodemanager,
diff --git a/sdk/pam/.dockerignore b/sdk/pam/.dockerignore
deleted file mode 100644 (file)
index 922b80e..0000000
+++ /dev/null
@@ -1,10 +0,0 @@
-# Copyright (C) The Arvados Authors. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
diff --git a/sdk/pam/.gitignore b/sdk/pam/.gitignore
deleted file mode 120000 (symlink)
index 1399fd4..0000000
+++ /dev/null
@@ -1 +0,0 @@
\ No newline at end of file
diff --git a/sdk/pam/Dockerfile b/sdk/pam/Dockerfile
deleted file mode 100644 (file)
index ff450d8..0000000
+++ /dev/null
@@ -1,56 +0,0 @@
-# Copyright (C) The Arvados Authors. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-# These tests assume you have a real API server running on the docker host.
-# Build the test container:
-#   First, replace 3000 below with your api server's port number if necessary.
-#   host$ python setup.py sdist rotate --keep=1 --match .tar.gz
-#   host$ docker build --tag=arvados/pam_test .
-# Automated integration test:
-#   host$ docker run -it --add-host zzzzz.arvadosapi.com:"$(hostname -I |awk '{print $1}')" arvados/pam_test
-# You should see "=== OK ===", followed by a Perl stack trace due to a
-# yet-unidentified pam_python.so bug.
-# Manual integration test:
-#   host$ docker run -it --add-host zzzzz.arvadosapi.com:"$(hostname -I |awk '{print $1}')" arvados/pam_test bash -c 'rsyslogd & tail -F /var/log/auth.log & sleep 1 & bash'
-#   container# login
-#   login: active
-#   Arvados API token: 3kg6k6lzmp9kj5cpkcoxie963cmvjahbt2fod9zru30k1jqdmi
-# You should now be logged in to the "active" shell account in the
-# container. You should also see arvados_pam log entries in
-# /var/log/auth.log (and in your terminal, thanks to "tail -F").
-FROM debian:wheezy
-RUN apt-get update
-RUN apt-get -qy dist-upgrade
-RUN apt-get -qy install python python-virtualenv libpam-python rsyslog
-# Packages required by pycurl, ciso8601
-RUN apt-get -qy install libcurl4-gnutls-dev python2.7-dev
-# for jessie (which also has other snags)
-# RUN apt-get -qy install python-pip libgnutls28-dev
-RUN pip install --upgrade setuptools
-RUN pip install python-pam
-ADD dist /dist
-RUN pip install /dist/arvados-pam-*.tar.gz
-# Configure and enable the module (hopefully vendor packages will offer a neater way)
-RUN perl -pi -e 's{api.example}{zzzzz.arvadosapi.com:3000}; s{shell\.example}{testvm2.shell insecure};' /usr/share/pam-configs/arvados
-RUN DEBIAN_FRONTEND=noninteractive pam-auth-update arvados --remove unix
-# Add a user account matching the fixture
-RUN useradd -ms /bin/bash active
-# Test with python (SIGSEGV during tests)
-#ADD . /pam
-#WORKDIR /pam
-#CMD rsyslogd & tail -F /var/log/auth.log & python setup.py test --test-suite integration_tests
-# Test with perl (SIGSEGV when program exits)
-RUN apt-get install -qy libauthen-pam-perl
-ADD tests/integration_test.pl /integration_test.pl
-CMD rsyslogd & tail -F /var/log/auth.log & sleep 1 && /integration_test.pl
diff --git a/sdk/pam/LICENSE-2.0.txt b/sdk/pam/LICENSE-2.0.txt
deleted file mode 100644 (file)
index d645695..0000000
+++ /dev/null
@@ -1,202 +0,0 @@
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-   1. Definitions.
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      implied, including, without limitation, any warranties or conditions
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-   APPENDIX: How to apply the Apache License to your work.
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-   Copyright [yyyy] [name of copyright owner]
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-       http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
diff --git a/sdk/pam/MANIFEST.in b/sdk/pam/MANIFEST.in
deleted file mode 100644 (file)
index 48892fa..0000000
+++ /dev/null
@@ -1,10 +0,0 @@
-# Copyright (C) The Arvados Authors. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-include LICENSE-2.0.txt
-include README.rst
-include examples/shellinabox
-include lib/libpam_arvados.py
-include pam-configs/arvados
-include arvados_version.py
\ No newline at end of file
diff --git a/sdk/pam/README.rst b/sdk/pam/README.rst
deleted file mode 100644 (file)
index 81be331..0000000
+++ /dev/null
@@ -1,25 +0,0 @@
-.. Copyright (C) The Arvados Authors. All rights reserved.
-.. SPDX-License-Identifier: Apache-2.0
-Arvados PAM Module
-Accept Arvados API tokens to authenticate to shell accounts.
-.. _Arvados: https://arvados.org
-See http://doc.arvados.org
-Testing and Development
-describes how to set up a development environment and run tests.
diff --git a/sdk/pam/arvados_pam/__init__.py b/sdk/pam/arvados_pam/__init__.py
deleted file mode 100644 (file)
index dd78d41..0000000
+++ /dev/null
@@ -1,55 +0,0 @@
-# Copyright (C) The Arvados Authors. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-import sys
-from . import auth_event
-def pam_sm_authenticate(pamh, flags, argv):
-    config = {}
-    config['arvados_api_host'] = argv[1]
-    config['virtual_machine_hostname'] = argv[2]
-    if len(argv) > 3:
-        for k in argv[3:]:
-            config[k] = True
-    try:
-        username = pamh.get_user(None)
-    except pamh.exception as e:
-        return e.pam_result
-    if not username:
-        return pamh.PAM_USER_UNKNOWN
-    try:
-        prompt = '' if config.get('noprompt') else 'Arvados API token: '
-        token = pamh.conversation(pamh.Message(pamh.PAM_PROMPT_ECHO_OFF, prompt)).resp
-    except pamh.exception as e:
-        return e.pam_result
-    if auth_event.AuthEvent(
-            config=config,
-            service=pamh.service,
-            client_host=pamh.rhost,
-            username=username,
-            token=token).can_login():
-        return pamh.PAM_SUCCESS
-    else:
-        return pamh.PAM_AUTH_ERR
-def pam_sm_setcred(pamh, flags, argv):
-    return pamh.PAM_SUCCESS
-def pam_sm_acct_mgmt(pamh, flags, argv):
-    return pamh.PAM_SUCCESS
-def pam_sm_open_session(pamh, flags, argv):
-    return pamh.PAM_SUCCESS
-def pam_sm_close_session(pamh, flags, argv):
-    return pamh.PAM_SUCCESS
-def pam_sm_chauthtok(pamh, flags, argv):
-    return pamh.PAM_SUCCESS
diff --git a/sdk/pam/arvados_pam/auth_event.py b/sdk/pam/arvados_pam/auth_event.py
deleted file mode 100644 (file)
index 4f2663c..0000000
+++ /dev/null
@@ -1,92 +0,0 @@
-# Copyright (C) The Arvados Authors. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-import arvados
-import syslog
-def auth_log(msg):
-    """Log an authentication result to syslogd"""
-    syslog.openlog(facility=syslog.LOG_AUTH)
-    syslog.syslog('arvados_pam: ' + msg)
-    syslog.closelog()
-class AuthEvent(object):
-    def __init__(self, config, service, client_host, username, token):
-        self.config = config
-        self.service = service
-        self.client_host = client_host
-        self.username = username
-        self.token = token
-        self.api_host = None
-        self.vm_uuid = None
-        self.user = None
-    def can_login(self):
-        """Return truthy IFF credentials should be accepted."""
-        ok = False
-        try:
-            self.api_host = self.config['arvados_api_host']
-            self.arv = arvados.api('v1', host=self.api_host, token=self.token,
-                                   insecure=self.config.get('insecure', False),
-                                   cache=False)
-            vmname = self.config['virtual_machine_hostname']
-            vms = self.arv.virtual_machines().list(filters=[['hostname','=',vmname]]).execute()
-            if vms['items_available'] > 1:
-                raise Exception("lookup hostname %s returned %d records" % (vmname, vms['items_available']))
-            if vms['items_available'] == 0:
-                raise Exception("lookup hostname %s not found" % vmname)
-            vm = vms['items'][0]
-            if vm['hostname'] != vmname:
-                raise Exception("lookup hostname %s returned hostname %s" % (vmname, vm['hostname']))
-            self.vm_uuid = vm['uuid']
-            self.user = self.arv.users().current().execute()
-            filters = [
-                ['link_class','=','permission'],
-                ['name','=','can_login'],
-                ['head_uuid','=',self.vm_uuid],
-                ['tail_uuid','=',self.user['uuid']]]
-            for l in self.arv.links().list(filters=filters, limit=10000).execute()['items']:
-                if (l['properties']['username'] == self.username and
-                    l['tail_uuid'] == self.user['uuid'] and
-                    l['head_uuid'] == self.vm_uuid and
-                    l['link_class'] == 'permission' and
-                    l['name'] == 'can_login'):
-                    return self._report(True)
-            return self._report(False)
-        except Exception as e:
-            return self._report(e)
-    def _report(self, result):
-        """Log the result. Return truthy IFF result is True.
-        result must be True, False, or an exception.
-        """
-        self.result = result
-        auth_log(self.message())
-        return result == True
-    def message(self):
-        """Return a log message describing the event and its outcome."""
-        if isinstance(self.result, Exception):
-            outcome = 'Error: ' + repr(self.result)
-        elif self.result == True:
-            outcome = 'Allow'
-        else:
-            outcome = 'Deny'
-        if len(self.token) > 40:
-            log_token = self.token[0:15]
-        else:
-            log_token = '<invalid>'
-        log_label = [self.service, self.api_host, self.vm_uuid, self.client_host, self.username, log_token]
-        if self.user:
-            log_label += [self.user.get('uuid'), self.user.get('full_name')]
-        return str(log_label) + ': ' + outcome
diff --git a/sdk/pam/arvados_version.py b/sdk/pam/arvados_version.py
deleted file mode 100644 (file)
index 9aabff4..0000000
+++ /dev/null
@@ -1,36 +0,0 @@
-# Copyright (C) The Arvados Authors. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-import subprocess
-import time
-import os
-import re
-def git_version_at_commit():
-    curdir = os.path.dirname(os.path.abspath(__file__))
-    myhash = subprocess.check_output(['git', 'log', '-n1', '--first-parent',
-                                       '--format=%H', curdir]).strip()
-    myversion = subprocess.check_output([curdir+'/../../build/version-at-commit.sh', myhash]).strip().decode()
-    return myversion
-def save_version(setup_dir, module, v):
-  with open(os.path.join(setup_dir, module, "_version.py"), 'wt') as fp:
-      return fp.write("__version__ = '%s'\n" % v)
-def read_version(setup_dir, module):
-  with open(os.path.join(setup_dir, module, "_version.py"), 'rt') as fp:
-      return re.match("__version__ = '(.*)'$", fp.read()).groups()[0]
-def get_version(setup_dir, module):
-    env_version = os.environ.get("ARVADOS_BUILDING_VERSION")
-    if env_version:
-        save_version(setup_dir, module, env_version)
-    else:
-        try:
-            save_version(setup_dir, module, git_version_at_commit())
-        except (subprocess.CalledProcessError, OSError):
-            pass
-    return read_version(setup_dir, module)
diff --git a/sdk/pam/examples/shellinabox b/sdk/pam/examples/shellinabox
deleted file mode 100644 (file)
index 2d91ccb..0000000
+++ /dev/null
@@ -1,27 +0,0 @@
-# This example is a stock debian "login" file with libpam_arvados
-# replacing pam_unix, and the "noprompt" option in use. It can be
-# installed as /etc/pam.d/shellinabox .
-auth       optional   pam_faildelay.so  delay=3000000
-auth [success=ok new_authtok_reqd=ok ignore=ignore user_unknown=bad default=die] pam_securetty.so
-auth       requisite  pam_nologin.so
-session [success=ok ignore=ignore module_unknown=ignore default=bad] pam_selinux.so close
-session       required   pam_env.so readenv=1
-session       required   pam_env.so readenv=1 envfile=/etc/default/locale
-auth [success=1 default=ignore] pam_python.so /usr/local/lib/security/libpam_arvados.py api.example shell.example noprompt
-auth   requisite                       pam_deny.so
-auth   required                        pam_permit.so
-auth       optional   pam_group.so
-session    required   pam_limits.so
-session    optional   pam_lastlog.so
-session    optional   pam_motd.so  motd=/run/motd.dynamic
-session    optional   pam_motd.so
-session    optional   pam_mail.so standard
-@include common-account
-@include common-session
-@include common-password
-session [success=ok ignore=ignore module_unknown=ignore default=bad] pam_selinux.so open
diff --git a/sdk/pam/fpm-info.sh b/sdk/pam/fpm-info.sh
deleted file mode 100644 (file)
index 6c323f5..0000000
+++ /dev/null
@@ -1,22 +0,0 @@
-# Copyright (C) The Arvados Authors. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-case "$TARGET" in
-    debian* | ubuntu*)
-        fpm_depends+=('libpam-python' 'libcurl3-gnutls')
-        ;;
-    centos*)
-        fpm_depends+=('python-pam')
-        ;;
-    *)
-        echo >&2 "ERROR: $PACKAGE: pam_python.so dependency unavailable in $TARGET."
-        return 1
-        ;;
-case "$FORMAT" in
-    deb)
-        fpm_args+=('--deb-recommends=system-log-daemon')
-        ;;
diff --git a/sdk/pam/gittaggers.py b/sdk/pam/gittaggers.py
deleted file mode 120000 (symlink)
index d59c02c..0000000
+++ /dev/null
@@ -1 +0,0 @@
\ No newline at end of file
diff --git a/sdk/pam/integration_tests/__init__.py b/sdk/pam/integration_tests/__init__.py
deleted file mode 100644 (file)
index e69de29..0000000
diff --git a/sdk/pam/integration_tests/test_pam.py b/sdk/pam/integration_tests/test_pam.py
deleted file mode 100644 (file)
index 32ae38d..0000000
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright (C) The Arvados Authors. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-"""These tests assume we are running (in a docker container) with
-arvados_pam configured and a test API server running.
-import pam
-import unittest
-# From services/api/test/fixtures/api_client_authorizations.yml
-# because that file is not available during integration tests:
-ACTIVE_TOKEN = '3kg6k6lzmp9kj5cpkcoxie963cmvjahbt2fod9zru30k1jqdmi'
-SPECTATOR_TOKEN = 'zw2f4gwx8hw8cjre7yp6v1zylhrhn3m5gvjq73rtpwhmknrybu'
-class IntegrationTest(unittest.TestCase):
-    def setUp(self):
-        self.p = pam.pam()
-    def test_allow(self):
-        self.assertTrue(self.p.authenticate('active', ACTIVE_TOKEN, service='login'))
-    def test_deny_bad_token(self):
-        self.assertFalse(self.p.authenticate('active', 'thisisaverybadtoken', service='login'))
-    def test_deny_empty_token(self):
-        self.assertFalse(self.p.authenticate('active', '', service='login'))
-    def test_deny_permission(self):
-        self.assertFalse(self.p.authenticate('spectator', SPECTATOR_TOKEN, service='login'))
diff --git a/sdk/pam/lib/libpam_arvados.py b/sdk/pam/lib/libpam_arvados.py
deleted file mode 100644 (file)
index 7c3406d..0000000
+++ /dev/null
@@ -1,7 +0,0 @@
-# Copyright (C) The Arvados Authors. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-import sys
-from arvados_pam import *
diff --git a/sdk/pam/pam-configs/arvados b/sdk/pam/pam-configs/arvados
deleted file mode 100644 (file)
index 086e176..0000000
+++ /dev/null
@@ -1,14 +0,0 @@
-# 1. Change "api.example" to your ARVADOS_API_HOST
-# 2. Change "shell.example" to this host's hostname
-#    (as it appears in the Arvados virtual_machines list)
-# 3. Install in /usr/share/pam-configs/arvados
-# 4. Run `pam-auth-update arvados`
-Name: Arvados authentication
-Default: yes
-Priority: 256
-Auth-Type: Primary
-       [success=end default=ignore]    pam_python.so /usr/local/lib/security/libpam_arvados.py api.example shell.example
-       [success=end default=ignore]    pam_python.so /usr/local/lib/security/libpam_arvados.py api.example shell.example
diff --git a/sdk/pam/setup.py b/sdk/pam/setup.py
deleted file mode 100755 (executable)
index 59b49a1..0000000
+++ /dev/null
@@ -1,57 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-from __future__ import absolute_import
-import glob
-import os
-import sys
-import re
-import subprocess
-from setuptools import setup, find_packages
-SETUP_DIR = os.path.dirname(__file__) or '.'
-README = os.path.join(SETUP_DIR, 'README.rst')
-import arvados_version
-version = arvados_version.get_version(SETUP_DIR, "arvados_pam")
-if os.environ.get('ARVADOS_BUILDING_VERSION', False):
-    pysdk_dep = "=={}".format(version)
-    # On dev releases, arvados-python-client may have a different timestamp
-    pysdk_dep = "<={}".format(version)
-short_tests_only = False
-if '--short-tests-only' in sys.argv:
-    short_tests_only = True
-    sys.argv.remove('--short-tests-only')
-      version=version,
-      description='Arvados PAM module',
-      long_description=open(README).read(),
-      author='Arvados',
-      author_email='info@arvados.org',
-      url='https://arvados.org',
-      download_url='https://github.com/arvados/arvados.git',
-      license='Apache 2.0',
-      packages=[
-          'arvados_pam',
-      ],
-      scripts=[
-      ],
-      data_files=[
-          ('lib/security', ['lib/libpam_arvados.py']),
-          ('share/pam-configs', ['pam-configs/arvados']),
-          ('share/doc/arvados-pam', ['LICENSE-2.0.txt', 'README.rst']),
-          ('share/doc/arvados-pam/examples', glob.glob('examples/*')),
-      ],
-      install_requires=[
-          'arvados-python-client{}'.format(pysdk_dep),
-      ],
-      test_suite='tests',
-      tests_require=['pbr<1.7.0', 'mock>=1.0', 'python-pam'],
-      zip_safe=False,
diff --git a/sdk/pam/tests/__init__.py b/sdk/pam/tests/__init__.py
deleted file mode 100644 (file)
index e69de29..0000000
diff --git a/sdk/pam/tests/integration_test.pl b/sdk/pam/tests/integration_test.pl
deleted file mode 100755 (executable)
index cbe9b0a..0000000
+++ /dev/null
@@ -1,49 +0,0 @@
-#!/usr/bin/env perl
-# Copyright (C) The Arvados Authors. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-use Authen::PAM qw(:constants);
-for my $case (['good', 1, 'active', '3kg6k6lzmp9kj5cpkcoxie963cmvjahbt2fod9zru30k1jqdmi'],
-              ['badtoken', 0, 'active', 'badtokenmp9kj5cpkcoxie963cmvjahbt2fod9zru30k1jqdmi'],
-              ['badusername', 0, 'baduser', '3kg6k6lzmp9kj5cpkcoxie963cmvjahbt2fod9zru30k1jqdmi']) {
-    dotest(@$case);
-print "=== OK ===\n";
-sub dotest {
-    my ($label, $expect_ok, $user, $token) = @_;
-    print "$label: ";
-    my $service_name = 'login';
-    $main::Token = $token;
-    my $pamh = new Authen::PAM($service_name, $user, \&token_conv_func);
-    ref($pamh) || die "Error code $pamh during PAM init!";
-    $pamh->pam_set_item(PAM_RHOST(), '::1');
-    $pamh->pam_set_item(PAM_RUSER(), 'none');
-    $pamh->pam_set_item(PAM_TTY(), '/dev/null');
-    my $flags = PAM_SILENT();
-    $res = $pamh->pam_authenticate($flags);
-    $msg = $pamh->pam_strerror($res);
-    print "Result (code $res): $msg\n";
-    if (($res == 0) != ($expect_ok == 1)) {
-        die "*** FAIL ***\n";
-    }
-sub token_conv_func {
-    my @res;
-    while ( @_ ) {
-        my $code = shift;
-        my $msg = shift;
-        my $ans;
-        print "Message (type $code): $msg\n";
-        if ($code == PAM_PROMPT_ECHO_OFF() || $code == PAM_PROMPT_ECHO_ON()) {
-            $ans = $main::Token;
-        }
-        push @res, (0,$ans);
-    }
-    push @res, PAM_SUCCESS();
-    return @res;
diff --git a/sdk/pam/tests/mocker.py b/sdk/pam/tests/mocker.py
deleted file mode 100644 (file)
index ec6f064..0000000
+++ /dev/null
@@ -1,63 +0,0 @@
-# Copyright (C) The Arvados Authors. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-import mock
-import unittest
-class Mocker(unittest.TestCase):
-    ACTIVE_TOKEN = '3kg6k6lzmp9kj5cpkcoxie963cmvjahbt2fod9zru30k1jqdmi'
-    default_config = {
-        'arvados_api_host': 'zzzzz.api_host.example',
-        'virtual_machine_hostname': 'testvm2.shell',
-    }
-    default_request = {
-        'client_host': '::1',
-        'token': ACTIVE_TOKEN,
-        'username': 'active',
-    }
-    default_response = {
-        'links': {
-            'items': [{
-                'uuid': 'zzzzz-o0j2j-rah2ya1ohx9xaev',
-                'tail_uuid': 'zzzzz-tpzed-xurymjxw79nv3jz',
-                'head_uuid': 'zzzzz-2x53u-382brsig8rp3065',
-                'link_class': 'permission',
-                'name': 'can_login',
-                'properties': {
-                    'username': 'active',
-                },
-            }],
-        },
-        'users': {
-            'uuid': 'zzzzz-tpzed-xurymjxw79nv3jz',
-            'full_name': 'Active User',
-        },
-        'virtual_machines': {
-            'items': [{
-                'uuid': 'zzzzz-2x53u-382brsig8rp3065',
-                'hostname': 'testvm2.shell',
-            }],
-            'items_available': 1,
-        },
-    }
-    def setUp(self):
-        self.config = self.default_config.copy()
-        self.request = self.default_request.copy()
-        self.response = self.default_response.copy()
-        self.api_client = mock.MagicMock(name='api_client')
-        self.api_client.users().current().execute.side_effect = lambda: self.response['users']
-        self.api_client.virtual_machines().list().execute.side_effect = lambda: self.response['virtual_machines']
-        self.api_client.links().list().execute.side_effect = lambda: self.response['links']
-        patcher = mock.patch('arvados.api')
-        self.api = patcher.start()
-        self.addCleanup(patcher.stop)
-        self.api.side_effect = [self.api_client]
-        self.syslogged = []
-        patcher = mock.patch('syslog.syslog')
-        self.syslog = patcher.start()
-        self.addCleanup(patcher.stop)
-        self.syslog.side_effect = lambda s: self.syslogged.append(s)
diff --git a/sdk/pam/tests/test_auth_event.py b/sdk/pam/tests/test_auth_event.py
deleted file mode 100644 (file)
index f907b31..0000000
+++ /dev/null
@@ -1,99 +0,0 @@
-# Copyright (C) The Arvados Authors. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-import arvados_pam
-import re
-from . import mocker
-class AuthEventTest(mocker.Mocker):
-    def attempt(self):
-        return arvados_pam.auth_event.AuthEvent(config=self.config, service='test_service', **self.request).can_login()
-    def test_success(self):
-        self.assertTrue(self.attempt())
-        self.api_client.virtual_machines().list.assert_called_with(
-            filters=[['hostname','=',self.config['virtual_machine_hostname']]])
-        self.api.assert_called_with(
-            'v1',
-            host=self.config['arvados_api_host'], token=self.request['token'],
-            insecure=False,
-            cache=False)
-        self.assertEqual(1, len(self.syslogged))
-        for i in ['test_service',
-                  self.request['username'],
-                  self.config['arvados_api_host'],
-                  self.response['virtual_machines']['items'][0]['uuid']]:
-            self.assertRegexpMatches(self.syslogged[0], re.escape(i))
-        self.assertRegexpMatches(self.syslogged[0], re.escape(self.request['token'][0:15]), 'token prefix not logged')
-        self.assertNotRegexpMatches(self.syslogged[0], re.escape(self.request['token'][15:30]), 'too much token logged')
-    def test_fail_vm_lookup(self):
-        self.api_client.virtual_machines().list().execute.side_effect = Exception("Test-induced failure")
-        self.assertFalse(self.attempt())
-        self.assertRegexpMatches(self.syslogged[0], 'Test-induced failure')
-    def test_vm_hostname_not_found(self):
-        self.response['virtual_machines'] = {
-            'items': [],
-            'items_available': 0,
-        }
-        self.assertFalse(self.attempt())
-    def test_vm_hostname_ambiguous(self):
-        self.response['virtual_machines'] = {
-            'items': [
-                {
-                    'uuid': 'zzzzz-2x53u-382brsig8rp3065',
-                    'hostname': 'testvm2.shell',
-                },
-                {
-                    'uuid': 'zzzzz-2x53u-382brsig8rp3065',
-                    'hostname': 'testvm2.shell',
-                },
-            ],
-            'items_available': 2,
-        }
-        self.assertFalse(self.attempt())
-    def test_server_ignores_vm_filters(self):
-        self.response['virtual_machines'] = {
-            'items': [
-                {
-                    'uuid': 'zzzzz-2x53u-382brsig8rp3065',
-                    'hostname': 'testvm22.shell', # <-----
-                },
-            ],
-            'items_available': 1,
-        }
-        self.assertFalse(self.attempt())
-    def test_fail_user_lookup(self):
-        self.api_client.users().current().execute.side_effect = Exception("Test-induced failure")
-        self.assertFalse(self.attempt())
-    def test_fail_permission_check(self):
-        self.api_client.links().list().execute.side_effect = Exception("Test-induced failure")
-        self.assertFalse(self.attempt())
-    def test_no_login_permission(self):
-        self.response['links'] = {
-            'items': [],
-        }
-        self.assertFalse(self.attempt())
-    def test_server_ignores_permission_filters(self):
-        self.response['links'] = {
-            'items': [{
-                'uuid': 'zzzzz-o0j2j-rah2ya1ohx9xaev',
-                'tail_uuid': 'zzzzz-tpzed-xurymjxw79nv3jz',
-                'head_uuid': 'zzzzz-2x53u-382brsig8rp3065',
-                'link_class': 'permission',
-                'name': 'CANT_login', # <-----
-                'properties': {
-                    'username': 'active',
-                },
-            }],
-        }
-        self.assertFalse(self.attempt())
diff --git a/sdk/pam/tests/test_pam_sm.py b/sdk/pam/tests/test_pam_sm.py
deleted file mode 100644 (file)
index 53597c0..0000000
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright (C) The Arvados Authors. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-import arvados_pam
-import mock
-from . import mocker
-class PamSMTest(mocker.Mocker):
-    def attempt(self):
-        return arvados_pam.pam_sm_authenticate(self.pamh, 0, self.argv)
-    def test_success(self):
-        self.assertEqual(self.pamh.PAM_SUCCESS, self.attempt())
-    def test_bad_user(self):
-        self.pamh.get_user = mock.MagicMock(return_value='badusername')
-        self.assertEqual(self.pamh.PAM_AUTH_ERR, self.attempt())
-    def test_bad_vm(self):
-        self.argv[2] = 'testvm22.shell'
-        self.assertEqual(self.pamh.PAM_AUTH_ERR, self.attempt())
-    def setUp(self):
-        super(PamSMTest, self).setUp()
-        self.pamh = mock.MagicMock()
-        self.pamh.get_user = mock.MagicMock(return_value='active')
-        self.pamh.PAM_SUCCESS = 12345
-        self.pamh.PAM_AUTH_ERR = 54321
-        self.argv = [__file__, 'zzzzz.arvadosapi.com', 'testvm2.shell']
index b54e8d9de64f970726dc49d07ca47e368491986a..9fa3febe1e75fddf4227fae9801cda7d976b4149 100644 (file)
@@ -141,7 +141,7 @@ class NodeTest < ActiveSupport::TestCase
     assert_equal "custom1", node2.hostname
-  test "update dns when nodemanager clears hostname and ip_address" do
+  test "update dns when hostname and ip_address are cleared" do
     act_as_system_user do
       node = ping_node(:new_with_custom_hostname, {})
       Node.expects(:dns_server_update).with(node.hostname, Node::UNUSED_NODE_IP)
diff --git a/services/nodemanager/.gitignore b/services/nodemanager/.gitignore
deleted file mode 120000 (symlink)
index ed3b362..0000000
+++ /dev/null
@@ -1 +0,0 @@
\ No newline at end of file
diff --git a/services/nodemanager/MANIFEST.in b/services/nodemanager/MANIFEST.in
deleted file mode 100644 (file)
index 8410420..0000000
+++ /dev/null
@@ -1,8 +0,0 @@
-# Copyright (C) The Arvados Authors. All rights reserved.
-# SPDX-License-Identifier: AGPL-3.0
-include agpl-3.0.txt
-include README.rst
-include arvados_version.py
-include arvados-node-manager.service
diff --git a/services/nodemanager/README.rst b/services/nodemanager/README.rst
deleted file mode 100644 (file)
index 1d725e0..0000000
+++ /dev/null
@@ -1,43 +0,0 @@
-.. Copyright (C) The Arvados Authors. All rights reserved.
-.. SPDX-License-Identifier: AGPL-3.0
-Arvados Node Manager
-This package provides ``arvados-node-manager``.  It dynamically starts
-and stops compute nodes on an Arvados_ cloud installation based on job
-.. _Arvados: https://arvados.org/
-1. Install the package.
-2. Write a configuration file.  ``doc/ec2.example.cfg`` documents all
-   of the options available, with specific tunables for EC2 clouds.
-3. Run ``arvados-node-manager --config YOURCONFIGFILE`` using whatever
-   supervisor you like (e.g., runit).
-Testing and Development
-To run tests, just run::
-  python setup.py test
-Our `hacking guide
-provides an architectural overview of the Arvados Node Manager to help
-you find your way around the source.  The `Lifecycle of an Arvados
-compute node
-page explains how it works in concert with other Arvados components to
-prepare a node for compute work.
diff --git a/services/nodemanager/agpl-3.0.txt b/services/nodemanager/agpl-3.0.txt
deleted file mode 100644 (file)
index dba13ed..0000000
+++ /dev/null
@@ -1,661 +0,0 @@
-                       Version 3, 19 November 2007
- Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
- Everyone is permitted to copy and distribute verbatim copies
- of this license document, but changing it is not allowed.
-                            Preamble
-  The GNU Affero General Public License is a free, copyleft license for
-software and other kinds of works, specifically designed to ensure
-cooperation with the community in the case of network server software.
-  The licenses for most software and other practical works are designed
-to take away your freedom to share and change the works.  By contrast,
-our General Public Licenses are intended to guarantee your freedom to
-share and change all versions of a program--to make sure it remains free
-software for all its users.
-  When we speak of free software, we are referring to freedom, not
-price.  Our General Public Licenses are designed to make sure that you
-have the freedom to distribute copies of free software (and charge for
-them if you wish), that you receive source code or can get it if you
-want it, that you can change the software or use pieces of it in new
-free programs, and that you know you can do these things.
-  Developers that use our General Public Licenses protect your rights
-with two steps: (1) assert copyright on the software, and (2) offer
-you this License which gives you legal permission to copy, distribute
-and/or modify the software.
-  A secondary benefit of defending all users' freedom is that
-improvements made in alternate versions of the program, if they
-receive widespread use, become available for other developers to
-incorporate.  Many developers of free software are heartened and
-encouraged by the resulting cooperation.  However, in the case of
-software used on network servers, this result may fail to come about.
-The GNU General Public License permits making a modified version and
-letting the public access it on a server without ever releasing its
-source code to the public.
-  The GNU Affero General Public License is designed specifically to
-ensure that, in such cases, the modified source code becomes available
-to the community.  It requires the operator of a network server to
-provide the source code of the modified version running there to the
-users of that server.  Therefore, public use of a modified version, on
-a publicly accessible server, gives the public access to the source
-code of the modified version.
-  An older license, called the Affero General Public License and
-published by Affero, was designed to accomplish similar goals.  This is
-a different license, not a version of the Affero GPL, but Affero has
-released a new version of the Affero GPL which permits relicensing under
-this license.
-  The precise terms and conditions for copying, distribution and
-modification follow.
-                       TERMS AND CONDITIONS
-  0. Definitions.
-  "This License" refers to version 3 of the GNU Affero General Public License.
-  "Copyright" also means copyright-like laws that apply to other kinds of
-works, such as semiconductor masks.
-  "The Program" refers to any copyrightable work licensed under this
-License.  Each licensee is addressed as "you".  "Licensees" and
-"recipients" may be individuals or organizations.
-  To "modify" a work means to copy from or adapt all or part of the work
-in a fashion requiring copyright permission, other than the making of an
-exact copy.  The resulting work is called a "modified version" of the
-earlier work or a work "based on" the earlier work.
-  A "covered work" means either the unmodified Program or a work based
-on the Program.
-  To "propagate" a work means to do anything with it that, without
-permission, would make you directly or secondarily liable for
-infringement under applicable copyright law, except executing it on a
-computer or modifying a private copy.  Propagation includes copying,
-distribution (with or without modification), making available to the
-public, and in some countries other activities as well.
-  To "convey" a work means any kind of propagation that enables other
-parties to make or receive copies.  Mere interaction with a user through
-a computer network, with no transfer of a copy, is not conveying.
-  An interactive user interface displays "Appropriate Legal Notices"
-to the extent that it includes a convenient and prominently visible
-feature that (1) displays an appropriate copyright notice, and (2)
-tells the user that there is no warranty for the work (except to the
-extent that warranties are provided), that licensees may convey the
-work under this License, and how to view a copy of this License.  If
-the interface presents a list of user commands or options, such as a
-menu, a prominent item in the list meets this criterion.
-  1. Source Code.
-  The "source code" for a work means the preferred form of the work
-for making modifications to it.  "Object code" means any non-source
-form of a work.
-  A "Standard Interface" means an interface that either is an official
-standard defined by a recognized standards body, or, in the case of
-interfaces specified for a particular programming language, one that
-is widely used among developers working in that language.
-  The "System Libraries" of an executable work include anything, other
-than the work as a whole, that (a) is included in the normal form of
-packaging a Major Component, but which is not part of that Major
-Component, and (b) serves only to enable use of the work with that
-Major Component, or to implement a Standard Interface for which an
-implementation is available to the public in source code form.  A
-"Major Component", in this context, means a major essential component
-(kernel, window system, and so on) of the specific operating system
-(if any) on which the executable work runs, or a compiler used to
-produce the work, or an object code interpreter used to run it.
-  The "Corresponding Source" for a work in object code form means all
-the source code needed to generate, install, and (for an executable
-work) run the object code and to modify the work, including scripts to
-control those activities.  However, it does not include the work's
-System Libraries, or general-purpose tools or generally available free
-programs which are used unmodified in performing those activities but
-which are not part of the work.  For example, Corresponding Source
-includes interface definition files associated with source files for
-the work, and the source code for shared libraries and dynamically
-linked subprograms that the work is specifically designed to require,
-such as by intimate data communication or control flow between those
-subprograms and other parts of the work.
-  The Corresponding Source need not include anything that users
-can regenerate automatically from other parts of the Corresponding
-  The Corresponding Source for a work in source code form is that
-same work.
-  2. Basic Permissions.
-  All rights granted under this License are granted for the term of
-copyright on the Program, and are irrevocable provided the stated
-conditions are met.  This License explicitly affirms your unlimited
-permission to run the unmodified Program.  The output from running a
-covered work is covered by this License only if the output, given its
-content, constitutes a covered work.  This License acknowledges your
-rights of fair use or other equivalent, as provided by copyright law.
-  You may make, run and propagate covered works that you do not
-convey, without conditions so long as your license otherwise remains
-in force.  You may convey covered works to others for the sole purpose
-of having them make modifications exclusively for you, or provide you
-with facilities for running those works, provided that you comply with
-the terms of this License in conveying all material for which you do
-not control copyright.  Those thus making or running the covered works
-for you must do so exclusively on your behalf, under your direction
-and control, on terms that prohibit them from making any copies of
-your copyrighted material outside their relationship with you.
-  Conveying under any other circumstances is permitted solely under
-the conditions stated below.  Sublicensing is not allowed; section 10
-makes it unnecessary.
-  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
-  No covered work shall be deemed part of an effective technological
-measure under any applicable law fulfilling obligations under article
-11 of the WIPO copyright treaty adopted on 20 December 1996, or
-similar laws prohibiting or restricting circumvention of such
-  When you convey a covered work, you waive any legal power to forbid
-circumvention of technological measures to the extent such circumvention
-is effected by exercising rights under this License with respect to
-the covered work, and you disclaim any intention to limit operation or
-modification of the work as a means of enforcing, against the work's
-users, your or third parties' legal rights to forbid circumvention of
-technological measures.
-  4. Conveying Verbatim Copies.
-  You may convey verbatim copies of the Program's source code as you
-receive it, in any medium, provided that you conspicuously and
-appropriately publish on each copy an appropriate copyright notice;
-keep intact all notices stating that this License and any
-non-permissive terms added in accord with section 7 apply to the code;
-keep intact all notices of the absence of any warranty; and give all
-recipients a copy of this License along with the Program.
-  You may charge any price or no price for each copy that you convey,
-and you may offer support or warranty protection for a fee.
-  5. Conveying Modified Source Versions.
-  You may convey a work based on the Program, or the modifications to
-produce it from the Program, in the form of source code under the
-terms of section 4, provided that you also meet all of these conditions:
-    a) The work must carry prominent notices stating that you modified
-    it, and giving a relevant date.
-    b) The work must carry prominent notices stating that it is
-    released under this License and any conditions added under section
-    7.  This requirement modifies the requirement in section 4 to
-    "keep intact all notices".
-    c) You must license the entire work, as a whole, under this
-    License to anyone who comes into possession of a copy.  This
-    License will therefore apply, along with any applicable section 7
-    additional terms, to the whole of the work, and all its parts,
-    regardless of how they are packaged.  This License gives no
-    permission to license the work in any other way, but it does not
-    invalidate such permission if you have separately received it.
-    d) If the work has interactive user interfaces, each must display
-    Appropriate Legal Notices; however, if the Program has interactive
-    interfaces that do not display Appropriate Legal Notices, your
-    work need not make them do so.
-  A compilation of a covered work with other separate and independent
-works, which are not by their nature extensions of the covered work,
-and which are not combined with it such as to form a larger program,
-in or on a volume of a storage or distribution medium, is called an
-"aggregate" if the compilation and its resulting copyright are not
-used to limit the access or legal rights of the compilation's users
-beyond what the individual works permit.  Inclusion of a covered work
-in an aggregate does not cause this License to apply to the other
-parts of the aggregate.
-  6. Conveying Non-Source Forms.
-  You may convey a covered work in object code form under the terms
-of sections 4 and 5, provided that you also convey the
-machine-readable Corresponding Source under the terms of this License,
-in one of these ways:
-    a) Convey the object code in, or embodied in, a physical product
-    (including a physical distribution medium), accompanied by the
-    Corresponding Source fixed on a durable physical medium
-    customarily used for software interchange.
-    b) Convey the object code in, or embodied in, a physical product
-    (including a physical distribution medium), accompanied by a
-    written offer, valid for at least three years and valid for as
-    long as you offer spare parts or customer support for that product
-    model, to give anyone who possesses the object code either (1) a
-    copy of the Corresponding Source for all the software in the
-    product that is covered by this License, on a durable physical
-    medium customarily used for software interchange, for a price no
-    more than your reasonable cost of physically performing this
-    conveying of source, or (2) access to copy the
-    Corresponding Source from a network server at no charge.
-    c) Convey individual copies of the object code with a copy of the
-    written offer to provide the Corresponding Source.  This
-    alternative is allowed only occasionally and noncommercially, and
-    only if you received the object code with such an offer, in accord
-    with subsection 6b.
-    d) Convey the object code by offering access from a designated
-    place (gratis or for a charge), and offer equivalent access to the
-    Corresponding Source in the same way through the same place at no
-    further charge.  You need not require recipients to copy the
-    Corresponding Source along with the object code.  If the place to
-    copy the object code is a network server, the Corresponding Source
-    may be on a different server (operated by you or a third party)
-    that supports equivalent copying facilities, provided you maintain
-    clear directions next to the object code saying where to find the
-    Corresponding Source.  Regardless of what server hosts the
-    Corresponding Source, you remain obligated to ensure that it is
-    available for as long as needed to satisfy these requirements.
-    e) Convey the object code using peer-to-peer transmission, provided
-    you inform other peers where the object code and Corresponding
-    Source of the work are being offered to the general public at no
-    charge under subsection 6d.
-  A separable portion of the object code, whose source code is excluded
-from the Corresponding Source as a System Library, need not be
-included in conveying the object code work.
-  A "User Product" is either (1) a "consumer product", which means any
-tangible personal property which is normally used for personal, family,
-or household purposes, or (2) anything designed or sold for incorporation
-into a dwelling.  In determining whether a product is a consumer product,
-doubtful cases shall be resolved in favor of coverage.  For a particular
-product received by a particular user, "normally used" refers to a
-typical or common use of that class of product, regardless of the status
-of the particular user or of the way in which the particular user
-actually uses, or expects or is expected to use, the product.  A product
-is a consumer product regardless of whether the product has substantial
-commercial, industrial or non-consumer uses, unless such uses represent
-the only significant mode of use of the product.
-  "Installation Information" for a User Product means any methods,
-procedures, authorization keys, or other information required to install
-and execute modified versions of a covered work in that User Product from
-a modified version of its Corresponding Source.  The information must
-suffice to ensure that the continued functioning of the modified object
-code is in no case prevented or interfered with solely because
-modification has been made.
-  If you convey an object code work under this section in, or with, or
-specifically for use in, a User Product, and the conveying occurs as
-part of a transaction in which the right of possession and use of the
-User Product is transferred to the recipient in perpetuity or for a
-fixed term (regardless of how the transaction is characterized), the
-Corresponding Source conveyed under this section must be accompanied
-by the Installation Information.  But this requirement does not apply
-if neither you nor any third party retains the ability to install
-modified object code on the User Product (for example, the work has
-been installed in ROM).
-  The requirement to provide Installation Information does not include a
-requirement to continue to provide support service, warranty, or updates
-for a work that has been modified or installed by the recipient, or for
-the User Product in which it has been modified or installed.  Access to a
-network may be denied when the modification itself materially and
-adversely affects the operation of the network or violates the rules and
-protocols for communication across the network.
-  Corresponding Source conveyed, and Installation Information provided,
-in accord with this section must be in a format that is publicly
-documented (and with an implementation available to the public in
-source code form), and must require no special password or key for
-unpacking, reading or copying.
-  7. Additional Terms.
-  "Additional permissions" are terms that supplement the terms of this
-License by making exceptions from one or more of its conditions.
-Additional permissions that are applicable to the entire Program shall
-be treated as though they were included in this License, to the extent
-that they are valid under applicable law.  If additional permissions
-apply only to part of the Program, that part may be used separately
-under those permissions, but the entire Program remains governed by
-this License without regard to the additional permissions.
-  When you convey a copy of a covered work, you may at your option
-remove any additional permissions from that copy, or from any part of
-it.  (Additional permissions may be written to require their own
-removal in certain cases when you modify the work.)  You may place
-additional permissions on material, added by you to a covered work,
-for which you have or can give appropriate copyright permission.
-  Notwithstanding any other provision of this License, for material you
-add to a covered work, you may (if authorized by the copyright holders of
-that material) supplement the terms of this License with terms:
-    a) Disclaiming warranty or limiting liability differently from the
-    terms of sections 15 and 16 of this License; or
-    b) Requiring preservation of specified reasonable legal notices or
-    author attributions in that material or in the Appropriate Legal
-    Notices displayed by works containing it; or
-    c) Prohibiting misrepresentation of the origin of that material, or
-    requiring that modified versions of such material be marked in
-    reasonable ways as different from the original version; or
-    d) Limiting the use for publicity purposes of names of licensors or
-    authors of the material; or
-    e) Declining to grant rights under trademark law for use of some
-    trade names, trademarks, or service marks; or
-    f) Requiring indemnification of licensors and authors of that
-    material by anyone who conveys the material (or modified versions of
-    it) with contractual assumptions of liability to the recipient, for
-    any liability that these contractual assumptions directly impose on
-    those licensors and authors.
-  All other non-permissive additional terms are considered "further
-restrictions" within the meaning of section 10.  If the Program as you
-received it, or any part of it, contains a notice stating that it is
-governed by this License along with a term that is a further
-restriction, you may remove that term.  If a license document contains
-a further restriction but permits relicensing or conveying under this
-License, you may add to a covered work material governed by the terms
-of that license document, provided that the further restriction does
-not survive such relicensing or conveying.
-  If you add terms to a covered work in accord with this section, you
-must place, in the relevant source files, a statement of the
-additional terms that apply to those files, or a notice indicating
-where to find the applicable terms.
-  Additional terms, permissive or non-permissive, may be stated in the
-form of a separately written license, or stated as exceptions;
-the above requirements apply either way.
-  8. Termination.
-  You may not propagate or modify a covered work except as expressly
-provided under this License.  Any attempt otherwise to propagate or
-modify it is void, and will automatically terminate your rights under
-this License (including any patent licenses granted under the third
-paragraph of section 11).
-  However, if you cease all violation of this License, then your
-license from a particular copyright holder is reinstated (a)
-provisionally, unless and until the copyright holder explicitly and
-finally terminates your license, and (b) permanently, if the copyright
-holder fails to notify you of the violation by some reasonable means
-prior to 60 days after the cessation.
-  Moreover, your license from a particular copyright holder is
-reinstated permanently if the copyright holder notifies you of the
-violation by some reasonable means, this is the first time you have
-received notice of violation of this License (for any work) from that
-copyright holder, and you cure the violation prior to 30 days after
-your receipt of the notice.
-  Termination of your rights under this section does not terminate the
-licenses of parties who have received copies or rights from you under
-this License.  If your rights have been terminated and not permanently
-reinstated, you do not qualify to receive new licenses for the same
-material under section 10.
-  9. Acceptance Not Required for Having Copies.
-  You are not required to accept this License in order to receive or
-run a copy of the Program.  Ancillary propagation of a covered work
-occurring solely as a consequence of using peer-to-peer transmission
-to receive a copy likewise does not require acceptance.  However,
-nothing other than this License grants you permission to propagate or
-modify any covered work.  These actions infringe copyright if you do
-not accept this License.  Therefore, by modifying or propagating a
-covered work, you indicate your acceptance of this License to do so.
-  10. Automatic Licensing of Downstream Recipients.
-  Each time you convey a covered work, the recipient automatically
-receives a license from the original licensors, to run, modify and
-propagate that work, subject to this License.  You are not responsible
-for enforcing compliance by third parties with this License.
-  An "entity transaction" is a transaction transferring control of an
-organization, or substantially all assets of one, or subdividing an
-organization, or merging organizations.  If propagation of a covered
-work results from an entity transaction, each party to that
-transaction who receives a copy of the work also receives whatever
-licenses to the work the party's predecessor in interest had or could
-give under the previous paragraph, plus a right to possession of the
-Corresponding Source of the work from the predecessor in interest, if
-the predecessor has it or can get it with reasonable efforts.
-  You may not impose any further restrictions on the exercise of the
-rights granted or affirmed under this License.  For example, you may
-not impose a license fee, royalty, or other charge for exercise of
-rights granted under this License, and you may not initiate litigation
-(including a cross-claim or counterclaim in a lawsuit) alleging that
-any patent claim is infringed by making, using, selling, offering for
-sale, or importing the Program or any portion of it.
-  11. Patents.
-  A "contributor" is a copyright holder who authorizes use under this
-License of the Program or a work on which the Program is based.  The
-work thus licensed is called the contributor's "contributor version".
-  A contributor's "essential patent claims" are all patent claims
-owned or controlled by the contributor, whether already acquired or
-hereafter acquired, that would be infringed by some manner, permitted
-by this License, of making, using, or selling its contributor version,
-but do not include claims that would be infringed only as a
-consequence of further modification of the contributor version.  For
-purposes of this definition, "control" includes the right to grant
-patent sublicenses in a manner consistent with the requirements of
-this License.
-  Each contributor grants you a non-exclusive, worldwide, royalty-free
-patent license under the contributor's essential patent claims, to
-make, use, sell, offer for sale, import and otherwise run, modify and
-propagate the contents of its contributor version.
-  In the following three paragraphs, a "patent license" is any express
-agreement or commitment, however denominated, not to enforce a patent
-(such as an express permission to practice a patent or covenant not to
-sue for patent infringement).  To "grant" such a patent license to a
-party means to make such an agreement or commitment not to enforce a
-patent against the party.
-  If you convey a covered work, knowingly relying on a patent license,
-and the Corresponding Source of the work is not available for anyone
-to copy, free of charge and under the terms of this License, through a
-publicly available network server or other readily accessible means,
-then you must either (1) cause the Corresponding Source to be so
-available, or (2) arrange to deprive yourself of the benefit of the
-patent license for this particular work, or (3) arrange, in a manner
-consistent with the requirements of this License, to extend the patent
-license to downstream recipients.  "Knowingly relying" means you have
-actual knowledge that, but for the patent license, your conveying the
-covered work in a country, or your recipient's use of the covered work
-in a country, would infringe one or more identifiable patents in that
-country that you have reason to believe are valid.
-  If, pursuant to or in connection with a single transaction or
-arrangement, you convey, or propagate by procuring conveyance of, a
-covered work, and grant a patent license to some of the parties
-receiving the covered work authorizing them to use, propagate, modify
-or convey a specific copy of the covered work, then the patent license
-you grant is automatically extended to all recipients of the covered
-work and works based on it.
-  A patent license is "discriminatory" if it does not include within
-the scope of its coverage, prohibits the exercise of, or is
-conditioned on the non-exercise of one or more of the rights that are
-specifically granted under this License.  You may not convey a covered
-work if you are a party to an arrangement with a third party that is
-in the business of distributing software, under which you make payment
-to the third party based on the extent of your activity of conveying
-the work, and under which the third party grants, to any of the
-parties who would receive the covered work from you, a discriminatory
-patent license (a) in connection with copies of the covered work
-conveyed by you (or copies made from those copies), or (b) primarily
-for and in connection with specific products or compilations that
-contain the covered work, unless you entered into that arrangement,
-or that patent license was granted, prior to 28 March 2007.
-  Nothing in this License shall be construed as excluding or limiting
-any implied license or other defenses to infringement that may
-otherwise be available to you under applicable patent law.
-  12. No Surrender of Others' Freedom.
-  If conditions are imposed on you (whether by court order, agreement or
-otherwise) that contradict the conditions of this License, they do not
-excuse you from the conditions of this License.  If you cannot convey a
-covered work so as to satisfy simultaneously your obligations under this
-License and any other pertinent obligations, then as a consequence you may
-not convey it at all.  For example, if you agree to terms that obligate you
-to collect a royalty for further conveying from those to whom you convey
-the Program, the only way you could satisfy both those terms and this
-License would be to refrain entirely from conveying the Program.
-  13. Remote Network Interaction; Use with the GNU General Public License.
-  Notwithstanding any other provision of this License, if you modify the
-Program, your modified version must prominently offer all users
-interacting with it remotely through a computer network (if your version
-supports such interaction) an opportunity to receive the Corresponding
-Source of your version by providing access to the Corresponding Source
-from a network server at no charge, through some standard or customary
-means of facilitating copying of software.  This Corresponding Source
-shall include the Corresponding Source for any work covered by version 3
-of the GNU General Public License that is incorporated pursuant to the
-following paragraph.
-  Notwithstanding any other provision of this License, you have
-permission to link or combine any covered work with a work licensed
-under version 3 of the GNU General Public License into a single
-combined work, and to convey the resulting work.  The terms of this
-License will continue to apply to the part which is the covered work,
-but the work with which it is combined will remain governed by version
-3 of the GNU General Public License.
-  14. Revised Versions of this License.
-  The Free Software Foundation may publish revised and/or new versions of
-the GNU Affero General Public License from time to time.  Such new versions
-will be similar in spirit to the present version, but may differ in detail to
-address new problems or concerns.
-  Each version is given a distinguishing version number.  If the
-Program specifies that a certain numbered version of the GNU Affero General
-Public License "or any later version" applies to it, you have the
-option of following the terms and conditions either of that numbered
-version or of any later version published by the Free Software
-Foundation.  If the Program does not specify a version number of the
-GNU Affero General Public License, you may choose any version ever published
-by the Free Software Foundation.
-  If the Program specifies that a proxy can decide which future
-versions of the GNU Affero General Public License can be used, that proxy's
-public statement of acceptance of a version permanently authorizes you
-to choose that version for the Program.
-  Later license versions may give you additional or different
-permissions.  However, no additional obligations are imposed on any
-author or copyright holder as a result of your choosing to follow a
-later version.
-  15. Disclaimer of Warranty.
-  16. Limitation of Liability.
-  17. Interpretation of Sections 15 and 16.
-  If the disclaimer of warranty and limitation of liability provided
-above cannot be given local legal effect according to their terms,
-reviewing courts shall apply local law that most closely approximates
-an absolute waiver of all civil liability in connection with the
-Program, unless a warranty or assumption of liability accompanies a
-copy of the Program in return for a fee.
-                     END OF TERMS AND CONDITIONS
-            How to Apply These Terms to Your New Programs
-  If you develop a new program, and you want it to be of the greatest
-possible use to the public, the best way to achieve this is to make it
-free software which everyone can redistribute and change under these terms.
-  To do so, attach the following notices to the program.  It is safest
-to attach them to the start of each source file to most effectively
-state the exclusion of warranty; and each file should have at least
-the "copyright" line and a pointer to where the full notice is found.
-    <one line to give the program's name and a brief idea of what it does.>
-    Copyright (C) <year>  <name of author>
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU Affero General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    GNU Affero General Public License for more details.
-    You should have received a copy of the GNU Affero General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
-Also add information on how to contact you by electronic and paper mail.
-  If your software can interact with users remotely through a computer
-network, you should also make sure that it provides a way for users to
-get its source.  For example, if your program is a web application, its
-interface could display a "Source" link that leads users to an archive
-of the code.  There are many ways you could offer source, and different
-solutions will be better for different programs; see section 13 for the
-specific requirements.
-  You should also get your employer (if you work as a programmer) or school,
-if any, to sign a "copyright disclaimer" for the program, if necessary.
-For more information on this, and how to apply and follow the GNU AGPL, see
diff --git a/services/nodemanager/arvados-node-manager.service b/services/nodemanager/arvados-node-manager.service
deleted file mode 100644 (file)
index 38c525b..0000000
+++ /dev/null
@@ -1,32 +0,0 @@
-# Copyright (C) The Arvados Authors. All rights reserved.
-# SPDX-License-Identifier: AGPL-3.0
-Description=Arvados Node Manager Daemon
-# systemd==229 (ubuntu:xenial) obeys StartLimitInterval in the [Unit] section
-# systemd>=230 (debian:9) obeys StartLimitIntervalSec in the [Unit] section
-ExecStart=/usr/bin/env sh -c '/usr/bin/arvados-node-manager --foreground --config /etc/arvados-node-manager/config.ini 2>&1 | cat'
-# systemd<=219 (centos:7, debian:8, ubuntu:trusty) obeys StartLimitInterval in the [Service] section
diff --git a/services/nodemanager/arvados_version.py b/services/nodemanager/arvados_version.py
deleted file mode 100644 (file)
index 0c65369..0000000
+++ /dev/null
@@ -1,51 +0,0 @@
-# Copyright (C) The Arvados Authors. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-import subprocess
-import time
-import os
-import re
-SETUP_DIR = os.path.dirname(os.path.abspath(__file__))
-def choose_version_from():
-    sdk_ts = subprocess.check_output(
-        ['git', 'log', '--first-parent', '--max-count=1',
-         '--format=format:%ct', os.path.join(SETUP_DIR, "../../sdk/python")]).strip()
-    cwl_ts = subprocess.check_output(
-        ['git', 'log', '--first-parent', '--max-count=1',
-         '--format=format:%ct', SETUP_DIR]).strip()
-    if int(sdk_ts) > int(cwl_ts):
-        getver = os.path.join(SETUP_DIR, "../../sdk/python")
-    else:
-        getver = SETUP_DIR
-    return getver
-def git_version_at_commit():
-    curdir = choose_version_from()
-    myhash = subprocess.check_output(['git', 'log', '-n1', '--first-parent',
-                                       '--format=%H', curdir]).strip()
-    myversion = subprocess.check_output([curdir+'/../../build/version-at-commit.sh', myhash]).strip().decode()
-    return myversion
-def save_version(setup_dir, module, v):
-  with open(os.path.join(setup_dir, module, "_version.py"), 'wt') as fp:
-      return fp.write("__version__ = '%s'\n" % v)
-def read_version(setup_dir, module):
-  with open(os.path.join(setup_dir, module, "_version.py"), 'rt') as fp:
-      return re.match("__version__ = '(.*)'$", fp.read()).groups()[0]
-def get_version(setup_dir, module):
-    env_version = os.environ.get("ARVADOS_BUILDING_VERSION")
-    if env_version:
-        save_version(setup_dir, module, env_version)
-    else:
-        try:
-            save_version(setup_dir, module, git_version_at_commit())
-        except (subprocess.CalledProcessError, OSError):
-            pass
-    return read_version(setup_dir, module)
diff --git a/services/nodemanager/arvnodeman/__init__.py b/services/nodemanager/arvnodeman/__init__.py
deleted file mode 100644 (file)
index 3f94807..0000000
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-# SPDX-License-Identifier: AGPL-3.0
-from __future__ import absolute_import, print_function
-import _strptime  # See <http://bugs.python.org/issue7980#msg221094>.
-import logging
-logger = logging.getLogger('arvnodeman')
diff --git a/services/nodemanager/arvnodeman/baseactor.py b/services/nodemanager/arvnodeman/baseactor.py
deleted file mode 100644 (file)
index bdfe5d4..0000000
+++ /dev/null
@@ -1,129 +0,0 @@
-# Copyright (C) The Arvados Authors. All rights reserved.
-# SPDX-License-Identifier: AGPL-3.0
-from __future__ import absolute_import, print_function
-import errno
-import logging
-import os
-import signal
-import time
-import threading
-import traceback
-import pykka
-from .status import tracker
-class _TellCallableProxy(object):
-    """Internal helper class for proxying callables."""
-    def __init__(self, ref, attr_path):
-        self.actor_ref = ref
-        self._attr_path = attr_path
-    def __call__(self, *args, **kwargs):
-        message = {
-            'command': 'pykka_call',
-            'attr_path': self._attr_path,
-            'args': args,
-            'kwargs': kwargs,
-        }
-        self.actor_ref.tell(message)
-class TellActorProxy(pykka.ActorProxy):
-    """ActorProxy in which all calls are implemented as using tell().
-    The standard pykka.ActorProxy always uses ask() and returns a Future.  If
-    the target method raises an exception, it is placed in the Future object
-    and re-raised when get() is called on the Future.  Unfortunately, most
-    messaging in Node Manager is asynchronous and the caller does not store the
-    Future object returned by the call to ActorProxy.  As a result, exceptions
-    resulting from these calls end up in limbo, neither reported in the logs
-    nor handled by on_failure().
-    The TellActorProxy uses tell() instead of ask() and does not return a
-    Future object.  As a result, if the target method raises an exception, it
-    will be logged and on_failure() will be called as intended.
-    """
-    def __repr__(self):
-        return '<ActorProxy for %s, attr_path=%s>' % (
-            self.actor_ref, self._attr_path)
-    def __getattr__(self, name):
-        """Get a callable from the actor."""
-        attr_path = self._attr_path + (name,)
-        if attr_path not in self._known_attrs:
-            self._known_attrs = self._get_attributes()
-        attr_info = self._known_attrs.get(attr_path)
-        if attr_info is None:
-            raise AttributeError('%s has no attribute "%s"' % (self, name))
-        if attr_info['callable']:
-            if attr_path not in self._callable_proxies:
-                self._callable_proxies[attr_path] = _TellCallableProxy(
-                    self.actor_ref, attr_path)
-            return self._callable_proxies[attr_path]
-        else:
-            raise AttributeError('attribute "%s" is not a callable on %s' % (name, self))
-class TellableActorRef(pykka.ActorRef):
-    """ActorRef adding the tell_proxy() method to get TellActorProxy."""
-    def tell_proxy(self):
-        return TellActorProxy(self)
-class BaseNodeManagerActor(pykka.ThreadingActor):
-    """Base class for actors in node manager, redefining actor_ref as a
-    TellableActorRef and providing a default on_failure handler.
-    """
-    def __init__(self, *args, **kwargs):
-         super(pykka.ThreadingActor, self).__init__(*args, **kwargs)
-         self.actor_ref = TellableActorRef(self)
-         self._killfunc = kwargs.get("killfunc", os.kill)
-    def on_failure(self, exception_type, exception_value, tb):
-        lg = getattr(self, "_logger", logging)
-        if (exception_type in (threading.ThreadError, MemoryError) or
-            exception_type is OSError and exception_value.errno == errno.ENOMEM):
-            lg.critical("Unhandled exception is a fatal error, killing Node Manager")
-            self._killfunc(os.getpid(), signal.SIGKILL)
-        tracker.counter_add('actor_exceptions')
-    def ping(self):
-        return True
-    def get_thread(self):
-        return threading.current_thread()
-class WatchdogActor(pykka.ThreadingActor):
-    def __init__(self, timeout, *args, **kwargs):
-         super(pykka.ThreadingActor, self).__init__(*args, **kwargs)
-         self.timeout = timeout
-         self.actors = [a.proxy() for a in args]
-         self.actor_ref = TellableActorRef(self)
-         self._later = self.actor_ref.tell_proxy()
-         self._killfunc = kwargs.get("killfunc", os.kill)
-    def kill_self(self, e, act):
-        lg = getattr(self, "_logger", logging)
-        lg.critical("Watchdog exception", exc_info=e)
-        lg.critical("Actor %s watchdog ping time out, killing Node Manager", act)
-        self._killfunc(os.getpid(), signal.SIGKILL)
-    def on_start(self):
-        self._later.run()
-    def run(self):
-        a = None
-        try:
-            for a in self.actors:
-                a.ping().get(self.timeout)
-            time.sleep(20)
-            self._later.run()
-        except Exception as e:
-            self.kill_self(e, a)
diff --git a/services/nodemanager/arvnodeman/clientactor.py b/services/nodemanager/arvnodeman/clientactor.py
deleted file mode 100644 (file)
index afc4f1c..0000000
+++ /dev/null
@@ -1,116 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-# SPDX-License-Identifier: AGPL-3.0
-from __future__ import absolute_import, print_function
-import logging
-import time
-import pykka
-from .config import actor_class
-def _notify_subscribers(response, subscribers):
-    """Send the response to all the subscriber methods.
-    If any of the subscriber actors have stopped, remove them from the
-    subscriber set.
-    """
-    dead_subscribers = set()
-    for subscriber in subscribers:
-        try:
-            subscriber(response)
-        except pykka.ActorDeadError:
-            dead_subscribers.add(subscriber)
-    subscribers.difference_update(dead_subscribers)
-class RemotePollLoopActor(actor_class):
-    """Abstract actor class to regularly poll a remote service.
-    This actor sends regular requests to a remote service, and sends each
-    response to subscribers.  It takes care of error handling, and retrying
-    requests with exponential backoff.
-    To use this actor, define the _send_request method.  If you also
-    define an _item_key method, this class will support subscribing to
-    a specific item by key in responses.
-    """
-    def __init__(self, client, timer_actor, poll_wait=60, max_poll_wait=180):
-        super(RemotePollLoopActor, self).__init__()
-        self._client = client
-        self._timer = timer_actor
-        self._later = self.actor_ref.tell_proxy()
-        self._polling_started = False
-        self.min_poll_wait = poll_wait
-        self.max_poll_wait = max_poll_wait
-        self.poll_wait = self.min_poll_wait
-        self.all_subscribers = set()
-        self.key_subscribers = {}
-        if hasattr(self, '_item_key'):
-            self.subscribe_to = self._subscribe_to
-    def on_start(self):
-        self._logger = logging.getLogger("%s.%s" % (self.__class__.__name__, id(self.actor_urn[9:])))
-    def _start_polling(self):
-        if not self._polling_started:
-            self._polling_started = True
-            self._later.poll()
-    def subscribe(self, subscriber):
-        self.all_subscribers.add(subscriber)
-        self._logger.debug("%s subscribed to all events", subscriber.actor_ref.actor_urn)
-        self._start_polling()
-    # __init__ exposes this method to the proxy if the subclass defines
-    # _item_key.
-    def _subscribe_to(self, key, subscriber):
-        self.key_subscribers.setdefault(key, set()).add(subscriber)
-        self._logger.debug("%s subscribed to events for '%s'", subscriber.actor_ref.actor_urn, key)
-        self._start_polling()
-    def _send_request(self):
-        raise NotImplementedError("subclasses must implement request method")
-    def _got_response(self, response):
-        self.poll_wait = self.min_poll_wait
-        _notify_subscribers(response, self.all_subscribers)
-        if hasattr(self, '_item_key'):
-            items = {self._item_key(x): x for x in response}
-            for key, subscribers in self.key_subscribers.iteritems():
-                _notify_subscribers(items.get(key), subscribers)
-    def _got_error(self, error):
-        self.poll_wait = min(self.poll_wait * 2, self.max_poll_wait)
-        return "got error: {} - will try again in {} seconds".format(
-            error, self.poll_wait)
-    def is_common_error(self, exception):
-        return False
-    def poll(self, scheduled_start=None):
-        self._logger.debug("sending request")
-        start_time = time.time()
-        if scheduled_start is None:
-            scheduled_start = start_time
-        try:
-            response = self._send_request()
-        except Exception as error:
-            errmsg = self._got_error(error)
-            if self.is_common_error(error):
-                self._logger.warning(errmsg)
-            else:
-                self._logger.exception(errmsg)
-            next_poll = start_time + self.poll_wait
-        else:
-            self._got_response(response)
-            next_poll = scheduled_start + self.poll_wait
-            self._logger.info("got response with %d items in %s seconds, next poll at %s",
-                              len(response), (time.time() - scheduled_start),
-                              time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(next_poll)))
-        end_time = time.time()
-        if next_poll < end_time:  # We've drifted too much; start fresh.
-            next_poll = end_time + self.poll_wait
-        self._timer.schedule(next_poll, self._later.poll, next_poll)
diff --git a/services/nodemanager/arvnodeman/computenode/__init__.py b/services/nodemanager/arvnodeman/computenode/__init__.py
deleted file mode 100644 (file)
index b124c66..0000000
+++ /dev/null
@@ -1,201 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-# SPDX-License-Identifier: AGPL-3.0
-from __future__ import absolute_import, print_function
-import calendar
-import functools
-import itertools
-import re
-import time
-from ..config import CLOUD_ERRORS
-from ..status import tracker
-from libcloud.common.exceptions import BaseHTTPError, RateLimitReachedError
-ARVADOS_TIMESUBSEC_RE = re.compile(r'(\.\d+)Z$')
-def arvados_node_fqdn(arvados_node, default_hostname='dynamic.compute'):
-    hostname = arvados_node.get('hostname') or default_hostname
-    return '{}.{}'.format(hostname, arvados_node['domain'])
-def arvados_node_mtime(node):
-    return arvados_timestamp(node['modified_at'])
-def arvados_timestamp(timestr):
-    subsec_match = ARVADOS_TIMESUBSEC_RE.search(timestr)
-    if subsec_match is None:
-        subsecs = .0
-    else:
-        subsecs = float(subsec_match.group(1))
-        timestr = timestr[:subsec_match.start()] + 'Z'
-    return calendar.timegm(time.strptime(timestr + 'UTC',
-                                         ARVADOS_TIMEFMT + '%Z')) + subsecs
-def timestamp_fresh(timestamp, fresh_time):
-    return (time.time() - timestamp) < fresh_time
-def arvados_node_missing(arvados_node, fresh_time):
-    """Indicate if cloud node corresponding to the arvados
-    node is "missing".
-    If True, this means the node has not pinged the API server within the timeout
-    period.  If False, the ping is up to date.  If the node has never pinged,
-    returns None.
-    """
-    if arvados_node["last_ping_at"] is None:
-        return None
-    else:
-        return not timestamp_fresh(arvados_timestamp(arvados_node["last_ping_at"]), fresh_time)
-class RetryMixin(object):
-    """Retry decorator for an method that makes remote requests.
-    Use this function to decorate method, and pass in a tuple of exceptions to
-    catch.  If the original method raises a known cloud driver error, or any of
-    the given exception types, this decorator will either go into a
-    sleep-and-retry loop with exponential backoff either by sleeping (if
-    self._timer is None) or by scheduling retries of the method (if self._timer
-    is a timer actor.)
-    """
-    def __init__(self, retry_wait, max_retry_wait, logger, cloud, timer=None):
-        self.min_retry_wait = max(1, retry_wait)
-        self.max_retry_wait = max(self.min_retry_wait, max_retry_wait)
-        self.retry_wait = retry_wait
-        self._logger = logger
-        self._cloud = cloud
-        self._timer = timer
-    @staticmethod
-    def _retry(errors=()):
-        def decorator(orig_func):
-            @functools.wraps(orig_func)
-            def retry_wrapper(self, *args, **kwargs):
-                while True:
-                    should_retry = False
-                    try:
-                        ret = orig_func(self, *args, **kwargs)
-                    except RateLimitReachedError as error:
-                        # If retry-after is zero, continue with exponential
-                        # backoff.
-                        if error.retry_after != 0:
-                            self.retry_wait = error.retry_after
-                        should_retry = True
-                    except BaseHTTPError as error:
-                        if error.headers and error.headers.get("retry-after"):
-                            try:
-                                retry_after = int(error.headers["retry-after"])
-                                # If retry-after is zero, continue with
-                                # exponential backoff.
-                                if retry_after != 0:
-                                    self.retry_wait = retry_after
-                                should_retry = True
-                            except ValueError:
-                                self._logger.warning(
-                                    "Unrecognizable Retry-After header: %r",
-                                    error.headers["retry-after"],
-                                    exc_info=error)
-                        if error.code == 429 or error.code >= 500:
-                            should_retry = True
-                    except CLOUD_ERRORS as error:
-                        tracker.counter_add('cloud_errors')
-                        should_retry = True
-                    except errors as error:
-                        should_retry = True
-                    except Exception as error:
-                        # As a libcloud workaround for drivers that don't use
-                        # typed exceptions, consider bare Exception() objects
-                        # retryable.
-                        if type(error) is Exception:
-                            tracker.counter_add('cloud_errors')
-                            should_retry = True
-                    else:
-                        # No exception
-                        self.retry_wait = self.min_retry_wait
-                        return ret
-                    # Only got here if an exception was caught.  Now determine what to do about it.
-                    if not should_retry:
-                        self.retry_wait = self.min_retry_wait
-                        self._logger.warning(
-                            "Re-raising error (no retry): %s",
-                            error, exc_info=error)
-                        raise
-                    # Retry wait out of bounds?
-                    if self.retry_wait < self.min_retry_wait:
-                        self.retry_wait = self.min_retry_wait
-                    elif self.retry_wait > self.max_retry_wait:
-                        self.retry_wait = self.max_retry_wait
-                    self._logger.warning(
-                        "Client error: %s - %s %s seconds",
-                        error,
-                        "scheduling retry in" if self._timer else "sleeping",
-                        self.retry_wait,
-                        exc_info=error)
-                    if self._timer:
-                        start_time = time.time()
-                        # reschedule to be called again
-                        self._timer.schedule(start_time + self.retry_wait,
-                                             getattr(self._later,
-                                                     orig_func.__name__),
-                                             *args, **kwargs)
-                    else:
-                        # sleep on it.
-                        time.sleep(self.retry_wait)
-                    self.retry_wait = min(self.retry_wait * 2,
-                                          self.max_retry_wait)
-                    if self._timer:
-                        # expect to be called again by timer so don't loop
-                        return
-            return retry_wrapper
-        return decorator
-class ShutdownTimer(object):
-    """Keep track of a cloud node's shutdown windows.
-    Instantiate this class with a timestamp of when a cloud node started,
-    and a list of durations (in minutes) of when the node must not and may
-    be shut down, alternating.  The class will tell you when a shutdown
-    window is open, and when the next open window will start.
-    """
-    def __init__(self, start_time, shutdown_windows):
-        # The implementation is easiest if we have an even number of windows,
-        # because then windows always alternate between open and closed.
-        # Rig that up: calculate the first shutdown window based on what's
-        # passed in.  Then, if we were given an odd number of windows, merge
-        # that first window into the last one, since they both# represent
-        # closed state.
-        first_window = shutdown_windows[0]
-        shutdown_windows = list(shutdown_windows[1:])
-        self._next_opening = start_time + (60 * first_window)
-        if len(shutdown_windows) % 2:
-            shutdown_windows.append(first_window)
-        else:
-            shutdown_windows[-1] += first_window
-        self.shutdown_windows = itertools.cycle([60 * n
-                                                 for n in shutdown_windows])
-        self._open_start = self._next_opening
-        self._open_for = next(self.shutdown_windows)
-    def _advance_opening(self):
-        while self._next_opening < time.time():
-            self._open_start = self._next_opening
-            self._next_opening += self._open_for + next(self.shutdown_windows)
-            self._open_for = next(self.shutdown_windows)
-    def next_opening(self):
-        self._advance_opening()
-        return self._next_opening
-    def window_open(self):
-        self._advance_opening()
-        return 0 < (time.time() - self._open_start) < self._open_for
diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
deleted file mode 100644 (file)
index 77c515d..0000000
+++ /dev/null
@@ -1,536 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-# SPDX-License-Identifier: AGPL-3.0
-from __future__ import absolute_import, print_function
-import functools
-import logging
-import time
-import re
-import libcloud.common.types as cloud_types
-from libcloud.common.exceptions import BaseHTTPError
-import pykka
-from .. import \
-    arvados_node_fqdn, arvados_node_mtime, arvados_timestamp, timestamp_fresh, \
-    arvados_node_missing, RetryMixin
-from ...clientactor import _notify_subscribers
-from ... import config
-from ... import status
-from .transitions import transitions
-QuotaExceeded = "QuotaExceeded"
-class ComputeNodeStateChangeBase(config.actor_class, RetryMixin):
-    """Base class for actors that change a compute node's state.
-    This base class takes care of retrying changes and notifying
-    subscribers when the change is finished.
-    """
-    def __init__(self, cloud_client, arvados_client, timer_actor,
-                 retry_wait, max_retry_wait):
-        super(ComputeNodeStateChangeBase, self).__init__()
-        RetryMixin.__init__(self, retry_wait, max_retry_wait,
-                            None, cloud_client, timer_actor)
-        self._later = self.actor_ref.tell_proxy()
-        self._arvados = arvados_client
-        self.subscribers = set()
-    def _set_logger(self):
-        self._logger = logging.getLogger("%s.%s" % (self.__class__.__name__, self.actor_urn[33:]))
-    def on_start(self):
-        self._set_logger()
-    def _finished(self):
-        if self.subscribers is None:
-            raise Exception("Actor tried to finish twice")
-        _notify_subscribers(self.actor_ref.proxy(), self.subscribers)
-        self.subscribers = None
-        self._logger.info("finished")
-    def subscribe(self, subscriber):
-        if self.subscribers is None:
-            try:
-                subscriber(self.actor_ref.proxy())
-            except pykka.ActorDeadError:
-                pass
-        else:
-            self.subscribers.add(subscriber)
-    def _clean_arvados_node(self, arvados_node, explanation):
-        return self._arvados.nodes().update(
-            uuid=arvados_node['uuid'],
-            body={'hostname': None,
-                  'ip_address': None,
-                  'slot_number': None,
-                  'first_ping_at': None,
-                  'last_ping_at': None,
-                  'properties': {},
-                  'info': {'ec2_instance_id': None,
-                           'last_action': explanation}},
-            ).execute()
-    @staticmethod
-    def _finish_on_exception(orig_func):
-        @functools.wraps(orig_func)
-        def finish_wrapper(self, *args, **kwargs):
-            try:
-                return orig_func(self, *args, **kwargs)
-            except Exception as error:
-                self._logger.error("Actor error %s", error)
-                self._finished()
-        return finish_wrapper
-class ComputeNodeSetupActor(ComputeNodeStateChangeBase):
-    """Actor to create and set up a cloud compute node.
-    This actor prepares an Arvados node record for a new compute node
-    (either creating one or cleaning one passed in), then boots the
-    actual compute node.  It notifies subscribers when the cloud node
-    is successfully created (the last step in the process for Node
-    Manager to handle).
-    """
-    def __init__(self, timer_actor, arvados_client, cloud_client,
-                 cloud_size, arvados_node=None,
-                 retry_wait=1, max_retry_wait=180):
-        super(ComputeNodeSetupActor, self).__init__(
-            cloud_client, arvados_client, timer_actor,
-            retry_wait, max_retry_wait)
-        self.cloud_size = cloud_size
-        self.arvados_node = None
-        self.cloud_node = None
-        self.error = None
-        if arvados_node is None:
-            self._later.create_arvados_node()
-        else:
-            self._later.prepare_arvados_node(arvados_node)
-    @ComputeNodeStateChangeBase._finish_on_exception
-    @RetryMixin._retry(config.ARVADOS_ERRORS)
-    def create_arvados_node(self):
-        self.arvados_node = self._arvados.nodes().create(
-            body={}, assign_slot=True).execute()
-        self._later.create_cloud_node()
-    @ComputeNodeStateChangeBase._finish_on_exception
-    @RetryMixin._retry(config.ARVADOS_ERRORS)
-    def prepare_arvados_node(self, node):
-        self._clean_arvados_node(node, "Prepared by Node Manager")
-        self.arvados_node = self._arvados.nodes().update(
-            uuid=node['uuid'], body={}, assign_slot=True).execute()
-        self._later.create_cloud_node()
-    @ComputeNodeStateChangeBase._finish_on_exception
-    @RetryMixin._retry()
-    def create_cloud_node(self):
-        self._logger.info("Sending create_node request for node size %s.",
-                          self.cloud_size.id)
-        try:
-            self.cloud_node = self._cloud.create_node(self.cloud_size,
-                                                      self.arvados_node)
-        except BaseHTTPError as e:
-            if e.code == 429 or "RequestLimitExceeded" in e.message:
-                # Don't consider API rate limits to be quota errors.
-                # re-raise so the Retry logic applies.
-                raise
-            # The set of possible error codes / messages isn't documented for
-            # all clouds, so use a keyword heuristic to determine if the
-            # failure is likely due to a quota.
-            if re.search(r'(exceed|quota|limit)', e.message, re.I):
-                self.error = QuotaExceeded
-                self._logger.warning("Quota exceeded: %s", e)
-                self._finished()
-                return
-            else:
-                # Something else happened, re-raise so the Retry logic applies.
-                raise
-        except Exception as e:
-            raise
-        # The information included in the node size object we get from libcloud
-        # is inconsistent between cloud drivers.  Replace libcloud NodeSize
-        # object with compatible CloudSizeWrapper object which merges the size
-        # info reported from the cloud with size information from the
-        # configuration file.
-        self.cloud_node.size = self.cloud_size
-        self._logger.info("Cloud node %s created.", self.cloud_node.id)
-        self._later.update_arvados_node_properties()
-    @ComputeNodeStateChangeBase._finish_on_exception
-    @RetryMixin._retry(config.ARVADOS_ERRORS)
-    def update_arvados_node_properties(self):
-        """Tell Arvados some details about the cloud node.
-        Currently we only include size/price from our request, which
-        we already knew before create_cloud_node(), but doing it here
-        gives us an opportunity to provide more detail from
-        self.cloud_node, too.
-        """
-        self.arvados_node['properties']['cloud_node'] = {
-            # Note this 'size' is the node size we asked the cloud
-            # driver to create -- not necessarily equal to the size
-            # reported by the cloud driver for the node that was
-            # created.
-            'size': self.cloud_size.id,
-            'price': self.cloud_size.price,
-        }
-        self.arvados_node = self._arvados.nodes().update(
-            uuid=self.arvados_node['uuid'],
-            body={'properties': self.arvados_node['properties']},
-        ).execute()
-        self._logger.info("%s updated properties.", self.arvados_node['uuid'])
-        self._later.post_create()
-    @RetryMixin._retry()
-    def post_create(self):
-        self._cloud.post_create_node(self.cloud_node)
-        self._logger.info("%s post-create work done.", self.cloud_node.id)
-        self._finished()
-    def stop_if_no_cloud_node(self):
-        if self.cloud_node is not None:
-            return False
-        self.stop()
-        return True
-class ComputeNodeShutdownActor(ComputeNodeStateChangeBase):
-    """Actor to shut down a compute node.
-    This actor simply destroys a cloud node, retrying as needed.
-    """
-    # Reasons for a shutdown to be cancelled.
-    WINDOW_CLOSED = "shutdown window closed"
-    DESTROY_FAILED = "destroy_node failed"
-    def __init__(self, timer_actor, cloud_client, arvados_client, node_monitor,
-                 cancellable=True, retry_wait=1, max_retry_wait=180):
-        # If a ShutdownActor is cancellable, it will ask the
-        # ComputeNodeMonitorActor if it's still eligible before taking each
-        # action, and stop the shutdown process if the node is no longer
-        # eligible.  Normal shutdowns based on job demand should be
-        # cancellable; shutdowns based on node misbehavior should not.
-        super(ComputeNodeShutdownActor, self).__init__(
-            cloud_client, arvados_client, timer_actor,
-            retry_wait, max_retry_wait)
-        self._monitor = node_monitor.proxy()
-        self.cloud_node = self._monitor.cloud_node.get()
-        self.cancellable = cancellable
-        self.cancel_reason = None
-        self.success = None
-    def _set_logger(self):
-        self._logger = logging.getLogger("%s.%s.%s" % (self.__class__.__name__, self.actor_urn[33:], self.cloud_node.name))
-    def on_start(self):
-        super(ComputeNodeShutdownActor, self).on_start()
-        self._later.shutdown_node()
-    def _arvados_node(self):
-        return self._monitor.arvados_node.get()
-    def _finished(self, success_flag=None):
-        if success_flag is not None:
-            self.success = success_flag
-        return super(ComputeNodeShutdownActor, self)._finished()
-    def cancel_shutdown(self, reason, **kwargs):
-        if not self.cancellable:
-            return False
-        if self.cancel_reason is not None:
-            # already cancelled
-            return False
-        self.cancel_reason = reason
-        self._logger.info("Shutdown cancelled: %s.", reason)
-        self._finished(success_flag=False)
-        return True
-    def _cancel_on_exception(orig_func):
-        @functools.wraps(orig_func)
-        def finish_wrapper(self, *args, **kwargs):
-            try:
-                return orig_func(self, *args, **kwargs)
-            except Exception as error:
-                self._logger.error("Actor error %s", error)
-                self._logger.debug("", exc_info=True)
-                self._later.cancel_shutdown("Unhandled exception %s" % error, try_resume=False)
-        return finish_wrapper
-    @_cancel_on_exception
-    def shutdown_node(self):
-        if self.cancel_reason is not None:
-            # already cancelled
-            return
-        if self.cancellable:
-            self._logger.info("Checking that node is still eligible for shutdown")
-            eligible, reason = self._monitor.shutdown_eligible().get()
-            if not eligible:
-                self.cancel_shutdown("No longer eligible for shut down because %s" % reason,
-                                     try_resume=True)
-                return
-        # If boot failed, count the event
-        if self._monitor.get_state().get() == 'unpaired':
-            status.tracker.counter_add('boot_failures')
-        self._destroy_node()
-    def _destroy_node(self):
-        self._logger.info("Starting shutdown")
-        arv_node = self._arvados_node()
-        if self._cloud.destroy_node(self.cloud_node):
-            self.cancellable = False
-            self._logger.info("Shutdown success")
-            if arv_node:
-                self._later.clean_arvados_node(arv_node)
-            else:
-                self._finished(success_flag=True)
-        else:
-            self.cancel_shutdown(self.DESTROY_FAILED, try_resume=False)
-    @ComputeNodeStateChangeBase._finish_on_exception
-    @RetryMixin._retry(config.ARVADOS_ERRORS)
-    def clean_arvados_node(self, arvados_node):
-        self._clean_arvados_node(arvados_node, "Shut down by Node Manager")
-        self._finished(success_flag=True)
-class ComputeNodeUpdateActor(config.actor_class, RetryMixin):
-    """Actor to dispatch one-off cloud management requests.
-    This actor receives requests for small cloud updates, and
-    dispatches them to a real driver.  ComputeNodeMonitorActors use
-    this to perform maintenance tasks on themselves.  Having a
-    dedicated actor for this gives us the opportunity to control the
-    flow of requests; e.g., by backing off when errors occur.
-    """
-    def __init__(self, cloud_factory, timer_actor, max_retry_wait=180):
-        super(ComputeNodeUpdateActor, self).__init__()
-        RetryMixin.__init__(self, 1, max_retry_wait,
-                            None, cloud_factory(), timer_actor)
-        self._cloud = cloud_factory()
-        self._later = self.actor_ref.tell_proxy()
-    def _set_logger(self):
-        self._logger = logging.getLogger("%s.%s" % (self.__class__.__name__, self.actor_urn[33:]))
-    def on_start(self):
-        self._set_logger()
-    @RetryMixin._retry()
-    def sync_node(self, cloud_node, arvados_node):
-        if self._cloud.node_fqdn(cloud_node) != arvados_node_fqdn(arvados_node):
-            return self._cloud.sync_node(cloud_node, arvados_node)
-class ComputeNodeMonitorActor(config.actor_class):
-    """Actor to manage a running compute node.
-    This actor gets updates about a compute node's cloud and Arvados records.
-    It uses this information to notify subscribers when the node is eligible
-    for shutdown.
-    """
-    def __init__(self, cloud_node, cloud_node_start_time, shutdown_timer,
-                 timer_actor, update_actor, cloud_client,
-                 arvados_node=None, poll_stale_after=600, node_stale_after=3600,
-                 boot_fail_after=1800, consecutive_idle_count=0
-    ):
-        super(ComputeNodeMonitorActor, self).__init__()
-        self._later = self.actor_ref.tell_proxy()
-        self._shutdowns = shutdown_timer
-        self._timer = timer_actor
-        self._update = update_actor
-        self._cloud = cloud_client
-        self.cloud_node = cloud_node
-        self.cloud_node_start_time = cloud_node_start_time
-        self.poll_stale_after = poll_stale_after
-        self.node_stale_after = node_stale_after
-        self.boot_fail_after = boot_fail_after
-        self.subscribers = set()
-        self.arvados_node = None
-        self.consecutive_idle_count = consecutive_idle_count
-        self.consecutive_idle = 0
-        self._later.update_arvados_node(arvados_node)
-        self.last_shutdown_opening = None
-        self._later.consider_shutdown()
-    def _set_logger(self):
-        self._logger = logging.getLogger("%s.%s.%s" % (self.__class__.__name__, self.actor_urn[33:], self.cloud_node.name))
-    def on_start(self):
-        self._set_logger()
-        self._timer.schedule(self.cloud_node_start_time + self.boot_fail_after, self._later.consider_shutdown)
-    def subscribe(self, subscriber):
-        self.subscribers.add(subscriber)
-    def _debug(self, msg, *args):
-        self._logger.debug(msg, *args)
-    def get_state(self):
-        """Get node state, one of ['unpaired', 'busy', 'idle', 'down']."""
-        # If this node is not associated with an Arvados node, return
-        # 'unpaired' if we're in the boot grace period, and 'down' if not,
-        # so it isn't counted towards usable nodes.
-        if self.arvados_node is None:
-            if timestamp_fresh(self.cloud_node_start_time,
-                               self.boot_fail_after):
-                return 'unpaired'
-            else:
-                return 'down'
-        state = self.arvados_node['crunch_worker_state']
-        # If state information is not available because it is missing or the
-        # record is stale, return 'down'.
-        if not state or not timestamp_fresh(arvados_node_mtime(self.arvados_node),
-                                            self.node_stale_after):
-            state = 'down'
-        # There's a window between when a node pings for the first time and the
-        # value of 'slurm_state' is synchronized by crunch-dispatch.  In this
-        # window, the node will still report as 'down'.  Check that
-        # first_ping_at is truthy and consider the node 'idle' during the
-        # initial boot grace period.
-        if (state == 'down' and
-            self.arvados_node['first_ping_at'] and
-            timestamp_fresh(self.cloud_node_start_time,
-                            self.boot_fail_after) and
-            not self._cloud.broken(self.cloud_node)):
-            state = 'idle'
-        # "missing" means last_ping_at is stale, this should be
-        # considered "down"
-        if arvados_node_missing(self.arvados_node, self.node_stale_after):
-            state = 'down'
-        # Turns out using 'job_uuid' this way is a bad idea.  The node record
-        # is assigned the job_uuid before the job is locked (which removes it
-        # from the queue) which means the job will be double-counted as both in
-        # the wishlist and but also keeping a node busy.  This end result is
-        # excess nodes being booted.
-        #if state == 'idle' and self.arvados_node['job_uuid']:
-        #    state = 'busy'
-        # Update idle node times tracker
-        if state == 'idle':
-            status.tracker.idle_in(self.arvados_node['hostname'])
-        else:
-            status.tracker.idle_out(self.arvados_node['hostname'])
-        return state
-    def in_state(self, *states):
-        return self.get_state() in states
-    def shutdown_eligible(self):
-        """Determine if node is candidate for shut down.
-        Returns a tuple of (boolean, string) where the first value is whether
-        the node is candidate for shut down, and the second value is the
-        reason for the decision.
-        """
-        # If this node's size is invalid (because it has a stale arvados_node_size
-        # tag), return True so that it's properly shut down.
-        if self.cloud_node.size.id == 'invalid':
-            return (True, "node's size tag '%s' not recognizable" % (self.cloud_node.extra['arvados_node_size'],))
-        # Collect states and then consult state transition table whether we
-        # should shut down.  Possible states are:
-        # crunch_worker_state = ['unpaired', 'busy', 'idle', 'down']
-        # window = ["open", "closed"]
-        # boot_grace = ["boot wait", "boot exceeded"]
-        # idle_grace = ["not idle", "idle wait", "idle exceeded"]
-        if self.arvados_node and not timestamp_fresh(arvados_node_mtime(self.arvados_node), self.node_stale_after):
-            return (False, "node state is stale")
-        crunch_worker_state = self.get_state()
-        window = "open" if self._shutdowns.window_open() else "closed"
-        if timestamp_fresh(self.cloud_node_start_time, self.boot_fail_after):
-            boot_grace = "boot wait"
-        else:
-            boot_grace = "boot exceeded"
-        if crunch_worker_state == "idle":
-            # Must report as "idle" at least "consecutive_idle_count" times
-            if self.consecutive_idle < self.consecutive_idle_count:
-                idle_grace = 'idle wait'
-            else:
-                idle_grace = 'idle exceeded'
-        else:
-            idle_grace = 'not idle'
-        node_state = (crunch_worker_state, window, boot_grace, idle_grace)
-        t = transitions[node_state]
-        if t is not None:
-            # yes, shutdown eligible
-            return (True, "node state is %s" % (node_state,))
-        else:
-            # no, return a reason
-            return (False, "node state is %s" % (node_state,))
-    def consider_shutdown(self):
-        try:
-            eligible, reason = self.shutdown_eligible()
-            next_opening = self._shutdowns.next_opening()
-            if eligible:
-                self._debug("Suggesting shutdown because %s", reason)
-                _notify_subscribers(self.actor_ref.proxy(), self.subscribers)
-            else:
-                self._debug("Not eligible for shut down because %s", reason)
-                if self.last_shutdown_opening != next_opening:
-                    self._debug("Shutdown window closed.  Next at %s.",
-                                time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(next_opening)))
-                    self._timer.schedule(next_opening, self._later.consider_shutdown)
-                    self.last_shutdown_opening = next_opening
-        except Exception:
-            self._logger.exception("Unexpected exception")
-    def offer_arvados_pair(self, arvados_node):
-        first_ping_s = arvados_node.get('first_ping_at')
-        if (self.arvados_node is not None) or (not first_ping_s):
-            return None
-        elif ((arvados_node['info'].get('ec2_instance_id') == self._cloud.node_id(self.cloud_node)) and
-              (arvados_timestamp(first_ping_s) >= self.cloud_node_start_time)):
-            self._later.update_arvados_node(arvados_node)
-            return self.cloud_node.id
-        else:
-            return None
-    def update_cloud_node(self, cloud_node):
-        if cloud_node is not None:
-            self.cloud_node = cloud_node
-            self._later.consider_shutdown()
-    def update_arvados_node(self, arvados_node):
-        """Called when the latest Arvados node record is retrieved.
-        Calls the updater's sync_node() method.
-        """
-        # This method is a little unusual in the way it just fires off the
-        # request without checking the result or retrying errors.  That's
-        # because this update happens every time we reload the Arvados node
-        # list: if a previous sync attempt failed, we'll see that the names
-        # are out of sync and just try again.  ComputeNodeUpdateActor has
-        # the logic to throttle those effective retries when there's trouble.
-        if arvados_node is not None:
-            self.arvados_node = arvados_node
-            self._update.sync_node(self.cloud_node, self.arvados_node)
-            if self.arvados_node['crunch_worker_state'] == "idle":
-                self.consecutive_idle += 1
-            else:
-                self.consecutive_idle = 0
-            self._later.consider_shutdown()
diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py b/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
deleted file mode 100644 (file)
index 5b7785a..0000000
+++ /dev/null
@@ -1,118 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-# SPDX-License-Identifier: AGPL-3.0
-from __future__ import absolute_import, print_function
-import subprocess32 as subprocess
-import time
-from . import ComputeNodeMonitorActor
-from . import ComputeNodeSetupActor as SetupActorBase
-from . import ComputeNodeShutdownActor as ShutdownActorBase
-from . import ComputeNodeUpdateActor as UpdateActorBase
-from .. import RetryMixin
-class SlurmMixin(object):
-    SLURM_END_STATES = frozenset(['down\n', 'down*\n',
-                                  'drain\n', 'drain*\n',
-                                  'fail\n', 'fail*\n'])
-    SLURM_DRAIN_STATES = frozenset(['drain\n', 'drng\n'])
-    def _update_slurm_node(self, nodename, updates):
-        cmd = ['scontrol', 'update', 'NodeName=' + nodename] + updates
-        try:
-            subprocess.check_output(cmd)
-        except:
-            self._logger.error(
-                "SLURM update %r failed", cmd, exc_info=True)
-    def _update_slurm_size_attrs(self, nodename, size):
-        self._update_slurm_node(nodename, [
-            'Weight=%i' % int(size.price * 1000),
-            'Features=instancetype=' + size.id,
-        ])
-    def _get_slurm_state(self, nodename):
-        return subprocess.check_output(['sinfo', '--noheader', '-o', '%t', '-n', nodename])
-class ComputeNodeSetupActor(SlurmMixin, SetupActorBase):
-    def create_cloud_node(self):
-        hostname = self.arvados_node.get("hostname")
-        if hostname:
-            self._update_slurm_size_attrs(hostname, self.cloud_size)
-        return super(ComputeNodeSetupActor, self).create_cloud_node()
-class ComputeNodeShutdownActor(SlurmMixin, ShutdownActorBase):
-    def on_start(self):
-        arv_node = self._arvados_node()
-        if arv_node is None:
-            self._nodename = None
-            return super(ComputeNodeShutdownActor, self).on_start()
-        else:
-            self._set_logger()
-            self._nodename = arv_node['hostname']
-            self._logger.info("Draining SLURM node %s", self._nodename)
-            self._later.issue_slurm_drain()
-    @RetryMixin._retry((subprocess.CalledProcessError, OSError))
-    def cancel_shutdown(self, reason, try_resume=True):
-        if self._nodename:
-            if try_resume and self._get_slurm_state(self._nodename) in self.SLURM_DRAIN_STATES:
-                # Resume from "drng" or "drain"
-                self._update_slurm_node(self._nodename, ['State=RESUME'])
-            else:
-                # Node is in a state such as 'idle' or 'alloc' so don't
-                # try to resume it because that will just raise an error.
-                pass
-        return super(ComputeNodeShutdownActor, self).cancel_shutdown(reason)
-    @RetryMixin._retry((subprocess.CalledProcessError, OSError))
-    def issue_slurm_drain(self):
-        if self.cancel_reason is not None:
-            return
-        if self._nodename:
-            self._update_slurm_node(self._nodename, [
-                'State=DRAIN', 'Reason=Node Manager shutdown'])
-            self._logger.info("Waiting for SLURM node %s to drain", self._nodename)
-            self._later.await_slurm_drain()
-        else:
-            self._later.shutdown_node()
-    @RetryMixin._retry((subprocess.CalledProcessError, OSError))
-    def await_slurm_drain(self):
-        if self.cancel_reason is not None:
-            return
-        output = self._get_slurm_state(self._nodename)
-        if output in ("drng\n", "alloc\n", "drng*\n", "alloc*\n"):
-            self._timer.schedule(time.time() + 10,
-                                 self._later.await_slurm_drain)
-        elif output in ("idle\n",):
-            # Not in "drng" but idle, don't shut down
-            self.cancel_shutdown("slurm state is %s" % output.strip(), try_resume=False)
-        else:
-            # any other state.
-            self._later.shutdown_node()
-    def _destroy_node(self):
-        if self._nodename:
-            self._update_slurm_node(self._nodename, [
-                'State=DOWN', 'Reason=Node Manager shutdown'])
-        super(ComputeNodeShutdownActor, self)._destroy_node()
-class ComputeNodeUpdateActor(SlurmMixin, UpdateActorBase):
-    def sync_node(self, cloud_node, arvados_node):
-        """Keep SLURM's node properties up to date."""
-        hostname = arvados_node.get("hostname")
-        features = arvados_node.get("slurm_node_features", "").split(",")
-        sizefeature = "instancetype=" + cloud_node.size.id
-        if hostname and sizefeature not in features:
-            # This probably means SLURM has restarted and lost our
-            # dynamically configured node weights and features.
-            self._update_slurm_size_attrs(hostname, cloud_node.size)
-        return super(ComputeNodeUpdateActor, self).sync_node(
-            cloud_node, arvados_node)
diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/transitions.py b/services/nodemanager/arvnodeman/computenode/dispatch/transitions.py
deleted file mode 100644 (file)
index 93f50c1..0000000
+++ /dev/null
@@ -1,69 +0,0 @@
-# Copyright (C) The Arvados Authors. All rights reserved.
-# SPDX-License-Identifier: AGPL-3.0
-transitions = {
- ('busy', 'closed', 'boot exceeded', 'idle exceeded'): None,
- ('busy', 'closed', 'boot exceeded', 'idle wait'): None,
- ('busy', 'closed', 'boot exceeded', 'not idle'): None,
- ('busy', 'closed', 'boot wait', 'idle exceeded'): None,
- ('busy', 'closed', 'boot wait', 'idle wait'): None,
- ('busy', 'closed', 'boot wait', 'not idle'): None,
- ('busy', 'open', 'boot exceeded', 'idle exceeded'): None,
- ('busy', 'open', 'boot exceeded', 'idle wait'): None,
- ('busy', 'open', 'boot exceeded', 'not idle'): None,
- ('busy', 'open', 'boot wait', 'idle exceeded'): None,
- ('busy', 'open', 'boot wait', 'idle wait'): None,
- ('busy', 'open', 'boot wait', 'not idle'): None,
- ('down', 'closed', 'boot exceeded', 'idle exceeded'): "START_SHUTDOWN",
- ('down', 'closed', 'boot exceeded', 'idle wait'): "START_SHUTDOWN",
- ('down', 'closed', 'boot exceeded', 'not idle'): "START_SHUTDOWN",
- ('down', 'closed', 'boot wait', 'idle exceeded'): None,
- ('down', 'closed', 'boot wait', 'idle wait'): None,
- ('down', 'closed', 'boot wait', 'not idle'): None,
- ('down', 'open', 'boot exceeded', 'idle exceeded'): "START_SHUTDOWN",
- ('down', 'open', 'boot exceeded', 'idle wait'): "START_SHUTDOWN",
- ('down', 'open', 'boot exceeded', 'not idle'): "START_SHUTDOWN",
- ('down', 'open', 'boot wait', 'idle exceeded'): "START_SHUTDOWN",
- ('down', 'open', 'boot wait', 'idle wait'): "START_SHUTDOWN",
- ('down', 'open', 'boot wait', 'not idle'): "START_SHUTDOWN",
- ('idle', 'closed', 'boot exceeded', 'idle exceeded'): None,
- ('idle', 'closed', 'boot exceeded', 'idle wait'): None,
- ('idle', 'closed', 'boot exceeded', 'not idle'): None,
- ('idle', 'closed', 'boot wait', 'idle exceeded'): None,
- ('idle', 'closed', 'boot wait', 'idle wait'): None,
- ('idle', 'closed', 'boot wait', 'not idle'): None,
- ('idle', 'open', 'boot exceeded', 'idle exceeded'): "START_DRAIN",
- ('idle', 'open', 'boot exceeded', 'idle wait'): None,
- ('idle', 'open', 'boot exceeded', 'not idle'): None,
- ('idle', 'open', 'boot wait', 'idle exceeded'): "START_DRAIN",
- ('idle', 'open', 'boot wait', 'idle wait'): None,
- ('idle', 'open', 'boot wait', 'not idle'): None,
- ('unpaired', 'closed', 'boot exceeded', 'idle exceeded'): "START_SHUTDOWN",
- ('unpaired', 'closed', 'boot exceeded', 'idle wait'): "START_SHUTDOWN",
- ('unpaired', 'closed', 'boot exceeded', 'not idle'): "START_SHUTDOWN",
- ('unpaired', 'closed', 'boot wait', 'idle exceeded'): None,
- ('unpaired', 'closed', 'boot wait', 'idle wait'): None,
- ('unpaired', 'closed', 'boot wait', 'not idle'): None,
- ('unpaired', 'open', 'boot exceeded', 'idle exceeded'): "START_SHUTDOWN",
- ('unpaired', 'open', 'boot exceeded', 'idle wait'): "START_SHUTDOWN",
- ('unpaired', 'open', 'boot exceeded', 'not idle'): "START_SHUTDOWN",
- ('unpaired', 'open', 'boot wait', 'idle exceeded'): None,
- ('unpaired', 'open', 'boot wait', 'idle wait'): None,
- ('unpaired', 'open', 'boot wait', 'not idle'): None,
- ('fail', 'closed', 'boot exceeded', 'idle exceeded'): "START_SHUTDOWN",
- ('fail', 'closed', 'boot exceeded', 'idle wait'): "START_SHUTDOWN",
- ('fail', 'closed', 'boot exceeded', 'not idle'): "START_SHUTDOWN",
- ('fail', 'closed', 'boot wait', 'idle exceeded'): "START_SHUTDOWN",
- ('fail', 'closed', 'boot wait', 'idle wait'): "START_SHUTDOWN",
- ('fail', 'closed', 'boot wait', 'not idle'): "START_SHUTDOWN",
- ('fail', 'open', 'boot exceeded', 'idle exceeded'): "START_SHUTDOWN",
- ('fail', 'open', 'boot exceeded', 'idle wait'): "START_SHUTDOWN",
- ('fail', 'open', 'boot exceeded', 'not idle'): "START_SHUTDOWN",
- ('fail', 'open', 'boot wait', 'idle exceeded'): "START_SHUTDOWN",
- ('fail', 'open', 'boot wait', 'idle wait'): "START_SHUTDOWN",
- ('fail', 'open', 'boot wait', 'not idle'): "START_SHUTDOWN"}
diff --git a/services/nodemanager/arvnodeman/computenode/driver/__init__.py b/services/nodemanager/arvnodeman/computenode/driver/__init__.py
deleted file mode 100644 (file)
index 48d19f5..0000000
+++ /dev/null
@@ -1,253 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-# SPDX-License-Identifier: AGPL-3.0
-from __future__ import absolute_import, print_function
-import logging
-from operator import attrgetter
-import libcloud.common.types as cloud_types
-from libcloud.compute.base import NodeDriver, NodeAuthSSHKey
-from ...config import CLOUD_ERRORS
-from ...status import tracker
-from .. import RetryMixin
-class BaseComputeNodeDriver(RetryMixin):
-    """Abstract base class for compute node drivers.
-    libcloud drivers abstract away many of the differences between
-    cloud providers, but managing compute nodes requires some
-    cloud-specific features (e.g., keeping track of node FQDNs and
-    boot times).  Compute node drivers are responsible for translating
-    the node manager's cloud requests to a specific cloud's
-    vocabulary.
-    Subclasses must implement arvados_create_kwargs, sync_node,
-    node_fqdn, and node_start_time.
-    """
-    @RetryMixin._retry()
-    def _create_driver(self, driver_class, **auth_kwargs):
-        return driver_class(**auth_kwargs)
-    @RetryMixin._retry()
-    def sizes(self):
-        if self._sizes is None:
-            self._sizes = {sz.id: sz for sz in self.real.list_sizes()}
-        return self._sizes
-    def __init__(self, auth_kwargs, list_kwargs, create_kwargs,
-                 driver_class, retry_wait=1, max_retry_wait=180):
-        """Base initializer for compute node drivers.
-        Arguments:
-        * auth_kwargs: A dictionary of arguments that are passed into the
-          driver_class constructor to instantiate a libcloud driver.
-        * list_kwargs: A dictionary of arguments that are passed to the
-          libcloud driver's list_nodes method to return the list of compute
-          nodes.
-        * create_kwargs: A dictionary of arguments that are passed to the
-          libcloud driver's create_node method to create a new compute node.
-        * driver_class: The class of a libcloud driver to use.
-        """
-        super(BaseComputeNodeDriver, self).__init__(retry_wait, max_retry_wait,
-                                         logging.getLogger(self.__class__.__name__),
-                                         type(self),
-                                         None)
-        self.real = self._create_driver(driver_class, **auth_kwargs)
-        self.list_kwargs = list_kwargs
-        self.create_kwargs = create_kwargs
-        # Transform entries in create_kwargs.  For each key K, if this class
-        # has an _init_K method, remove the entry and call _init_K with the
-        # corresponding value.  If _init_K returns None, the entry stays out
-        # of the dictionary (we expect we're holding the value somewhere
-        # else, like an instance variable).  Otherwise, _init_K returns a
-        # key-value tuple pair, and we add that entry to create_kwargs.
-        for key in self.create_kwargs.keys():
-            init_method = getattr(self, '_init_' + key, None)
-            if init_method is not None:
-                new_pair = init_method(self.create_kwargs.pop(key))
-                if new_pair is not None:
-                    self.create_kwargs[new_pair[0]] = new_pair[1]
-        self._sizes = None
-    def _init_ping_host(self, ping_host):
-        self.ping_host = ping_host
-    def _init_ssh_key(self, filename):
-        with open(filename) as ssh_file:
-            key = NodeAuthSSHKey(ssh_file.read())
-        return 'auth', key
-    def search_for_now(self, term, list_method, key=attrgetter('id'), **kwargs):
-        """Return one matching item from a list of cloud objects.
-        Raises ValueError if the number of matching objects is not exactly 1.
-        Arguments:
-        * term: The value that identifies a matching item.
-        * list_method: A string that names the method to call for a
-          list of objects.
-        * key: A function that accepts a cloud object and returns a
-          value search for a `term` match on each item.  Returns the
-          object's 'id' attribute by default.
-        """
-        try:
-            list_func = getattr(self, list_method)
-        except AttributeError:
-            list_func = getattr(self.real, list_method)
-        items = list_func(**kwargs)
-        results = [item for item in items if key(item) == term]
-        count = len(results)
-        if count != 1:
-            raise ValueError("{} returned {} results for {!r}".format(
-                    list_method, count, term))
-        return results[0]
-    def search_for(self, term, list_method, key=attrgetter('id'), **kwargs):
-        """Return one cached matching item from a list of cloud objects.
-        See search_for_now() for details of arguments and exceptions.
-        This method caches results, so it's good to find static cloud objects
-        like node sizes, regions, etc.
-        """
-        cache_key = (list_method, term)
-        if cache_key not in self.SEARCH_CACHE:
-            self.SEARCH_CACHE[cache_key] = self.search_for_now(
-                term, list_method, key, **kwargs)
-        return self.SEARCH_CACHE[cache_key]
-    def list_nodes(self, **kwargs):
-        l = self.list_kwargs.copy()
-        l.update(kwargs)
-        try:
-            return self.real.list_nodes(**l)
-        except CLOUD_ERRORS:
-            tracker.counter_add('list_nodes_errors')
-            raise
-    def create_cloud_name(self, arvados_node):
-        """Return a cloud node name for the given Arvados node record.
-        Subclasses must override this method.  It should return a string
-        that can be used as the name for a newly-created cloud node,
-        based on identifying information in the Arvados node record.
-        Arguments:
-        * arvados_node: This Arvados node record to seed the new cloud node.
-        """
-        raise NotImplementedError("BaseComputeNodeDriver.create_cloud_name")
-    def arvados_create_kwargs(self, size, arvados_node):
-        """Return dynamic keyword arguments for create_node.
-        Subclasses must override this method.  It should return a dictionary
-        of keyword arguments to pass to the libcloud driver's create_node
-        method.  These arguments will extend the static arguments in
-        create_kwargs.
-        Arguments:
-        * size: The node size that will be created (libcloud NodeSize object)
-        * arvados_node: The Arvados node record that will be associated
-          with this cloud node, as returned from the API server.
-        """
-        raise NotImplementedError("BaseComputeNodeDriver.arvados_create_kwargs")
-    def broken(self, cloud_node):
-        """Return true if libcloud has indicated the node is in a "broken" state."""
-        return False
-    def _make_ping_url(self, arvados_node):
-        return 'https://{}/arvados/v1/nodes/{}/ping?ping_secret={}'.format(
-            self.ping_host, arvados_node['uuid'],
-            arvados_node['info']['ping_secret'])
-    @staticmethod
-    def _name_key(cloud_object):
-        return cloud_object.name
-    def create_node(self, size, arvados_node):
-        try:
-            kwargs = self.create_kwargs.copy()
-            kwargs.update(self.arvados_create_kwargs(size, arvados_node))
-            kwargs['size'] = size.real
-            return self.real.create_node(**kwargs)
-        except CLOUD_ERRORS as create_error:
-            # Workaround for bug #6702: sometimes the create node request
-            # succeeds but times out and raises an exception instead of
-            # returning a result.  If this happens, we get stuck in a retry
-            # loop forever because subsequent create_node attempts will fail
-            # due to node name collision.  So check if the node we intended to
-            # create shows up in the cloud node list and return it if found.
-            try:
-                return self.search_for_now(kwargs['name'], 'list_nodes', self._name_key)
-            except ValueError:
-                tracker.counter_add('create_node_errors')
-                raise create_error
-    def post_create_node(self, cloud_node):
-        # ComputeNodeSetupActor calls this method after the cloud node is
-        # created.  Any setup tasks that need to happen afterward (e.g.,
-        # tagging) should be done in this method.
-        pass
-    def sync_node(self, cloud_node, arvados_node):
-        # When a compute node first pings the API server, the API server
-        # will automatically assign some attributes on the corresponding
-        # node record, like hostname.  This method should propagate that
-        # information back to the cloud node appropriately.
-        raise NotImplementedError("BaseComputeNodeDriver.sync_node")
-    @classmethod
-    def node_fqdn(cls, node):
-        # This method should return the FQDN of the node object argument.
-        # Different clouds store this in different places.
-        raise NotImplementedError("BaseComputeNodeDriver.node_fqdn")
-    @classmethod
-    def node_start_time(cls, node):
-        # This method should return the time the node was started, in
-        # seconds since the epoch UTC.
-        raise NotImplementedError("BaseComputeNodeDriver.node_start_time")
-    def destroy_node(self, cloud_node):
-        try:
-            return self.real.destroy_node(cloud_node)
-        except CLOUD_ERRORS:
-            # Sometimes the destroy node request succeeds but times out and
-            # raises an exception instead of returning success.  If this
-            # happens, we get a noisy stack trace.  Check if the node is still
-            # on the node list.  If it is gone, we can declare victory.
-            try:
-                self.search_for_now(cloud_node.id, 'list_nodes')
-            except ValueError:
-                # If we catch ValueError, that means search_for_now didn't find
-                # it, which means destroy_node actually succeeded.
-                return True
-            # The node is still on the list.  Re-raise.
-            tracker.counter_add('destroy_node_errors')
-            raise
-    # Now that we've defined all our own methods, delegate generic, public
-    # attributes of libcloud drivers that we haven't defined ourselves.
-    def _delegate_to_real(attr_name):
-        return property(
-            lambda self: getattr(self.real, attr_name),
-            lambda self, value: setattr(self.real, attr_name, value),
-            doc=getattr(getattr(NodeDriver, attr_name), '__doc__', None))
-    # node id
-    @classmethod
-    def node_id(cls):
-        raise NotImplementedError("BaseComputeNodeDriver.node_id")
-    _locals = locals()
-    for _attr_name in dir(NodeDriver):
-        if (not _attr_name.startswith('_')) and (_attr_name not in _locals):
-            _locals[_attr_name] = _delegate_to_real(_attr_name)
diff --git a/services/nodemanager/arvnodeman/computenode/driver/azure.py b/services/nodemanager/arvnodeman/computenode/driver/azure.py
deleted file mode 100644 (file)
index 35c8b5a..0000000
+++ /dev/null
@@ -1,112 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-# SPDX-License-Identifier: AGPL-3.0
-from __future__ import absolute_import, print_function
-import pipes
-import time
-import libcloud.compute.base as cloud_base
-import libcloud.compute.providers as cloud_provider
-import libcloud.compute.types as cloud_types
-from libcloud.common.exceptions import BaseHTTPError
-from . import BaseComputeNodeDriver
-from .. import arvados_node_fqdn, arvados_timestamp, ARVADOS_TIMEFMT
-class ComputeNodeDriver(BaseComputeNodeDriver):
-    DEFAULT_DRIVER = cloud_provider.get_driver(cloud_types.Provider.AZURE_ARM)
-    SEARCH_CACHE = {}
-    def __init__(self, auth_kwargs, list_kwargs, create_kwargs,
-                 driver_class=DEFAULT_DRIVER):
-        if not list_kwargs.get("ex_resource_group"):
-            raise Exception("Must include ex_resource_group in Cloud List configuration (list_kwargs)")
-        create_kwargs["ex_resource_group"] = list_kwargs["ex_resource_group"]
-        self.tags = {key[4:]: value
-                     for key, value in create_kwargs.iteritems()
-                     if key.startswith('tag_')}
-        # filter out tags from create_kwargs
-        create_kwargs = {key: value
-                         for key, value in create_kwargs.iteritems()
-                         if not key.startswith('tag_')}
-        super(ComputeNodeDriver, self).__init__(
-            auth_kwargs, list_kwargs, create_kwargs,
-            driver_class)
-    def create_cloud_name(self, arvados_node):
-        uuid_parts = arvados_node['uuid'].split('-', 2)
-        return 'compute-{parts[2]}-{parts[0]}'.format(parts=uuid_parts)
-    def arvados_create_kwargs(self, size, arvados_node):
-        tags = {
-            # Set up tag indicating the Arvados assigned Cloud Size id.
-            'arvados_node_size': size.id,
-            'booted_at': time.strftime(ARVADOS_TIMEFMT, time.gmtime()),
-            'arv-ping-url': self._make_ping_url(arvados_node)
-        }
-        tags.update(self.tags)
-        name = self.create_cloud_name(arvados_node)
-        customdata = """#!/bin/sh
-mkdir -p    /var/tmp/arv-node-data/meta-data
-echo %s > /var/tmp/arv-node-data/arv-ping-url
-echo %s > /var/tmp/arv-node-data/meta-data/instance-id
-echo %s > /var/tmp/arv-node-data/meta-data/instance-type
-""" % (pipes.quote(tags['arv-ping-url']),
-       pipes.quote(name),
-       pipes.quote(size.id))
-        return {
-            'name': name,
-            'ex_tags': tags,
-            'ex_customdata': customdata
-        }
-    def sync_node(self, cloud_node, arvados_node):
-        try:
-            self.real.ex_create_tags(cloud_node,
-                                     {'hostname': arvados_node_fqdn(arvados_node)})
-            return True
-        except BaseHTTPError as b:
-            return False
-    def _init_image(self, urn):
-        return "image", self.get_image(urn)
-    def list_nodes(self):
-        # Azure only supports filtering node lists by resource group.
-        # Do our own filtering based on tag.
-        nodes = [node for node in
-                super(ComputeNodeDriver, self).list_nodes(ex_fetch_nic=False, ex_fetch_power_state=False)
-                if node.extra.get("tags", {}).get("arvados-class") == self.tags["arvados-class"]]
-        for n in nodes:
-            # Need to populate Node.size
-            if not n.size:
-                n.size = self.sizes()[n.extra["properties"]["hardwareProfile"]["vmSize"]]
-            n.extra['arvados_node_size'] = n.extra.get('tags', {}).get('arvados_node_size') or n.size.id
-        return nodes
-    def broken(self, cloud_node):
-        """Return true if libcloud has indicated the node is in a "broken" state."""
-        # UNKNOWN means the node state is unrecognized, which in practice means some combination
-        # of failure that the Azure libcloud driver doesn't know how to interpret.
-        return (cloud_node.state in (cloud_types.NodeState.ERROR, cloud_types.NodeState.UNKNOWN))
-    @classmethod
-    def node_fqdn(cls, node):
-        return node.extra["tags"].get("hostname")
-    @classmethod
-    def node_start_time(cls, node):
-        return arvados_timestamp(node.extra["tags"].get("booted_at"))
-    @classmethod
-    def node_id(cls, node):
-        return node.name
diff --git a/services/nodemanager/arvnodeman/computenode/driver/dummy.py b/services/nodemanager/arvnodeman/computenode/driver/dummy.py
deleted file mode 100644 (file)
index 14845ac..0000000
+++ /dev/null
@@ -1,61 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-# SPDX-License-Identifier: AGPL-3.0
-from __future__ import absolute_import, print_function
-import time
-import libcloud.compute.providers as cloud_provider
-import libcloud.compute.types as cloud_types
-from . import BaseComputeNodeDriver
-from .. import arvados_node_fqdn
-class ComputeNodeDriver(BaseComputeNodeDriver):
-    """Compute node driver wrapper for libcloud's dummy driver.
-    This class provides the glue necessary to run the node manager with a
-    dummy cloud.  It's useful for testing.
-    """
-    DEFAULT_DRIVER = cloud_provider.get_driver(cloud_types.Provider.DUMMY)
-    DEFAULT_REAL = DEFAULT_DRIVER('ComputeNodeDriver')
-    DUMMY_START_TIME = time.time()
-    def __init__(self, auth_kwargs, list_kwargs, create_kwargs,
-                 driver_class=DEFAULT_DRIVER):
-        super(ComputeNodeDriver, self).__init__(
-            auth_kwargs, list_kwargs, create_kwargs, driver_class)
-        if driver_class is self.DEFAULT_DRIVER:
-            self.real = self.DEFAULT_REAL
-    def _ensure_private_ip(self, node):
-        if not node.private_ips:
-            node.private_ips = ['10.10.0.{}'.format(node.id)]
-    def arvados_create_kwargs(self, size, arvados_node):
-        return {}
-    def list_nodes(self):
-        nodelist = super(ComputeNodeDriver, self).list_nodes()
-        for node in nodelist:
-            self._ensure_private_ip(node)
-            node.size = self.sizes()["1"]
-        return nodelist
-    def create_node(self, size, arvados_node):
-        node = super(ComputeNodeDriver, self).create_node(size, arvados_node)
-        self._ensure_private_ip(node)
-        return node
-    def sync_node(self, cloud_node, arvados_node):
-        cloud_node.name = arvados_node_fqdn(arvados_node)
-    @classmethod
-    def node_fqdn(cls, node):
-        return node.name
-    @classmethod
-    def node_start_time(cls, node):
-        return cls.DUMMY_START_TIME
diff --git a/services/nodemanager/arvnodeman/computenode/driver/ec2.py b/services/nodemanager/arvnodeman/computenode/driver/ec2.py
deleted file mode 100644 (file)
index 418a9f9..0000000
+++ /dev/null
@@ -1,129 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-# SPDX-License-Identifier: AGPL-3.0
-from __future__ import absolute_import, print_function
-import time
-import libcloud.compute.base as cloud_base
-import libcloud.compute.providers as cloud_provider
-import libcloud.compute.types as cloud_types
-from libcloud.compute.drivers import ec2 as cloud_ec2
-from . import BaseComputeNodeDriver
-from .. import arvados_node_fqdn
-### Monkeypatch libcloud to support AWS' new SecurityGroup API.
-# These classes can be removed when libcloud support specifying
-# security groups with the SecurityGroupId parameter.
-class ANMEC2Connection(cloud_ec2.EC2Connection):
-    def request(self, *args, **kwargs):
-        params = kwargs.get('params')
-        if (params is not None) and (params.get('Action') == 'RunInstances'):
-            for key in params.keys():
-                if key.startswith('SecurityGroup.'):
-                    new_key = key.replace('Group.', 'GroupId.', 1)
-                    params[new_key] = params.pop(key).id
-            kwargs['params'] = params
-        return super(ANMEC2Connection, self).request(*args, **kwargs)
-class ANMEC2NodeDriver(cloud_ec2.EC2NodeDriver):
-    connectionCls = ANMEC2Connection
-class ComputeNodeDriver(BaseComputeNodeDriver):
-    """Compute node driver wrapper for EC2.
-    This translates cloud driver requests to EC2's specific parameters.
-    """
-### End monkeypatch
-    SEARCH_CACHE = {}
-    def __init__(self, auth_kwargs, list_kwargs, create_kwargs,
-                 driver_class=DEFAULT_DRIVER):
-        # We need full lists of keys up front because these loops modify
-        # dictionaries in-place.
-        for key in list_kwargs.keys():
-            list_kwargs[key.replace('_', ':')] = list_kwargs.pop(key)
-        self.tags = {key[4:]: value
-                     for key, value in list_kwargs.iteritems()
-                     if key.startswith('tag:')}
-        # Tags are assigned at instance creation time
-        create_kwargs.setdefault('ex_metadata', {})
-        create_kwargs['ex_metadata'].update(self.tags)
-        super(ComputeNodeDriver, self).__init__(
-            auth_kwargs, {'ex_filters': list_kwargs}, create_kwargs,
-            driver_class)
-    def _init_image_id(self, image_id):
-        return 'image', self.search_for(image_id, 'list_images', ex_owner='self')
-    def _init_security_groups(self, group_names):
-        return 'ex_security_groups', [
-            self.search_for(gname.strip(), 'ex_get_security_groups')
-            for gname in group_names.split(',')]
-    def _init_subnet_id(self, subnet_id):
-        return 'ex_subnet', self.search_for(subnet_id, 'ex_list_subnets')
-    create_cloud_name = staticmethod(arvados_node_fqdn)
-    def arvados_create_kwargs(self, size, arvados_node):
-        kw = {'name': self.create_cloud_name(arvados_node),
-                'ex_userdata': self._make_ping_url(arvados_node)}
-        # libcloud/ec2 disk sizes are in GB, Arvados/SLURM "scratch" value is in MB
-        scratch = int(size.scratch / 1000) + 1
-        if scratch > size.disk:
-            volsize = scratch - size.disk
-            if volsize > 16384:
-                # Must be 1-16384 for General Purpose SSD (gp2) devices
-                # https://docs.aws.amazon.com/AWSEC2/latest/APIReference/API_EbsBlockDevice.html
-                self._logger.warning("Requested EBS volume size %d is too large, capping size request to 16384 GB", volsize)
-                volsize = 16384
-            kw["ex_blockdevicemappings"] = [{
-                "DeviceName": "/dev/xvdt",
-                "Ebs": {
-                    "DeleteOnTermination": True,
-                    "VolumeSize": volsize,
-                    "VolumeType": "gp2"
-                }}]
-        if size.preemptible:
-            # Request a Spot instance for this node
-            kw['ex_spot_market'] = True
-        return kw
-    def sync_node(self, cloud_node, arvados_node):
-        self.real.ex_create_tags(cloud_node,
-                                 {'Name': arvados_node_fqdn(arvados_node)})
-    def create_node(self, size, arvados_node):
-        # Set up tag indicating the Arvados assigned Cloud Size id.
-        self.create_kwargs['ex_metadata'].update({'arvados_node_size': size.id})
-        return super(ComputeNodeDriver, self).create_node(size, arvados_node)
-    def list_nodes(self):
-        # Need to populate Node.size
-        nodes = super(ComputeNodeDriver, self).list_nodes()
-        for n in nodes:
-            if not n.size:
-                n.size = self.sizes()[n.extra["instance_type"]]
-            n.extra['arvados_node_size'] = n.extra.get('tags', {}).get('arvados_node_size') or n.size.id
-        return nodes
-    @classmethod
-    def node_fqdn(cls, node):
-        return node.name
-    @classmethod
-    def node_start_time(cls, node):
-        time_str = node.extra['launch_time'].split('.', 2)[0] + 'UTC'
-        return time.mktime(time.strptime(
-                time_str,'%Y-%m-%dT%H:%M:%S%Z')) - time.timezone
-    @classmethod
-    def node_id(cls, node):
-        return node.id
diff --git a/services/nodemanager/arvnodeman/computenode/driver/gce.py b/services/nodemanager/arvnodeman/computenode/driver/gce.py
deleted file mode 100644 (file)
index 23a1017..0000000
+++ /dev/null
@@ -1,181 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-# SPDX-License-Identifier: AGPL-3.0
-from __future__ import absolute_import, print_function
-import functools
-import json
-import time
-import libcloud.compute.providers as cloud_provider
-import libcloud.compute.types as cloud_types
-from . import BaseComputeNodeDriver
-from .. import arvados_node_fqdn, arvados_timestamp, ARVADOS_TIMEFMT
-class ComputeNodeDriver(BaseComputeNodeDriver):
-    """Compute node driver wrapper for GCE
-    This translates cloud driver requests to GCE's specific parameters.
-    """
-    DEFAULT_DRIVER = cloud_provider.get_driver(cloud_types.Provider.GCE)
-    SEARCH_CACHE = {}
-    def __init__(self, auth_kwargs, list_kwargs, create_kwargs,
-                 driver_class=DEFAULT_DRIVER):
-        list_kwargs = list_kwargs.copy()
-        tags_str = list_kwargs.pop('tags', '')
-        if not tags_str.strip():
-            self.node_tags = frozenset()
-        else:
-            self.node_tags = frozenset(t.strip() for t in tags_str.split(','))
-        create_kwargs = create_kwargs.copy()
-        create_kwargs.setdefault('external_ip', None)
-        create_kwargs.setdefault('ex_metadata', {})
-        self._project = auth_kwargs.get("project")
-        super(ComputeNodeDriver, self).__init__(
-            auth_kwargs, list_kwargs, create_kwargs,
-            driver_class)
-        self._disktype_links = {dt.name: self._object_link(dt)
-                                for dt in self.real.ex_list_disktypes()}
-    @staticmethod
-    def _object_link(cloud_object):
-        return cloud_object.extra.get('selfLink')
-    def _init_image(self, image_name):
-        return 'image', self.search_for(
-            image_name, 'list_images', self._name_key, ex_project=self._project)
-    def _init_network(self, network_name):
-        return 'ex_network', self.search_for(
-            network_name, 'ex_list_networks', self._name_key)
-    def _init_service_accounts(self, service_accounts_str):
-        return 'ex_service_accounts', json.loads(service_accounts_str)
-    def _init_ssh_key(self, filename):
-        # SSH keys are delivered to GCE nodes via ex_metadata: see
-        # http://stackoverflow.com/questions/26752617/creating-sshkeys-for-gce-instance-using-libcloud
-        with open(filename) as ssh_file:
-            self.create_kwargs['ex_metadata']['sshKeys'] = (
-                'root:' + ssh_file.read().strip())
-    def create_cloud_name(self, arvados_node):
-        uuid_parts = arvados_node['uuid'].split('-', 2)
-        return 'compute-{parts[2]}-{parts[0]}'.format(parts=uuid_parts)
-    def arvados_create_kwargs(self, size, arvados_node):
-        name = self.create_cloud_name(arvados_node)
-        if size.scratch > 375000:
-            self._logger.warning("Requested %d MB scratch space, but GCE driver currently only supports attaching a single 375 GB disk.", size.scratch)
-        disks = [
-            {'autoDelete': True,
-             'boot': True,
-             'deviceName': name,
-             'initializeParams':
-                 {'diskName': name,
-                  'diskType': self._disktype_links['pd-standard'],
-                  'sourceImage': self._object_link(self.create_kwargs['image']),
-                  },
-             'type': 'PERSISTENT',
-             },
-            {'autoDelete': True,
-             'boot': False,
-             # Boot images rely on this device name to find the SSD.
-             # Any change must be coordinated in the image.
-             'deviceName': 'tmp',
-             'initializeParams':
-                 {'diskType': self._disktype_links['local-ssd'],
-                  },
-             'type': 'SCRATCH',
-             },
-            ]
-        result = {'name': name,
-                  'ex_metadata': self.create_kwargs['ex_metadata'].copy(),
-                  'ex_tags': list(self.node_tags),
-                  'ex_disks_gce_struct': disks,
-                  }
-        result['ex_metadata'].update({
-            'arvados_node_size': size.id,
-            'arv-ping-url': self._make_ping_url(arvados_node),
-            'booted_at': time.strftime(ARVADOS_TIMEFMT, time.gmtime()),
-            'hostname': arvados_node_fqdn(arvados_node),
-        })
-        return result
-    def list_nodes(self):
-        # The GCE libcloud driver only supports filtering node lists by zone.
-        # Do our own filtering based on tag list.
-        nodelist = [node for node in
-                    super(ComputeNodeDriver, self).list_nodes()
-                    if self.node_tags.issubset(node.extra.get('tags', []))]
-        for node in nodelist:
-            # As of 0.18, the libcloud GCE driver sets node.size to the size's name.
-            # It's supposed to be the actual size object.  Check that it's not,
-            # and monkeypatch the results when that's the case.
-            if not hasattr(node.size, 'id'):
-                node.size = self.sizes()[node.size]
-            # Get arvados-assigned cloud size id
-            node.extra['arvados_node_size'] = node.extra.get('metadata', {}).get('arvados_node_size') or node.size.id
-        return nodelist
-    @classmethod
-    def _find_metadata(cls, metadata_items, key):
-        # Given a list of two-item metadata dictonaries, return the one with
-        # the named key.  Raise KeyError if not found.
-        try:
-            return next(data_dict for data_dict in metadata_items
-                        if data_dict.get('key') == key)
-        except StopIteration:
-            raise KeyError(key)
-    @classmethod
-    def _get_metadata(cls, metadata_items, key, *default):
-        try:
-            return cls._find_metadata(metadata_items, key)['value']
-        except KeyError:
-            if default:
-                return default[0]
-            raise
-    def sync_node(self, cloud_node, arvados_node):
-        # Update the cloud node record to ensure we have the correct metadata
-        # fingerprint.
-        cloud_node = self.real.ex_get_node(cloud_node.name, cloud_node.extra['zone'])
-        # We can't store the FQDN on the name attribute or anything like it,
-        # because (a) names are static throughout the node's life (so FQDN
-        # isn't available because we don't know it at node creation time) and
-        # (b) it can't contain dots.  Instead stash it in metadata.
-        hostname = arvados_node_fqdn(arvados_node)
-        metadata_req = cloud_node.extra['metadata'].copy()
-        metadata_items = metadata_req.setdefault('items', [])
-        try:
-            self._find_metadata(metadata_items, 'hostname')['value'] = hostname
-        except KeyError:
-            metadata_items.append({'key': 'hostname', 'value': hostname})
-        self.real.ex_set_node_metadata(cloud_node, metadata_items)
-    @classmethod
-    def node_fqdn(cls, node):
-        # See sync_node comment.
-        return cls._get_metadata(node.extra['metadata'].get('items', []),
-                                 'hostname', '')
-    @classmethod
-    def node_start_time(cls, node):
-        try:
-            return arvados_timestamp(cls._get_metadata(
-                    node.extra['metadata']['items'], 'booted_at'))
-        except KeyError:
-            return 0
-    @classmethod
-    def node_id(cls, node):
-        return node.id
diff --git a/services/nodemanager/arvnodeman/config.py b/services/nodemanager/arvnodeman/config.py
deleted file mode 100644 (file)
index 4857e89..0000000
+++ /dev/null
@@ -1,184 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-# SPDX-License-Identifier: AGPL-3.0
-from __future__ import absolute_import, print_function
-import ConfigParser
-import importlib
-import logging
-import sys
-import arvados
-import httplib2
-import pykka
-from apiclient import errors as apierror
-from .baseactor import BaseNodeManagerActor
-from functools import partial
-from libcloud.common.types import LibcloudError
-from libcloud.common.exceptions import BaseHTTPError
-# IOError is the base class for socket.error, ssl.SSLError, and friends.
-# It seems like it hits the sweet spot for operations we want to retry:
-# it's low-level, but unlikely to catch code bugs.
-actor_class = BaseNodeManagerActor
-class NodeManagerConfig(ConfigParser.SafeConfigParser):
-    """Node Manager Configuration class.
-    This a standard Python ConfigParser, with additional helper methods to
-    create objects instantiated with configuration information.
-    """
-    LOGGING_NONLEVELS = frozenset(['file'])
-    def __init__(self, *args, **kwargs):
-        # Can't use super() because SafeConfigParser is an old-style class.
-        ConfigParser.SafeConfigParser.__init__(self, *args, **kwargs)
-        for sec_name, settings in {
-            'Arvados': {'insecure': 'no',
-                        'timeout': '15',
-                        'jobs_queue': 'yes',
-                        'slurm_queue': 'yes'
-                    },
-            'Daemon': {'min_nodes': '0',
-                       'max_nodes': '1',
-                       'poll_time': '60',
-                       'cloudlist_poll_time': '0',
-                       'nodelist_poll_time': '0',
-                       'wishlist_poll_time': '0',
-                       'max_poll_time': '300',
-                       'poll_stale_after': '600',
-                       'max_total_price': '0',
-                       'boot_fail_after': str(sys.maxint),
-                       'node_stale_after': str(60 * 60 * 2),
-                       'watchdog': '600',
-                       'node_mem_scaling': '0.95',
-                       'consecutive_idle_count': '2'},
-            'Manage': {'address': '',
-                       'port': '-1',
-                       'ManagementToken': ''},
-            'Logging': {'file': '/dev/stderr',
-                        'level': 'WARNING'}
-        }.iteritems():
-            if not self.has_section(sec_name):
-                self.add_section(sec_name)
-            for opt_name, value in settings.iteritems():
-                if not self.has_option(sec_name, opt_name):
-                    self.set(sec_name, opt_name, value)
-    def get_section(self, section, transformers={}, default_transformer=None):
-        transformer_map = {
-            str: self.get,
-            int: self.getint,
-            bool: self.getboolean,
-            float: self.getfloat,
-        }
-        result = self._dict()
-        for key, value in self.items(section):
-            transformer = None
-            if transformers.get(key) in transformer_map:
-                transformer = partial(transformer_map[transformers[key]], section)
-            elif default_transformer in transformer_map:
-                transformer = partial(transformer_map[default_transformer], section)
-            if transformer is not None:
-                try:
-                    value = transformer(key)
-                except (TypeError, ValueError):
-                    pass
-            result[key] = value
-        return result
-    def log_levels(self):
-        return {key: getattr(logging, self.get('Logging', key).upper())
-                for key in self.options('Logging')
-                if key not in self.LOGGING_NONLEVELS}
-    def dispatch_classes(self):
-        mod_name = 'arvnodeman.computenode.dispatch'
-        if self.has_option('Daemon', 'dispatcher'):
-            mod_name = '{}.{}'.format(mod_name,
-                                      self.get('Daemon', 'dispatcher'))
-        module = importlib.import_module(mod_name)
-        return (module.ComputeNodeSetupActor,
-                module.ComputeNodeShutdownActor,
-                module.ComputeNodeUpdateActor,
-                module.ComputeNodeMonitorActor)
-    def new_arvados_client(self):
-        if self.has_option('Daemon', 'certs_file'):
-            certs_file = self.get('Daemon', 'certs_file')
-        else:
-            certs_file = None
-        insecure = self.getboolean('Arvados', 'insecure')
-        http = httplib2.Http(timeout=self.getint('Arvados', 'timeout'),
-                             ca_certs=certs_file,
-                             disable_ssl_certificate_validation=insecure)
-        return arvados.api(version='v1',
-                           host=self.get('Arvados', 'host'),
-                           token=self.get('Arvados', 'token'),
-                           insecure=insecure,
-                           http=http)
-    def new_cloud_client(self):
-        module = importlib.import_module('arvnodeman.computenode.driver.' +
-                                         self.get('Cloud', 'provider'))
-        driver_class = module.ComputeNodeDriver.DEFAULT_DRIVER
-        if self.has_option('Cloud', 'driver_class'):
-            d = self.get('Cloud', 'driver_class').split('.')
-            mod = '.'.join(d[:-1])
-            cls = d[-1]
-            driver_class = importlib.import_module(mod).__dict__[cls]
-        auth_kwargs = self.get_section('Cloud Credentials')
-        if 'timeout' in auth_kwargs:
-            auth_kwargs['timeout'] = int(auth_kwargs['timeout'])
-        return module.ComputeNodeDriver(auth_kwargs,
-                                        self.get_section('Cloud List'),
-                                        self.get_section('Cloud Create'),
-                                        driver_class=driver_class)
-    def node_sizes(self):
-        """Finds all acceptable NodeSizes for our installation.
-        Returns a list of (NodeSize, kwargs) pairs for each NodeSize object
-        returned by libcloud that matches a size listed in our config file.
-        """
-        all_sizes = self.new_cloud_client().list_sizes()
-        size_kwargs = {}
-        section_types = {
-            'instance_type': str,
-            'price': float,
-            'preemptible': bool,
-        }
-        for sec_name in self.sections():
-            sec_words = sec_name.split(None, 2)
-            if sec_words[0] != 'Size':
-                continue
-            size_spec = self.get_section(sec_name, section_types, int)
-            if 'preemptible' not in size_spec:
-                size_spec['preemptible'] = False
-            if 'instance_type' not in size_spec:
-                # Assume instance type is Size name if missing
-                size_spec['instance_type'] = sec_words[1]
-            size_spec['id'] = sec_words[1]
-            size_kwargs[sec_words[1]] = size_spec
-        # EC2 node sizes are identified by id. GCE sizes are identified by name.
-        matching_sizes = []
-        for size in all_sizes:
-            matching_sizes += [
-                (size, size_kwargs[s]) for s in size_kwargs
-                if size_kwargs[s]['instance_type'] == size.id
-                or size_kwargs[s]['instance_type'] == size.name
-            ]
-        return matching_sizes
-    def shutdown_windows(self):
-        return [float(n)
-                for n in self.get('Cloud', 'shutdown_windows').split(',')]
diff --git a/services/nodemanager/arvnodeman/daemon.py b/services/nodemanager/arvnodeman/daemon.py
deleted file mode 100644 (file)
index 1edf4dc..0000000
+++ /dev/null
@@ -1,583 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-# SPDX-License-Identifier: AGPL-3.0
-from __future__ import absolute_import, print_function
-import functools
-import logging
-import time
-import pykka
-from . import computenode as cnode
-from . import status
-from .computenode import dispatch
-from .config import actor_class
-class _ComputeNodeRecord(object):
-    def __init__(self, actor=None, cloud_node=None, arvados_node=None,
-                 assignment_time=float('-inf')):
-        self.actor = actor
-        self.cloud_node = cloud_node
-        self.arvados_node = arvados_node
-        self.assignment_time = assignment_time
-        self.shutdown_actor = None
-class _BaseNodeTracker(object):
-    def __init__(self):
-        self.nodes = {}
-        self.orphans = {}
-    # Proxy the methods listed below to self.nodes.
-    def _proxy_method(name):
-        method = getattr(dict, name)
-        @functools.wraps(method, ('__name__', '__doc__'))
-        def wrapper(self, *args, **kwargs):
-            return method(self.nodes, *args, **kwargs)
-        return wrapper
-    for _method_name in ['__contains__', '__getitem__', '__len__', 'get']:
-        locals()[_method_name] = _proxy_method(_method_name)
-    def record_key(self, record):
-        return self.item_key(getattr(record, self.RECORD_ATTR))
-    def add(self, record):
-        self.nodes[self.record_key(record)] = record
-    def update_record(self, key, item):
-        setattr(self.nodes[key], self.RECORD_ATTR, item)
-    def update_from(self, response):
-        unseen = set(self.nodes.iterkeys())
-        for item in response:
-            key = self.item_key(item)
-            if key in unseen:
-                unseen.remove(key)
-                self.update_record(key, item)
-            else:
-                yield key, item
-        self.orphans = {key: self.nodes.pop(key) for key in unseen}
-    def unpaired(self):
-        return (record for record in self.nodes.itervalues()
-                if getattr(record, self.PAIR_ATTR) is None)
-class _CloudNodeTracker(_BaseNodeTracker):
-    RECORD_ATTR = 'cloud_node'
-    PAIR_ATTR = 'arvados_node'
-    item_key = staticmethod(lambda cloud_node: cloud_node.id)
-class _ArvadosNodeTracker(_BaseNodeTracker):
-    RECORD_ATTR = 'arvados_node'
-    PAIR_ATTR = 'cloud_node'
-    item_key = staticmethod(lambda arvados_node: arvados_node['uuid'])
-    def find_stale_node(self, stale_time):
-        # Try to select a stale node record that have an assigned slot first
-        for record in sorted(self.nodes.itervalues(),
-                             key=lambda r: r.arvados_node['slot_number'],
-                             reverse=True):
-            node = record.arvados_node
-            if (not cnode.timestamp_fresh(cnode.arvados_node_mtime(node),
-                                          stale_time) and
-                  not cnode.timestamp_fresh(record.assignment_time,
-                                            stale_time)):
-                return node
-        return None
-class NodeManagerDaemonActor(actor_class):
-    """Node Manager daemon.
-    This actor subscribes to all information polls about cloud nodes,
-    Arvados nodes, and the job queue.  It creates a ComputeNodeMonitorActor
-    for every cloud node, subscribing them to poll updates
-    appropriately.  It creates and destroys cloud nodes based on job queue
-    demand, and stops the corresponding ComputeNode actors when their work
-    is done.
-    """
-    def __init__(self, server_wishlist_actor, arvados_nodes_actor,
-                 cloud_nodes_actor, cloud_update_actor, timer_actor,
-                 arvados_factory, cloud_factory,
-                 shutdown_windows, server_calculator,
-                 min_nodes, max_nodes,
-                 poll_stale_after=600,
-                 boot_fail_after=1800,
-                 node_stale_after=7200,
-                 node_setup_class=dispatch.ComputeNodeSetupActor,
-                 node_shutdown_class=dispatch.ComputeNodeShutdownActor,
-                 node_actor_class=dispatch.ComputeNodeMonitorActor,
-                 max_total_price=0,
-                 consecutive_idle_count=1):
-        super(NodeManagerDaemonActor, self).__init__()
-        self._node_setup = node_setup_class
-        self._node_shutdown = node_shutdown_class
-        self._node_actor = node_actor_class
-        self._cloud_updater = cloud_update_actor
-        self._timer = timer_actor
-        self._new_arvados = arvados_factory
-        self._new_cloud = cloud_factory
-        self._cloud_driver = self._new_cloud()
-        self._later = self.actor_ref.tell_proxy()
-        self.shutdown_windows = shutdown_windows
-        self.server_calculator = server_calculator
-        self.min_cloud_size = self.server_calculator.cheapest_size()
-        self.min_nodes = min_nodes
-        self.max_nodes = max_nodes
-        self.node_quota = max_nodes
-        self.max_total_price = max_total_price
-        self.poll_stale_after = poll_stale_after
-        self.boot_fail_after = boot_fail_after
-        self.node_stale_after = node_stale_after
-        self.consecutive_idle_count = consecutive_idle_count
-        self.last_polls = {}
-        for poll_name in ['server_wishlist', 'arvados_nodes', 'cloud_nodes']:
-            poll_actor = locals()[poll_name + '_actor']
-            poll_actor.subscribe(getattr(self._later, 'update_' + poll_name))
-            setattr(self, '_{}_actor'.format(poll_name), poll_actor)
-            self.last_polls[poll_name] = -self.poll_stale_after
-        self.cloud_nodes = _CloudNodeTracker()
-        self.arvados_nodes = _ArvadosNodeTracker()
-        self.booting = {}       # Actor IDs to ComputeNodeSetupActors
-        self.sizes_booting = {} # Actor IDs to node size
-    def on_start(self):
-        self._logger = logging.getLogger("%s.%s" % (self.__class__.__name__, self.actor_urn[33:]))
-        self._logger.debug("Daemon started")
-    def _update_poll_time(self, poll_key):
-        self.last_polls[poll_key] = time.time()
-    def _pair_nodes(self, node_record, arvados_node):
-        self._logger.info("Cloud node %s is now paired with Arvados node %s with hostname %s",
-                          node_record.cloud_node.name, arvados_node['uuid'], arvados_node['hostname'])
-        self._arvados_nodes_actor.subscribe_to(
-            arvados_node['uuid'], node_record.actor.update_arvados_node)
-        node_record.arvados_node = arvados_node
-        self.arvados_nodes.add(node_record)
-    def _new_node(self, cloud_node):
-        start_time = self._cloud_driver.node_start_time(cloud_node)
-        shutdown_timer = cnode.ShutdownTimer(start_time,
-                                             self.shutdown_windows)
-        actor = self._node_actor.start(
-            cloud_node=cloud_node,
-            cloud_node_start_time=start_time,
-            shutdown_timer=shutdown_timer,
-            update_actor=self._cloud_updater,
-            timer_actor=self._timer,
-            arvados_node=None,
-            poll_stale_after=self.poll_stale_after,
-            node_stale_after=self.node_stale_after,
-            cloud_client=self._cloud_driver,
-            boot_fail_after=self.boot_fail_after,
-            consecutive_idle_count=self.consecutive_idle_count)
-        actorTell = actor.tell_proxy()
-        actorTell.subscribe(self._later.node_can_shutdown)
-        self._cloud_nodes_actor.subscribe_to(cloud_node.id,
-                                             actorTell.update_cloud_node)
-        record = _ComputeNodeRecord(actor.proxy(), cloud_node)
-        return record
-    def _register_cloud_node(self, node):
-        rec = self.cloud_nodes.get(node.id)
-        if rec is None:
-            self._logger.info("Registering new cloud node %s", node.id)
-            record = self._new_node(node)
-            self.cloud_nodes.add(record)
-        else:
-            rec.cloud_node = node
-    def update_cloud_nodes(self, nodelist):
-        self._update_poll_time('cloud_nodes')
-        for _, node in self.cloud_nodes.update_from(nodelist):
-            self._register_cloud_node(node)
-        self.try_pairing()
-        for record in self.cloud_nodes.orphans.itervalues():
-            if record.shutdown_actor:
-                try:
-                    record.shutdown_actor.stop()
-                except pykka.ActorDeadError:
-                    pass
-                record.shutdown_actor = None
-            # A recently booted node is a node that successfully completed the
-            # setup actor but has not yet appeared in the cloud node list.
-            # This will have the tag _nodemanager_recently_booted on it, which
-            # means (if we're not shutting it down) we want to put it back into
-            # the cloud node list.  Once it really appears in the cloud list,
-            # the object in record.cloud_node will be replaced by a new one
-            # that lacks the "_nodemanager_recently_booted" tag.
-            if hasattr(record.cloud_node, "_nodemanager_recently_booted"):
-                self.cloud_nodes.add(record)
-            else:
-                # Node disappeared from the cloud node list. If it's paired,
-                # remove its idle time counter.
-                if record.arvados_node:
-                    status.tracker.idle_out(record.arvados_node.get('hostname'))
-                # Stop the monitor actor if necessary and forget about the node.
-                if record.actor:
-                    try:
-                        record.actor.stop()
-                    except pykka.ActorDeadError:
-                        pass
-                    record.actor = None
-                record.cloud_node = None
-    def _register_arvados_node(self, key, arv_node):
-        self._logger.info("Registering new Arvados node %s", key)
-        record = _ComputeNodeRecord(arvados_node=arv_node)
-        self.arvados_nodes.add(record)
-    def update_arvados_nodes(self, nodelist):
-        self._update_poll_time('arvados_nodes')
-        for key, node in self.arvados_nodes.update_from(nodelist):
-            self._register_arvados_node(key, node)
-        self.try_pairing()
-    def try_pairing(self):
-        for record in self.cloud_nodes.unpaired():
-            for arv_rec in self.arvados_nodes.unpaired():
-                if record.actor is not None and record.actor.offer_arvados_pair(arv_rec.arvados_node).get():
-                    self._pair_nodes(record, arv_rec.arvados_node)
-                    break
-    def _nodes_booting(self, size):
-        s = sum(1
-                for c in self.booting.iterkeys()
-                if size is None or self.sizes_booting[c].id == size.id)
-        return s
-    def _node_states(self, size):
-        proxy_states = []
-        states = []
-        for rec in self.cloud_nodes.nodes.itervalues():
-            if size is None or rec.cloud_node.size.id == size.id:
-                if rec.shutdown_actor is None and rec.actor is not None:
-                    proxy_states.append(rec.actor.get_state())
-                else:
-                    states.append("shutdown")
-        return states + pykka.get_all(proxy_states)
-    def _update_tracker(self):
-        updates = {
-            k: 0
-            for k in status.tracker.keys()
-            if k.startswith('nodes_')
-        }
-        for s in self._node_states(size=None):
-            updates.setdefault('nodes_'+s, 0)
-            updates['nodes_'+s] += 1
-        updates['nodes_wish'] = len(self.last_wishlist)
-        updates['node_quota'] = self.node_quota
-        status.tracker.update(updates)
-    def _state_counts(self, size):
-        states = self._node_states(size)
-        counts = {
-            "booting": self._nodes_booting(size),
-            "unpaired": 0,
-            "busy": 0,
-            "idle": 0,
-            "fail": 0,
-            "down": 0,
-            "shutdown": 0
-        }
-        for s in states:
-            counts[s] = counts[s] + 1
-        return counts
-    def _nodes_up(self, counts):
-        up = counts["booting"] + counts["unpaired"] + counts["idle"] + counts["busy"]
-        return up
-    def _total_price(self):
-        cost = 0
-        cost += sum(self.sizes_booting[c].price
-                    for c in self.booting.iterkeys())
-        cost += sum(c.cloud_node.size.price
-                    for c in self.cloud_nodes.nodes.itervalues())
-        return cost
-    def _size_wishlist(self, size):
-        return sum(1 for c in self.last_wishlist if c.id == size.id)
-    def _nodes_wanted(self, size):
-        total_node_count = self._nodes_booting(None) + len(self.cloud_nodes)
-        under_min = self.min_nodes - total_node_count
-        over_max = total_node_count - self.node_quota
-        total_price = self._total_price()
-        counts = self._state_counts(size)
-        up_count = self._nodes_up(counts)
-        busy_count = counts["busy"]
-        wishlist_count = self._size_wishlist(size)
-        self._logger.info("%s: wishlist %i, up %i (booting %i, unpaired %i, idle %i, busy %i), down %i, shutdown %i", size.id,
-                          wishlist_count,
-                          up_count,
-                          counts["booting"],
-                          counts["unpaired"],
-                          counts["idle"],
-                          busy_count,
-                          counts["down"]+counts["fail"],
-                          counts["shutdown"])
-        if over_max >= 0:
-            return -over_max
-        elif under_min > 0 and size.id == self.min_cloud_size.id:
-            return under_min
-        wanted = wishlist_count - (up_count - busy_count)
-        if wanted > 0 and self.max_total_price and ((total_price + (size.price*wanted)) > self.max_total_price):
-            can_boot = int((self.max_total_price - total_price) / size.price)
-            if can_boot == 0:
-                self._logger.info("Not booting %s (price %s) because with it would exceed max_total_price of %s (current total_price is %s)",
-                                  size.id, size.price, self.max_total_price, total_price)
-            return can_boot
-        else:
-            return wanted
-    def _nodes_excess(self, size):
-        counts = self._state_counts(size)
-        up_count = self._nodes_up(counts)
-        if size.id == self.min_cloud_size.id:
-            up_count -= self.min_nodes
-        return up_count - (counts["busy"] + self._size_wishlist(size))
-    def update_server_wishlist(self, wishlist):
-        self._update_poll_time('server_wishlist')
-        requestable_nodes = self.node_quota - (self._nodes_booting(None) + len(self.cloud_nodes))
-        self.last_wishlist = wishlist[:requestable_nodes]
-        for size in reversed(self.server_calculator.cloud_sizes):
-            try:
-                nodes_wanted = self._nodes_wanted(size)
-                if nodes_wanted > 0:
-                    self._later.start_node(size)
-                elif (nodes_wanted < 0) and self.booting:
-                    self._later.stop_booting_node(size)
-            except Exception:
-                self._logger.exception("while calculating nodes wanted for size %s", getattr(size, "id", "(id not available)"))
-        try:
-            self._update_tracker()
-        except:
-            self._logger.exception("while updating tracker")
-    def _check_poll_freshness(orig_func):
-        """Decorator to inhibit a method when poll information is stale.
-        This decorator checks the timestamps of all the poll information the
-        daemon has received.  The decorated method is only called if none
-        of the timestamps are considered stale.
-        """
-        @functools.wraps(orig_func)
-        def wrapper(self, *args, **kwargs):
-            now = time.time()
-            if all(now - t < self.poll_stale_after
-                   for t in self.last_polls.itervalues()):
-                return orig_func(self, *args, **kwargs)
-            else:
-                return None
-        return wrapper
-    @_check_poll_freshness
-    def start_node(self, cloud_size):
-        nodes_wanted = self._nodes_wanted(cloud_size)
-        if nodes_wanted < 1:
-            return None
-        if not self.cancel_node_shutdown(cloud_size):
-            arvados_node = self.arvados_nodes.find_stale_node(self.node_stale_after)
-            self._logger.info("Want %i more %s nodes.  Booting a node.",
-                              nodes_wanted, cloud_size.id)
-            new_setup = self._node_setup.start(
-                timer_actor=self._timer,
-                arvados_client=self._new_arvados(),
-                arvados_node=arvados_node,
-                cloud_client=self._new_cloud(),
-                cloud_size=self.server_calculator.find_size(cloud_size.id))
-            self.booting[new_setup.actor_urn] = new_setup.proxy()
-            self.sizes_booting[new_setup.actor_urn] = cloud_size
-            if arvados_node is not None:
-                self.arvados_nodes[arvados_node['uuid']].assignment_time = (
-                    time.time())
-            new_setup.tell_proxy().subscribe(self._later.node_setup_finished)
-        if nodes_wanted > 1:
-            self._later.start_node(cloud_size)
-    def _get_actor_attrs(self, actor, *attr_names):
-        return pykka.get_all([getattr(actor, name) for name in attr_names])
-    def node_setup_finished(self, setup_proxy):
-        # Called when a SetupActor has completed.
-        cloud_node, arvados_node, error = self._get_actor_attrs(
-            setup_proxy, 'cloud_node', 'arvados_node', 'error')
-        setup_proxy.stop()
-        if cloud_node is None:
-            # If cloud_node is None then the node create wasn't successful.
-            if error == dispatch.QuotaExceeded:
-                # We've hit a quota limit, so adjust node_quota to stop trying to
-                # boot new nodes until the node count goes down.
-                self.node_quota = len(self.cloud_nodes)
-                self._logger.warning("After quota exceeded error setting node quota to %s", self.node_quota)
-        else:
-            # Node creation succeeded.  Update cloud node list.
-            cloud_node._nodemanager_recently_booted = True
-            self._register_cloud_node(cloud_node)
-            # Different quota policies may in force depending on the cloud
-            # provider, account limits, and the specific mix of nodes sizes
-            # that are already created.  If we are right at the quota limit,
-            # we want to probe to see if the last quota still applies or if we
-            # are allowed to create more nodes.
-            #
-            # For example, if the quota is actually based on core count, the
-            # quota might be 20 single-core machines or 10 dual-core machines.
-            # If we previously set node_quota to 10 dual core machines, but are
-            # now booting single core machines (actual quota 20), we want to
-            # allow the quota to expand so we don't get stuck at 10 machines
-            # forever.
-            if len(self.cloud_nodes) >= self.node_quota:
-                self.node_quota = len(self.cloud_nodes)+1
-                self._logger.warning("After successful boot setting node quota to %s", self.node_quota)
-        self.node_quota = min(self.node_quota, self.max_nodes)
-        del self.booting[setup_proxy.actor_ref.actor_urn]
-        del self.sizes_booting[setup_proxy.actor_ref.actor_urn]
-    @_check_poll_freshness
-    def stop_booting_node(self, size):
-        nodes_excess = self._nodes_excess(size)
-        if (nodes_excess < 1) or not self.booting:
-            return None
-        for key, node in self.booting.iteritems():
-            try:
-                if node and node.cloud_size.get().id == size.id and node.stop_if_no_cloud_node().get(2):
-                    del self.booting[key]
-                    del self.sizes_booting[key]
-                    if nodes_excess > 1:
-                        self._later.stop_booting_node(size)
-                    return
-            except pykka.Timeout:
-                pass
-    @_check_poll_freshness
-    def cancel_node_shutdown(self, size):
-        # Go through shutdown actors and see if there are any of the appropriate size that can be cancelled
-        for record in self.cloud_nodes.nodes.itervalues():
-            try:
-                if (record.shutdown_actor is not None and
-                    record.cloud_node.size.id == size.id and
-                    record.shutdown_actor.cancel_shutdown("Node size is in wishlist").get(2)):
-                        return True
-            except (pykka.ActorDeadError, pykka.Timeout) as e:
-                pass
-        return False
-    def _begin_node_shutdown(self, node_actor, cancellable):
-        cloud_node_obj = node_actor.cloud_node.get()
-        cloud_node_id = cloud_node_obj.id
-        record = self.cloud_nodes[cloud_node_id]
-        if record.shutdown_actor is not None:
-            return None
-        shutdown = self._node_shutdown.start(
-            timer_actor=self._timer, cloud_client=self._new_cloud(),
-            arvados_client=self._new_arvados(),
-            node_monitor=node_actor.actor_ref, cancellable=cancellable)
-        record.shutdown_actor = shutdown.proxy()
-        shutdown.tell_proxy().subscribe(self._later.node_finished_shutdown)
-    @_check_poll_freshness
-    def node_can_shutdown(self, node_actor):
-        try:
-            if self._nodes_excess(node_actor.cloud_node.get().size) > 0:
-                self._begin_node_shutdown(node_actor, cancellable=True)
-            elif self.cloud_nodes.nodes.get(node_actor.cloud_node.get().id).arvados_node is None:
-                # Node is unpaired, which means it probably exceeded its booting
-                # grace period without a ping, so shut it down so we can boot a new
-                # node in its place.
-                self._begin_node_shutdown(node_actor, cancellable=False)
-            elif node_actor.in_state('down', 'fail').get():
-                # Node is down and unlikely to come back.
-                self._begin_node_shutdown(node_actor, cancellable=False)
-        except pykka.ActorDeadError as e:
-            # The monitor actor sends shutdown suggestions every time the
-            # node's state is updated, and these go into the daemon actor's
-            # message queue.  It's possible that the node has already been shut
-            # down (which shuts down the node monitor actor).  In that case,
-            # this message is stale and we'll get ActorDeadError when we try to
-            # access node_actor.  Log the error.
-            self._logger.debug("ActorDeadError in node_can_shutdown: %s", e)
-    def node_finished_shutdown(self, shutdown_actor):
-        try:
-            cloud_node, success = self._get_actor_attrs(
-                shutdown_actor, 'cloud_node', 'success')
-        except pykka.ActorDeadError:
-            return
-        cloud_node_id = cloud_node.id
-        try:
-            shutdown_actor.stop()
-        except pykka.ActorDeadError:
-            pass
-        try:
-            record = self.cloud_nodes[cloud_node_id]
-        except KeyError:
-            # Cloud node was already removed from the cloud node list
-            # supposedly while the destroy_node call was finishing its
-            # job.
-            return
-        record.shutdown_actor = None
-        if not success:
-            return
-        # Shutdown was successful, so stop the monitor actor, otherwise it
-        # will keep offering the node as a candidate for shutdown.
-        record.actor.stop()
-        record.actor = None
-        # If the node went from being booted to being shut down without ever
-        # appearing in the cloud node list, it will have the
-        # _nodemanager_recently_booted tag, so get rid of it so that the node
-        # can be forgotten completely.
-        if hasattr(record.cloud_node, "_nodemanager_recently_booted"):
-            del record.cloud_node._nodemanager_recently_booted
-    def shutdown(self):
-        self._logger.info("Shutting down after signal.")
-        self.poll_stale_after = -1  # Inhibit starting/stopping nodes
-        # Shut down pollers
-        self._server_wishlist_actor.stop()
-        self._arvados_nodes_actor.stop()
-        self._cloud_nodes_actor.stop()
-        # Clear cloud node list
-        self.update_cloud_nodes([])
-        # Stop setup actors unless they are in the middle of setup.
-        setup_stops = {key: node.stop_if_no_cloud_node()
-                       for key, node in self.booting.iteritems()}
-        self.booting = {key: self.booting[key]
-                        for key in setup_stops if not setup_stops[key].get()}
-        self._later.await_shutdown()
-    def await_shutdown(self):
-        if self.booting:
-            self._timer.schedule(time.time() + 1, self._later.await_shutdown)
-        else:
-            self.stop()
diff --git a/services/nodemanager/arvnodeman/jobqueue.py b/services/nodemanager/arvnodeman/jobqueue.py
deleted file mode 100644 (file)
index 7ca9c95..0000000
+++ /dev/null
@@ -1,255 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-# SPDX-License-Identifier: AGPL-3.0
-from __future__ import absolute_import, print_function
-import logging
-import re
-import subprocess32 as subprocess
-import arvados.util
-from . import clientactor
-from .config import ARVADOS_ERRORS
-class ServerCalculator(object):
-    """Generate cloud server wishlists from an Arvados job queue.
-    Instantiate this class with a list of cloud node sizes you're willing to
-    use, plus keyword overrides from the configuration.  Then you can pass
-    job queues to servers_for_queue.  It will return a list of node sizes
-    that would best satisfy the jobs, choosing the cheapest size that
-    satisfies each job, and ignoring jobs that can't be satisfied.
-    """
-    class InvalidCloudSize(object):
-        """
-        Dummy CloudSizeWrapper-like class, to be used when a cloud node doesn't
-        have a recognizable arvados_node_size tag.
-        """
-        def __init__(self):
-            self.id = 'invalid'
-            self.name = 'invalid'
-            self.ram = 0
-            self.disk = 0
-            self.scratch = 0
-            self.cores = 0
-            self.bandwidth = 0
-            # price is multiplied by 1000 to get the node weight
-            # the maximum node weight is                  4294967280
-            # so use invalid node weight 4294967 * 1000 = 4294967000
-            self.price = 4294967
-            self.preemptible = False
-            self.extra = {}
-        def meets_constraints(self, **kwargs):
-            return False
-    class CloudSizeWrapper(object):
-        def __init__(self, real_size, node_mem_scaling, **kwargs):
-            self.real = real_size
-            for name in ['id', 'name', 'ram', 'disk', 'bandwidth', 'price',
-                         'extra']:
-                setattr(self, name, getattr(self.real, name))
-            self.cores = kwargs.pop('cores')
-            # libcloud disk sizes are in GB, Arvados/SLURM are in MB
-            # multiply by 1000 instead of 1024 to err on low side
-            if self.disk is None:
-                self.disk = 0
-            self.scratch = self.disk * 1000
-            self.ram = int(self.ram * node_mem_scaling)
-            self.preemptible = False
-            for name, override in kwargs.iteritems():
-                if name == 'instance_type': continue
-                if not hasattr(self, name):
-                    raise ValueError("unrecognized size field '%s'" % (name,))
-                setattr(self, name, override)
-            if self.price is None:
-                raise ValueError("Required field 'price' is None")
-        def meets_constraints(self, **kwargs):
-            for name, want_value in kwargs.iteritems():
-                have_value = getattr(self, name)
-                if (have_value != 0) and (have_value < want_value):
-                    return False
-            return True
-    def __init__(self, server_list, max_nodes=None, max_price=None,
-                 node_mem_scaling=0.95):
-        self.cloud_sizes = [self.CloudSizeWrapper(s, node_mem_scaling, **kws)
-                            for s, kws in server_list]
-        self.cloud_sizes.sort(key=lambda s: s.price)
-        self.max_nodes = max_nodes or float('inf')
-        self.max_price = max_price or float('inf')
-        self.logger = logging.getLogger('arvnodeman.jobqueue')
-        self.logger.info("Using cloud node sizes:")
-        for s in self.cloud_sizes:
-            self.logger.info(str(s.__dict__))
-    @staticmethod
-    def coerce_int(x, fallback):
-        try:
-            return int(x)
-        except (TypeError, ValueError):
-            return fallback
-    def cloud_size_for_constraints(self, constraints):
-        specified_size = constraints.get('instance_type')
-        want_value = lambda key: self.coerce_int(constraints.get(key), 0)
-        wants = {'cores': want_value('min_cores_per_node'),
-                 'ram': want_value('min_ram_mb_per_node'),
-                 'scratch': want_value('min_scratch_mb_per_node')}
-        # EC2 node sizes are identified by id. GCE sizes are identified by name.
-        for size in self.cloud_sizes:
-            if (size.meets_constraints(**wants) and
-                (specified_size is None or
-                    size.id == specified_size or size.name == specified_size)):
-                        return size
-        return None
-    def servers_for_queue(self, queue):
-        servers = []
-        unsatisfiable_jobs = {}
-        for job in queue:
-            constraints = job['runtime_constraints']
-            want_count = max(1, self.coerce_int(constraints.get('min_nodes'), 1))
-            cloud_size = self.cloud_size_for_constraints(constraints)
-            if cloud_size is None:
-                unsatisfiable_jobs[job['uuid']] = (
-                    "Constraints cannot be satisfied by any node type")
-            elif (want_count > self.max_nodes):
-                unsatisfiable_jobs[job['uuid']] = (
-                    "Job's min_nodes constraint is greater than the configured "
-                    "max_nodes (%d)" % self.max_nodes)
-            elif (want_count*cloud_size.price <= self.max_price):
-                servers.extend([cloud_size] * want_count)
-            else:
-                unsatisfiable_jobs[job['uuid']] = (
-                    "Job's price (%d) is above system's max_price "
-                    "limit (%d)" % (want_count*cloud_size.price, self.max_price))
-        return (servers, unsatisfiable_jobs)
-    def cheapest_size(self):
-        return self.cloud_sizes[0]
-    def find_size(self, sizeid):
-        for s in self.cloud_sizes:
-            if s.id == sizeid:
-                return s
-        return self.InvalidCloudSize()
-class JobQueueMonitorActor(clientactor.RemotePollLoopActor):
-    """Actor to generate server wishlists from the job queue.
-    This actor regularly polls Arvados' job queue, and uses the provided
-    ServerCalculator to turn that into a list of requested node sizes.  That
-    list is sent to subscribers on every poll.
-    """
-    def __init__(self, client, timer_actor, server_calc,
-                 jobs_queue, slurm_queue, *args, **kwargs):
-        super(JobQueueMonitorActor, self).__init__(
-            client, timer_actor, *args, **kwargs)
-        self.jobs_queue = jobs_queue
-        self.slurm_queue = slurm_queue
-        self._calculator = server_calc
-    @staticmethod
-    def coerce_to_mb(x):
-        v, u = x[:-1], x[-1]
-        if u in ("M", "m"):
-            return int(v)
-        elif u in ("G", "g"):
-            return float(v) * 2**10
-        elif u in ("T", "t"):
-            return float(v) * 2**20
-        elif u in ("P", "p"):
-            return float(v) * 2**30
-        else:
-            return int(x)
-    def _send_request(self):
-        queuelist = []
-        if self.slurm_queue:
-            # cpus, memory, tempory disk space, reason, job name, feature constraints, priority
-            squeue_out = subprocess.check_output(["squeue", "--state=PENDING", "--noheader", "--format=%c|%m|%d|%r|%j|%f|%Q"])
-            for out in squeue_out.splitlines():
-                try:
-                    cpu, ram, disk, reason, jobname, features, priority = out.split("|", 6)
-                except ValueError:
-                    self._logger.warning("ignored malformed line in squeue output: %r", out)
-                    continue
-                if '-dz642-' not in jobname:
-                    continue
-                if not re.search(r'BadConstraints|ReqNodeNotAvail|Resources|Priority', reason):
-                    continue
-                for feature in features.split(','):
-                    m = re.match(r'instancetype=(.*)', feature)
-                    if not m:
-                        continue
-                    instance_type = m.group(1)
-                    # Ignore cpu/ram/scratch requirements, bring up
-                    # the requested node type.
-                    queuelist.append({
-                        "uuid": jobname,
-                        "runtime_constraints": {
-                            "instance_type": instance_type,
-                        },
-                        "priority": int(priority)
-                    })
-                    break
-                else:
-                    # No instance type specified. Choose a node type
-                    # to suit cpu/ram/scratch requirements.
-                    queuelist.append({
-                        "uuid": jobname,
-                        "runtime_constraints": {
-                            "min_cores_per_node": cpu,
-                            "min_ram_mb_per_node": self.coerce_to_mb(ram),
-                            "min_scratch_mb_per_node": self.coerce_to_mb(disk)
-                        },
-                        "priority": int(priority)
-                    })
-            queuelist.sort(key=lambda x: x.get('priority', 1), reverse=True)
-        if self.jobs_queue:
-            queuelist.extend(self._client.jobs().queue().execute()['items'])
-        return queuelist
-    def _got_response(self, queue):
-        server_list, unsatisfiable_jobs = self._calculator.servers_for_queue(queue)
-        # Cancel any job/container with unsatisfiable requirements, emitting
-        # a log explaining why.
-        for job_uuid, reason in unsatisfiable_jobs.iteritems():
-            try:
-                self._client.logs().create(body={
-                    'object_uuid': job_uuid,
-                    'event_type': 'stderr',
-                    'properties': {'text': reason},
-                }).execute()
-                # Cancel the job depending on its type
-                if arvados.util.container_uuid_pattern.match(job_uuid):
-                    subprocess.check_call(['scancel', '--name='+job_uuid])
-                elif arvados.util.job_uuid_pattern.match(job_uuid):
-                    self._client.jobs().cancel(uuid=job_uuid).execute()
-                else:
-                    raise Exception('Unknown job type')
-                self._logger.debug("Cancelled unsatisfiable job '%s'", job_uuid)
-            except Exception as error:
-                self._logger.error("Trying to cancel job '%s': %s",
-                                   job_uuid,
-                                   error)
-        self._logger.debug("Calculated wishlist: %s",
-                           ', '.join(s.id for s in server_list) or "(empty)")
-        return super(JobQueueMonitorActor, self)._got_response(server_list)
diff --git a/services/nodemanager/arvnodeman/launcher.py b/services/nodemanager/arvnodeman/launcher.py
deleted file mode 100644 (file)
index 34ea9ad..0000000
+++ /dev/null
@@ -1,171 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-# SPDX-License-Identifier: AGPL-3.0
-from __future__ import absolute_import, print_function
-import argparse
-import logging
-import signal
-import sys
-import time
-import daemon
-import pykka
-import libcloud
-from . import config as nmconfig
-from . import status
-from .baseactor import WatchdogActor
-from .daemon import NodeManagerDaemonActor
-from .jobqueue import JobQueueMonitorActor, ServerCalculator
-from .nodelist import ArvadosNodeListMonitorActor, CloudNodeListMonitorActor
-from .timedcallback import TimedCallBackActor
-from ._version import __version__
-node_daemon = None
-watchdog = None
-def abort(msg, code=1):
-    print("arvados-node-manager: " + msg)
-    sys.exit(code)
-def parse_cli(args):
-    parser = argparse.ArgumentParser(
-        prog='arvados-node-manager',
-        description="Dynamically allocate Arvados cloud compute nodes")
-    parser.add_argument(
-        '--version', action='version',
-        version="%s %s" % (sys.argv[0], __version__),
-        help='Print version and exit.')
-    parser.add_argument(
-        '--foreground', action='store_true', default=False,
-        help="Run in the foreground.  Don't daemonize.")
-    parser.add_argument(
-        '--config', help="Path to configuration file")
-    return parser.parse_args(args)
-def load_config(path):
-    if not path:
-        abort("No --config file specified", 2)
-    config = nmconfig.NodeManagerConfig()
-    try:
-        with open(path) as config_file:
-            config.readfp(config_file)
-    except (IOError, OSError) as error:
-        abort("Error reading configuration file {}: {}".format(path, error))
-    return config
-def setup_logging(path, level, **sublevels):
-    handler = logging.FileHandler(path)
-    handler.setFormatter(logging.Formatter(
-            '%(asctime)s %(name)s[%(process)d] %(levelname)s: %(message)s',
-            '%Y-%m-%d %H:%M:%S'))
-    root_logger = logging.getLogger()
-    root_logger.addHandler(handler)
-    root_logger.setLevel(level)
-    for logger_name, sublevel in sublevels.iteritems():
-        sublogger = logging.getLogger(logger_name)
-        sublogger.setLevel(sublevel)
-    return root_logger
-def build_server_calculator(config):
-    cloud_size_list = config.node_sizes()
-    if not cloud_size_list:
-        abort("No valid node sizes configured")
-    return ServerCalculator(cloud_size_list,
-                            config.getint('Daemon', 'max_nodes'),
-                            config.getfloat('Daemon', 'max_total_price'),
-                            config.getfloat('Daemon', 'node_mem_scaling'))
-def launch_pollers(config, server_calculator):
-    poll_time = config.getfloat('Daemon', 'poll_time')
-    max_poll_time = config.getint('Daemon', 'max_poll_time')
-    cloudlist_poll_time = config.getfloat('Daemon', 'cloudlist_poll_time') or poll_time
-    nodelist_poll_time = config.getfloat('Daemon', 'nodelist_poll_time') or poll_time
-    wishlist_poll_time = config.getfloat('Daemon', 'wishlist_poll_time') or poll_time
-    timer = TimedCallBackActor.start(poll_time / 10.0).tell_proxy()
-    cloud_node_poller = CloudNodeListMonitorActor.start(
-        config.new_cloud_client(), timer, server_calculator, cloudlist_poll_time, max_poll_time).tell_proxy()
-    arvados_node_poller = ArvadosNodeListMonitorActor.start(
-        config.new_arvados_client(), timer, nodelist_poll_time, max_poll_time).tell_proxy()
-    job_queue_poller = JobQueueMonitorActor.start(
-        config.new_arvados_client(), timer, server_calculator,
-        config.getboolean('Arvados', 'jobs_queue'),
-        config.getboolean('Arvados', 'slurm_queue'),
-        wishlist_poll_time, max_poll_time
-    ).tell_proxy()
-    return timer, cloud_node_poller, arvados_node_poller, job_queue_poller
-_caught_signals = {}
-def shutdown_signal(signal_code, frame):
-    current_count = _caught_signals.get(signal_code, 0)
-    _caught_signals[signal_code] = current_count + 1
-    if node_daemon is None:
-        pykka.ActorRegistry.stop_all()
-        sys.exit(-signal_code)
-    elif current_count == 0:
-        watchdog.stop()
-        node_daemon.shutdown()
-    elif current_count == 1:
-        pykka.ActorRegistry.stop_all()
-    else:
-        sys.exit(-signal_code)
-def main(args=None):
-    global node_daemon, watchdog
-    args = parse_cli(args)
-    config = load_config(args.config)
-    if not args.foreground:
-        daemon.DaemonContext().open()
-    for sigcode in [signal.SIGINT, signal.SIGQUIT, signal.SIGTERM]:
-        signal.signal(sigcode, shutdown_signal)
-    status.Server(config).start()
-    try:
-        root_logger = setup_logging(config.get('Logging', 'file'), **config.log_levels())
-        root_logger.info("%s %s started, libcloud %s", sys.argv[0], __version__, libcloud.__version__)
-        node_setup, node_shutdown, node_update, node_monitor = \
-            config.dispatch_classes()
-        server_calculator = build_server_calculator(config)
-        timer, cloud_node_poller, arvados_node_poller, job_queue_poller = \
-            launch_pollers(config, server_calculator)
-        cloud_node_updater = node_update.start(config.new_cloud_client, timer).tell_proxy()
-        node_daemon = NodeManagerDaemonActor.start(
-            job_queue_poller, arvados_node_poller, cloud_node_poller,
-            cloud_node_updater, timer,
-            config.new_arvados_client, config.new_cloud_client,
-            config.shutdown_windows(),
-            server_calculator,
-            config.getint('Daemon', 'min_nodes'),
-            config.getint('Daemon', 'max_nodes'),
-            config.getint('Daemon', 'poll_stale_after'),
-            config.getint('Daemon', 'boot_fail_after'),
-            config.getint('Daemon', 'node_stale_after'),
-            node_setup, node_shutdown, node_monitor,
-            max_total_price=config.getfloat('Daemon', 'max_total_price'),
-            consecutive_idle_count=config.getint('Daemon', 'consecutive_idle_count'),).tell_proxy()
-        watchdog = WatchdogActor.start(config.getint('Daemon', 'watchdog'),
-                            cloud_node_poller.actor_ref,
-                            arvados_node_poller.actor_ref,
-                            job_queue_poller.actor_ref,
-                            node_daemon.actor_ref)
-        signal.pause()
-        daemon_stopped = node_daemon.actor_ref.actor_stopped.is_set
-        while not daemon_stopped():
-            time.sleep(1)
-    except Exception:
-        logging.exception("Uncaught exception during setup")
-    finally:
-        pykka.ActorRegistry.stop_all()
-if __name__ == '__main__':
-    main()
diff --git a/services/nodemanager/arvnodeman/nodelist.py b/services/nodemanager/arvnodeman/nodelist.py
deleted file mode 100644 (file)
index 0abb3b3..0000000
+++ /dev/null
@@ -1,87 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-# SPDX-License-Identifier: AGPL-3.0
-from __future__ import absolute_import, print_function
-import subprocess32 as subprocess
-from . import clientactor
-from . import config
-import arvados.util
-class ArvadosNodeListMonitorActor(clientactor.RemotePollLoopActor):
-    """Actor to poll the Arvados node list.
-    This actor regularly polls the list of Arvados node records,
-    augments it with the latest SLURM node info (`sinfo`), and sends
-    it to subscribers.
-    """
-    def is_common_error(self, exception):
-        return isinstance(exception, config.ARVADOS_ERRORS)
-    def _item_key(self, node):
-        return node['uuid']
-    def _send_request(self):
-        nodelist = arvados.util.list_all(self._client.nodes().list)
-        # node hostname, state
-        sinfo_out = subprocess.check_output(["sinfo", "--noheader", "--format=%n|%t|%f"])
-        nodestates = {}
-        nodefeatures = {}
-        for out in sinfo_out.splitlines():
-            try:
-                nodename, state, features = out.split("|", 3)
-            except ValueError:
-                continue
-            if state in ('alloc', 'alloc*',
-                         'comp',  'comp*',
-                         'mix',   'mix*',
-                         'drng',  'drng*'):
-                nodestates[nodename] = 'busy'
-            elif state in ('idle', 'fail'):
-                nodestates[nodename] = state
-            else:
-                nodestates[nodename] = 'down'
-            if features != "(null)":
-                nodefeatures[nodename] = features
-        for n in nodelist:
-            if n["slot_number"] and n["hostname"] and n["hostname"] in nodestates:
-                n["crunch_worker_state"] = nodestates[n["hostname"]]
-            else:
-                n["crunch_worker_state"] = 'down'
-            n["slurm_node_features"] = nodefeatures.get(n["hostname"], "")
-        return nodelist
-class CloudNodeListMonitorActor(clientactor.RemotePollLoopActor):
-    """Actor to poll the cloud node list.
-    This actor regularly polls the cloud to get a list of running compute
-    nodes, and sends it to subscribers.
-    """
-    def __init__(self, client, timer_actor, server_calc, *args, **kwargs):
-        super(CloudNodeListMonitorActor, self).__init__(
-            client, timer_actor, *args, **kwargs)
-        self._calculator = server_calc
-    def is_common_error(self, exception):
-        return isinstance(exception, config.CLOUD_ERRORS)
-    def _item_key(self, node):
-        return node.id
-    def _send_request(self):
-        nodes = self._client.list_nodes()
-        for n in nodes:
-            # Replace the libcloud NodeSize object with compatible
-            # CloudSizeWrapper object which merges the size info reported from
-            # the cloud with size information from the configuration file.
-            n.size = self._calculator.find_size(n.extra['arvados_node_size'])
-        return nodes
diff --git a/services/nodemanager/arvnodeman/status.py b/services/nodemanager/arvnodeman/status.py
deleted file mode 100644 (file)
index 1e18996..0000000
+++ /dev/null
@@ -1,129 +0,0 @@
-# Copyright (C) The Arvados Authors. All rights reserved.
-# SPDX-License-Identifier: AGPL-3.0
-from __future__ import absolute_import, print_function
-from future import standard_library
-import http.server
-import time
-import json
-import logging
-import socketserver
-import threading
-from ._version import __version__
-_logger = logging.getLogger('status.Handler')
-class Server(socketserver.ThreadingMixIn, http.server.HTTPServer, object):
-    def __init__(self, config):
-        port = config.getint('Manage', 'port')
-        self.enabled = port >= 0
-        if not self.enabled:
-            _logger.warning("Management server disabled. "+
-                            "Use [Manage] config section to enable.")
-            return
-        self._config = config
-        self._tracker = tracker
-        self._tracker.update({'config_max_nodes': config.getint('Daemon', 'max_nodes')})
-        super(Server, self).__init__(
-            (config.get('Manage', 'address'), port), Handler)
-        self._thread = threading.Thread(target=self.serve_forever)
-        self._thread.daemon = True
-    def start(self):
-        if self.enabled:
-            self._thread.start()
-class Handler(http.server.BaseHTTPRequestHandler, object):
-    def do_GET(self):
-        if self.path == '/status.json':
-            self.send_response(200)
-            self.send_header('Content-type', 'application/json')
-            self.end_headers()
-            self.wfile.write(tracker.get_json())
-        elif self.path == '/_health/ping':
-            code, msg = self.check_auth()
-            if code != 200:
-              self.send_response(code)
-              self.wfile.write(msg)
-            else:
-              self.send_response(200)
-              self.send_header('Content-type', 'application/json')
-              self.end_headers()
-              self.wfile.write(json.dumps({"health":"OK"}))
-        else:
-            self.send_response(404)
-    def log_message(self, fmt, *args, **kwargs):
-        _logger.info(fmt, *args, **kwargs)
-    def check_auth(self):
-        mgmt_token = self.server._config.get('Manage', 'ManagementToken')
-        auth_header = self.headers.get('Authorization', None)
-        if mgmt_token == '':
-          return 404, "disabled"
-        elif auth_header == None:
-          return 401, "authorization required"
-        elif auth_header != 'Bearer '+mgmt_token:
-          return 403, "authorization error"
-        return 200, ""
-class Tracker(object):
-    def __init__(self):
-        self._mtx = threading.Lock()
-        self._latest = {
-            'list_nodes_errors': 0,
-            'create_node_errors': 0,
-            'destroy_node_errors': 0,
-            'boot_failures': 0,
-            'actor_exceptions': 0
-        }
-        self._version = {'Version' : __version__}
-        self._idle_nodes = {}
-    def get_json(self):
-        with self._mtx:
-            times = {'idle_times' : {}}
-            now = time.time()
-            for node, ts in self._idle_nodes.items():
-                times['idle_times'][node] = int(now - ts)
-            return json.dumps(
-                dict(dict(self._latest, **self._version), **times))
-    def keys(self):
-        with self._mtx:
-            return self._latest.keys()
-    def get(self, key):
-        with self._mtx:
-            return self._latest.get(key)
-    def update(self, updates):
-        with self._mtx:
-            self._latest.update(updates)
-    def counter_add(self, counter, value=1):
-        with self._mtx:
-            self._latest.setdefault(counter, 0)
-            self._latest[counter] += value
-    def idle_in(self, nodename):
-        with self._mtx:
-            if self._idle_nodes.get(nodename):
-                return
-            self._idle_nodes[nodename] = time.time()
-    def idle_out(self, nodename):
-        with self._mtx:
-            try:
-                del self._idle_nodes[nodename]
-            except KeyError:
-                pass
-tracker = Tracker()
diff --git a/services/nodemanager/arvnodeman/test/fake_driver.py b/services/nodemanager/arvnodeman/test/fake_driver.py
deleted file mode 100644 (file)
index 2a592f9..0000000
+++ /dev/null
@@ -1,226 +0,0 @@
-# Copyright (C) The Arvados Authors. All rights reserved.
-# SPDX-License-Identifier: AGPL-3.0
-import re
-import urllib
-import ssl
-import time
-from arvnodeman.computenode import ARVADOS_TIMEFMT
-from libcloud.compute.base import NodeSize, Node, NodeDriver, NodeState, NodeImage
-from libcloud.compute.drivers.gce import GCEDiskType
-from libcloud.common.exceptions import BaseHTTPError, RateLimitReachedError
-all_nodes = []
-create_calls = 0
-quota = 2
-class FakeDriver(NodeDriver):
-    def __init__(self, *args, **kwargs):
-        self.name = "FakeDriver"
-    def list_sizes(self, **kwargs):
-        return [NodeSize("Standard_D3", "Standard_D3", 3500, 200, 0, 0, self),
-                NodeSize("Standard_D4", "Standard_D4", 7000, 400, 0, 0, self)]
-    def list_nodes(self, **kwargs):
-        return all_nodes
-    def create_node(self, name=None,
-                    size=None,
-                    image=None,
-                    auth=None,
-                    ex_storage_account=None,
-                    ex_customdata=None,
-                    ex_resource_group=None,
-                    ex_user_name=None,
-                    ex_tags=None,
-                    ex_metadata=None,
-                    ex_network=None,
-                    ex_userdata=None):
-        global all_nodes, create_calls
-        create_calls += 1
-        nodeid = "node%i" % create_calls
-        if ex_tags is None:
-            ex_tags = {}
-        ex_tags.update({'arvados_node_size': size.id})
-        n = Node(nodeid, nodeid, NodeState.RUNNING, [], [], self, size=size, extra={"tags": ex_tags})
-        all_nodes.append(n)
-        if ex_customdata:
-            ping_url = re.search(r"echo '(.*)' > /var/tmp/arv-node-data/arv-ping-url", ex_customdata).groups(1)[0]
-        if ex_userdata:
-            ping_url = ex_userdata
-        elif ex_metadata:
-            ping_url = ex_metadata["arv-ping-url"]
-        ping_url += "&instance_id=" + nodeid
-        ctx = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
-        ctx.verify_mode = ssl.CERT_NONE
-        f = urllib.urlopen(ping_url, "", context=ctx)
-        f.close()
-        return n
-    def destroy_node(self, cloud_node):
-        global all_nodes
-        all_nodes = [n for n in all_nodes if n.id != cloud_node.id]
-        return True
-    def get_image(self, img):
-        pass
-    def ex_create_tags(self, cloud_node, tags):
-        pass
-class QuotaDriver(FakeDriver):
-    def create_node(self, name=None,
-                    size=None,
-                    image=None,
-                    auth=None,
-                    ex_storage_account=None,
-                    ex_customdata=None,
-                    ex_resource_group=None,
-                    ex_user_name=None,
-                    ex_tags=None,
-                    ex_network=None):
-        global all_nodes, create_calls, quota
-        if len(all_nodes) >= quota:
-            raise BaseHTTPError(503, "Quota exceeded")
-        else:
-            return super(QuotaDriver, self).create_node(name=name,
-                    size=size,
-                    image=image,
-                    auth=auth,
-                    ex_storage_account=ex_storage_account,
-                    ex_customdata=ex_customdata,
-                    ex_resource_group=ex_resource_group,
-                    ex_user_name=ex_user_name,
-                    ex_tags=ex_tags,
-                    ex_network=ex_network)
-    def destroy_node(self, cloud_node):
-        global all_nodes, quota
-        all_nodes = [n for n in all_nodes if n.id != cloud_node.id]
-        if len(all_nodes) == 0:
-            quota = 4
-        return True
-class FailingDriver(FakeDriver):
-    def create_node(self, name=None,
-                    size=None,
-                    image=None,
-                    auth=None,
-                    ex_storage_account=None,
-                    ex_customdata=None,
-                    ex_resource_group=None,
-                    ex_user_name=None,
-                    ex_tags=None,
-                    ex_network=None):
-        raise Exception("nope")
-class RetryDriver(FakeDriver):
-    def create_node(self, name=None,
-                    size=None,
-                    image=None,
-                    auth=None,
-                    ex_storage_account=None,
-                    ex_customdata=None,
-                    ex_resource_group=None,
-                    ex_user_name=None,
-                    ex_tags=None,
-                    ex_network=None):
-        global create_calls
-        create_calls += 1
-        if create_calls < 2:
-            raise RateLimitReachedError(429, "Rate limit exceeded",
-                                        headers={'retry-after': '2'})
-        elif create_calls < 3:
-            raise BaseHTTPError(429, "Rate limit exceeded",
-                                {'retry-after': '1'})
-        else:
-            return super(RetryDriver, self).create_node(name=name,
-                    size=size,
-                    image=image,
-                    auth=auth,
-                    ex_storage_account=ex_storage_account,
-                    ex_customdata=ex_customdata,
-                    ex_resource_group=ex_resource_group,
-                    ex_user_name=ex_user_name,
-                    ex_tags=ex_tags,
-                    ex_network=ex_network)
-class FakeAwsDriver(FakeDriver):
-    def create_node(self, name=None,
-                    size=None,
-                    image=None,
-                    auth=None,
-                    ex_userdata=None,
-                    ex_metadata=None,
-                    ex_blockdevicemappings=None):
-        n = super(FakeAwsDriver, self).create_node(name=name,
-                                                      size=size,
-                                                      image=image,
-                                                      auth=auth,
-                                                      ex_metadata=ex_metadata,
-                                                      ex_userdata=ex_userdata)
-        n.extra = {
-            "launch_time": time.strftime(ARVADOS_TIMEFMT, time.gmtime())[:-1],
-            "tags" : {
-                "arvados_node_size": size.id
-            }
-        }
-        return n
-    def list_sizes(self, **kwargs):
-        return [NodeSize("m3.xlarge", "Extra Large Instance", 3500, 80, 0, 0, self),
-                NodeSize("m4.xlarge", "Extra Large Instance", 3500, 0, 0, 0, self),
-                NodeSize("m4.2xlarge", "Double Extra Large Instance", 7000, 0, 0, 0, self)]
-class FakeGceDriver(FakeDriver):
-    def create_node(self, name=None,
-                    size=None,
-                    image=None,
-                    auth=None,
-                    external_ip=None,
-                    ex_metadata=None,
-                    ex_tags=None,
-                    ex_disks_gce_struct=None):
-        n = super(FakeGceDriver, self).create_node(name=name,
-                                                   size=size,
-                                                   image=image,
-                                                   auth=auth,
-                                                   ex_metadata=ex_metadata)
-        n.extra = {
-            "metadata": {
-                "items": [{"key": k, "value": v} for k,v in ex_metadata.iteritems()],
-                "arvados_node_size": size.id
-            },
-            "zone": "fake"
-        }
-        return n
-    def list_images(self, ex_project=None):
-        return [NodeImage("fake_image_id", "fake_image_id", self)]
-    def list_sizes(self, **kwargs):
-        return [NodeSize("n1-standard-1", "Standard", 3750, None, 0, 0, self),
-                NodeSize("n1-standard-2", "Double standard", 7500, None, 0, 0, self)]
-    def ex_list_disktypes(self, zone=None):
-        return [GCEDiskType("pd-standard", "pd-standard", zone, self,
-                            extra={"selfLink": "pd-standard"}),
-                GCEDiskType("local-ssd", "local-ssd", zone, self,
-                            extra={"selfLink": "local-ssd"})]
-    def ex_get_node(self, name, zone=None):
-        global all_nodes
-        for n in all_nodes:
-            if n.id == name:
-                return n
-        return None
-    def ex_set_node_metadata(self, n, items):
-        n.extra["metadata"]["items"] = items
diff --git a/services/nodemanager/arvnodeman/timedcallback.py b/services/nodemanager/arvnodeman/timedcallback.py
deleted file mode 100644 (file)
index e7e3f25..0000000
+++ /dev/null
@@ -1,49 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-# SPDX-License-Identifier: AGPL-3.0
-from __future__ import absolute_import, print_function
-import heapq
-import time
-import pykka
-from .config import actor_class
-class TimedCallBackActor(actor_class):
-    """Send messages to other actors on a schedule.
-    Other actors can call the schedule() method to schedule delivery of a
-    message at a later time.  This actor runs the necessary event loop for
-    delivery.
-    """
-    def __init__(self, max_sleep=1, timefunc=None):
-        super(TimedCallBackActor, self).__init__()
-        self._proxy = self.actor_ref.tell_proxy()
-        self.messages = []
-        self.max_sleep = max_sleep
-        if timefunc is None:
-            self._timefunc = time.time
-        else:
-            self._timefunc = timefunc
-    def schedule(self, delivery_time, receiver, *args, **kwargs):
-        if not self.messages:
-            self._proxy.deliver()
-        heapq.heappush(self.messages, (delivery_time, receiver, args, kwargs))
-    def deliver(self):
-        if not self.messages:
-            return
-        til_next = self.messages[0][0] - self._timefunc()
-        if til_next <= 0:
-            t, receiver, args, kwargs = heapq.heappop(self.messages)
-            try:
-                receiver(*args, **kwargs)
-            except pykka.ActorDeadError:
-                pass
-        else:
-            time.sleep(min(til_next, self.max_sleep))
-        self._proxy.deliver()
diff --git a/services/nodemanager/bin/arvados-node-manager b/services/nodemanager/bin/arvados-node-manager
deleted file mode 100755 (executable)
index 72e0831..0000000
+++ /dev/null
@@ -1,9 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-# SPDX-License-Identifier: AGPL-3.0
-from __future__ import absolute_import, print_function
-from arvnodeman.launcher import main
diff --git a/services/nodemanager/doc/azure.example.cfg b/services/nodemanager/doc/azure.example.cfg
deleted file mode 100644 (file)
index 8ba6801..0000000
+++ /dev/null
@@ -1,202 +0,0 @@
-# Copyright (C) The Arvados Authors. All rights reserved.
-# SPDX-License-Identifier: AGPL-3.0
-# Azure configuration for Arvados Node Manager.
-# All times are in seconds unless specified otherwise.
-# The management server responds to http://addr:port/status.json with
-# a snapshot of internal state.
-# Management server listening address (default
-#address =
-# Management server port number (default -1, server is disabled)
-#port = 8989
-# The dispatcher can customize the start and stop procedure for
-# cloud nodes.  For example, the SLURM dispatcher drains nodes
-# through SLURM before shutting them down.
-#dispatcher = slurm
-# Node Manager will ensure that there are at least this many nodes running at
-# all times.  If node manager needs to start new idle nodes for the purpose of
-# satisfying min_nodes, it will use the cheapest node type.  However, depending
-# on usage patterns, it may also satisfy min_nodes by keeping alive some
-# more-expensive nodes
-min_nodes = 0
-# Node Manager will not start any compute nodes when at least this
-# many are running.
-max_nodes = 8
-# Upper limit on rate of spending (in $/hr), will not boot additional nodes
-# if total price of already running nodes meets or exceeds this threshold.
-# default 0 means no limit.
-max_total_price = 0
-# Poll Azure nodes and Arvados for new information every N seconds.
-poll_time = 60
-# Polls have exponential backoff when services fail to respond.
-# This is the longest time to wait between polls.
-max_poll_time = 300
-# If Node Manager can't succesfully poll a service for this long,
-# it will never start or stop compute nodes, on the assumption that its
-# information is too outdated.
-poll_stale_after = 600
-# If Node Manager boots a cloud node, and it does not pair with an Arvados
-# node before this long, assume that there was a cloud bootstrap failure and
-# shut it down.  Note that normal shutdown windows apply (see the Cloud
-# section), so this should be shorter than the first shutdown window value.
-boot_fail_after = 1800
-# "Node stale time" affects two related behaviors.
-# 1. If a compute node has been running for at least this long, but it
-# isn't paired with an Arvados node, do not shut it down, but leave it alone.
-# This prevents the node manager from shutting down a node that might
-# actually be doing work, but is having temporary trouble contacting the
-# API server.
-# 2. When the Node Manager starts a new compute node, it will try to reuse
-# an Arvados node that hasn't been updated for this long.
-node_stale_after = 14400
-# Number of consecutive times a node must report as "idle" before it
-# will be considered eligible for shutdown.  Node status is checked
-# each poll period, and node can go idle at any point during a poll
-# period (meaning a node could be reported as idle that has only been
-# idle for 1 second).  With a 60 second poll period, three consecutive
-# status updates of "idle" suggests the node has been idle at least
-# 121 seconds.
-consecutive_idle_count = 3
-# Scaling factor to be applied to nodes' available RAM size. Usually there's a
-# variable discrepancy between the advertised RAM value on cloud nodes and the
-# actual amount available.
-# If not set, this value will be set to 0.95
-node_mem_scaling = 0.95
-# File path for Certificate Authorities
-certs_file = /etc/ssl/certs/ca-certificates.crt
-# Log file path
-file = /var/log/arvados/node-manager.log
-# Log level for most Node Manager messages.
-# WARNING lets you know when polling a service fails.
-# INFO additionally lets you know when a compute node is started or stopped.
-level = INFO
-# You can also set different log levels for specific libraries.
-# Pykka is the Node Manager's actor library.
-# Setting this to DEBUG will display tracebacks for uncaught
-# exceptions in the actors, but it's also very chatty.
-pykka = WARNING
-# Setting apiclient to INFO will log the URL of every Arvados API request.
-apiclient = WARNING
-host = zyxwv.arvadosapi.com
-timeout = 15
-jobs_queue = yes   # Get work request from Arvados jobs queue (jobs API)
-slurm_queue = yes  # Get work request from squeue (containers API)
-# Accept an untrusted SSL certificate from the API server?
-insecure = no
-provider = azure
-# Shutdown windows define periods of time when a node may and may not be shut
-# down.  These are windows in full minutes, separated by commas.  Counting from
-# the time the node is booted, the node WILL NOT shut down for N1 minutes; then
-# it MAY shut down for N2 minutes; then it WILL NOT shut down for N3 minutes;
-# and so on.  For example, "20, 999999" means the node may shut down between
-# the 20th and 999999th minutes of uptime.
-# Azure bills by the minute, so it makes sense to agressively shut down idle
-# nodes.  Specify at least two windows.  You can add as many as you need beyond
-# that.
-shutdown_windows = 20, 999999
-[Cloud Credentials]
-# Use "azure account list" with the azure CLI to get these values.
-tenant_id = 00000000-0000-0000-0000-000000000000
-subscription_id = 00000000-0000-0000-0000-000000000000
-# The following directions are based on
-# https://azure.microsoft.com/en-us/documentation/articles/resource-group-authenticate-service-principal/
-# azure config mode arm
-# azure ad app create --name "<Your Application Display Name>" --home-page "<https://YourApplicationHomePage>" --identifier-uris "<https://YouApplicationUri>" --password <Your_Password>
-# azure ad sp create "<Application_Id>"
-# azure role assignment create --objectId "<Object_Id>" -o Owner -c /subscriptions/{subscriptionId}/
-# Use <Application_Id> for "key" and the <Your_Password> for "secret"
-key = 00000000-0000-0000-0000-000000000000
-secret = PASSWORD
-timeout = 60
-region = East US
-[Cloud List]
-# The resource group in which the compute node virtual machines will be created
-# and listed.
-ex_resource_group = ArvadosResourceGroup
-[Cloud Create]
-# The image id, in the form "Publisher:Offer:SKU:Version"
-image = Canonical:UbuntuServer:14.04.3-LTS:14.04.201508050
-# Path to a local ssh key file that will be used to provision new nodes.
-ssh_key = /home/arvadosuser/.ssh/id_rsa.pub
-# The account name for the admin user that will be provisioned on new nodes.
-ex_user_name = arvadosuser
-# The Azure storage account that will be used to store the node OS disk images.
-ex_storage_account = arvadosstorage
-# The virtual network the VMs will be associated with.
-ex_network = ArvadosNetwork
-# Optional subnet of the virtual network.
-#ex_subnet = default
-# Node tags
-tag_arvados-class = dynamic-compute
-tag_cluster = zyxwv
-# the API server to ping
-ping_host = hostname:port
-# You can define any number of Size sections to list Azure sizes you're willing
-# to use.  The Node Manager should boot the cheapest size(s) that can run jobs
-# in the queue.  You must also provide price per hour as the Azure driver
-# compute currently does not report prices.
-# See https://azure.microsoft.com/en-us/pricing/details/virtual-machines/
-# for a list of known machine types that may be used as a Size parameter.
-# Each size section MUST define the number of cores are available in this
-# size class (since libcloud does not provide any consistent API for exposing
-# this setting).
-# You may also want to define the amount of scratch space (expressed
-# in GB) for Crunch jobs.  You can also override Microsoft's provided
-# data fields by setting them here.
-[Size Standard_D3]
-cores = 4
-price = 0.56
-[Size Standard_D4]
-cores = 8
-price = 1.12
diff --git a/services/nodemanager/doc/ec2.example.cfg b/services/nodemanager/doc/ec2.example.cfg
deleted file mode 100644 (file)
index 3bc905b..0000000
+++ /dev/null
@@ -1,205 +0,0 @@
-# Copyright (C) The Arvados Authors. All rights reserved.
-# SPDX-License-Identifier: AGPL-3.0
-# EC2 configuration for Arvados Node Manager.
-# All times are in seconds unless specified otherwise.
-# The management server responds to http://addr:port/status.json with
-# a snapshot of internal state.
-# Management server listening address (default
-#address =
-# Management server port number (default -1, server is disabled)
-#port = 8989
-# The dispatcher can customize the start and stop procedure for
-# cloud nodes.  For example, the SLURM dispatcher drains nodes
-# through SLURM before shutting them down.
-#dispatcher = slurm
-# Node Manager will ensure that there are at least this many nodes running at
-# all times.  If node manager needs to start new idle nodes for the purpose of
-# satisfying min_nodes, it will use the cheapest node type.  However, depending
-# on usage patterns, it may also satisfy min_nodes by keeping alive some
-# more-expensive nodes
-min_nodes = 0
-# Node Manager will not start any compute nodes when at least this
-# many are running.
-max_nodes = 8
-# Upper limit on rate of spending (in $/hr), will not boot additional nodes
-# if total price of already running nodes meets or exceeds this threshold.
-# default 0 means no limit.
-max_total_price = 0
-# Poll EC2 nodes and Arvados for new information every N seconds.
-poll_time = 60
-# Polls have exponential backoff when services fail to respond.
-# This is the longest time to wait between polls.
-max_poll_time = 300
-# If Node Manager can't succesfully poll a service for this long,
-# it will never start or stop compute nodes, on the assumption that its
-# information is too outdated.
-poll_stale_after = 600
-# If Node Manager boots a cloud node, and it does not pair with an Arvados
-# node before this long, assume that there was a cloud bootstrap failure and
-# shut it down.  Note that normal shutdown windows apply (see the Cloud
-# section), so this should be shorter than the first shutdown window value.
-boot_fail_after = 1800
-# "Node stale time" affects two related behaviors.
-# 1. If a compute node has been running for at least this long, but it
-# isn't paired with an Arvados node, do not shut it down, but leave it alone.
-# This prevents the node manager from shutting down a node that might
-# actually be doing work, but is having temporary trouble contacting the
-# API server.
-# 2. When the Node Manager starts a new compute node, it will try to reuse
-# an Arvados node that hasn't been updated for this long.
-node_stale_after = 14400
-# Number of consecutive times a node must report as "idle" before it
-# will be considered eligible for shutdown.  Node status is checked
-# each poll period, and node can go idle at any point during a poll
-# period (meaning a node could be reported as idle that has only been
-# idle for 1 second).  With a 60 second poll period, three consecutive
-# status updates of "idle" suggests the node has been idle at least
-# 121 seconds.
-consecutive_idle_count = 3
-# Scaling factor to be applied to nodes' available RAM size. Usually there's a
-# variable discrepancy between the advertised RAM value on cloud nodes and the
-# actual amount available.
-# If not set, this value will be set to 0.95
-node_mem_scaling = 0.95
-# File path for Certificate Authorities
-certs_file = /etc/ssl/certs/ca-certificates.crt
-# Log file path
-file = /var/log/arvados/node-manager.log
-# Log level for most Node Manager messages.
-# WARNING lets you know when polling a service fails.
-# INFO additionally lets you know when a compute node is started or stopped.
-level = INFO
-# You can also set different log levels for specific libraries.
-# Pykka is the Node Manager's actor library.
-# Setting this to DEBUG will display tracebacks for uncaught
-# exceptions in the actors, but it's also very chatty.
-pykka = WARNING
-# Setting apiclient to INFO will log the URL of every Arvados API request.
-apiclient = WARNING
-host = zyxwv.arvadosapi.com
-timeout = 15
-jobs_queue = yes   # Get work request from Arvados jobs queue (jobs API)
-slurm_queue = yes  # Get work request from squeue (containers API)
-# Accept an untrusted SSL certificate from the API server?
-insecure = no
-provider = ec2
-# It's usually most cost-effective to shut down compute nodes during narrow
-# windows of time.  For example, EC2 bills each node by the hour, so the best
-# time to shut down a node is right before a new hour of uptime starts.
-# Shutdown windows define these periods of time.  These are windows in
-# full minutes, separated by commas.  Counting from the time the node is
-# booted, the node WILL NOT shut down for N1 minutes; then it MAY shut down
-# for N2 minutes; then it WILL NOT shut down for N3 minutes; and so on.
-# For example, "54, 5, 1" means the node may shut down from the 54th to the
-# 59th minute of each hour of uptime.
-# Specify at least two windows.  You can add as many as you need beyond that.
-shutdown_windows = 54, 5, 1
-[Cloud Credentials]
-key = KEY
-secret = SECRET_KEY
-region = us-east-1
-timeout = 60
-[Cloud List]
-# This section defines filters that find compute nodes.
-# Tags that you specify here will automatically be added to nodes you create.
-# Replace colons in Amazon filters with underscores
-# (e.g., write "tag:mytag" as "tag_mytag").
-instance-state-name = running
-tag_arvados-class = dynamic-compute
-tag_cluster = zyxwv
-[Cloud Create]
-# New compute nodes will send pings to Arvados at this host.
-# You may specify a port, and use brackets to disambiguate IPv6 addresses.
-ping_host = hostname:port
-# Give the name of an SSH key on AWS...
-ex_keyname = string
-# ... or a file path for an SSH key that can log in to the compute node.
-# (One or the other, not both.)
-# ssh_key = path
-# The EC2 IDs of the image and subnet compute nodes should use.
-image_id = idstring
-subnet_id = idstring
-# Comma-separated EC2 IDs for the security group(s) assigned to each
-# compute node.
-security_groups = idstring1, idstring2
-# Apply an Instance Profile ARN to the newly created compute nodes
-# For more info, see:
-# https://aws.amazon.com/premiumsupport/knowledge-center/iam-policy-restrict-vpc/
-# ex_iamprofile = arn:aws:iam::ACCOUNTNUMBER:instance-profile/ROLENAME
-# You can define any number of Size sections to list EC2 sizes you're
-# willing to use.  The Node Manager should boot the cheapest size(s) that
-# can run jobs in the queue.
-# Each size section MUST define the number of cores are available in this
-# size class (since libcloud does not provide any consistent API for exposing
-# this setting).
-# You may also want to define the amount of scratch space (expressed
-# in MB) for Crunch jobs.  You can also override Amazon's provided
-# data fields (such as price per hour) by setting them here.
-# Additionally, you can ask for a preemptible instance (AWS's spot instance)
-# by adding the appropriate boolean configuration flag. If you want to have
-# both spot & reserved versions of the same size, you can do so by renaming
-# the Size section and specifying the instance type inside it.
-# 100 GB scratch space
-[Size m4.large]
-cores = 2
-price = 0.126
-scratch = 100000
-# 10 GB scratch space
-[Size m4.large.spot]
-instance_type = m4.large
-preemptible = true
-cores = 2
-price = 0.126
-scratch = 10000
-# 200 GB scratch space
-[Size m4.xlarge]
-cores = 4
-price = 0.252
-scratch = 200000
diff --git a/services/nodemanager/doc/gce.example.cfg b/services/nodemanager/doc/gce.example.cfg
deleted file mode 100644 (file)
index acd3fd1..0000000
+++ /dev/null
@@ -1,187 +0,0 @@
-# Copyright (C) The Arvados Authors. All rights reserved.
-# SPDX-License-Identifier: AGPL-3.0
-# Google Compute Engine configuration for Arvados Node Manager.
-# All times are in seconds unless specified otherwise.
-# The management server responds to http://addr:port/status.json with
-# a snapshot of internal state.
-# Management server listening address (default
-#address =
-# Management server port number (default -1, server is disabled)
-#port = 8989
-# Node Manager will ensure that there are at least this many nodes running at
-# all times.  If node manager needs to start new idle nodes for the purpose of
-# satisfying min_nodes, it will use the cheapest node type.  However, depending
-# on usage patterns, it may also satisfy min_nodes by keeping alive some
-# more-expensive nodes
-min_nodes = 0
-# Node Manager will not start any compute nodes when at least this
-# running at all times.  By default, these will be the cheapest node size.
-max_nodes = 8
-# Poll compute nodes and Arvados for new information every N seconds.
-poll_time = 60
-# Upper limit on rate of spending (in $/hr), will not boot additional nodes
-# if total price of already running nodes meets or exceeds this threshold.
-# default 0 means no limit.
-max_total_price = 0
-# Polls have exponential backoff when services fail to respond.
-# This is the longest time to wait between polls.
-max_poll_time = 300
-# If Node Manager can't succesfully poll a service for this long,
-# it will never start or stop compute nodes, on the assumption that its
-# information is too outdated.
-poll_stale_after = 600
-# "Node stale time" affects two related behaviors.
-# 1. If a compute node has been running for at least this long, but it
-# isn't paired with an Arvados node, do not shut it down, but leave it alone.
-# This prevents the node manager from shutting down a node that might
-# actually be doing work, but is having temporary trouble contacting the
-# API server.
-# 2. When the Node Manager starts a new compute node, it will try to reuse
-# an Arvados node that hasn't been updated for this long.
-node_stale_after = 14400
-# Number of consecutive times a node must report as "idle" before it
-# will be considered eligible for shutdown.  Node status is checked
-# each poll period, and node can go idle at any point during a poll
-# period (meaning a node could be reported as idle that has only been
-# idle for 1 second).  With a 60 second poll period, three consecutive
-# status updates of "idle" suggests the node has been idle at least
-# 121 seconds.
-consecutive_idle_count = 3
-# Scaling factor to be applied to nodes' available RAM size. Usually there's a
-# variable discrepancy between the advertised RAM value on cloud nodes and the
-# actual amount available.
-# If not set, this value will be set to 0.95
-node_mem_scaling = 0.95
-# File path for Certificate Authorities
-certs_file = /etc/ssl/certs/ca-certificates.crt
-# Log file path
-file = /var/log/arvados/node-manager.log
-# Log level for most Node Manager messages.
-# WARNING lets you know when polling a service fails.
-# INFO additionally lets you know when a compute node is started or stopped.
-level = INFO
-# You can also set different log levels for specific libraries.
-# Pykka is the Node Manager's actor library.
-# Setting this to DEBUG will display tracebacks for uncaught
-# exceptions in the actors, but it's also very chatty.
-pykka = WARNING
-# Setting apiclient to INFO will log the URL of every Arvados API request.
-apiclient = WARNING
-host = zyxwv.arvadosapi.com
-timeout = 15
-jobs_queue = yes   # Get work request from Arvados jobs queue (jobs API)
-slurm_queue = yes  # Get work request from squeue (containers API)
-# Accept an untrusted SSL certificate from the API server?
-insecure = no
-provider = gce
-# Shutdown windows define periods of time when a node may and may not
-# be shut down.  These are windows in full minutes, separated by
-# commas.  Counting from the time the node is booted, the node WILL
-# NOT shut down for N1 minutes; then it MAY shut down for N2 minutes;
-# then it WILL NOT shut down for N3 minutes; and so on.  For example,
-# "54, 5, 1" means the node may shut down from the 54th to the 59th
-# minute of each hour of uptime.
-# GCE bills by the minute, and does not provide information about when
-# a node booted.  Node Manager will store this information in metadata
-# when it boots a node; if that information is not available, it will
-# assume the node booted at the epoch.  These shutdown settings are
-# very aggressive.  You may want to adjust this if you want more
-# continuity of service from a single node.
-shutdown_windows = 20, 999999
-[Cloud Credentials]
-user_id = client_email_address@developer.gserviceaccount.com
-key = path_to_certificate.pem
-project = project-id-from-google-cloud-dashboard
-timeout = 60
-# Valid location (zone) names: https://cloud.google.com/compute/docs/zones
-datacenter = us-central1-a
-# Optional settings. For full documentation see
-# http://libcloud.readthedocs.org/en/latest/compute/drivers/gce.html#libcloud.compute.drivers.gce.GCENodeDriver
-# auth_type = SA               # SA, IA or GCE
-# scopes = https://www.googleapis.com/auth/compute
-# credential_file =
-[Cloud List]
-# A comma-separated list of tags that must be applied to a node for it to
-# be considered a compute node.
-# The driver will automatically apply these tags to nodes it creates.
-tags = zyxwv, compute
-[Cloud Create]
-# New compute nodes will send pings to Arvados at this host.
-# You may specify a port, and use brackets to disambiguate IPv6 addresses.
-ping_host = hostname:port
-# A file path for an SSH key that can log in to the compute node.
-# ssh_key = path
-# The GCE image name and network zone name to use when creating new nodes.
-image = debian-7
-# network = your_network_name
-# JSON string of service account authorizations for this cluster.
-# See http://libcloud.readthedocs.org/en/latest/compute/drivers/gce.html#specifying-service-account-scopes
-# service_accounts = [{'email':'account@example.com', 'scopes':['storage-ro']}]
-# You can define any number of Size sections to list node sizes you're
-# willing to use.  The Node Manager should boot the cheapest size(s) that
-# can run jobs in the queue.
-# The Size fields are interpreted the same way as with a libcloud NodeSize:
-# http://libcloud.readthedocs.org/en/latest/compute/api.html#libcloud.compute.base.NodeSize
-# See https://cloud.google.com/compute/docs/machine-types for a list
-# of known machine types that may be used as a Size parameter.
-# Each size section MUST define the number of cores are available in this
-# size class (since libcloud does not provide any consistent API for exposing
-# this setting).
-# You may also want to define the amount of scratch space (expressed
-# in GB) for Crunch jobs.
-# You can also override Google's provided data fields (such as price per hour)
-# by setting them here.
-[Size n1-standard-2]
-cores = 2
-price = 0.076
-scratch = 100
-[Size n1-standard-4]
-cores = 4
-price = 0.152
-scratch = 200
\ No newline at end of file
diff --git a/services/nodemanager/doc/local.example.cfg b/services/nodemanager/doc/local.example.cfg
deleted file mode 100644 (file)
index 1221775..0000000
+++ /dev/null
@@ -1,50 +0,0 @@
-# Copyright (C) The Arvados Authors. All rights reserved.
-# SPDX-License-Identifier: AGPL-3.0
-# You can use this configuration to run a development Node Manager for
-# testing.  It uses libcloud's dummy driver and your own development API server.
-# When new cloud nodes are created, you'll need to simulate the ping that
-# they send to the Arvados API server.  The easiest way I've found to do that
-# is through the API server Rails console: load the Node object, set its
-# IP address to 10.10.0.N (where N is the cloud node's ID), and save.
-address =
-port = 8989
-min_nodes = 0
-max_nodes = 8
-poll_time = 15
-max_poll_time = 60
-poll_stale_after = 600
-node_stale_after = 300
-certs_file = /etc/ssl/certs/ca-certificates.crt
-level = DEBUG
-pykka = DEBUG
-apiclient = WARNING
-host = localhost:3030
-# This is the token for the text fixture's admin user.
-token = 4axaw8zxe0qm22wa6urpp5nskcne8z88cvbupv653y1njyi05h
-insecure = yes
-timeout = 15
-provider = dummy
-shutdown_windows = 1, 1
-timeout = 15
-[Cloud Credentials]
-creds = dummycreds
-[Cloud List]
-[Cloud Create]
-[Size 2]
-cores = 4
-scratch = 1234
diff --git a/services/nodemanager/fpm-info.sh b/services/nodemanager/fpm-info.sh
deleted file mode 100644 (file)
index c4a9dbb..0000000
+++ /dev/null
@@ -1,9 +0,0 @@
-# Copyright (C) The Arvados Authors. All rights reserved.
-# SPDX-License-Identifier: AGPL-3.0
-case "$TARGET" in
-    debian* | ubuntu*)
-        fpm_depends+=(libcurl3-gnutls libpython2.7)
-        ;;
diff --git a/services/nodemanager/gittaggers.py b/services/nodemanager/gittaggers.py
deleted file mode 120000 (symlink)
index a9ad861..0000000
+++ /dev/null
@@ -1 +0,0 @@
\ No newline at end of file
diff --git a/services/nodemanager/setup.py b/services/nodemanager/setup.py
deleted file mode 100644 (file)
index 75e8f85..0000000
+++ /dev/null
@@ -1,60 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-# SPDX-License-Identifier: AGPL-3.0
-from __future__ import absolute_import
-import os
-import sys
-import re
-from setuptools import setup, find_packages
-SETUP_DIR = os.path.dirname(__file__) or '.'
-README = os.path.join(SETUP_DIR, 'README.rst')
-import arvados_version
-version = arvados_version.get_version(SETUP_DIR, "arvnodeman")
-if os.environ.get('ARVADOS_BUILDING_VERSION', False):
-    pysdk_dep = "=={}".format(version)
-    # On dev releases, arvados-python-client may have a different timestamp
-    pysdk_dep = "<={}".format(version)
-short_tests_only = False
-if '--short-tests-only' in sys.argv:
-    short_tests_only = True
-    sys.argv.remove('--short-tests-only')
-      version=version,
-      description='Arvados compute node manager',
-      long_description=open(README).read(),
-      author='Arvados',
-      author_email='info@arvados.org',
-      url="https://arvados.org",
-      license='GNU Affero General Public License, version 3.0',
-      packages=find_packages(),
-      scripts=['bin/arvados-node-manager'],
-      data_files=[
-          ('share/doc/arvados-node-manager', ['agpl-3.0.txt', 'README.rst', 'arvados-node-manager.service']),
-      ],
-      install_requires=[
-          'apache-libcloud==2.5.0', # 2.6.0 cannot create azure nodes, #15649
-          'arvados-python-client{}'.format(pysdk_dep),
-          'future',
-          'pykka < 2',
-          'python-daemon',
-          'setuptools',
-          'subprocess32>=3.5.1',
-      ],
-      test_suite='tests',
-      tests_require=[
-          'requests',
-          'pbr<1.7.0',
-          'mock>=1.0',
-          'apache-libcloud==2.5.0',
-          'subprocess32>=3.5.1',
-      ],
-      zip_safe=False,
diff --git a/services/nodemanager/tests/__init__.py b/services/nodemanager/tests/__init__.py
deleted file mode 100644 (file)
index 20e02f9..0000000
+++ /dev/null
@@ -1,18 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-# SPDX-License-Identifier: AGPL-3.0
-import logging
-import os
-# Set the ANMTEST_LOGLEVEL environment variable to enable logging at that level.
-loglevel = os.environ.get('ANMTEST_LOGLEVEL', 'CRITICAL')
-logging.basicConfig(level=getattr(logging, loglevel.upper()))
-# Set the ANMTEST_TIMEOUT environment variable to the maximum amount of time to
-# wait for tested actors to respond to important messages.  The default value
-# is very conservative, because a small value may produce false negatives on
-# slower systems.  If you're debugging a known timeout issue, however, you may
-# want to set this lower to speed up tests.
-pykka_timeout = int(os.environ.get('ANMTEST_TIMEOUT', '10'))
diff --git a/services/nodemanager/tests/fake_azure.cfg.template b/services/nodemanager/tests/fake_azure.cfg.template
deleted file mode 100644 (file)
index e5deac8..0000000
+++ /dev/null
@@ -1,194 +0,0 @@
-# Copyright (C) The Arvados Authors. All rights reserved.
-# SPDX-License-Identifier: AGPL-3.0
-# Azure configuration for Arvados Node Manager.
-# All times are in seconds unless specified otherwise.
-# The management server responds to http://addr:port/status.json with
-# a snapshot of internal state.
-# Management server listening address (default
-address =
-# Management server port number (default -1, server is disabled)
-port = 8989
-MangementToken = xxx
-# The dispatcher can customize the start and stop procedure for
-# cloud nodes.  For example, the SLURM dispatcher drains nodes
-# through SLURM before shutting them down.
-#dispatcher = slurm
-# Node Manager will ensure that there are at least this many nodes running at
-# all times.  If node manager needs to start new idle nodes for the purpose of
-# satisfying min_nodes, it will use the cheapest node type.  However, depending
-# on usage patterns, it may also satisfy min_nodes by keeping alive some
-# more-expensive nodes
-min_nodes = 0
-# Node Manager will not start any compute nodes when at least this
-# many are running.
-max_nodes = 8
-# Upper limit on rate of spending (in $/hr), will not boot additional nodes
-# if total price of already running nodes meets or exceeds this threshold.
-# default 0 means no limit.
-max_total_price = 0
-# Poll Azure nodes and Arvados for new information every N seconds.
-poll_time = 0.5
-# Polls have exponential backoff when services fail to respond.
-# This is the longest time to wait between polls.
-max_poll_time = 1
-# If Node Manager can't succesfully poll a service for this long,
-# it will never start or stop compute nodes, on the assumption that its
-# information is too outdated.
-poll_stale_after = 1
-# If Node Manager boots a cloud node, and it does not pair with an Arvados
-# node before this long, assume that there was a cloud bootstrap failure and
-# shut it down.  Note that normal shutdown windows apply (see the Cloud
-# section), so this should be shorter than the first shutdown window value.
-boot_fail_after = 45
-# "Node stale time" affects two related behaviors.
-# 1. If a compute node has been running for at least this long, but it
-# isn't paired with an Arvados node, do not shut it down, but leave it alone.
-# This prevents the node manager from shutting down a node that might
-# actually be doing work, but is having temporary trouble contacting the
-# API server.
-# 2. When the Node Manager starts a new compute node, it will try to reuse
-# an Arvados node that hasn't been updated for this long.
-node_stale_after = 14400
-# Scaling factor to be applied to nodes' available RAM size. Usually there's a
-# variable discrepancy between the advertised RAM value on cloud nodes and the
-# actual amount available.
-# If not set, this value will be set to 0.95
-node_mem_scaling = 0.95
-# File path for Certificate Authorities
-certs_file = /etc/ssl/certs/ca-certificates.crt
-# Log file path
-#file = node-manager.log
-# Log level for most Node Manager messages.
-# WARNING lets you know when polling a service fails.
-# INFO additionally lets you know when a compute node is started or stopped.
-level = DEBUG
-# You can also set different log levels for specific libraries.
-# Pykka is the Node Manager's actor library.
-# Setting this to DEBUG will display tracebacks for uncaught
-# exceptions in the actors, but it's also very chatty.
-pykka = WARNING
-# Setting apiclient to INFO will log the URL of every Arvados API request.
-apiclient = WARNING
-host = {host}
-token = {token}
-timeout = 15
-jobs_queue = no
-# Accept an untrusted SSL certificate from the API server?
-insecure = yes
-provider = azure
-driver_class = {driver_class}
-# Shutdown windows define periods of time when a node may and may not be shut
-# down.  These are windows in full minutes, separated by commas.  Counting from
-# the time the node is booted, the node WILL NOT shut down for N1 minutes; then
-# it MAY shut down for N2 minutes; then it WILL NOT shut down for N3 minutes;
-# and so on.  For example, "20, 999999" means the node may shut down between
-# the 20th and 999999th minutes of uptime.
-# Azure bills by the minute, so it makes sense to agressively shut down idle
-# nodes.  Specify at least two windows.  You can add as many as you need beyond
-# that.
-shutdown_windows = 0.05, 999999
-[Cloud Credentials]
-# Use "azure account list" with the azure CLI to get these values.
-tenant_id = 00000000-0000-0000-0000-000000000000
-subscription_id = 00000000-0000-0000-0000-000000000000
-# The following directions are based on
-# https://azure.microsoft.com/en-us/documentation/articles/resource-group-authenticate-service-principal/
-# azure config mode arm
-# azure ad app create --name "<Your Application Display Name>" --home-page "<https://YourApplicationHomePage>" --identifier-uris "<https://YouApplicationUri>" --password <Your_Password>
-# azure ad sp create "<Application_Id>"
-# azure role assignment create --objectId "<Object_Id>" -o Owner -c /subscriptions/<subscriptionId>/
-# Use <Application_Id> for "key" and the <Your_Password> for "secret"
-key = 00000000-0000-0000-0000-000000000000
-secret = PASSWORD
-timeout = 60
-region = East US
-[Cloud List]
-# The resource group in which the compute node virtual machines will be created
-# and listed.
-ex_resource_group = ArvadosResourceGroup
-[Cloud Create]
-# The image id, in the form "Publisher:Offer:SKU:Version"
-image = Canonical:UbuntuServer:14.04.3-LTS:14.04.201508050
-# Path to a local ssh key file that will be used to provision new nodes.
-ssh_key = {ssh_key}
-# The account name for the admin user that will be provisioned on new nodes.
-ex_user_name = arvadosuser
-# The Azure storage account that will be used to store the node OS disk images.
-ex_storage_account = arvadosstorage
-# The virtual network the VMs will be associated with.
-ex_network = ArvadosNetwork
-# Optional subnet of the virtual network.
-#ex_subnet = default
-# Node tags
-tag_arvados-class = dynamic-compute
-tag_cluster = zyxwv
-# the API server to ping
-ping_host = {host}
-# You can define any number of Size sections to list Azure sizes you're willing
-# to use.  The Node Manager should boot the cheapest size(s) that can run jobs
-# in the queue.  You must also provide price per hour as the Azure driver
-# compute currently does not report prices.
-# See https://azure.microsoft.com/en-us/pricing/details/virtual-machines/
-# for a list of known machine types that may be used as a Size parameter.
-# Each size section MUST define the number of cores are available in this
-# size class (since libcloud does not provide any consistent API for exposing
-# this setting).
-# You may also want to define the amount of scratch space (expressed
-# in GB) for Crunch jobs.  You can also override Microsoft's provided
-# data fields by setting them here.
-[Size Standard_D3]
-cores = 4
-price = 0.56
-[Size Standard_D4]
-cores = 8
-price = 1.12
diff --git a/services/nodemanager/tests/fake_ec2.cfg.template b/services/nodemanager/tests/fake_ec2.cfg.template
deleted file mode 100644 (file)
index 2bb7d0e..0000000
+++ /dev/null
@@ -1,162 +0,0 @@
-# Copyright (C) The Arvados Authors. All rights reserved.
-# SPDX-License-Identifier: AGPL-3.0
-# Azure configuration for Arvados Node Manager.
-# All times are in seconds unless specified otherwise.
-# The management server responds to http://addr:port/status.json with
-# a snapshot of internal state.
-# Management server listening address (default
-#address =
-# Management server port number (default -1, server is disabled)
-#port = 8989
-# The dispatcher can customize the start and stop procedure for
-# cloud nodes.  For example, the SLURM dispatcher drains nodes
-# through SLURM before shutting them down.
-#dispatcher = slurm
-# Node Manager will ensure that there are at least this many nodes running at
-# all times.  If node manager needs to start new idle nodes for the purpose of
-# satisfying min_nodes, it will use the cheapest node type.  However, depending
-# on usage patterns, it may also satisfy min_nodes by keeping alive some
-# more-expensive nodes
-min_nodes = 0
-# Node Manager will not start any compute nodes when at least this
-# many are running.
-max_nodes = 8
-# Upper limit on rate of spending (in $/hr), will not boot additional nodes
-# if total price of already running nodes meets or exceeds this threshold.
-# default 0 means no limit.
-max_total_price = 0
-# Poll Azure nodes and Arvados for new information every N seconds.
-poll_time = 0.5
-# Polls have exponential backoff when services fail to respond.
-# This is the longest time to wait between polls.
-max_poll_time = 1
-# If Node Manager can't succesfully poll a service for this long,
-# it will never start or stop compute nodes, on the assumption that its
-# information is too outdated.
-poll_stale_after = 1
-# If Node Manager boots a cloud node, and it does not pair with an Arvados
-# node before this long, assume that there was a cloud bootstrap failure and
-# shut it down.  Note that normal shutdown windows apply (see the Cloud
-# section), so this should be shorter than the first shutdown window value.
-boot_fail_after = 45
-# "Node stale time" affects two related behaviors.
-# 1. If a compute node has been running for at least this long, but it
-# isn't paired with an Arvados node, do not shut it down, but leave it alone.
-# This prevents the node manager from shutting down a node that might
-# actually be doing work, but is having temporary trouble contacting the
-# API server.
-# 2. When the Node Manager starts a new compute node, it will try to reuse
-# an Arvados node that hasn't been updated for this long.
-node_stale_after = 14400
-# Scaling factor to be applied to nodes' available RAM size. Usually there's a
-# variable discrepancy between the advertised RAM value on cloud nodes and the
-# actual amount available.
-# If not set, this value will be set to 0.95
-node_mem_scaling = 0.95
-# File path for Certificate Authorities
-certs_file = /etc/ssl/certs/ca-certificates.crt
-# Log file path
-#file = node-manager.log
-# Log level for most Node Manager messages.
-# WARNING lets you know when polling a service fails.
-# INFO additionally lets you know when a compute node is started or stopped.
-level = DEBUG
-# You can also set different log levels for specific libraries.
-# Pykka is the Node Manager's actor library.
-# Setting this to DEBUG will display tracebacks for uncaught
-# exceptions in the actors, but it's also very chatty.
-pykka = WARNING
-# Setting apiclient to INFO will log the URL of every Arvados API request.
-apiclient = WARNING
-host = {host}
-token = {token}
-timeout = 15
-jobs_queue = no
-# Accept an untrusted SSL certificate from the API server?
-insecure = yes
-provider = ec2
-driver_class = {driver_class}
-# Shutdown windows define periods of time when a node may and may not be shut
-# down.  These are windows in full minutes, separated by commas.  Counting from
-# the time the node is booted, the node WILL NOT shut down for N1 minutes; then
-# it MAY shut down for N2 minutes; then it WILL NOT shut down for N3 minutes;
-# and so on.  For example, "20, 999999" means the node may shut down between
-# the 20th and 999999th minutes of uptime.
-# Azure bills by the minute, so it makes sense to agressively shut down idle
-# nodes.  Specify at least two windows.  You can add as many as you need beyond
-# that.
-shutdown_windows = 0.05, 999999
-[Cloud Credentials]
-key = 00000000-0000-0000-0000-000000000000
-secret = PASSWORD
-timeout = 60
-region = East US
-[Cloud List]
-[Cloud Create]
-# The image id
-image = fake_image_id
-# Path to a local ssh key file that will be used to provision new nodes.
-ssh_key = {ssh_key}
-# the API server to ping
-ping_host = {host}
-# You can define any number of Size sections to list Azure sizes you're willing
-# to use.  The Node Manager should boot the cheapest size(s) that can run jobs
-# in the queue.  You must also provide price per hour as the Azure driver
-# compute currently does not report prices.
-# See https://azure.microsoft.com/en-us/pricing/details/virtual-machines/
-# for a list of known machine types that may be used as a Size parameter.
-# Each size section MUST define the number of cores are available in this
-# size class (since libcloud does not provide any consistent API for exposing
-# this setting).
-# You may also want to define the amount of scratch space (expressed
-# in GB) for Crunch jobs.  You can also override Microsoft's provided
-# data fields by setting them here.
-[Size m4.xlarge]
-cores = 4
-price = 0.56
-scratch = 250
-[Size m4.2xlarge]
-cores = 8
-price = 1.12
-scratch = 500
diff --git a/services/nodemanager/tests/fake_gce.cfg.template b/services/nodemanager/tests/fake_gce.cfg.template
deleted file mode 100644 (file)
index 11131ef..0000000
+++ /dev/null
@@ -1,159 +0,0 @@
-# Copyright (C) The Arvados Authors. All rights reserved.
-# SPDX-License-Identifier: AGPL-3.0
-# Azure configuration for Arvados Node Manager.
-# All times are in seconds unless specified otherwise.
-# The management server responds to http://addr:port/status.json with
-# a snapshot of internal state.
-# Management server listening address (default
-#address =
-# Management server port number (default -1, server is disabled)
-#port = 8989
-# The dispatcher can customize the start and stop procedure for
-# cloud nodes.  For example, the SLURM dispatcher drains nodes
-# through SLURM before shutting them down.
-#dispatcher = slurm
-# Node Manager will ensure that there are at least this many nodes running at
-# all times.  If node manager needs to start new idle nodes for the purpose of
-# satisfying min_nodes, it will use the cheapest node type.  However, depending
-# on usage patterns, it may also satisfy min_nodes by keeping alive some
-# more-expensive nodes
-min_nodes = 0
-# Node Manager will not start any compute nodes when at least this
-# many are running.
-max_nodes = 8
-# Upper limit on rate of spending (in $/hr), will not boot additional nodes
-# if total price of already running nodes meets or exceeds this threshold.
-# default 0 means no limit.
-max_total_price = 0
-# Poll Azure nodes and Arvados for new information every N seconds.
-poll_time = 0.5
-# Polls have exponential backoff when services fail to respond.
-# This is the longest time to wait between polls.
-max_poll_time = 1
-# If Node Manager can't succesfully poll a service for this long,
-# it will never start or stop compute nodes, on the assumption that its
-# information is too outdated.
-poll_stale_after = 1
-# If Node Manager boots a cloud node, and it does not pair with an Arvados
-# node before this long, assume that there was a cloud bootstrap failure and
-# shut it down.  Note that normal shutdown windows apply (see the Cloud
-# section), so this should be shorter than the first shutdown window value.
-boot_fail_after = 45
-# "Node stale time" affects two related behaviors.
-# 1. If a compute node has been running for at least this long, but it
-# isn't paired with an Arvados node, do not shut it down, but leave it alone.
-# This prevents the node manager from shutting down a node that might
-# actually be doing work, but is having temporary trouble contacting the
-# API server.
-# 2. When the Node Manager starts a new compute node, it will try to reuse
-# an Arvados node that hasn't been updated for this long.
-node_stale_after = 14400
-# Scaling factor to be applied to nodes' available RAM size. Usually there's a
-# variable discrepancy between the advertised RAM value on cloud nodes and the
-# actual amount available.
-# If not set, this value will be set to 0.95
-node_mem_scaling = 0.95
-# File path for Certificate Authorities
-certs_file = /etc/ssl/certs/ca-certificates.crt
-# Log file path
-#file = node-manager.log
-# Log level for most Node Manager messages.
-# WARNING lets you know when polling a service fails.
-# INFO additionally lets you know when a compute node is started or stopped.
-level = DEBUG
-# You can also set different log levels for specific libraries.
-# Pykka is the Node Manager's actor library.
-# Setting this to DEBUG will display tracebacks for uncaught
-# exceptions in the actors, but it's also very chatty.
-pykka = WARNING
-# Setting apiclient to INFO will log the URL of every Arvados API request.
-apiclient = WARNING
-host = {host}
-token = {token}
-timeout = 15
-jobs_queue = no
-# Accept an untrusted SSL certificate from the API server?
-insecure = yes
-provider = gce
-driver_class = {driver_class}
-# Shutdown windows define periods of time when a node may and may not be shut
-# down.  These are windows in full minutes, separated by commas.  Counting from
-# the time the node is booted, the node WILL NOT shut down for N1 minutes; then
-# it MAY shut down for N2 minutes; then it WILL NOT shut down for N3 minutes;
-# and so on.  For example, "20, 999999" means the node may shut down between
-# the 20th and 999999th minutes of uptime.
-# Azure bills by the minute, so it makes sense to agressively shut down idle
-# nodes.  Specify at least two windows.  You can add as many as you need beyond
-# that.
-shutdown_windows = 0.05, 999999
-[Cloud Credentials]
-key = 00000000-0000-0000-0000-000000000000
-secret = PASSWORD
-timeout = 60
-region = East US
-[Cloud List]
-[Cloud Create]
-# The image id
-image = fake_image_id
-# Path to a local ssh key file that will be used to provision new nodes.
-ssh_key = {ssh_key}
-# the API server to ping
-ping_host = {host}
-# You can define any number of Size sections to list Azure sizes you're willing
-# to use.  The Node Manager should boot the cheapest size(s) that can run jobs
-# in the queue.  You must also provide price per hour as the Azure driver
-# compute currently does not report prices.
-# See https://azure.microsoft.com/en-us/pricing/details/virtual-machines/
-# for a list of known machine types that may be used as a Size parameter.
-# Each size section MUST define the number of cores are available in this
-# size class (since libcloud does not provide any consistent API for exposing
-# this setting).
-# You may also want to define the amount of scratch space (expressed
-# in GB) for Crunch jobs.  You can also override Microsoft's provided
-# data fields by setting them here.
-[Size n1-standard-1]
-cores = 1
-price = 0.56
-[Size n1-standard-2]
-cores = 2
-price = 1.12
\ No newline at end of file
diff --git a/services/nodemanager/tests/integration_test.py b/services/nodemanager/tests/integration_test.py
deleted file mode 100755 (executable)
index 1ba2957..0000000
+++ /dev/null
@@ -1,494 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-# SPDX-License-Identifier: AGPL-3.0
-"""Integration test framework for node manager.
-Runs full node manager with an API server (needs ARVADOS_API_HOST and
-ARVADOS_API_TOKEN).  Stubs out the cloud driver and slurm commands to mock
-specific behaviors.  Monitors the log output to verify an expected sequence of
-events or behaviors for each test.
-import subprocess32 as subprocess
-import os
-import sys
-import re
-import time
-import logging
-import stat
-import tempfile
-import shutil
-import errno
-from functools import partial
-import arvados
-import StringIO
-formatter = logging.Formatter('%(asctime)s %(levelname)s: %(message)s')
-handler = logging.StreamHandler(sys.stderr)
-logger = logging.getLogger("logger")
-detail = logging.getLogger("detail")
-if os.environ.get("ANMTEST_LOGLEVEL"):
-    detail_content = sys.stderr
-    detail_content = StringIO.StringIO()
-handler = logging.StreamHandler(detail_content)
-fake_slurm = None
-compute_nodes = None
-all_jobs = None
-unsatisfiable_job_scancelled = None
-def update_script(path, val):
-    with open(path+"_", "w") as f:
-        f.write(val)
-    os.chmod(path+"_", stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR)
-    os.rename(path+"_", path)
-    detail.info("Update script %s: %s", path, val)
-def set_squeue(g):
-    global all_jobs
-    update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n" +
-                  "\n".join("echo '1|100|100|%s|%s|(null)|1234567890'" % (v, k) for k,v in all_jobs.items()))
-    return 0
-def set_queue_unsatisfiable(g):
-    global all_jobs, unsatisfiable_job_scancelled
-    # Simulate a job requesting a 99 core node.
-    update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n" +
-                  "\n".join("echo '99|100|100|%s|%s|(null)|1234567890'" % (v, k) for k,v in all_jobs.items()))
-    update_script(os.path.join(fake_slurm, "scancel"), "#!/bin/sh\n" +
-                  "\ntouch %s" % unsatisfiable_job_scancelled)
-    return 0
-def job_cancelled(g):
-    global unsatisfiable_job_scancelled
-    cancelled_job = g.group(1)
-    api = arvados.api('v1')
-    # Check that 'scancel' was called
-    if not os.path.isfile(unsatisfiable_job_scancelled):
-        return 1
-    # Check for the log entry
-    log_entry = api.logs().list(
-        filters=[
-            ['object_uuid', '=', cancelled_job],
-            ['event_type', '=', 'stderr'],
-        ]).execute()['items'][0]
-    if not re.match(
-            r"Constraints cannot be satisfied",
-            log_entry['properties']['text']):
-        return 1
-    return 0
-def node_paired(g):
-    global compute_nodes
-    compute_nodes[g.group(1)] = g.group(3)
-    update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n" +
-                  "\n".join("echo '%s|alloc|(null)'" % (v) for k,v in compute_nodes.items()))
-    for k,v in all_jobs.items():
-        if v == "ReqNodeNotAvail":
-            all_jobs[k] = "Running"
-            break
-    set_squeue(g)
-    return 0
-def node_busy(g):
-    update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n" +
-                  "\n".join("echo '%s|idle|(null)'" % (v) for k,v in compute_nodes.items()))
-    return 0
-def node_shutdown(g):
-    global compute_nodes
-    if g.group(1) in compute_nodes:
-        del compute_nodes[g.group(1)]
-        return 0
-    else:
-        return 1
-def jobs_req(g):
-    global all_jobs
-    for k,v in all_jobs.items():
-        all_jobs[k] = "ReqNodeNotAvail"
-    set_squeue(g)
-    return 0
-def noop(g):
-    return 0
-def fail(checks, pattern, g):
-    return 1
-def expect_count(count, checks, pattern, g):
-    if count == 0:
-        return 1
-    else:
-        checks[pattern] = partial(expect_count, count-1)
-        return 0
-def run_test(name, actions, checks, driver_class, jobs, provider):
-    code = 0
-    global unsatisfiable_job_scancelled
-    unsatisfiable_job_scancelled = os.path.join(tempfile.mkdtemp(),
-                                                "scancel_called")
-    # Delete any stale node records
-    api = arvados.api('v1')
-    for n in api.nodes().list().execute()['items']:
-        api.nodes().delete(uuid=n["uuid"]).execute()
-    logger.info("Start %s", name)
-    global fake_slurm
-    fake_slurm = tempfile.mkdtemp()
-    detail.info("fake_slurm is %s", fake_slurm)
-    global compute_nodes
-    compute_nodes = {}
-    global all_jobs
-    all_jobs = jobs
-    env = os.environ.copy()
-    env["PATH"] = fake_slurm + ":" + env["PATH"]
-    # Reset fake squeue/sinfo to empty
-    update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n")
-    update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n")
-    # Write configuration file for test
-    with open("tests/fake_%s.cfg.template" % provider) as f:
-        open(os.path.join(fake_slurm, "id_rsa.pub"), "w").close()
-        with open(os.path.join(fake_slurm, "fake.cfg"), "w") as cfg:
-            cfg.write(f.read().format(host=os.environ["ARVADOS_API_HOST"],
-                                      token=os.environ["ARVADOS_API_TOKEN"],
-                                      driver_class=driver_class,
-                                      ssh_key=os.path.join(fake_slurm, "id_rsa.pub")))
-    # Tests must complete in less than 30 seconds.
-    timeout = time.time() + 30
-    terminated = False
-    # Now start node manager
-    p = subprocess.Popen(["bin/arvados-node-manager", "--foreground", "--config", os.path.join(fake_slurm, "fake.cfg")],
-                         bufsize=0, stderr=subprocess.PIPE, env=env)
-    # Test main loop:
-    # - Read line
-    # - Apply negative checks (things that are not supposed to happen)
-    # - Check timeout
-    # - Check if the next action should trigger
-    # - If all actions are exhausted, terminate with test success
-    # - If it hits timeout with actions remaining, terminate with test failed
-    try:
-        # naive line iteration over pipes gets buffered, which isn't what we want,
-        # see https://bugs.python.org/issue3907
-        for line in iter(p.stderr.readline, ""):
-            detail_content.write(line)
-            for k,v in checks.items():
-                g = re.match(k, line)
-                if g:
-                    detail.info("Matched check %s", k)
-                    code += v(checks, k, g)
-                    if code != 0:
-                        detail.error("Check failed")
-                        if not terminated:
-                            p.kill()
-                            terminated = True
-            if terminated:
-                continue
-            if time.time() > timeout:
-                detail.error("Exceeded timeout with actions remaining: %s", actions)
-                code += 1
-                if not terminated:
-                    p.kill()
-                    terminated = True
-            k, v = actions[0]
-            g = re.match(k, line)
-            if g:
-                detail.info("Matched action %s", k)
-                actions.pop(0)
-                code += v(g)
-                if code != 0:
-                    detail.error("Action failed")
-                    p.kill()
-                    terminated = True
-            if not actions:
-                p.kill()
-                terminated = True
-    except KeyboardInterrupt:
-        p.kill()
-    if actions:
-        detail.error("Ended with remaining actions: %s", actions)
-        code = 1
-    shutil.rmtree(fake_slurm)
-    shutil.rmtree(os.path.dirname(unsatisfiable_job_scancelled))
-    if code == 0:
-        logger.info("%s passed", name)
-    else:
-        if isinstance(detail_content, StringIO.StringIO):
-            detail_content.seek(0)
-            chunk = detail_content.read(4096)
-            while chunk:
-                try:
-                    sys.stderr.write(chunk)
-                    chunk = detail_content.read(4096)
-                except IOError as e:
-                    if e.errno == errno.EAGAIN:
-                        # try again (probably pipe buffer full)
-                        pass
-                    else:
-                        raise
-        logger.info("%s failed", name)
-    return code
-def main():
-    # Test lifecycle.
-    tests = {
-        "test_unsatisfiable_jobs" : (
-            # Actions (pattern -> action)
-            [
-                (r".*Daemon started", set_queue_unsatisfiable),
-                (r".*Cancelled unsatisfiable job '(\S+)'", job_cancelled),
-            ],
-            # Checks (things that shouldn't happen)
-            {
-                r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": fail,
-                r".*Trying to cancel job '(\S+)'": fail,
-            },
-            # Driver class
-            "arvnodeman.test.fake_driver.FakeDriver",
-            # Jobs
-            {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
-            # Provider
-            "azure"),
-        "test_single_node_azure": (
-            # Actions (pattern -> action)
-            [
-                (r".*Daemon started", set_squeue),
-                (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
-                (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
-                (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
-                (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
-            ],
-            # Checks (things that shouldn't happen)
-            {
-                r".*Suggesting shutdown because node state is \('down', .*\)": fail,
-                r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1),
-                r".*Setting node quota.*": fail,
-            },
-            # Driver class
-            "arvnodeman.test.fake_driver.FakeDriver",
-            # Jobs
-            {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
-            # Provider
-            "azure"),
-        "test_multiple_nodes": (
-            # Actions (pattern -> action)
-            [
-                (r".*Daemon started", set_squeue),
-                (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
-                (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
-                (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
-                (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
-                (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
-                (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
-                (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
-                (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
-                (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
-                (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
-            ],
-            # Checks (things that shouldn't happen)
-            {
-                r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 4),
-                r".*Setting node quota.*": fail,
-            },
-            # Driver class
-            "arvnodeman.test.fake_driver.FakeDriver",
-            # Jobs
-            {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
-             "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
-             "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
-             "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"},
-            # Provider
-            "azure"),
-        "test_hit_quota": (
-            # Actions (pattern -> action)
-            [
-                (r".*Daemon started", set_squeue),
-                (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
-                (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
-                (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
-                (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
-                (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
-                (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown)
-            ],
-            # Checks (things that shouldn't happen)
-            {
-                r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 2),
-                r".*Sending create_node request.*": partial(expect_count, 5)
-            },
-            # Driver class
-            "arvnodeman.test.fake_driver.QuotaDriver",
-            # Jobs
-            {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
-             "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
-             "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
-             "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"},
-            # Provider
-            "azure"),
-        "test_probe_quota": (
-            # Actions (pattern -> action)
-            [
-                (r".*Daemon started", set_squeue),
-                (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
-                (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
-                (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
-                (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
-                (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
-                (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
-                (r".*sending request", jobs_req),
-                (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
-                (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
-                (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
-                (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
-                (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
-                (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
-                (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
-                (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
-                (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
-                (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
-            ],
-            # Checks (things that shouldn't happen)
-            {
-                r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 6),
-                r".*Sending create_node request.*": partial(expect_count, 9)
-            },
-            # Driver class
-            "arvnodeman.test.fake_driver.QuotaDriver",
-            # Jobs
-            {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
-             "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
-             "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
-             "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"},
-            # Provider
-            "azure"),
-        "test_no_hang_failing_node_create": (
-            # Actions (pattern -> action)
-            [
-                (r".*Daemon started", set_squeue),
-                (r".*Client error: nope", noop),
-                (r".*Client error: nope", noop),
-                (r".*Client error: nope", noop),
-                (r".*Client error: nope", noop),
-            ],
-            # Checks (things that shouldn't happen)
-            {},
-            # Driver class
-            "arvnodeman.test.fake_driver.FailingDriver",
-            # Jobs
-            {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail",
-             "34t0i-dz642-h42bg3hq4bdfpf2": "ReqNodeNotAvail",
-             "34t0i-dz642-h42bg3hq4bdfpf3": "ReqNodeNotAvail",
-             "34t0i-dz642-h42bg3hq4bdfpf4": "ReqNodeNotAvail"},
-            # Provider
-            "azure"),
-        "test_retry_create": (
-            # Actions (pattern -> action)
-            [
-                (r".*Daemon started", set_squeue),
-                (r".*Rate limit exceeded - scheduling retry in 2 seconds", noop),
-                (r".*Rate limit exceeded - scheduling retry in 1 seconds", noop),
-                (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", noop),
-            ],
-            # Checks (things that shouldn't happen)
-            {},
-            # Driver class
-            "arvnodeman.test.fake_driver.RetryDriver",
-            # Jobs
-            {"34t0i-dz642-h42bg3hq4bdfpf1": "ReqNodeNotAvail"},
-            # Provider
-            "azure"),
-        "test_single_node_aws": (
-            # Actions (pattern -> action)
-            [
-                (r".*Daemon started", set_squeue),
-                (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
-                (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
-                (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
-                (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
-            ],
-            # Checks (things that shouldn't happen)
-            {
-                r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1),
-                r".*Setting node quota.*": fail,
-            },
-            # Driver class
-            "arvnodeman.test.fake_driver.FakeAwsDriver",
-            # Jobs
-            {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
-            # Provider
-            "ec2"),
-        "test_single_node_gce": (
-            # Actions (pattern -> action)
-            [
-                (r".*Daemon started", set_squeue),
-                (r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)", node_paired),
-                (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Not eligible for shut down because node state is \('busy', 'open', .*\)", node_busy),
-                (r".*ComputeNodeMonitorActor\..*\.([^[]*).*Suggesting shutdown because node state is \('idle', 'open', .*\)", noop),
-                (r".*ComputeNodeShutdownActor\..*\.([^[]*).*Shutdown success", node_shutdown),
-            ],
-            # Checks (things that shouldn't happen)
-            {
-                r".*Cloud node (\S+) is now paired with Arvados node (\S+) with hostname (\S+)": partial(expect_count, 1),
-                r".*Setting node quota.*": fail,
-            },
-            # Driver class
-            "arvnodeman.test.fake_driver.FakeGceDriver",
-            # Jobs
-            {"34t0i-dz642-h42bg3hq4bdfpf9": "ReqNodeNotAvail"},
-            # Provider
-            "gce")
-    }
-    code = 0
-    if len(sys.argv) > 1:
-        code = run_test(sys.argv[1], *tests[sys.argv[1]])
-    else:
-        for t in sorted(tests.keys()):
-            code += run_test(t, *tests[t])
-    if code == 0:
-        logger.info("Tests passed")
-    else:
-        logger.info("Tests failed")
-    exit(code)
-if __name__ == '__main__':
-    main()
diff --git a/services/nodemanager/tests/stress_test.cwl b/services/nodemanager/tests/stress_test.cwl
deleted file mode 100644 (file)
index 082df64..0000000
+++ /dev/null
@@ -1,51 +0,0 @@
-# Copyright (C) The Arvados Authors. All rights reserved.
-# SPDX-License-Identifier: AGPL-3.0
-# Usage: arvados-cwl-runner stress_test.cwl
-# Submits 100 jobs or containers, creating load on node manager and
-# scheduler.
-class: Workflow
-cwlVersion: v1.0
-  ScatterFeatureRequirement: {}
-  InlineJavascriptRequirement: {}
-inputs: []
-outputs: []
-  step1:
-    in: []
-    out: [out]
-    run:
-      class: ExpressionTool
-      inputs: []
-      outputs:
-        out: int[]
-      expression: |
-        ${
-          var r = [];
-          for (var i = 1; i <= 100; i++) {
-            r.push(i);
-          }
-          return {out: r};
-        }
-  step2:
-    in:
-      num: step1/out
-    out: []
-    scatter: num
-    run:
-      class: CommandLineTool
-      requirements:
-        ShellCommandRequirement: {}
-      inputs:
-        num: int
-      outputs: []
-      arguments: [echo, "starting",
-        {shellQuote: false, valueFrom: "&&"},
-        sleep, $((101-inputs.num)*2),
-        {shellQuote: false, valueFrom: "&&"},
-        echo, "the number of the day is", $(inputs.num)]
diff --git a/services/nodemanager/tests/test_arguments.py b/services/nodemanager/tests/test_arguments.py
deleted file mode 100644 (file)
index e325e52..0000000
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-# SPDX-License-Identifier: AGPL-3.0
-import io
-import os
-import sys
-import tempfile
-import unittest
-import arvnodeman.launcher as nodeman
-from . import testutil
-class ArvNodemArgumentsTestCase(unittest.TestCase):
-    def run_nodeman(self, args):
-        return nodeman.main(args)
-    def test_unsupported_arg(self):
-        with self.assertRaises(SystemExit):
-            self.run_nodeman(['-x=unknown'])
-    def test_version_argument(self):
-        err = io.BytesIO()
-        out = io.BytesIO()
-        with testutil.redirected_streams(stdout=out, stderr=err):
-            with self.assertRaises(SystemExit):
-                self.run_nodeman(['--version'])
-        self.assertEqual(out.getvalue(), '')
-        self.assertRegexpMatches(err.getvalue(), "[0-9]+\.[0-9]+\.[0-9]+")
diff --git a/services/nodemanager/tests/test_clientactor.py b/services/nodemanager/tests/test_clientactor.py
deleted file mode 100644 (file)
index 19e094d..0000000
+++ /dev/null
@@ -1,152 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-# SPDX-License-Identifier: AGPL-3.0
-from __future__ import absolute_import, print_function
-import unittest
-import mock
-import pykka
-import arvnodeman.clientactor as clientactor
-from . import testutil
-class RemotePollLoopActorTestCase(testutil.RemotePollLoopActorTestMixin,
-                                  unittest.TestCase):
-    class MockClientError(Exception):
-        pass
-    class TestActor(clientactor.RemotePollLoopActor):
-        LOGGER_NAME = 'arvnodeman.testpoll'
-        def _send_request(self):
-            return self._client()
-    TestActor.CLIENT_ERRORS = (MockClientError,)
-    TEST_CLASS = TestActor
-    def build_monitor(self, side_effect, *args, **kwargs):
-        super(RemotePollLoopActorTestCase, self).build_monitor(*args, **kwargs)
-        self.client.side_effect = side_effect
-    def test_poll_loop_starts_after_subscription(self):
-        self.build_monitor(['test1'])
-        self.monitor.subscribe(self.subscriber).get(self.TIMEOUT)
-        self.stop_proxy(self.monitor)
-        self.subscriber.assert_called_with('test1')
-        self.assertTrue(self.timer.schedule.called)
-    def test_poll_loop_continues_after_failure(self):
-        self.build_monitor(self.MockClientError)
-        self.monitor.subscribe(self.subscriber).get(self.TIMEOUT)
-        self.assertTrue(self.stop_proxy(self.monitor),
-                        "poll loop died after error")
-        self.assertTrue(self.timer.schedule.called,
-                        "poll loop did not reschedule after error")
-        self.assertFalse(self.subscriber.called,
-                         "poll loop notified subscribers after error")
-    def test_late_subscribers_get_responses(self):
-        self.build_monitor(['pre_late_test', 'late_test'])
-        mock_subscriber = mock.Mock(name='mock_subscriber')
-        self.monitor.subscribe(mock_subscriber).get(self.TIMEOUT)
-        self.monitor.subscribe(self.subscriber)
-        self.monitor.poll().get(self.TIMEOUT)
-        self.stop_proxy(self.monitor)
-        self.subscriber.assert_called_with('late_test')
-    def test_survive_dead_subscriptions(self):
-        self.build_monitor(['survive1', 'survive2'])
-        dead_subscriber = mock.Mock(name='dead_subscriber')
-        dead_subscriber.side_effect = pykka.ActorDeadError
-        self.monitor.subscribe(dead_subscriber)
-        self.monitor.subscribe(self.subscriber)
-        self.monitor.poll().get(self.TIMEOUT)
-        self.assertTrue(self.stop_proxy(self.monitor),
-                        "poll loop died from dead subscriber")
-        self.subscriber.assert_called_with('survive2')
-    def check_poll_timers(self, *test_times):
-        schedule_mock = self.timer.schedule
-        last_expect = None
-        with mock.patch('time.time') as time_mock:
-            for fake_time, expect_next in test_times:
-                time_mock.return_value = fake_time
-                self.monitor.poll(last_expect).get(self.TIMEOUT)
-                self.assertTrue(schedule_mock.called)
-                self.assertEqual(expect_next, schedule_mock.call_args[0][0])
-                schedule_mock.reset_mock()
-                last_expect = expect_next
-    def test_poll_timing_on_consecutive_successes_with_drift(self):
-        self.build_monitor(['1', '2'], poll_wait=3, max_poll_wait=14)
-        self.check_poll_timers((0, 3), (4, 6))
-    def test_poll_backoff_on_failures(self):
-        self.build_monitor(self.MockClientError, poll_wait=3, max_poll_wait=14)
-        self.check_poll_timers((0, 6), (6, 18), (18, 32))
-    def test_poll_timing_after_error_recovery(self):
-        self.build_monitor(['a', self.MockClientError(), 'b'],
-                           poll_wait=3, max_poll_wait=14)
-        self.check_poll_timers((0, 3), (4, 10), (10, 13))
-    def test_no_subscriptions_by_key_without_support(self):
-        self.build_monitor([])
-        with self.assertRaises(AttributeError):
-            self.monitor.subscribe_to('key')
-class RemotePollLoopActorWithKeysTestCase(testutil.RemotePollLoopActorTestMixin,
-                                          unittest.TestCase):
-    class TestActor(RemotePollLoopActorTestCase.TestActor):
-        def _item_key(self, item):
-            return item['key']
-    TEST_CLASS = TestActor
-    def build_monitor(self, side_effect, *args, **kwargs):
-        super(RemotePollLoopActorWithKeysTestCase, self).build_monitor(
-            *args, **kwargs)
-        self.client.side_effect = side_effect
-    def test_key_subscription(self):
-        self.build_monitor([[{'key': 1}, {'key': 2}]])
-        self.monitor.subscribe_to(2, self.subscriber).get(self.TIMEOUT)
-        self.stop_proxy(self.monitor)
-        self.subscriber.assert_called_with({'key': 2})
-    def test_survive_dead_key_subscriptions(self):
-        item = {'key': 3}
-        self.build_monitor([[item], [item]])
-        dead_subscriber = mock.Mock(name='dead_subscriber')
-        dead_subscriber.side_effect = pykka.ActorDeadError
-        self.monitor.subscribe_to(3, dead_subscriber)
-        self.monitor.subscribe_to(3, self.subscriber)
-        self.monitor.poll().get(self.TIMEOUT)
-        self.assertTrue(self.stop_proxy(self.monitor),
-                        "poll loop died from dead key subscriber")
-        self.subscriber.assert_called_with(item)
-    def test_mixed_subscriptions(self):
-        item = {'key': 4}
-        self.build_monitor([[item], [item]])
-        key_subscriber = mock.Mock(name='key_subscriber')
-        self.monitor.subscribe(self.subscriber)
-        self.monitor.subscribe_to(4, key_subscriber)
-        self.monitor.poll().get(self.TIMEOUT)
-        self.stop_proxy(self.monitor)
-        self.subscriber.assert_called_with([item])
-        key_subscriber.assert_called_with(item)
-    def test_subscription_to_missing_key(self):
-        self.build_monitor([[]])
-        self.monitor.subscribe_to('nonesuch', self.subscriber).get(self.TIMEOUT)
-        self.stop_proxy(self.monitor)
-        self.subscriber.assert_called_with(None)
-if __name__ == '__main__':
-    unittest.main()
diff --git a/services/nodemanager/tests/test_computenode.py b/services/nodemanager/tests/test_computenode.py
deleted file mode 100644 (file)
index 898112b..0000000
+++ /dev/null
@@ -1,45 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-# SPDX-License-Identifier: AGPL-3.0
-from __future__ import absolute_import, print_function
-import time
-import unittest
-import arvados.errors as arverror
-import mock
-import arvnodeman.computenode as cnode
-from . import testutil
-@mock.patch('time.time', return_value=1)
-class ShutdownTimerTestCase(unittest.TestCase):
-    def test_two_length_window(self, time_mock):
-        timer = cnode.ShutdownTimer(time_mock.return_value, [8, 2])
-        self.assertEqual(481, timer.next_opening())
-        self.assertFalse(timer.window_open())
-        time_mock.return_value += 500
-        self.assertEqual(1081, timer.next_opening())
-        self.assertTrue(timer.window_open())
-        time_mock.return_value += 200
-        self.assertEqual(1081, timer.next_opening())
-        self.assertFalse(timer.window_open())
-    def test_three_length_window(self, time_mock):
-        timer = cnode.ShutdownTimer(time_mock.return_value, [6, 3, 1])
-        self.assertEqual(361, timer.next_opening())
-        self.assertFalse(timer.window_open())
-        time_mock.return_value += 400
-        self.assertEqual(961, timer.next_opening())
-        self.assertTrue(timer.window_open())
-        time_mock.return_value += 200
-        self.assertEqual(961, timer.next_opening())
-        self.assertFalse(timer.window_open())
-class ArvadosTimestamp(unittest.TestCase):
-    def test_arvados_timestamp(self):
-        self.assertEqual(1527710178, cnode.arvados_timestamp('2018-05-30T19:56:18Z'))
-        self.assertEqual(1527710178.999371, cnode.arvados_timestamp('2018-05-30T19:56:18.999371Z'))
diff --git a/services/nodemanager/tests/test_computenode_dispatch.py b/services/nodemanager/tests/test_computenode_dispatch.py
deleted file mode 100644 (file)
index aee3cbd..0000000
+++ /dev/null
@@ -1,562 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-# SPDX-License-Identifier: AGPL-3.0
-from __future__ import absolute_import, print_function
-import time
-import unittest
-import arvados.errors as arverror
-import httplib2
-import mock
-import pykka
-import threading
-from libcloud.common.exceptions import BaseHTTPError
-import arvnodeman.computenode.dispatch as dispatch
-import arvnodeman.status as status
-from arvnodeman.computenode.driver import BaseComputeNodeDriver
-from . import testutil
-class ComputeNodeSetupActorTestCase(testutil.ActorTestMixin, unittest.TestCase):
-    ACTOR_CLASS = dispatch.ComputeNodeSetupActor
-    def make_mocks(self, arvados_effect=None):
-        if arvados_effect is None:
-            arvados_effect = [testutil.arvados_node_mock(
-                slot_number=None,
-                hostname=None,
-                first_ping_at=None,
-                last_ping_at=None,
-            )]
-        self.arvados_effect = arvados_effect
-        self.timer = testutil.MockTimer()
-        self.api_client = mock.MagicMock(name='api_client')
-        self.api_client.nodes().create().execute.side_effect = arvados_effect
-        self.api_client.nodes().update().execute.side_effect = arvados_effect
-        self.cloud_client = mock.MagicMock(name='cloud_client')
-        self.cloud_client.create_node.return_value = testutil.cloud_node_mock(1)
-    def make_actor(self, arv_node=None):
-        if not hasattr(self, 'timer'):
-            self.make_mocks(arvados_effect=[arv_node] if arv_node else None)
-        self.setup_actor = self.ACTOR_CLASS.start(
-            self.timer, self.api_client, self.cloud_client,
-            testutil.MockSize(1), arv_node).proxy()
-    def assert_node_properties_updated(self, uuid=None,
-                                       size=testutil.MockSize(1)):
-        self.api_client.nodes().update.assert_any_call(
-            uuid=(uuid or self.arvados_effect[-1]['uuid']),
-            body={
-                'properties': {
-                    'cloud_node': {
-                        'size': size.id,
-                        'price': size.price}}})
-    def test_creation_without_arvados_node(self):
-        self.make_actor()
-        finished = threading.Event()
-        self.setup_actor.subscribe(lambda _: finished.set())
-        self.assertEqual(self.arvados_effect[-1],
-                         self.setup_actor.arvados_node.get(self.TIMEOUT))
-        assert(finished.wait(self.TIMEOUT))
-        self.api_client.nodes().create.called_with(body={}, assign_slot=True)
-        self.assertEqual(1, self.api_client.nodes().create().execute.call_count)
-        self.assertEqual(1, self.api_client.nodes().update().execute.call_count)
-        self.assert_node_properties_updated()
-        self.assertEqual(self.cloud_client.create_node(),
-                         self.setup_actor.cloud_node.get(self.TIMEOUT))
-    def test_creation_with_arvados_node(self):
-        self.make_mocks(arvados_effect=[testutil.arvados_node_mock()]*2)
-        self.make_actor(testutil.arvados_node_mock())
-        finished = threading.Event()
-        self.setup_actor.subscribe(lambda _: finished.set())
-        self.assertEqual(self.arvados_effect[-1],
-                         self.setup_actor.arvados_node.get(self.TIMEOUT))
-        assert(finished.wait(self.TIMEOUT))
-        self.assert_node_properties_updated()
-        self.api_client.nodes().create.called_with(body={}, assign_slot=True)
-        self.assertEqual(3, self.api_client.nodes().update().execute.call_count)
-        self.assertEqual(self.cloud_client.create_node(),
-                         self.setup_actor.cloud_node.get(self.TIMEOUT))
-    def test_failed_arvados_calls_retried(self):
-        self.make_mocks([
-                arverror.ApiError(httplib2.Response({'status': '500'}), ""),
-                testutil.arvados_node_mock(),
-                ])
-        self.make_actor()
-        self.wait_for_assignment(self.setup_actor, 'arvados_node')
-    def test_failed_cloud_calls_retried(self):
-        self.make_mocks()
-        self.cloud_client.create_node.side_effect = [
-            Exception("test cloud creation error"),
-            self.cloud_client.create_node.return_value,
-            ]
-        self.make_actor()
-        self.wait_for_assignment(self.setup_actor, 'cloud_node')
-    def test_basehttperror_retried(self):
-        self.make_mocks()
-        self.cloud_client.create_node.side_effect = [
-            BaseHTTPError(500, "Try again"),
-            self.cloud_client.create_node.return_value,
-            ]
-        self.make_actor()
-        self.wait_for_assignment(self.setup_actor, 'cloud_node')
-        self.setup_actor.ping().get(self.TIMEOUT)
-        self.assertEqual(1, self.cloud_client.post_create_node.call_count)
-    def test_instance_exceeded_not_retried(self):
-        self.make_mocks()
-        self.cloud_client.create_node.side_effect = [
-            BaseHTTPError(400, "InstanceLimitExceeded"),
-            self.cloud_client.create_node.return_value,
-            ]
-        self.make_actor()
-        done = self.FUTURE_CLASS()
-        self.setup_actor.subscribe(done.set)
-        done.get(self.TIMEOUT)
-        self.assertEqual(0, self.cloud_client.post_create_node.call_count)
-    def test_failed_post_create_retried(self):
-        self.make_mocks()
-        self.cloud_client.post_create_node.side_effect = [
-            Exception("test cloud post-create error"), None]
-        self.make_actor()
-        done = self.FUTURE_CLASS()
-        self.setup_actor.subscribe(done.set)
-        done.get(self.TIMEOUT)
-        self.assertEqual(2, self.cloud_client.post_create_node.call_count)
-    def test_stop_when_no_cloud_node(self):
-        self.make_mocks(
-            arverror.ApiError(httplib2.Response({'status': '500'}), ""))
-        self.make_actor()
-        self.assertTrue(
-            self.setup_actor.stop_if_no_cloud_node().get(self.TIMEOUT))
-        self.assertTrue(
-            self.setup_actor.actor_ref.actor_stopped.wait(self.TIMEOUT))
-    def test_no_stop_when_cloud_node(self):
-        self.make_actor()
-        self.wait_for_assignment(self.setup_actor, 'cloud_node')
-        self.assertFalse(
-            self.setup_actor.stop_if_no_cloud_node().get(self.TIMEOUT))
-        self.assertTrue(self.stop_proxy(self.setup_actor),
-                        "actor was stopped by stop_if_no_cloud_node")
-    def test_subscribe(self):
-        self.make_mocks(
-            arverror.ApiError(httplib2.Response({'status': '500'}), ""))
-        self.make_actor()
-        subscriber = mock.Mock(name='subscriber_mock')
-        self.setup_actor.subscribe(subscriber)
-        retry_resp = [testutil.arvados_node_mock()]
-        self.api_client.nodes().create().execute.side_effect = retry_resp
-        self.api_client.nodes().update().execute.side_effect = retry_resp
-        self.wait_for_assignment(self.setup_actor, 'cloud_node')
-        self.setup_actor.ping().get(self.TIMEOUT)
-        self.assertEqual(self.setup_actor.actor_ref.actor_urn,
-                         subscriber.call_args[0][0].actor_ref.actor_urn)
-    def test_late_subscribe(self):
-        self.make_actor()
-        subscriber = mock.Mock(name='subscriber_mock')
-        self.wait_for_assignment(self.setup_actor, 'cloud_node')
-        self.setup_actor.subscribe(subscriber).get(self.TIMEOUT)
-        self.stop_proxy(self.setup_actor)
-        self.assertEqual(self.setup_actor.actor_ref.actor_urn,
-                         subscriber.call_args[0][0].actor_ref.actor_urn)
-class ComputeNodeShutdownActorMixin(testutil.ActorTestMixin):
-    def make_mocks(self, cloud_node=None, arvados_node=None,
-                   shutdown_open=True, node_broken=False):
-        self.timer = testutil.MockTimer()
-        self.shutdowns = testutil.MockShutdownTimer()
-        self.shutdowns._set_state(shutdown_open, 300)
-        self.cloud_client = mock.MagicMock(name='cloud_client')
-        self.cloud_client.broken.return_value = node_broken
-        self.arvados_client = mock.MagicMock(name='arvados_client')
-        self.updates = mock.MagicMock(name='update_mock')
-        if cloud_node is None:
-            cloud_node = testutil.cloud_node_mock()
-        self.cloud_node = cloud_node
-        self.arvados_node = arvados_node
-    def make_actor(self, cancellable=True, start_time=None):
-        if not hasattr(self, 'timer'):
-            self.make_mocks()
-        if start_time is None:
-            start_time = time.time()
-        monitor_actor = dispatch.ComputeNodeMonitorActor.start(
-            self.cloud_node, start_time, self.shutdowns,
-            self.timer, self.updates, self.cloud_client,
-            self.arvados_node)
-        self.shutdown_actor = self.ACTOR_CLASS.start(
-            self.timer, self.cloud_client, self.arvados_client, monitor_actor,
-            cancellable).proxy()
-        self.monitor_actor = monitor_actor.proxy()
-    def check_success_flag(self, expected, allow_msg_count=1):
-        # allow_msg_count is the number of internal messages that may
-        # need to be handled for shutdown to finish.
-        for _ in range(1 + allow_msg_count):
-            last_flag = self.shutdown_actor.success.get(self.TIMEOUT)
-            if last_flag is expected:
-                break
-        else:
-            self.fail("success flag {} is not {}".format(last_flag, expected))
-    def test_boot_failure_counting(self, *mocks):
-        # A boot failure happens when a node transitions from unpaired to shutdown
-        status.tracker.update({'boot_failures': 0})
-        self.make_mocks(shutdown_open=True, arvados_node=testutil.arvados_node_mock(crunch_worker_state="unpaired"))
-        self.cloud_client.destroy_node.return_value = True
-        self.make_actor(cancellable=False)
-        self.check_success_flag(True, 2)
-        self.assertTrue(self.cloud_client.destroy_node.called)
-        self.assertEqual(1, status.tracker.get('boot_failures'))
-    def test_cancellable_shutdown(self, *mocks):
-        self.make_mocks(shutdown_open=True, arvados_node=testutil.arvados_node_mock(crunch_worker_state="busy"))
-        self.cloud_client.destroy_node.return_value = True
-        self.make_actor(cancellable=True)
-        self.check_success_flag(False, 2)
-        self.assertFalse(self.cloud_client.destroy_node.called)
-    def test_uncancellable_shutdown(self, *mocks):
-        status.tracker.update({'boot_failures': 0})
-        self.make_mocks(shutdown_open=True, arvados_node=testutil.arvados_node_mock(crunch_worker_state="busy"))
-        self.cloud_client.destroy_node.return_value = True
-        self.make_actor(cancellable=False)
-        self.check_success_flag(True, 4)
-        self.assertTrue(self.cloud_client.destroy_node.called)
-        # A normal shutdown shouldn't be counted as boot failure
-        self.assertEqual(0, status.tracker.get('boot_failures'))
-    def test_arvados_node_cleaned_after_shutdown(self, *mocks):
-        if len(mocks) == 1:
-            mocks[0].return_value = "drain\n"
-        cloud_node = testutil.cloud_node_mock(62)
-        arv_node = testutil.arvados_node_mock(62)
-        self.make_mocks(cloud_node, arv_node)
-        self.make_actor()
-        self.check_success_flag(True, 3)
-        update_mock = self.arvados_client.nodes().update
-        self.assertTrue(update_mock.called)
-        update_kwargs = update_mock.call_args_list[0][1]
-        self.assertEqual(arv_node['uuid'], update_kwargs.get('uuid'))
-        self.assertIn('body', update_kwargs)
-        for clear_key in ['slot_number', 'hostname', 'ip_address',
-                          'first_ping_at', 'last_ping_at']:
-            self.assertIn(clear_key, update_kwargs['body'])
-            self.assertIsNone(update_kwargs['body'][clear_key])
-        self.assertTrue(update_mock().execute.called)
-    def test_arvados_node_not_cleaned_after_shutdown_cancelled(self, *mocks):
-        if len(mocks) == 1:
-            mocks[0].return_value = "idle\n"
-        cloud_node = testutil.cloud_node_mock(61)
-        arv_node = testutil.arvados_node_mock(61)
-        self.make_mocks(cloud_node, arv_node, shutdown_open=False)
-        self.cloud_client.destroy_node.return_value = False
-        self.make_actor(cancellable=True)
-        self.shutdown_actor.cancel_shutdown("test")
-        self.shutdown_actor.ping().get(self.TIMEOUT)
-        self.check_success_flag(False, 2)
-        self.assertFalse(self.arvados_client.nodes().update.called)
-class ComputeNodeShutdownActorTestCase(ComputeNodeShutdownActorMixin,
-                                       unittest.TestCase):
-    ACTOR_CLASS = dispatch.ComputeNodeShutdownActor
-    def test_easy_shutdown(self):
-        self.make_actor(start_time=0)
-        self.check_success_flag(True)
-        self.assertTrue(self.cloud_client.destroy_node.called)
-    def test_shutdown_cancelled_when_destroy_node_fails(self):
-        self.make_mocks(node_broken=True)
-        self.cloud_client.destroy_node.return_value = False
-        self.make_actor(start_time=0)
-        self.check_success_flag(False, 2)
-        self.assertEqual(1, self.cloud_client.destroy_node.call_count)
-        self.assertEqual(self.ACTOR_CLASS.DESTROY_FAILED,
-                         self.shutdown_actor.cancel_reason.get(self.TIMEOUT))
-    def test_late_subscribe(self):
-        self.make_actor()
-        subscriber = mock.Mock(name='subscriber_mock')
-        self.shutdown_actor.subscribe(subscriber).get(self.TIMEOUT)
-        self.stop_proxy(self.shutdown_actor)
-        self.assertTrue(subscriber.called)
-        self.assertEqual(self.shutdown_actor.actor_ref.actor_urn,
-                         subscriber.call_args[0][0].actor_ref.actor_urn)
-class ComputeNodeUpdateActorTestCase(testutil.ActorTestMixin,
-                                     unittest.TestCase):
-    ACTOR_CLASS = dispatch.ComputeNodeUpdateActor
-    def make_actor(self):
-        self.driver = mock.MagicMock(name='driver_mock')
-        self.timer = mock.MagicMock(name='timer_mock')
-        self.updater = self.ACTOR_CLASS.start(self.driver, self.timer).proxy()
-    def test_node_sync(self, *args):
-        self.make_actor()
-        cloud_node = testutil.cloud_node_mock()
-        arv_node = testutil.arvados_node_mock()
-        self.updater.sync_node(cloud_node, arv_node).get(self.TIMEOUT)
-        self.driver().sync_node.assert_called_with(cloud_node, arv_node)
-    @testutil.no_sleep
-    def test_node_sync_error(self, *args):
-        self.make_actor()
-        cloud_node = testutil.cloud_node_mock()
-        arv_node = testutil.arvados_node_mock()
-        self.driver().sync_node.side_effect = (IOError, Exception, True)
-        self.updater.sync_node(cloud_node, arv_node).get(self.TIMEOUT)
-        self.updater.sync_node(cloud_node, arv_node).get(self.TIMEOUT)
-        self.updater.sync_node(cloud_node, arv_node).get(self.TIMEOUT)
-        self.driver().sync_node.assert_called_with(cloud_node, arv_node)
-class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
-                                      unittest.TestCase):
-    def make_mocks(self, node_num):
-        self.shutdowns = testutil.MockShutdownTimer()
-        self.shutdowns._set_state(False, 300)
-        self.timer = mock.MagicMock(name='timer_mock')
-        self.updates = mock.MagicMock(name='update_mock')
-        self.cloud_mock = testutil.cloud_node_mock(node_num)
-        self.subscriber = mock.Mock(name='subscriber_mock')
-        self.cloud_client = mock.MagicMock(name='cloud_client')
-        self.cloud_client.broken.return_value = False
-    def make_actor(self, node_num=1, arv_node=None, start_time=None):
-        if not hasattr(self, 'cloud_mock'):
-            self.make_mocks(node_num)
-        if start_time is None:
-            start_time = time.time()
-        self.node_actor = dispatch.ComputeNodeMonitorActor.start(
-            self.cloud_mock, start_time, self.shutdowns,
-            self.timer, self.updates, self.cloud_client,
-            arv_node, boot_fail_after=300).proxy()
-        self.node_actor.subscribe(self.subscriber).get(self.TIMEOUT)
-    def node_state(self, *states):
-        return self.node_actor.in_state(*states).get(self.TIMEOUT)
-    def test_in_state_when_unpaired(self):
-        self.make_actor()
-        self.assertTrue(self.node_state('unpaired'))
-    def test_in_state_when_pairing_stale(self):
-        self.make_actor(arv_node=testutil.arvados_node_mock(
-                job_uuid=None, age=90000))
-        self.assertTrue(self.node_state('down'))
-    def test_in_state_when_no_state_available(self):
-        self.make_actor(arv_node=testutil.arvados_node_mock(
-                crunch_worker_state=None))
-        self.assertTrue(self.node_state('idle'))
-    def test_in_state_when_no_state_available_old(self):
-        self.make_actor(arv_node=testutil.arvados_node_mock(
-                crunch_worker_state=None, age=90000))
-        self.assertTrue(self.node_state('down'))
-    def test_in_idle_state(self):
-        idle_nodes_before = status.tracker._idle_nodes.keys()
-        self.make_actor(2, arv_node=testutil.arvados_node_mock(job_uuid=None))
-        self.assertTrue(self.node_state('idle'))
-        self.assertFalse(self.node_state('busy'))
-        self.assertTrue(self.node_state('idle', 'busy'))
-        idle_nodes_after = status.tracker._idle_nodes.keys()
-        new_idle_nodes = [n for n in idle_nodes_after if n not in idle_nodes_before]
-        # There should be 1 additional idle node
-        self.assertEqual(1, len(new_idle_nodes))
-    def test_in_busy_state(self):
-        idle_nodes_before = status.tracker._idle_nodes.keys()
-        self.make_actor(3, arv_node=testutil.arvados_node_mock(job_uuid=True))
-        self.assertFalse(self.node_state('idle'))
-        self.assertTrue(self.node_state('busy'))
-        self.assertTrue(self.node_state('idle', 'busy'))
-        idle_nodes_after = status.tracker._idle_nodes.keys()
-        new_idle_nodes = [n for n in idle_nodes_after if n not in idle_nodes_before]
-        # There shouldn't be any additional idle node
-        self.assertEqual(0, len(new_idle_nodes))
-    def test_init_shutdown_scheduling(self):
-        self.make_actor()
-        self.assertTrue(self.timer.schedule.called)
-        self.assertEqual(300, self.timer.schedule.call_args[0][0])
-    def test_shutdown_window_close_scheduling(self):
-        self.make_actor()
-        self.shutdowns._set_state(False, 600)
-        self.timer.schedule.reset_mock()
-        self.node_actor.consider_shutdown().get(self.TIMEOUT)
-        self.stop_proxy(self.node_actor)
-        self.assertTrue(self.timer.schedule.called)
-        self.assertEqual(600, self.timer.schedule.call_args[0][0])
-        self.assertFalse(self.subscriber.called)
-    def test_shutdown_subscription(self):
-        self.make_actor(start_time=0)
-        self.shutdowns._set_state(True, 600)
-        self.node_actor.consider_shutdown().get(self.TIMEOUT)
-        self.assertTrue(self.subscriber.called)
-        self.assertEqual(self.node_actor.actor_ref.actor_urn,
-                         self.subscriber.call_args[0][0].actor_ref.actor_urn)
-    def test_no_shutdown_booting(self):
-        self.make_actor()
-        self.shutdowns._set_state(True, 600)
-        self.assertEquals(self.node_actor.shutdown_eligible().get(self.TIMEOUT),
-                          (False, "node state is ('unpaired', 'open', 'boot wait', 'not idle')"))
-    def test_shutdown_when_invalid_cloud_node_size(self):
-        self.make_mocks(1)
-        self.cloud_mock.size.id = 'invalid'
-        self.cloud_mock.extra['arvados_node_size'] = 'stale.type'
-        self.make_actor()
-        self.shutdowns._set_state(True, 600)
-        self.assertEquals((True, "node's size tag 'stale.type' not recognizable"),
-                          self.node_actor.shutdown_eligible().get(self.TIMEOUT))
-    def test_shutdown_without_arvados_node(self):
-        self.make_actor(start_time=0)
-        self.shutdowns._set_state(True, 600)
-        self.assertEquals((True, "node state is ('down', 'open', 'boot exceeded', 'not idle')"),
-                          self.node_actor.shutdown_eligible().get(self.TIMEOUT))
-    def test_shutdown_missing(self):
-        arv_node = testutil.arvados_node_mock(10, job_uuid=None,
-                                              crunch_worker_state="down",
-                                              last_ping_at='1970-01-01T01:02:03.04050607Z')
-        self.make_actor(10, arv_node)
-        self.shutdowns._set_state(True, 600)
-        self.assertEquals((True, "node state is ('down', 'open', 'boot wait', 'not idle')"),
-                          self.node_actor.shutdown_eligible().get(self.TIMEOUT))
-    def test_shutdown_running_broken(self):
-        arv_node = testutil.arvados_node_mock(12, job_uuid=None,
-                                              crunch_worker_state="down")
-        self.make_actor(12, arv_node)
-        self.shutdowns._set_state(True, 600)
-        self.cloud_client.broken.return_value = True
-        self.assertEquals((True, "node state is ('down', 'open', 'boot wait', 'not idle')"),
-                          self.node_actor.shutdown_eligible().get(self.TIMEOUT))
-    def test_shutdown_missing_broken(self):
-        arv_node = testutil.arvados_node_mock(11, job_uuid=None,
-                                              crunch_worker_state="down",
-                                              last_ping_at='1970-01-01T01:02:03.04050607Z')
-        self.make_actor(11, arv_node)
-        self.shutdowns._set_state(True, 600)
-        self.cloud_client.broken.return_value = True
-        self.assertEquals(self.node_actor.shutdown_eligible().get(self.TIMEOUT), (True, "node state is ('down', 'open', 'boot wait', 'not idle')"))
-    def test_no_shutdown_when_window_closed(self):
-        self.make_actor(3, testutil.arvados_node_mock(3, job_uuid=None))
-        self.assertEquals((False, "node state is ('idle', 'closed', 'boot wait', 'idle exceeded')"),
-                          self.node_actor.shutdown_eligible().get(self.TIMEOUT))
-    def test_no_shutdown_when_node_running_job(self):
-        self.make_actor(4, testutil.arvados_node_mock(4, job_uuid=True))
-        self.shutdowns._set_state(True, 600)
-        self.assertEquals((False, "node state is ('busy', 'open', 'boot wait', 'not idle')"),
-                          self.node_actor.shutdown_eligible().get(self.TIMEOUT))
-    def test_shutdown_when_node_state_unknown(self):
-        self.make_actor(5, testutil.arvados_node_mock(
-            5, crunch_worker_state=None))
-        self.shutdowns._set_state(True, 600)
-        self.assertEquals((True, "node state is ('idle', 'open', 'boot wait', 'idle exceeded')"),
-                          self.node_actor.shutdown_eligible().get(self.TIMEOUT))
-    def test_shutdown_when_node_state_fail(self):
-        self.make_actor(5, testutil.arvados_node_mock(
-            5, crunch_worker_state='fail'))
-        self.shutdowns._set_state(True, 600)
-        self.assertEquals((True, "node state is ('fail', 'open', 'boot wait', 'not idle')"),
-                          self.node_actor.shutdown_eligible().get(self.TIMEOUT))
-    def test_no_shutdown_when_node_state_stale(self):
-        self.make_actor(6, testutil.arvados_node_mock(6, age=90000))
-        self.shutdowns._set_state(True, 600)
-        self.assertEquals((False, "node state is stale"),
-                          self.node_actor.shutdown_eligible().get(self.TIMEOUT))
-    def test_arvados_node_match(self):
-        self.make_actor(2)
-        arv_node = testutil.arvados_node_mock(
-            2, hostname='compute-two.zzzzz.arvadosapi.com')
-        self.cloud_client.node_id.return_value = '2'
-        pair_id = self.node_actor.offer_arvados_pair(arv_node).get(self.TIMEOUT)
-        self.assertEqual(self.cloud_mock.id, pair_id)
-        self.stop_proxy(self.node_actor)
-        self.updates.sync_node.assert_called_with(self.cloud_mock, arv_node)
-    def test_arvados_node_mismatch(self):
-        self.make_actor(3)
-        arv_node = testutil.arvados_node_mock(1)
-        self.assertIsNone(
-            self.node_actor.offer_arvados_pair(arv_node).get(self.TIMEOUT))
-    def test_arvados_node_mismatch_first_ping_too_early(self):
-        self.make_actor(4)
-        arv_node = testutil.arvados_node_mock(
-            4, first_ping_at='1971-03-02T14:15:16.1717282Z')
-        self.assertIsNone(
-            self.node_actor.offer_arvados_pair(arv_node).get(self.TIMEOUT))
-    def test_update_cloud_node(self):
-        self.make_actor(1)
-        self.make_mocks(2)
-        self.cloud_mock.id = '1'
-        self.node_actor.update_cloud_node(self.cloud_mock)
-        current_cloud = self.node_actor.cloud_node.get(self.TIMEOUT)
-        self.assertEqual([testutil.ip_address_mock(2)],
-                         current_cloud.private_ips)
-    def test_missing_cloud_node_update(self):
-        self.make_actor(1)
-        self.node_actor.update_cloud_node(None)
-        current_cloud = self.node_actor.cloud_node.get(self.TIMEOUT)
-        self.assertEqual([testutil.ip_address_mock(1)],
-                         current_cloud.private_ips)
-    def test_update_arvados_node(self):
-        self.make_actor(3)
-        job_uuid = 'zzzzz-jjjjj-updatejobnode00'
-        new_arvados = testutil.arvados_node_mock(3, job_uuid)
-        self.node_actor.update_arvados_node(new_arvados)
-        current_arvados = self.node_actor.arvados_node.get(self.TIMEOUT)
-        self.assertEqual(job_uuid, current_arvados['job_uuid'])
-    def test_missing_arvados_node_update(self):
-        self.make_actor(4, testutil.arvados_node_mock(4))
-        self.node_actor.update_arvados_node(None)
-        current_arvados = self.node_actor.arvados_node.get(self.TIMEOUT)
-        self.assertEqual(testutil.ip_address_mock(4),
-                         current_arvados['ip_address'])
-    def test_update_arvados_node_calls_sync_node(self):
-        self.make_mocks(5)
-        self.cloud_mock.extra['testname'] = 'cloudfqdn.zzzzz.arvadosapi.com'
-        self.make_actor()
-        arv_node = testutil.arvados_node_mock(5)
-        self.node_actor.update_arvados_node(arv_node).get(self.TIMEOUT)
-        self.assertEqual(1, self.updates.sync_node.call_count)
diff --git a/services/nodemanager/tests/test_computenode_dispatch_slurm.py b/services/nodemanager/tests/test_computenode_dispatch_slurm.py
deleted file mode 100644 (file)
index 02d8fb6..0000000
+++ /dev/null
@@ -1,155 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-# SPDX-License-Identifier: AGPL-3.0
-from __future__ import absolute_import, print_function
-import subprocess32 as subprocess
-import time
-import unittest
-import mock
-import arvnodeman.computenode.dispatch.slurm as slurm_dispatch
-from . import testutil
-from .test_computenode_dispatch import \
-    ComputeNodeShutdownActorMixin, \
-    ComputeNodeSetupActorTestCase, \
-    ComputeNodeUpdateActorTestCase
-class SLURMComputeNodeShutdownActorTestCase(ComputeNodeShutdownActorMixin,
-                                            unittest.TestCase):
-    ACTOR_CLASS = slurm_dispatch.ComputeNodeShutdownActor
-    def check_slurm_got_args(self, proc_mock, *args):
-        self.assertTrue(proc_mock.called)
-        slurm_cmd = proc_mock.call_args[0][0]
-        for s in args:
-            self.assertIn(s, slurm_cmd)
-    def check_success_after_reset(self, proc_mock, end_state='drain\n', timer=False):
-        self.make_mocks(arvados_node=testutil.arvados_node_mock(63))
-        if not timer:
-            self.timer = testutil.MockTimer(False)
-        self.make_actor()
-        self.check_success_flag(None, 0)
-        # At this point, 1st try should have happened.
-        self.timer.deliver()
-        self.check_success_flag(None, 0)
-        # At this point, 2nd try should have happened.
-        # Order is critical here: if the mock gets called when no return value
-        # or side effect is set, we may invoke a real subprocess.
-        proc_mock.return_value = end_state
-        proc_mock.side_effect = None
-        # 3rd try
-        self.timer.deliver()
-        self.check_success_flag(True, 3)
-        self.check_slurm_got_args(proc_mock, 'NodeName=compute63')
-    def make_wait_state_test(start_state='drng\n', end_state='drain\n'):
-        def test(self, proc_mock):
-            proc_mock.return_value = start_state
-            self.check_success_after_reset(proc_mock, end_state)
-        return test
-    for wait_state in ['alloc\n', 'drng\n']:
-        locals()['test_wait_while_' + wait_state.strip()
-                 ] = make_wait_state_test(start_state=wait_state)
-    for end_state in ['idle*\n', 'down\n', 'down*\n', 'drain\n', 'fail\n']:
-        locals()['test_wait_until_' + end_state.strip()
-                 ] = make_wait_state_test(end_state=end_state)
-    def test_retry_failed_slurm_calls(self, proc_mock):
-        proc_mock.side_effect = subprocess.CalledProcessError(1, ["mock"])
-        self.check_success_after_reset(proc_mock)
-    def test_slurm_bypassed_when_no_arvados_node(self, proc_mock):
-        # Test we correctly handle a node that failed to bootstrap.
-        proc_mock.return_value = 'down\n'
-        self.make_actor(start_time=0)
-        self.check_success_flag(True)
-        self.assertFalse(proc_mock.called)
-    def test_node_resumed_when_shutdown_cancelled(self, proc_mock):
-        try:
-            proc_mock.side_effect = iter(['', 'drng\n', 'drng\n', ''])
-            self.make_mocks(arvados_node=testutil.arvados_node_mock(job_uuid=True))
-            self.timer = testutil.MockTimer(False)
-            self.make_actor()
-            self.busywait(lambda: proc_mock.call_args is not None)
-            self.shutdown_actor.cancel_shutdown("test")
-            self.check_success_flag(False, 2)
-            self.assertEqual(proc_mock.call_args_list[0], mock.call(['scontrol', 'update', 'NodeName=compute99', 'State=DRAIN', 'Reason=Node Manager shutdown']))
-            self.assertEqual(proc_mock.call_args_list[-1], mock.call(['scontrol', 'update', 'NodeName=compute99', 'State=RESUME']))
-        finally:
-            self.shutdown_actor.actor_ref.stop()
-    def test_cancel_shutdown_retry(self, proc_mock):
-        proc_mock.side_effect = iter([OSError, 'drain\n', OSError, 'idle\n', 'idle\n'])
-        self.make_mocks(arvados_node=testutil.arvados_node_mock(job_uuid=True))
-        self.make_actor()
-        self.check_success_flag(False, 5)
-    def test_issue_slurm_drain_retry(self, proc_mock):
-        proc_mock.side_effect = iter([OSError, OSError, 'drng\n', 'drain\n'])
-        self.check_success_after_reset(proc_mock, timer=False)
-    def test_arvados_node_cleaned_after_shutdown(self, proc_mock):
-        proc_mock.return_value = 'drain\n'
-        super(SLURMComputeNodeShutdownActorTestCase,
-              self).test_arvados_node_cleaned_after_shutdown()
-    def test_cancellable_shutdown(self, proc_mock):
-        proc_mock.return_value = 'other\n'
-        super(SLURMComputeNodeShutdownActorTestCase,
-              self).test_cancellable_shutdown()
-    def test_uncancellable_shutdown(self, proc_mock):
-        proc_mock.return_value = 'other\n'
-        super(SLURMComputeNodeShutdownActorTestCase,
-              self).test_uncancellable_shutdown()
-class SLURMComputeNodeUpdateActorTestCase(ComputeNodeUpdateActorTestCase):
-    ACTOR_CLASS = slurm_dispatch.ComputeNodeUpdateActor
-    def test_update_node_weight(self, check_output):
-        self.make_actor()
-        cloud_node = testutil.cloud_node_mock()
-        arv_node = testutil.arvados_node_mock()
-        self.updater.sync_node(cloud_node, arv_node).get(self.TIMEOUT)
-        check_output.assert_called_with(['scontrol', 'update', 'NodeName=compute99', 'Weight=99000', 'Features=instancetype=z99.test'])
-class SLURMComputeNodeSetupActorTestCase(ComputeNodeSetupActorTestCase):
-    ACTOR_CLASS = slurm_dispatch.ComputeNodeSetupActor
-    @mock.patch('subprocess32.check_output')
-    def test_update_node_features(self, check_output):
-        # `scontrol update` happens only if the Arvados node record
-        # has a hostname. ComputeNodeSetupActorTestCase.make_mocks
-        # uses mocks with scrubbed hostnames, so we override with the
-        # default testutil.arvados_node_mock.
-        self.make_mocks(arvados_effect=[testutil.arvados_node_mock()])
-        self.make_actor()
-        self.wait_for_assignment(self.setup_actor, 'cloud_node')
-        check_output.assert_called_with(['scontrol', 'update', 'NodeName=compute99', 'Weight=1000', 'Features=instancetype=z1.test'])
-    @mock.patch('subprocess32.check_output')
-    def test_failed_arvados_calls_retried(self, check_output):
-        super(SLURMComputeNodeSetupActorTestCase, self).test_failed_arvados_calls_retried()
-    @mock.patch('subprocess32.check_output')
-    def test_subscribe(self, check_output):
-        super(SLURMComputeNodeSetupActorTestCase, self).test_subscribe()
-    @mock.patch('subprocess32.check_output')
-    def test_creation_with_arvados_node(self, check_output):
-        super(SLURMComputeNodeSetupActorTestCase, self).test_creation_with_arvados_node()
diff --git a/services/nodemanager/tests/test_computenode_driver.py b/services/nodemanager/tests/test_computenode_driver.py
deleted file mode 100644 (file)
index 4bf4c39..0000000
+++ /dev/null
@@ -1,113 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-# SPDX-License-Identifier: AGPL-3.0
-from __future__ import absolute_import, print_function
-import unittest
-import libcloud.common.types as cloud_types
-import mock
-import arvnodeman.computenode.driver as driver_base
-import arvnodeman.status as status
-import arvnodeman.config as config
-from . import testutil
-class ComputeNodeDriverTestCase(unittest.TestCase):
-    def setUp(self):
-        self.driver_mock = mock.MagicMock(name='driver_mock')
-        driver_base.BaseComputeNodeDriver.SEARCH_CACHE = {}
-    def test_search_for_now_uses_public_method(self):
-        image = testutil.cloud_object_mock(1)
-        self.driver_mock().list_images.return_value = [image]
-        driver = driver_base.BaseComputeNodeDriver({}, {}, {}, self.driver_mock)
-        self.assertIs(image, driver.search_for_now('id_1', 'list_images'))
-        self.assertEqual(1, self.driver_mock().list_images.call_count)
-    def test_search_for_now_uses_private_method(self):
-        net = testutil.cloud_object_mock(1)
-        self.driver_mock().ex_list_networks.return_value = [net]
-        driver = driver_base.BaseComputeNodeDriver({}, {}, {}, self.driver_mock)
-        self.assertIs(net, driver.search_for_now('id_1', 'ex_list_networks'))
-        self.assertEqual(1, self.driver_mock().ex_list_networks.call_count)
-    def test_search_for_now_raises_ValueError_on_zero_results(self):
-        self.driver_mock().list_images.return_value = []
-        driver = driver_base.BaseComputeNodeDriver({}, {}, {}, self.driver_mock)
-        with self.assertRaises(ValueError) as test:
-            driver.search_for_now('id_1', 'list_images')
-    def test_search_for_now_raises_ValueError_on_extra_results(self):
-        image = testutil.cloud_object_mock(1)
-        self.driver_mock().list_images.return_value = [image, image]
-        driver = driver_base.BaseComputeNodeDriver({}, {}, {}, self.driver_mock)
-        with self.assertRaises(ValueError) as test:
-            driver.search_for_now('id_1', 'list_images')
-    def test_search_for_now_does_not_cache_results(self):
-        image1 = testutil.cloud_object_mock(1)
-        image2 = testutil.cloud_object_mock(1)
-        self.driver_mock().list_images.side_effect = [[image1], [image2]]
-        driver = driver_base.BaseComputeNodeDriver({}, {}, {}, self.driver_mock)
-        self.assertIsNot(driver.search_for_now('id_1', 'list_images'),
-                         driver.search_for_now('id_1', 'list_images'))
-        self.assertEqual(2, self.driver_mock().list_images.call_count)
-    def test_search_for_returns_cached_results(self):
-        image1 = testutil.cloud_object_mock(1)
-        image2 = testutil.cloud_object_mock(1)
-        self.driver_mock().list_images.side_effect = [[image1], [image2]]
-        driver = driver_base.BaseComputeNodeDriver({}, {}, {}, self.driver_mock)
-        self.assertIs(driver.search_for('id_1', 'list_images'),
-                      driver.search_for('id_1', 'list_images'))
-        self.assertEqual(1, self.driver_mock().list_images.call_count)
-    class TestBaseComputeNodeDriver(driver_base.BaseComputeNodeDriver):
-        def arvados_create_kwargs(self, size, arvados_node):
-            return {'name': arvados_node}
-    def test_create_node_only_cloud_errors_are_counted(self):
-        status.tracker.update({'create_node_errors': 0})
-        errors = [(config.CLOUD_ERRORS[0], True), (KeyError, False)]
-        self.driver_mock().list_images.return_value = []
-        driver = self.TestBaseComputeNodeDriver({}, {}, {}, self.driver_mock)
-        error_count = 0
-        for an_error, is_cloud_error in errors:
-            self.driver_mock().create_node.side_effect = an_error
-            with self.assertRaises(an_error):
-                driver.create_node(testutil.MockSize(1), 'id_1')
-            if is_cloud_error:
-                error_count += 1
-            self.assertEqual(error_count, status.tracker.get('create_node_errors'))
-    def test_list_nodes_only_cloud_errors_are_counted(self):
-        status.tracker.update({'list_nodes_errors': 0})
-        errors = [(config.CLOUD_ERRORS[0], True), (KeyError, False)]
-        driver = self.TestBaseComputeNodeDriver({}, {}, {}, self.driver_mock)
-        error_count = 0
-        for an_error, is_cloud_error in errors:
-            self.driver_mock().list_nodes.side_effect = an_error
-            with self.assertRaises(an_error):
-                driver.list_nodes()
-            if is_cloud_error:
-                error_count += 1
-            self.assertEqual(error_count, status.tracker.get('list_nodes_errors'))
-    def test_destroy_node_only_cloud_errors_are_counted(self):
-        status.tracker.update({'destroy_node_errors': 0})
-        errors = [(config.CLOUD_ERRORS[0], True), (KeyError, False)]
-        self.driver_mock().list_nodes.return_value = [testutil.MockSize(1)]
-        driver = self.TestBaseComputeNodeDriver({}, {}, {}, self.driver_mock)
-        error_count = 0
-        for an_error, is_cloud_error in errors:
-            self.driver_mock().destroy_node.side_effect = an_error
-            with self.assertRaises(an_error):
-                driver.destroy_node(testutil.MockSize(1))
-            if is_cloud_error:
-                error_count += 1
-            self.assertEqual(error_count, status.tracker.get('destroy_node_errors'))
diff --git a/services/nodemanager/tests/test_computenode_driver_azure.py b/services/nodemanager/tests/test_computenode_driver_azure.py
deleted file mode 100644 (file)
index ea7a033..0000000
+++ /dev/null
@@ -1,145 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-# SPDX-License-Identifier: AGPL-3.0
-from __future__ import absolute_import, print_function
-import ssl
-import time
-import unittest
-import libcloud.common.types as cloud_types
-import mock
-import arvnodeman.computenode.driver.azure as azure
-from . import testutil
-class AzureComputeNodeDriverTestCase(testutil.DriverTestMixin, unittest.TestCase):
-    TEST_CLASS = azure.ComputeNodeDriver
-    def new_driver(self, auth_kwargs={}, list_kwargs={}, create_kwargs={}):
-        list_kwargs.setdefault("ex_resource_group", "TestResourceGroup")
-        return super(AzureComputeNodeDriverTestCase, self).new_driver(auth_kwargs, list_kwargs, create_kwargs)
-    def test_driver_instantiation(self):
-        kwargs = {'key': 'testkey'}
-        driver = self.new_driver(auth_kwargs=kwargs)
-        self.assertTrue(self.driver_mock.called)
-        self.assertEqual(kwargs, self.driver_mock.call_args[1])
-    def test_create_image_loaded_at_initialization(self):
-        get_method = self.driver_mock().get_image
-        get_method.return_value = testutil.cloud_object_mock('id_b')
-        driver = self.new_driver(create_kwargs={'image': 'id_b'})
-        self.assertEqual(1, get_method.call_count)
-    def test_create_includes_ping(self):
-        arv_node = testutil.arvados_node_mock(info={'ping_secret': 'ssshh'})
-        arv_node["hostname"] = None
-        driver = self.new_driver()
-        driver.create_node(testutil.MockSize(1), arv_node)
-        create_method = self.driver_mock().create_node
-        self.assertTrue(create_method.called)
-        self.assertIn('ping_secret=ssshh',
-                      create_method.call_args[1].get('ex_tags', {}).get('arv-ping-url', ""))
-    def test_create_includes_arvados_node_size(self):
-        arv_node = testutil.arvados_node_mock()
-        arv_node["hostname"] = None
-        size = testutil.MockSize(1)
-        driver = self.new_driver()
-        driver.create_node(size, arv_node)
-        create_method = self.driver_mock().create_node
-        self.assertTrue(create_method.called)
-        self.assertIn(
-            ('arvados_node_size', size.id),
-            create_method.call_args[1].get('ex_tags', {'tags': 'missing'}).items()
-        )
-    def test_name_from_new_arvados_node(self):
-        arv_node = testutil.arvados_node_mock(hostname=None)
-        driver = self.new_driver()
-        self.assertEqual('compute-000000000000063-zzzzz',
-                         driver.arvados_create_kwargs(testutil.MockSize(1), arv_node)['name'])
-    def check_node_tagged(self, cloud_node, expected_tags):
-        tag_mock = self.driver_mock().ex_create_tags
-        self.assertTrue(tag_mock.called)
-        self.assertIs(cloud_node, tag_mock.call_args[0][0])
-        self.assertEqual(expected_tags, tag_mock.call_args[0][1])
-    def test_node_create_time(self):
-        refsecs = int(time.time())
-        reftuple = time.gmtime(refsecs)
-        node = testutil.cloud_node_mock()
-        node.extra = {'tags': {'booted_at': time.strftime('%Y-%m-%dT%H:%M:%S.000Z',
-                                                   reftuple)}}
-        self.assertEqual(refsecs, azure.ComputeNodeDriver.node_start_time(node))
-    def test_node_fqdn(self):
-        name = 'fqdntest.zzzzz.arvadosapi.com'
-        node = testutil.cloud_node_mock()
-        node.extra = {'tags': {"hostname": name}}
-        self.assertEqual(name, azure.ComputeNodeDriver.node_fqdn(node))
-    def test_sync_node(self):
-        arv_node = testutil.arvados_node_mock(1)
-        cloud_node = testutil.cloud_node_mock(2)
-        driver = self.new_driver()
-        driver.sync_node(cloud_node, arv_node)
-        self.check_node_tagged(cloud_node,
-                               {'hostname': 'compute1.zzzzz.arvadosapi.com'})
-    def test_custom_data(self):
-        arv_node = testutil.arvados_node_mock(hostname=None)
-        driver = self.new_driver()
-        self.assertEqual("""#!/bin/sh
-mkdir -p    /var/tmp/arv-node-data/meta-data
-echo 'https://100::/arvados/v1/nodes/zzzzz-yyyyy-000000000000063/ping?ping_secret=defaulttestsecret' > /var/tmp/arv-node-data/arv-ping-url
-echo compute-000000000000063-zzzzz > /var/tmp/arv-node-data/meta-data/instance-id
-echo z1.test > /var/tmp/arv-node-data/meta-data/instance-type
-                         driver.arvados_create_kwargs(testutil.MockSize(1), arv_node)['ex_customdata'])
-    def test_list_nodes_ignores_nodes_without_tags(self):
-        driver = self.new_driver(create_kwargs={"tag_arvados-class": "dynamic-compute"})
-        # Mock cloud node without tags
-        nodelist = [testutil.cloud_node_mock(1)]
-        self.driver_mock().list_nodes.return_value = nodelist
-        n = driver.list_nodes()
-        self.assertEqual([], n)
-    def test_create_raises_but_actually_succeeded(self):
-        arv_node = testutil.arvados_node_mock(1, hostname=None)
-        driver = self.new_driver(create_kwargs={"tag_arvados-class": "dynamic-compute"})
-        nodelist = [testutil.cloud_node_mock(1, tags={"arvados-class": "dynamic-compute"})]
-        nodelist[0].name = 'compute-000000000000001-zzzzz'
-        self.driver_mock().list_nodes.return_value = nodelist
-        self.driver_mock().create_node.side_effect = IOError
-        n = driver.create_node(testutil.MockSize(1), arv_node)
-        self.assertEqual('compute-000000000000001-zzzzz', n.name)
-    def test_ex_fetch_nic_false(self):
-        arv_node = testutil.arvados_node_mock(1, hostname=None)
-        driver = self.new_driver(create_kwargs={"tag_arvados-class": "dynamic-compute"})
-        nodelist = [testutil.cloud_node_mock(1, tags={"arvados-class": "dynamic-compute"})]
-        nodelist[0].name = 'compute-000000000000001-zzzzz'
-        self.driver_mock().list_nodes.return_value = nodelist
-        n = driver.list_nodes()
-        self.assertEqual(nodelist, n)
-        self.driver_mock().list_nodes.assert_called_with(ex_fetch_nic=False, ex_fetch_power_state=False, ex_resource_group='TestResourceGroup')
-    def test_create_can_find_node_after_timeout(self):
-        super(AzureComputeNodeDriverTestCase,
-              self).test_create_can_find_node_after_timeout(
-                  create_kwargs={'tag_arvados-class': 'test'},
-                  node_extra={'tags': {'arvados-class': 'test'}})
-    def test_node_found_after_timeout_has_fixed_size(self):
-        size = testutil.MockSize(4)
-        node_props = {'hardwareProfile': {'vmSize': size.id}}
-        cloud_node = testutil.cloud_node_mock(tags={'arvados-class': 'test'}, properties=node_props)
-        cloud_node.size = None
-        self.check_node_found_after_timeout_has_fixed_size(
-            size, cloud_node, {'tag_arvados-class': 'test'})
diff --git a/services/nodemanager/tests/test_computenode_driver_ec2.py b/services/nodemanager/tests/test_computenode_driver_ec2.py
deleted file mode 100644 (file)
index 520c0dc..0000000
+++ /dev/null
@@ -1,175 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-# SPDX-License-Identifier: AGPL-3.0
-from __future__ import absolute_import, print_function
-import ssl
-import time
-import unittest
-import libcloud.common.types as cloud_types
-import mock
-import arvnodeman.computenode.driver.ec2 as ec2
-from . import testutil
-class EC2ComputeNodeDriverTestCase(testutil.DriverTestMixin, unittest.TestCase):
-    TEST_CLASS = ec2.ComputeNodeDriver
-    def test_driver_instantiation(self):
-        kwargs = {'key': 'testkey'}
-        driver = self.new_driver(auth_kwargs=kwargs)
-        self.assertTrue(self.driver_mock.called)
-        self.assertEqual(kwargs, self.driver_mock.call_args[1])
-    def test_list_kwargs_become_filters(self):
-        # We're also testing tag name translation.
-        driver = self.new_driver(list_kwargs={'tag_test': 'true'})
-        driver.list_nodes()
-        list_method = self.driver_mock().list_nodes
-        self.assertTrue(list_method.called)
-        self.assertEqual({'tag:test': 'true'},
-                          list_method.call_args[1].get('ex_filters'))
-    def test_create_image_loaded_at_initialization(self):
-        list_method = self.driver_mock().list_images
-        list_method.return_value = [testutil.cloud_object_mock(c)
-                                    for c in 'abc']
-        driver = self.new_driver(create_kwargs={'image_id': 'id_b'})
-        self.assertEqual(1, list_method.call_count)
-    def test_create_includes_ping_secret(self):
-        arv_node = testutil.arvados_node_mock(info={'ping_secret': 'ssshh'})
-        driver = self.new_driver()
-        driver.create_node(testutil.MockSize(1), arv_node)
-        create_method = self.driver_mock().create_node
-        self.assertTrue(create_method.called)
-        self.assertIn('ping_secret=ssshh',
-                      create_method.call_args[1].get('ex_userdata',
-                                                     'arg missing'))
-    def test_create_includes_metadata(self):
-        arv_node = testutil.arvados_node_mock()
-        driver = self.new_driver(list_kwargs={'tag_test': 'testvalue'})
-        driver.create_node(testutil.MockSize(1), arv_node)
-        create_method = self.driver_mock().create_node
-        self.assertTrue(create_method.called)
-        self.assertIn(
-            ('test', 'testvalue'),
-            create_method.call_args[1].get('ex_metadata', {'arg': 'missing'}).items()
-        )
-    def test_create_includes_arvados_node_size(self):
-        arv_node = testutil.arvados_node_mock()
-        size = testutil.MockSize(1)
-        driver = self.new_driver()
-        driver.create_node(size, arv_node)
-        create_method = self.driver_mock().create_node
-        self.assertTrue(create_method.called)
-        self.assertIn(
-            ('arvados_node_size', size.id),
-            create_method.call_args[1].get('ex_metadata', {'arg': 'missing'}).items()
-        )
-    def test_create_preemptible_instance(self):
-        arv_node = testutil.arvados_node_mock()
-        driver = self.new_driver()
-        driver.create_node(testutil.MockSize(1, preemptible=True), arv_node)
-        create_method = self.driver_mock().create_node
-        self.assertTrue(create_method.called)
-        self.assertEqual(
-            True,
-            create_method.call_args[1].get('ex_spot_market', 'arg missing')
-        )
-    def test_hostname_from_arvados_node(self):
-        arv_node = testutil.arvados_node_mock(8)
-        driver = self.new_driver()
-        self.assertEqual('compute8.zzzzz.arvadosapi.com',
-                         driver.arvados_create_kwargs(testutil.MockSize(1), arv_node)['name'])
-    def test_default_hostname_from_new_arvados_node(self):
-        arv_node = testutil.arvados_node_mock(hostname=None)
-        driver = self.new_driver()
-        self.assertEqual('dynamic.compute.zzzzz.arvadosapi.com',
-                         driver.arvados_create_kwargs(testutil.MockSize(1), arv_node)['name'])
-    def check_node_tagged(self, cloud_node, expected_tags):
-        tag_mock = self.driver_mock().ex_create_tags
-        self.assertTrue(tag_mock.called)
-        self.assertIs(cloud_node, tag_mock.call_args[0][0])
-        self.assertEqual(expected_tags, tag_mock.call_args[0][1])
-    def test_sync_node(self):
-        arv_node = testutil.arvados_node_mock(1)
-        cloud_node = testutil.cloud_node_mock(2)
-        driver = self.new_driver()
-        driver.sync_node(cloud_node, arv_node)
-        self.check_node_tagged(cloud_node,
-                               {'Name': 'compute1.zzzzz.arvadosapi.com'})
-    def test_node_create_time(self):
-        refsecs = int(time.time())
-        reftuple = time.gmtime(refsecs)
-        node = testutil.cloud_node_mock()
-        node.extra = {'launch_time': time.strftime('%Y-%m-%dT%H:%M:%S.000Z',
-                                                   reftuple)}
-        self.assertEqual(refsecs, ec2.ComputeNodeDriver.node_start_time(node))
-    def test_node_fqdn(self):
-        name = 'fqdntest.zzzzz.arvadosapi.com'
-        node = testutil.cloud_node_mock()
-        node.name = name
-        self.assertEqual(name, ec2.ComputeNodeDriver.node_fqdn(node))
-    def test_create_ebs_volume(self):
-        arv_node = testutil.arvados_node_mock()
-        driver = self.new_driver()
-        # libcloud/ec2 "disk" sizes are in GB, Arvados/SLURM "scratch" value is in MB
-        size = testutil.MockSize(1)
-        size.disk=5
-        size.scratch=20000
-        driver.create_node(size, arv_node)
-        create_method = self.driver_mock().create_node
-        self.assertTrue(create_method.called)
-        self.assertEqual([{
-            "DeviceName": "/dev/xvdt",
-            "Ebs": {
-                "DeleteOnTermination": True,
-                "VolumeSize": 16,
-                "VolumeType": "gp2"
-            }}],
-                         create_method.call_args[1].get('ex_blockdevicemappings'))
-    def test_ebs_volume_not_needed(self):
-        arv_node = testutil.arvados_node_mock()
-        driver = self.new_driver()
-        # libcloud/ec2 "disk" sizes are in GB, Arvados/SLURM "scratch" value is in MB
-        size = testutil.MockSize(1)
-        size.disk=80
-        size.scratch=20000
-        driver.create_node(size, arv_node)
-        create_method = self.driver_mock().create_node
-        self.assertTrue(create_method.called)
-        self.assertIsNone(create_method.call_args[1].get('ex_blockdevicemappings'))
-    def test_ebs_volume_too_big(self):
-        arv_node = testutil.arvados_node_mock()
-        driver = self.new_driver()
-        # libcloud/ec2 "disk" sizes are in GB, Arvados/SLURM "scratch" value is in MB
-        size = testutil.MockSize(1)
-        size.disk=80
-        size.scratch=20000000
-        driver.create_node(size, arv_node)
-        create_method = self.driver_mock().create_node
-        self.assertTrue(create_method.called)
-        self.assertEqual([{
-            "DeviceName": "/dev/xvdt",
-            "Ebs": {
-                "DeleteOnTermination": True,
-                "VolumeSize": 16384,
-                "VolumeType": "gp2"
-            }}],
-                         create_method.call_args[1].get('ex_blockdevicemappings'))
diff --git a/services/nodemanager/tests/test_computenode_driver_gce.py b/services/nodemanager/tests/test_computenode_driver_gce.py
deleted file mode 100644 (file)
index 1446cd2..0000000
+++ /dev/null
@@ -1,252 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-# SPDX-License-Identifier: AGPL-3.0
-from __future__ import absolute_import, print_function
-import json
-import time
-import unittest
-import mock
-import arvnodeman.computenode.driver.gce as gce
-from . import testutil
-class GCEComputeNodeDriverTestCase(testutil.DriverTestMixin, unittest.TestCase):
-    TEST_CLASS = gce.ComputeNodeDriver
-    def setUp(self):
-        super(GCEComputeNodeDriverTestCase, self).setUp()
-        self.driver_mock().list_images.return_value = [
-            testutil.cloud_object_mock('testimage', selfLink='image-link')]
-        self.driver_mock().ex_list_disktypes.return_value = [
-            testutil.cloud_object_mock(name, selfLink=name + '-link')
-            for name in ['pd-standard', 'pd-ssd', 'local-ssd']]
-        self.driver_mock.reset_mock()
-    def new_driver(self, auth_kwargs={}, list_kwargs={}, create_kwargs={}):
-        create_kwargs.setdefault('image', 'testimage')
-        return super(GCEComputeNodeDriverTestCase, self).new_driver(
-            auth_kwargs, list_kwargs, create_kwargs)
-    def test_driver_instantiation(self):
-        kwargs = {'user_id': 'foo'}
-        driver = self.new_driver(auth_kwargs=kwargs)
-        self.assertTrue(self.driver_mock.called)
-        self.assertEqual(kwargs, self.driver_mock.call_args[1])
-    def test_create_image_loaded_at_initialization_by_name(self):
-        image_mocks = [testutil.cloud_object_mock(c) for c in 'abc']
-        list_method = self.driver_mock().list_images
-        list_method.return_value = image_mocks
-        driver = self.new_driver(create_kwargs={'image': 'b'})
-        self.assertEqual(1, list_method.call_count)
-    def test_create_includes_ping_secret(self):
-        arv_node = testutil.arvados_node_mock(info={'ping_secret': 'ssshh'})
-        driver = self.new_driver()
-        driver.create_node(testutil.MockSize(1), arv_node)
-        metadata = self.driver_mock().create_node.call_args[1]['ex_metadata']
-        self.assertIn('ping_secret=ssshh', metadata.get('arv-ping-url'))
-    def test_create_includes_arvados_node_size(self):
-        arv_node = testutil.arvados_node_mock()
-        size = testutil.MockSize(1)
-        driver = self.new_driver()
-        driver.create_node(size, arv_node)
-        create_method = self.driver_mock().create_node
-        self.assertIn(
-            ('arvados_node_size', size.id),
-            create_method.call_args[1].get('ex_metadata', {'metadata':'missing'}).items()
-        )
-    def test_create_raises_but_actually_succeeded(self):
-        arv_node = testutil.arvados_node_mock(1, hostname=None)
-        driver = self.new_driver()
-        nodelist = [testutil.cloud_node_mock(1)]
-        nodelist[0].name = 'compute-000000000000001-zzzzz'
-        self.driver_mock().list_nodes.return_value = nodelist
-        self.driver_mock().create_node.side_effect = IOError
-        n = driver.create_node(testutil.MockSize(1), arv_node)
-        self.assertEqual('compute-000000000000001-zzzzz', n.name)
-    def test_create_sets_default_hostname(self):
-        driver = self.new_driver()
-        driver.create_node(testutil.MockSize(1),
-                           testutil.arvados_node_mock(254, hostname=None))
-        create_kwargs = self.driver_mock().create_node.call_args[1]
-        self.assertEqual('compute-0000000000000fe-zzzzz',
-                         create_kwargs.get('name'))
-        self.assertEqual('dynamic.compute.zzzzz.arvadosapi.com',
-                         create_kwargs.get('ex_metadata', {}).get('hostname'))
-    def test_create_tags_from_list_tags(self):
-        driver = self.new_driver(list_kwargs={'tags': 'testA, testB'})
-        driver.create_node(testutil.MockSize(1), testutil.arvados_node_mock())
-        self.assertEqual(['testA', 'testB'],
-                         self.driver_mock().create_node.call_args[1]['ex_tags'])
-    def test_create_with_two_disks_attached(self):
-        driver = self.new_driver(create_kwargs={'image': 'testimage'})
-        driver.create_node(testutil.MockSize(1), testutil.arvados_node_mock())
-        create_disks = self.driver_mock().create_node.call_args[1].get(
-            'ex_disks_gce_struct', [])
-        self.assertEqual(2, len(create_disks))
-        self.assertTrue(create_disks[0].get('autoDelete'))
-        self.assertTrue(create_disks[0].get('boot'))
-        self.assertEqual('PERSISTENT', create_disks[0].get('type'))
-        init_params = create_disks[0].get('initializeParams', {})
-        self.assertEqual('pd-standard-link', init_params.get('diskType'))
-        self.assertEqual('image-link', init_params.get('sourceImage'))
-        # Our node images expect the SSD to be named `tmp` to find and mount it.
-        self.assertEqual('tmp', create_disks[1].get('deviceName'))
-        self.assertTrue(create_disks[1].get('autoDelete'))
-        self.assertFalse(create_disks[1].get('boot', 'unset'))
-        self.assertEqual('SCRATCH', create_disks[1].get('type'))
-        init_params = create_disks[1].get('initializeParams', {})
-        self.assertEqual('local-ssd-link', init_params.get('diskType'))
-    def test_list_nodes_requires_tags_match(self):
-        # A node matches if our list tags are a subset of the node's tags.
-        # Test behavior with no tags, no match, partial matches, different
-        # order, and strict supersets.
-        cloud_mocks = [
-            testutil.cloud_node_mock(node_num, tags=tag_set)
-            for node_num, tag_set in enumerate(
-                [[], ['bad'], ['good'], ['great'], ['great', 'ok'],
-                 ['great', 'good'], ['good', 'fantastic', 'great']])]
-        cloud_mocks.append(testutil.cloud_node_mock())
-        self.driver_mock().list_nodes.return_value = cloud_mocks
-        driver = self.new_driver(list_kwargs={'tags': 'good, great'})
-        self.assertItemsEqual(['5', '6'], [n.id for n in driver.list_nodes()])
-    def build_gce_metadata(self, metadata_dict):
-        # Convert a plain metadata dictionary to the GCE data structure.
-        return {
-            'kind': 'compute#metadata',
-            'fingerprint': 'testprint',
-            'items': [{'key': key, 'value': metadata_dict[key]}
-                      for key in metadata_dict],
-            }
-    def check_sync_node_updates_hostname_tag(self, plain_metadata):
-        start_metadata = self.build_gce_metadata(plain_metadata)
-        arv_node = testutil.arvados_node_mock(1)
-        cloud_node = testutil.cloud_node_mock(
-            2, metadata=start_metadata.copy(),
-            zone=testutil.cloud_object_mock('testzone'))
-        self.driver_mock().ex_get_node.return_value = cloud_node
-        driver = self.new_driver()
-        driver.sync_node(cloud_node, arv_node)
-        args, kwargs = self.driver_mock().ex_set_node_metadata.call_args
-        self.assertEqual(cloud_node, args[0])
-        plain_metadata['hostname'] = 'compute1.zzzzz.arvadosapi.com'
-        self.assertEqual(
-            plain_metadata,
-            {item['key']: item['value'] for item in args[1]})
-    def test_sync_node_updates_hostname_tag(self):
-        self.check_sync_node_updates_hostname_tag(
-            {'testkey': 'testvalue', 'hostname': 'startvalue'})
-    def test_sync_node_adds_hostname_tag(self):
-        self.check_sync_node_updates_hostname_tag({'testkey': 'testval'})
-    def test_sync_node_raises_exception_on_failure(self):
-        arv_node = testutil.arvados_node_mock(8)
-        cloud_node = testutil.cloud_node_mock(
-            9, metadata={}, zone=testutil.cloud_object_mock('failzone'))
-        mock_response = self.driver_mock().ex_set_node_metadata.side_effect = (Exception('sync error test'),)
-        driver = self.new_driver()
-        with self.assertRaises(Exception) as err_check:
-            driver.sync_node(cloud_node, arv_node)
-        self.assertIs(err_check.exception.__class__, Exception)
-        self.assertIn('sync error test', str(err_check.exception))
-    def test_node_create_time_zero_for_unknown_nodes(self):
-        node = testutil.cloud_node_mock()
-        self.assertEqual(0, gce.ComputeNodeDriver.node_start_time(node))
-    def test_node_create_time_for_known_node(self):
-        node = testutil.cloud_node_mock(metadata=self.build_gce_metadata(
-                {'booted_at': '1970-01-01T00:01:05Z'}))
-        self.assertEqual(65, gce.ComputeNodeDriver.node_start_time(node))
-    def test_node_create_time_recorded_when_node_boots(self):
-        start_time = time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime())
-        arv_node = testutil.arvados_node_mock()
-        driver = self.new_driver()
-        driver.create_node(testutil.MockSize(1), arv_node)
-        metadata = self.driver_mock().create_node.call_args[1]['ex_metadata']
-        self.assertLessEqual(start_time, metadata.get('booted_at'))
-    def test_known_node_fqdn(self):
-        name = 'fqdntest.zzzzz.arvadosapi.com'
-        node = testutil.cloud_node_mock(metadata=self.build_gce_metadata(
-                {'hostname': name}))
-        self.assertEqual(name, gce.ComputeNodeDriver.node_fqdn(node))
-    def test_unknown_node_fqdn(self):
-        # Return an empty string.  This lets fqdn be safely compared
-        # against an expected value, and ComputeNodeMonitorActor
-        # should try to update it.
-        node = testutil.cloud_node_mock(metadata=self.build_gce_metadata({}))
-        self.assertEqual('', gce.ComputeNodeDriver.node_fqdn(node))
-    def test_deliver_ssh_key_in_metadata(self):
-        test_ssh_key = 'ssh-rsa-foo'
-        arv_node = testutil.arvados_node_mock(1)
-        with mock.patch('__builtin__.open',
-                        mock.mock_open(read_data=test_ssh_key)) as mock_file:
-            driver = self.new_driver(create_kwargs={'ssh_key': 'ssh-key-file'})
-        mock_file.assert_called_once_with('ssh-key-file')
-        driver.create_node(testutil.MockSize(1), arv_node)
-        metadata = self.driver_mock().create_node.call_args[1]['ex_metadata']
-        self.assertEqual('root:ssh-rsa-foo', metadata.get('sshKeys'))
-    def test_create_driver_with_service_accounts(self):
-        service_accounts = {'email': 'foo@bar', 'scopes': ['storage-full']}
-        srv_acct_config = {'service_accounts': json.dumps(service_accounts)}
-        arv_node = testutil.arvados_node_mock(1)
-        driver = self.new_driver(create_kwargs=srv_acct_config)
-        driver.create_node(testutil.MockSize(1), arv_node)
-        self.assertEqual(
-            service_accounts,
-            self.driver_mock().create_node.call_args[1]['ex_service_accounts'])
-    def test_fix_string_size(self):
-        # As of 0.18, the libcloud GCE driver sets node.size to the size's name.
-        # It's supposed to be the actual size object.  Make sure our driver
-        # patches that up in listings.
-        size = testutil.MockSize(2)
-        node = testutil.cloud_node_mock(size=size)
-        node.size = size.id
-        self.driver_mock().list_sizes.return_value = [size]
-        self.driver_mock().list_nodes.return_value = [node]
-        driver = self.new_driver()
-        nodelist = driver.list_nodes()
-        self.assertEqual(1, len(nodelist))
-        self.assertIs(node, nodelist[0])
-        self.assertIs(size, nodelist[0].size)
-    def test_skip_fix_when_size_not_string(self):
-        # Ensure we don't monkeypatch node sizes unless we need to.
-        size = testutil.MockSize(3)
-        node = testutil.cloud_node_mock(size=size)
-        self.driver_mock().list_nodes.return_value = [node]
-        driver = self.new_driver()
-        nodelist = driver.list_nodes()
-        self.assertEqual(1, len(nodelist))
-        self.assertIs(node, nodelist[0])
-        self.assertIs(size, nodelist[0].size)
-    def test_node_found_after_timeout_has_fixed_size(self):
-        size = testutil.MockSize(4)
-        cloud_node = testutil.cloud_node_mock(size=size.id)
-        self.check_node_found_after_timeout_has_fixed_size(size, cloud_node)
-    def test_list_empty_nodes(self):
-        self.driver_mock().list_nodes.return_value = []
-        self.assertEqual([], self.new_driver().list_nodes())
diff --git a/services/nodemanager/tests/test_config.py b/services/nodemanager/tests/test_config.py
deleted file mode 100644 (file)
index 8002b3b..0000000
+++ /dev/null
@@ -1,110 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-# SPDX-License-Identifier: AGPL-3.0
-from __future__ import absolute_import, print_function
-import io
-import logging
-import unittest
-import arvnodeman.computenode.dispatch as dispatch
-import arvnodeman.computenode.dispatch.slurm as slurm_dispatch
-import arvnodeman.config as nmconfig
-class NodeManagerConfigTestCase(unittest.TestCase):
-    TEST_CONFIG = u"""
-provider = dummy
-shutdown_windows = 52, 6, 2
-[Cloud Credentials]
-creds = dummy_creds
-[Cloud List]
-[Cloud Create]
-[Size 1]
-cores = 1
-price = 0.8
-[Size 1.preemptible]
-instance_type = 1
-preemptible = true
-cores = 1
-price = 0.8
-file = /dev/null
-level = DEBUG
-testlogger = INFO
-    def load_config(self, config=None, config_str=None):
-        if config is None:
-            config = nmconfig.NodeManagerConfig()
-        if config_str is None:
-            config_str = self.TEST_CONFIG
-        with io.StringIO(config_str) as config_fp:
-            config.readfp(config_fp)
-        return config
-    def test_seeded_defaults(self):
-        config = nmconfig.NodeManagerConfig()
-        sec_names = set(config.sections())
-        self.assertIn('Arvados', sec_names)
-        self.assertIn('Daemon', sec_names)
-        self.assertFalse(any(name.startswith('Size ') for name in sec_names))
-    def test_list_sizes(self):
-        config = self.load_config()
-        sizes = config.node_sizes()
-        self.assertEqual(2, len(sizes))
-        size, kwargs = sizes[0]
-        self.assertEqual('Small', size.name)
-        self.assertEqual(1, kwargs['cores'])
-        self.assertEqual(0.8, kwargs['price'])
-        # preemptible is False by default
-        self.assertEqual(False, kwargs['preemptible'])
-        # instance_type == arvados node size id by default
-        self.assertEqual(kwargs['id'], kwargs['instance_type'])
-        # Now retrieve the preemptible version
-        size, kwargs = sizes[1]
-        self.assertEqual('Small', size.name)
-        self.assertEqual('1.preemptible', kwargs['id'])
-        self.assertEqual(1, kwargs['cores'])
-        self.assertEqual(0.8, kwargs['price'])
-        self.assertEqual(True, kwargs['preemptible'])
-        self.assertEqual('1', kwargs['instance_type'])
-    def test_default_node_mem_scaling(self):
-        config = self.load_config()
-        self.assertEqual(0.95, config.getfloat('Daemon', 'node_mem_scaling'))
-    def test_shutdown_windows(self):
-        config = self.load_config()
-        self.assertEqual([52, 6, 2], config.shutdown_windows())
-    def test_log_levels(self):
-        config = self.load_config()
-        self.assertEqual({'level': logging.DEBUG,
-                          'testlogger': logging.INFO},
-                         config.log_levels())
-    def check_dispatch_classes(self, config, module):
-        setup, shutdown, update, monitor = config.dispatch_classes()
-        self.assertIs(setup, module.ComputeNodeSetupActor)
-        self.assertIs(shutdown, module.ComputeNodeShutdownActor)
-        self.assertIs(update, module.ComputeNodeUpdateActor)
-        self.assertIs(monitor, module.ComputeNodeMonitorActor)
-    def test_default_dispatch(self):
-        config = self.load_config()
-        self.check_dispatch_classes(config, dispatch)
-    def test_custom_dispatch(self):
-        config = self.load_config(
-            config_str=self.TEST_CONFIG + "[Daemon]\ndispatcher=slurm\n")
-        self.check_dispatch_classes(config, slurm_dispatch)
diff --git a/services/nodemanager/tests/test_daemon.py b/services/nodemanager/tests/test_daemon.py
deleted file mode 100644 (file)
index 1b6e4ca..0000000
+++ /dev/null
@@ -1,858 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-# SPDX-License-Identifier: AGPL-3.0
-from __future__ import absolute_import, print_function
-import time
-import unittest
-import mock
-import pykka
-import arvnodeman.daemon as nmdaemon
-import arvnodeman.status as status
-from arvnodeman.jobqueue import ServerCalculator
-from arvnodeman.computenode.dispatch import ComputeNodeMonitorActor
-from . import testutil
-from . import test_status
-from . import pykka_timeout
-import logging
-class NodeManagerDaemonActorTestCase(testutil.ActorTestMixin,
-                                     unittest.TestCase):
-    def assertwait(self, f, timeout=pykka_timeout*2):
-        deadline = time.time() + timeout
-        while True:
-            try:
-                return f()
-            except AssertionError:
-                if time.time() > deadline:
-                    raise
-                pass
-            time.sleep(.1)
-            self.daemon.ping().get(self.TIMEOUT)
-    def busywait(self, f):
-        for n in xrange(200):
-            ok = f()
-            if ok:
-                return
-            time.sleep(.1)
-            self.daemon.ping().get(self.TIMEOUT)
-        self.assertTrue(ok) # always falsy, but not necessarily False
-    def mock_node_start(self, **kwargs):
-        # Make sure that every time the daemon starts a setup actor,
-        # it gets a new mock object back.
-        get_cloud_size = mock.MagicMock()
-        get_cloud_size.get.return_value = kwargs["cloud_size"]
-        mock_actor = mock.MagicMock()
-        mock_proxy = mock.NonCallableMock(name='setup_mock_proxy',
-                                          cloud_size=get_cloud_size,
-                                          actor_ref=mock_actor)
-        mock_actor.proxy.return_value = mock_proxy
-        mock_actor.tell_proxy.return_value = mock_proxy
-        self.last_setup = mock_proxy
-        return mock_actor
-    def mock_node_shutdown(self, **kwargs):
-        # Make sure that every time the daemon starts a shutdown actor,
-        # it gets a new mock object back.
-        get_cloud_node = mock.MagicMock()
-        if "node_monitor" in kwargs:
-            get_cloud_node.get.return_value = kwargs["node_monitor"].proxy().cloud_node.get()
-        mock_actor = mock.MagicMock()
-        mock_proxy = mock.NonCallableMock(name='shutdown_mock_proxy',
-                                          cloud_node=get_cloud_node,
-                                          actor_ref=mock_actor)
-        mock_actor.proxy.return_value = mock_proxy
-        self.last_shutdown = mock_proxy
-        return mock_actor
-    def make_daemon(self, cloud_nodes=[], arvados_nodes=[], want_sizes=[],
-                    avail_sizes=None,
-                    min_nodes=0, max_nodes=8,
-                    shutdown_windows=[54, 5, 1],
-                    max_total_price=None):
-        for name in ['cloud_nodes', 'arvados_nodes', 'server_wishlist']:
-            setattr(self, name + '_poller', mock.MagicMock(name=name + '_mock'))
-        if not avail_sizes:
-            if cloud_nodes or want_sizes:
-                avail_sizes=[(c.size, {"cores": int(c.id)}) for c in cloud_nodes] + [(s, {"cores": 1}) for s in want_sizes]
-            else:
-                avail_sizes=[(testutil.MockSize(1), {"cores": 1})]
-        self.arv_factory = mock.MagicMock(name='arvados_mock')
-        api_client = mock.MagicMock(name='api_client')
-        api_client.nodes().create().execute.side_effect = \
-            [testutil.arvados_node_mock(1),
-             testutil.arvados_node_mock(2)]
-        self.arv_factory.return_value = api_client
-        self.cloud_factory = mock.MagicMock(name='cloud_mock')
-        self.cloud_factory().node_start_time.return_value = time.time()
-        self.cloud_updates = mock.MagicMock(name='updates_mock')
-        self.timer = testutil.MockTimer(deliver_immediately=False)
-        self.cloud_factory().node_id.side_effect = lambda node: node.id
-        self.cloud_factory().broken.return_value = False
-        self.node_setup = mock.MagicMock(name='setup_mock')
-        self.node_setup.start.side_effect = self.mock_node_start
-        self.node_setup.reset_mock()
-        self.node_shutdown = mock.MagicMock(name='shutdown_mock')
-        self.node_shutdown.start.side_effect = self.mock_node_shutdown
-        self.daemon = nmdaemon.NodeManagerDaemonActor.start(
-            self.server_wishlist_poller, self.arvados_nodes_poller,
-            self.cloud_nodes_poller, self.cloud_updates, self.timer,
-            self.arv_factory, self.cloud_factory,
-            shutdown_windows, ServerCalculator(avail_sizes),
-            min_nodes, max_nodes, 600, 1800, 3600,
-            self.node_setup, self.node_shutdown,
-            max_total_price=max_total_price).proxy()
-        if arvados_nodes is not None:
-            self.daemon.update_arvados_nodes(arvados_nodes).get(self.TIMEOUT)
-        if cloud_nodes is not None:
-            self.daemon.update_cloud_nodes(cloud_nodes).get(self.TIMEOUT)
-        if want_sizes is not None:
-            self.daemon.update_server_wishlist(want_sizes).get(self.TIMEOUT)
-    def monitor_list(self):
-        return [c.actor.actor_ref for c in self.daemon.cloud_nodes.get(self.TIMEOUT).nodes.values() if c.actor]
-    def monitored_arvados_nodes(self, include_unpaired=True):
-        pairings = []
-        for future in [actor.proxy().arvados_node
-                       for actor in self.monitor_list()]:
-            try:
-                g = future.get(self.TIMEOUT)
-                if g or include_unpaired:
-                    pairings.append(g)
-            except pykka.ActorDeadError:
-                pass
-        return pairings
-    def alive_monitor_count(self):
-        return len(self.monitored_arvados_nodes())
-    def paired_monitor_count(self):
-        return len(self.monitored_arvados_nodes(False))
-    def assertShutdownCancellable(self, expected=True):
-        self.assertTrue(self.node_shutdown.start.called)
-        self.assertIs(expected,
-                      self.node_shutdown.start.call_args[1]['cancellable'],
-                      "ComputeNodeShutdownActor incorrectly cancellable")
-    def test_easy_node_creation(self):
-        size = testutil.MockSize(1)
-        self.make_daemon(want_sizes=[size])
-        self.busywait(lambda: self.node_setup.start.called)
-        self.assertIn('node_quota', status.tracker._latest)
-    def check_monitors_arvados_nodes(self, *arv_nodes):
-        self.assertwait(lambda: self.assertItemsEqual(arv_nodes, self.monitored_arvados_nodes()))
-    def test_node_pairing(self):
-        cloud_node = testutil.cloud_node_mock(1)
-        arv_node = testutil.arvados_node_mock(1)
-        self.make_daemon([cloud_node], [arv_node])
-        self.check_monitors_arvados_nodes(arv_node)
-    def test_node_pairing_after_arvados_update(self):
-        cloud_node = testutil.cloud_node_mock(2)
-        self.make_daemon([cloud_node],
-                         [testutil.arvados_node_mock(1, ip_address=None)])
-        arv_node = testutil.arvados_node_mock(2)
-        self.daemon.update_arvados_nodes([arv_node]).get(self.TIMEOUT)
-        self.check_monitors_arvados_nodes(arv_node)
-    def test_arvados_node_un_and_re_paired(self):
-        # We need to create the Arvados node mock after spinning up the daemon
-        # to make sure it's new enough to pair with the cloud node.
-        self.make_daemon(cloud_nodes=[testutil.cloud_node_mock(3)],
-                         arvados_nodes=None)
-        arv_node = testutil.arvados_node_mock(3)
-        self.daemon.update_arvados_nodes([arv_node]).get(self.TIMEOUT)
-        self.check_monitors_arvados_nodes(arv_node)
-        self.daemon.update_cloud_nodes([]).get(self.TIMEOUT)
-        self.busywait(lambda: 0 == self.alive_monitor_count())
-        self.daemon.update_cloud_nodes([testutil.cloud_node_mock(3)])
-        self.check_monitors_arvados_nodes(arv_node)
-    def test_old_arvados_node_not_double_assigned(self):
-        arv_node = testutil.arvados_node_mock(3, age=9000)
-        size = testutil.MockSize(3)
-        self.make_daemon(arvados_nodes=[arv_node],
-                         avail_sizes=[(size, {"cores":1})])
-        self.daemon.update_server_wishlist([size]).get(self.TIMEOUT)
-        self.daemon.update_server_wishlist([size, size]).get(self.TIMEOUT)
-        self.stop_proxy(self.daemon)
-        used_nodes = [call[1].get('arvados_node')
-                      for call in self.node_setup.start.call_args_list]
-        self.assertEqual(2, len(used_nodes))
-        self.assertIn(arv_node, used_nodes)
-        self.assertIn(None, used_nodes)
-    def test_node_count_satisfied(self):
-        self.make_daemon(cloud_nodes=[testutil.cloud_node_mock(1)],
-                         want_sizes=[testutil.MockSize(1)])
-        self.busywait(lambda: not self.node_setup.start.called)
-    def test_select_stale_node_records_with_slot_numbers_first(self):
-        """
-        Stale node records with slot_number assigned can exist when
-        clean_arvados_node() isn't executed after a node shutdown, for
-        various reasons.
-        NodeManagerDaemonActor should use these stale node records first, so
-        that they don't accumulate unused, reducing the slots available.
-        """
-        size = testutil.MockSize(1)
-        a_long_time_ago = '1970-01-01T01:02:03.04050607Z'
-        arvados_nodes = []
-        for n in range(9):
-            # Add several stale node records without slot_number assigned
-            arvados_nodes.append(
-                testutil.arvados_node_mock(
-                    n+1,
-                    slot_number=None,
-                    modified_at=a_long_time_ago))
-        # Add one record with stale_node assigned, it should be the
-        # first one selected
-        arv_node = testutil.arvados_node_mock(
-            123,
-            modified_at=a_long_time_ago)
-        arvados_nodes.append(arv_node)
-        cloud_node = testutil.cloud_node_mock(125, size=size)
-        self.make_daemon(cloud_nodes=[cloud_node],
-                         arvados_nodes=arvados_nodes)
-        arvados_nodes_tracker = self.daemon.arvados_nodes.get()
-        # Here, find_stale_node() should return the node record with
-        # the slot_number assigned.
-        self.assertEqual(arv_node,
-                         arvados_nodes_tracker.find_stale_node(3601))
-    def test_dont_count_missing_as_busy(self):
-        size = testutil.MockSize(1)
-        self.make_daemon(cloud_nodes=[testutil.cloud_node_mock(1, size=size),
-                                      testutil.cloud_node_mock(2, size=size)],
-                         arvados_nodes=[testutil.arvados_node_mock(1),
-                                        testutil.arvados_node_mock(
-                                            2,
-                                            last_ping_at='1970-01-01T01:02:03.04050607Z')],
-                         want_sizes=[size, size])
-        self.busywait(lambda: 2 == self.alive_monitor_count())
-        self.busywait(lambda: self.node_setup.start.called)
-    def test_missing_counts_towards_max(self):
-        size = testutil.MockSize(1)
-        self.make_daemon(cloud_nodes=[testutil.cloud_node_mock(1, size=size),
-                                      testutil.cloud_node_mock(2, size=size)],
-                         arvados_nodes=[testutil.arvados_node_mock(1),
-                                        testutil.arvados_node_mock(2, last_ping_at='1970-01-01T01:02:03.04050607Z')],
-                         want_sizes=[size, size],
-                         max_nodes=2)
-        self.busywait(lambda: not self.node_setup.start.called)
-    def test_excess_counts_missing(self):
-        size = testutil.MockSize(1)
-        cloud_nodes = [testutil.cloud_node_mock(1, size=size), testutil.cloud_node_mock(2, size=size)]
-        self.make_daemon(cloud_nodes=cloud_nodes,
-                         arvados_nodes=[testutil.arvados_node_mock(1),
-                                        testutil.arvados_node_mock(2, last_ping_at='1970-01-01T01:02:03.04050607Z')],
-                         want_sizes=[size])
-        self.assertwait(lambda: self.assertEqual(2, self.paired_monitor_count()))
-        for mon_ref in self.monitor_list():
-            self.daemon.node_can_shutdown(mon_ref.proxy()).get(self.TIMEOUT)
-        self.assertEqual(1, self.node_shutdown.start.call_count)
-    def test_missing_shutdown_not_excess(self):
-        size = testutil.MockSize(1)
-        cloud_nodes = [testutil.cloud_node_mock(1, size=size), testutil.cloud_node_mock(2, size=size)]
-        self.make_daemon(cloud_nodes=cloud_nodes,
-                         arvados_nodes=[testutil.arvados_node_mock(1),
-                                        testutil.arvados_node_mock(2, last_ping_at='1970-01-01T01:02:03.04050607Z')],
-                         want_sizes=[size])
-        self.assertwait(lambda: self.assertEqual(2, self.paired_monitor_count()))
-        get_cloud_node = mock.MagicMock(name="get_cloud_node")
-        get_cloud_node.get.return_value = cloud_nodes[1]
-        mock_node_monitor = mock.MagicMock()
-        mock_node_monitor.proxy.return_value = mock.NonCallableMock(cloud_node=get_cloud_node)
-        mock_shutdown = self.node_shutdown.start(node_monitor=mock_node_monitor)
-        self.daemon.cloud_nodes.get()[cloud_nodes[1].id].shutdown_actor = mock_shutdown.proxy()
-        self.assertwait(lambda: self.assertEqual(2, self.alive_monitor_count()))
-        for mon_ref in self.monitor_list():
-            self.daemon.node_can_shutdown(mon_ref.proxy()).get(self.TIMEOUT)
-        self.busywait(lambda: 1 == self.node_shutdown.start.call_count)
-    def test_booting_nodes_counted(self):
-        cloud_node = testutil.cloud_node_mock(1)
-        arv_node = testutil.arvados_node_mock(1)
-        server_wishlist = [testutil.MockSize(1)] * 2
-        self.make_daemon([cloud_node], [arv_node], server_wishlist)
-        self.daemon.max_nodes.get(self.TIMEOUT)
-        self.assertTrue(self.node_setup.start.called)
-        self.daemon.update_server_wishlist(server_wishlist).get(self.TIMEOUT)
-        self.busywait(lambda: 1 == self.node_setup.start.call_count)
-    def test_boot_new_node_when_all_nodes_busy(self):
-        size = testutil.MockSize(2)
-        arv_node = testutil.arvados_node_mock(2, job_uuid=True)
-        self.make_daemon([testutil.cloud_node_mock(2, size=size)], [arv_node],
-                         [size], avail_sizes=[(size, {"cores":1})])
-        self.assertwait(lambda: self.assertEqual(1, self.paired_monitor_count()))
-        self.assertwait(lambda: self.assertEqual(1, self.node_setup.start.called))
-    def test_boot_new_node_below_min_nodes(self):
-        min_size = testutil.MockSize(1)
-        wish_size = testutil.MockSize(3)
-        avail_sizes = [(min_size, {"cores": 1}),
-                       (wish_size, {"cores": 3})]
-        self.make_daemon([], [], None, avail_sizes=avail_sizes, min_nodes=2)
-        self.daemon.update_server_wishlist([wish_size]).get(self.TIMEOUT)
-        self.daemon.update_cloud_nodes([]).get(self.TIMEOUT)
-        self.daemon.update_server_wishlist([wish_size]).get(self.TIMEOUT)
-        self.stop_proxy(self.daemon)
-        self.assertEqual([wish_size, min_size],
-                         [call[1].get('cloud_size')
-                          for call in self.node_setup.start.call_args_list])
-    def test_no_new_node_when_ge_min_nodes_busy(self):
-        size = testutil.MockSize(2)
-        cloud_nodes = [testutil.cloud_node_mock(n, size=size) for n in range(1, 4)]
-        arv_nodes = [testutil.arvados_node_mock(n, job_uuid=True)
-                     for n in range(1, 4)]
-        self.make_daemon(cloud_nodes, arv_nodes, [], min_nodes=2)
-        self.stop_proxy(self.daemon)
-        self.assertEqual(0, self.node_setup.start.call_count)
-    def test_no_new_node_when_max_nodes_busy(self):
-        size = testutil.MockSize(3)
-        self.make_daemon(cloud_nodes=[testutil.cloud_node_mock(3)],
-                         arvados_nodes=[testutil.arvados_node_mock(3, job_uuid=True)],
-                         want_sizes=[size],
-                         max_nodes=1)
-        self.stop_proxy(self.daemon)
-        self.assertFalse(self.node_setup.start.called)
-    def start_node_boot(self, cloud_node=None, arv_node=None, id_num=1):
-        if cloud_node is None:
-            cloud_node = testutil.cloud_node_mock(id_num)
-        id_num = int(cloud_node.id)
-        if arv_node is None:
-            arv_node = testutil.arvados_node_mock(id_num)
-        self.make_daemon(want_sizes=[testutil.MockSize(id_num)],
-                         avail_sizes=[(testutil.MockSize(id_num), {"cores":1})])
-        self.daemon.max_nodes.get(self.TIMEOUT)
-        self.assertEqual(1, self.node_setup.start.call_count)
-        self.last_setup.cloud_node.get.return_value = cloud_node
-        self.last_setup.arvados_node.get.return_value = arv_node
-        return self.last_setup
-    def test_new_node_when_booted_node_not_usable(self):
-        cloud_node = testutil.cloud_node_mock(4)
-        arv_node = testutil.arvados_node_mock(4, crunch_worker_state='down')
-        setup = self.start_node_boot(cloud_node, arv_node)
-        self.daemon.node_setup_finished(setup).get(self.TIMEOUT)
-        self.assertEqual(1, self.alive_monitor_count())
-        self.daemon.update_arvados_nodes([arv_node])
-        self.daemon.update_cloud_nodes([cloud_node])
-        self.monitor_list()[0].proxy().cloud_node_start_time = time.time()-1801
-        self.daemon.update_server_wishlist(
-            [testutil.MockSize(4)]).get(self.TIMEOUT)
-        self.stop_proxy(self.daemon)
-        self.assertEqual(2, self.node_setup.start.call_count)
-    def test_no_duplication_when_booting_node_listed_fast(self):
-        # Test that we don't start two ComputeNodeMonitorActors when
-        # we learn about a booting node through a listing before we
-        # get the "node up" message from CloudNodeSetupActor.
-        cloud_node = testutil.cloud_node_mock(1)
-        setup = self.start_node_boot(cloud_node)
-        self.daemon.update_cloud_nodes([cloud_node])
-        self.daemon.node_setup_finished(setup).get(self.TIMEOUT)
-        self.assertEqual(1, self.alive_monitor_count())
-    def test_no_duplication_when_booted_node_listed(self):
-        cloud_node = testutil.cloud_node_mock(2)
-        setup = self.start_node_boot(cloud_node, id_num=2)
-        self.daemon.node_setup_finished(setup)
-        self.daemon.update_cloud_nodes([cloud_node]).get(self.TIMEOUT)
-        self.assertEqual(1, self.alive_monitor_count())
-    def test_node_counted_after_boot_with_slow_listing(self):
-        # Test that, after we boot a compute node, we assume it exists
-        # even it doesn't appear in the listing (e.g., because of delays
-        # propagating tags).
-        setup = self.start_node_boot()
-        self.daemon.node_setup_finished(setup).get(self.TIMEOUT)
-        self.assertEqual(1, self.alive_monitor_count())
-        self.daemon.update_cloud_nodes([]).get(self.TIMEOUT)
-        self.assertEqual(1, self.alive_monitor_count())
-    def test_booted_unlisted_node_counted(self):
-        setup = self.start_node_boot(id_num=1)
-        self.daemon.node_setup_finished(setup)
-        self.daemon.update_server_wishlist(
-            [testutil.MockSize(1)]).get(self.TIMEOUT)
-        self.stop_proxy(self.daemon)
-        self.assertEqual(1, self.node_setup.start.call_count)
-    def test_booted_node_can_shutdown(self):
-        setup = self.start_node_boot()
-        self.daemon.node_setup_finished(setup).get(self.TIMEOUT)
-        self.assertEqual(1, self.alive_monitor_count())
-        monitor = self.monitor_list()[0].proxy()
-        self.daemon.update_server_wishlist([])
-        self.daemon.node_can_shutdown(monitor).get(self.TIMEOUT)
-        self.daemon.update_server_wishlist([]).get(self.TIMEOUT)
-        self.stop_proxy(self.daemon)
-        self.assertTrue(self.node_shutdown.start.called,
-                        "daemon did not shut down booted node on offer")
-        with test_status.TestServer() as srv:
-            self.assertEqual(0, srv.get_status().get('nodes_unpaired', None))
-            self.assertEqual(1, srv.get_status().get('nodes_shutdown', None))
-            self.assertEqual(0, srv.get_status().get('nodes_wish', None))
-    def test_booted_node_lifecycle(self):
-        cloud_node = testutil.cloud_node_mock(6)
-        setup = self.start_node_boot(cloud_node, id_num=6)
-        self.daemon.node_setup_finished(setup).get(self.TIMEOUT)
-        self.assertEqual(1, self.alive_monitor_count())
-        monitor = self.monitor_list()[0].proxy()
-        self.daemon.update_server_wishlist([])
-        self.daemon.node_can_shutdown(monitor).get(self.TIMEOUT)
-        self.assertShutdownCancellable(True)
-        shutdown = self.node_shutdown.start().proxy()
-        shutdown.cloud_node.get.return_value = cloud_node
-        self.daemon.node_finished_shutdown(shutdown).get(self.TIMEOUT)
-        self.daemon.update_cloud_nodes([])
-        self.assertTrue(shutdown.stop.called,
-                        "shutdown actor not stopped after finishing")
-        self.assertTrue(monitor.actor_ref.actor_stopped.wait(self.TIMEOUT),
-                        "monitor for booted node not stopped after shutdown")
-        self.daemon.update_server_wishlist(
-            [testutil.MockSize(2)]).get(self.TIMEOUT)
-        self.stop_proxy(self.daemon)
-        self.assertTrue(self.node_setup.start.called,
-                        "second node not started after booted node stopped")
-    def test_node_disappearing_during_shutdown(self):
-        cloud_node = testutil.cloud_node_mock(6)
-        setup = self.start_node_boot(cloud_node, id_num=6)
-        self.daemon.node_setup_finished(setup).get(self.TIMEOUT)
-        self.assertEqual(1, self.alive_monitor_count())
-        monitor = self.monitor_list()[0].proxy()
-        self.daemon.update_server_wishlist([])
-        self.daemon.node_can_shutdown(monitor).get(self.TIMEOUT)
-        self.assertShutdownCancellable(True)
-        shutdown = self.node_shutdown.start().proxy()
-        shutdown.cloud_node.get.return_value = cloud_node
-        # Simulate a successful but slow node destroy call: the cloud node
-        # list gets updated before the ShutdownActor finishes.
-        record = self.daemon.cloud_nodes.get().nodes.values()[0]
-        self.assertTrue(record.shutdown_actor is not None)
-        self.daemon.cloud_nodes.get().nodes.clear()
-        self.daemon.node_finished_shutdown(shutdown).get(self.TIMEOUT)
-        self.assertTrue(
-            record.shutdown_actor is not None,
-            "test was ineffective -- failed to simulate the race condition")
-    def test_booted_node_shut_down_when_never_listed(self):
-        setup = self.start_node_boot()
-        self.cloud_factory().node_start_time.return_value = time.time() - 3601
-        self.daemon.node_setup_finished(setup).get(self.TIMEOUT)
-        self.assertEqual(1, self.alive_monitor_count())
-        self.assertFalse(self.node_shutdown.start.called)
-        now = time.time()
-        self.monitor_list()[0].tell_proxy().consider_shutdown()
-        self.busywait(lambda: self.node_shutdown.start.called)
-        self.assertShutdownCancellable(False)
-    def test_booted_node_shut_down_when_never_paired(self):
-        cloud_node = testutil.cloud_node_mock(2)
-        setup = self.start_node_boot(cloud_node)
-        self.cloud_factory().node_start_time.return_value = time.time() - 3601
-        self.daemon.node_setup_finished(setup).get(self.TIMEOUT)
-        self.assertEqual(1, self.alive_monitor_count())
-        self.daemon.update_cloud_nodes([cloud_node])
-        self.monitor_list()[0].tell_proxy().consider_shutdown()
-        self.busywait(lambda: self.node_shutdown.start.called)
-        self.assertShutdownCancellable(False)
-    def test_booted_node_shut_down_when_never_working(self):
-        cloud_node = testutil.cloud_node_mock(4)
-        arv_node = testutil.arvados_node_mock(4, crunch_worker_state='down')
-        setup = self.start_node_boot(cloud_node, arv_node)
-        self.daemon.update_arvados_nodes([arv_node]).get(self.TIMEOUT)
-        self.daemon.node_setup_finished(setup).get(self.TIMEOUT)
-        self.assertEqual(1, self.alive_monitor_count())
-        self.monitor_list()[0].proxy().cloud_node_start_time = time.time()-3601
-        self.daemon.update_cloud_nodes([cloud_node])
-        self.busywait(lambda: self.node_shutdown.start.called)
-        self.assertShutdownCancellable(False)
-    def test_node_that_pairs_not_considered_failed_boot(self):
-        cloud_node = testutil.cloud_node_mock(3)
-        arv_node = testutil.arvados_node_mock(3)
-        setup = self.start_node_boot(cloud_node, arv_node)
-        self.daemon.node_setup_finished(setup).get(self.TIMEOUT)
-        self.assertEqual(1, self.alive_monitor_count())
-        self.daemon.update_cloud_nodes([cloud_node])
-        self.daemon.update_arvados_nodes([arv_node]).get(self.TIMEOUT)
-        self.timer.deliver()
-        self.stop_proxy(self.daemon)
-        self.assertFalse(self.node_shutdown.start.called)
-    def test_node_that_pairs_busy_not_considered_failed_boot(self):
-        cloud_node = testutil.cloud_node_mock(5)
-        arv_node = testutil.arvados_node_mock(5, job_uuid=True)
-        setup = self.start_node_boot(cloud_node, arv_node)
-        self.daemon.node_setup_finished(setup).get(self.TIMEOUT)
-        self.assertEqual(1, self.alive_monitor_count())
-        self.daemon.update_cloud_nodes([cloud_node])
-        self.daemon.update_arvados_nodes([arv_node]).get(self.TIMEOUT)
-        self.timer.deliver()
-        self.stop_proxy(self.daemon)
-        self.assertFalse(self.node_shutdown.start.called)
-    def test_booting_nodes_shut_down(self):
-        self.make_daemon(want_sizes=[testutil.MockSize(1)])
-        self.daemon.update_server_wishlist([]).get(self.TIMEOUT)
-        self.busywait(lambda: self.last_setup.stop_if_no_cloud_node.called)
-    def test_all_booting_nodes_tried_to_shut_down(self):
-        size = testutil.MockSize(2)
-        self.make_daemon(want_sizes=[size], avail_sizes=[(size, {"cores":1})])
-        self.daemon.max_nodes.get(self.TIMEOUT)
-        setup1 = self.last_setup
-        setup1.stop_if_no_cloud_node().get.return_value = False
-        setup1.stop_if_no_cloud_node.reset_mock()
-        self.daemon.update_server_wishlist([size, size]).get(self.TIMEOUT)
-        self.daemon.max_nodes.get(self.TIMEOUT)
-        self.assertIsNot(setup1, self.last_setup)
-        self.last_setup.stop_if_no_cloud_node().get.return_value = True
-        self.last_setup.stop_if_no_cloud_node.reset_mock()
-        self.daemon.update_server_wishlist([]).get(self.TIMEOUT)
-        self.daemon.max_nodes.get(self.TIMEOUT)
-        self.stop_proxy(self.daemon)
-        self.assertEqual(1, self.last_setup.stop_if_no_cloud_node.call_count)
-        self.assertTrue(setup1.stop_if_no_cloud_node.called)
-    def test_shutdown_declined_at_wishlist_capacity(self):
-        cloud_node = testutil.cloud_node_mock(1)
-        arv_node = testutil.arvados_node_mock(1)
-        size = testutil.MockSize(1)
-        self.make_daemon(cloud_nodes=[cloud_node], arvados_nodes=[arv_node], want_sizes=[size])
-        self.assertwait(lambda: self.assertEqual(1, self.paired_monitor_count()))
-        monitor = self.monitor_list()[0].proxy()
-        self.daemon.node_can_shutdown(monitor).get(self.TIMEOUT)
-        self.stop_proxy(self.daemon)
-        self.assertFalse(self.node_shutdown.start.called)
-    def test_shutdown_declined_below_min_nodes(self):
-        cloud_node = testutil.cloud_node_mock(1)
-        arv_node = testutil.arvados_node_mock(1)
-        self.make_daemon(cloud_nodes=[cloud_node], arvados_nodes=[arv_node], min_nodes=1)
-        self.assertwait(lambda: self.assertEqual(1, self.paired_monitor_count()))
-        monitor = self.monitor_list()[0].proxy()
-        self.daemon.node_can_shutdown(monitor).get(self.TIMEOUT)
-        self.stop_proxy(self.daemon)
-        self.assertFalse(self.node_shutdown.start.called)
-    def test_shutdown_accepted_below_capacity(self):
-        self.make_daemon(cloud_nodes=[testutil.cloud_node_mock()])
-        self.busywait(lambda: 1 == self.alive_monitor_count())
-        monitor = self.monitor_list()[0].proxy()
-        self.daemon.node_can_shutdown(monitor).get(self.TIMEOUT)
-        self.busywait(lambda: self.node_shutdown.start.called)
-    def test_shutdown_declined_when_idle_and_job_queued(self):
-        size = testutil.MockSize(1)
-        cloud_nodes = [testutil.cloud_node_mock(n, size=size) for n in [3, 4]]
-        arv_nodes = [testutil.arvados_node_mock(3, job_uuid=True),
-                     testutil.arvados_node_mock(4, job_uuid=None)]
-        self.make_daemon(cloud_nodes, arv_nodes, [size])
-        self.assertwait(lambda: self.assertEqual(2, self.paired_monitor_count()))
-        for mon_ref in self.monitor_list():
-            monitor = mon_ref.proxy()
-            if monitor.cloud_node.get(self.TIMEOUT) is cloud_nodes[-1]:
-                break
-        else:
-            self.fail("monitor for idle node not found")
-        self.daemon.node_can_shutdown(monitor).get(self.TIMEOUT)
-        self.stop_proxy(self.daemon)
-        self.assertFalse(self.node_shutdown.start.called)
-    def test_node_shutdown_after_cancelled_shutdown(self):
-        cloud_node = testutil.cloud_node_mock(5)
-        self.make_daemon([cloud_node], [testutil.arvados_node_mock(5)])
-        self.assertEqual(1, self.alive_monitor_count())
-        monitor = self.monitor_list()[0].proxy()
-        self.daemon.node_can_shutdown(monitor).get(self.TIMEOUT)
-        self.last_shutdown.success.get.return_value = False
-        self.daemon.node_finished_shutdown(self.last_shutdown).get(self.TIMEOUT)
-        self.assertwait(lambda: self.assertEqual(1, self.paired_monitor_count()))
-        self.daemon.node_can_shutdown(monitor).get(self.TIMEOUT)
-        self.last_shutdown.success.get.return_value = True
-        self.last_shutdown.stop.side_effect = lambda: monitor.stop()
-        self.daemon.node_finished_shutdown(self.last_shutdown).get(self.TIMEOUT)
-        self.assertwait(lambda: self.assertEqual(0, self.paired_monitor_count()))
-    def test_nodes_shutting_down_replaced_below_max_nodes(self):
-        size = testutil.MockSize(6)
-        cloud_node = testutil.cloud_node_mock(6, size=size)
-        self.make_daemon([cloud_node], [testutil.arvados_node_mock(6, crunch_worker_state='down')],
-                         avail_sizes=[(size, {"cores":1})])
-        self.assertEqual(1, self.alive_monitor_count())
-        monitor = self.monitor_list()[0].proxy()
-        self.daemon.node_can_shutdown(monitor).get(self.TIMEOUT)
-        self.assertTrue(self.node_shutdown.start.called)
-        getmock = mock.MagicMock()
-        getmock.get.return_value = False
-        self.last_shutdown.cancel_shutdown.return_value = getmock
-        self.daemon.update_server_wishlist(
-            [testutil.MockSize(6)]).get(self.TIMEOUT)
-        self.busywait(lambda: self.node_setup.start.called)
-    def test_nodes_shutting_down_cancelled(self):
-        size = testutil.MockSize(6)
-        cloud_node = testutil.cloud_node_mock(6, size=size)
-        self.make_daemon([cloud_node], [testutil.arvados_node_mock(6, crunch_worker_state='down')],
-                         avail_sizes=[(size, {"cores":1})])
-        self.assertEqual(1, self.alive_monitor_count())
-        monitor = self.monitor_list()[0].proxy()
-        self.daemon.node_can_shutdown(monitor).get(self.TIMEOUT)
-        self.assertTrue(self.node_shutdown.start.called)
-        self.daemon.update_server_wishlist(
-            [testutil.MockSize(6)]).get(self.TIMEOUT)
-        self.busywait(lambda: self.last_shutdown.cancel_shutdown.called)
-    def test_nodes_shutting_down_not_replaced_at_max_nodes(self):
-        cloud_node = testutil.cloud_node_mock(7)
-        self.make_daemon([cloud_node], [testutil.arvados_node_mock(7)],
-                         max_nodes=1)
-        self.assertwait(lambda: self.assertEqual(1, self.paired_monitor_count()))
-        monitor = self.monitor_list()[0].proxy()
-        self.daemon.node_can_shutdown(monitor).get(self.TIMEOUT)
-        self.assertTrue(self.node_shutdown.start.called)
-        self.daemon.update_server_wishlist(
-            [testutil.MockSize(7)]).get(self.TIMEOUT)
-        self.busywait(lambda: not self.node_setup.start.called)
-    def test_nodes_shutting_down_count_against_excess(self):
-        size = testutil.MockSize(8)
-        cloud_nodes = [testutil.cloud_node_mock(n, size=size) for n in [8, 9]]
-        arv_nodes = [testutil.arvados_node_mock(n, size=size) for n in [8, 9]]
-        self.make_daemon(cloud_nodes, arv_nodes, [size],
-                         avail_sizes=[(size, {"cores":1})])
-        self.assertwait(lambda: self.assertEqual(2, self.paired_monitor_count()))
-        for mon_ref in self.monitor_list():
-            self.daemon.node_can_shutdown(mon_ref.proxy()).get(self.TIMEOUT)
-        self.assertEqual(1, self.node_shutdown.start.call_count)
-    def test_clean_shutdown_waits_for_node_setup_finish(self):
-        new_node = self.start_node_boot()
-        new_node.stop_if_no_cloud_node().get.return_value = False
-        new_node.stop_if_no_cloud_node.reset_mock()
-        self.daemon.shutdown().get(self.TIMEOUT)
-        self.assertTrue(new_node.stop_if_no_cloud_node.called)
-        self.daemon.node_setup_finished(new_node).get(self.TIMEOUT)
-        self.assertTrue(new_node.stop.called)
-        self.timer.deliver()
-        self.assertTrue(
-            self.daemon.actor_ref.actor_stopped.wait(self.TIMEOUT))
-    def test_wishlist_ignored_after_shutdown(self):
-        new_node = self.start_node_boot()
-        new_node.stop_if_no_cloud_node().get.return_value = False
-        new_node.stop_if_no_cloud_node.reset_mock()
-        self.daemon.shutdown().get(self.TIMEOUT)
-        size = testutil.MockSize(2)
-        self.daemon.update_server_wishlist([size] * 2).get(self.TIMEOUT)
-        self.timer.deliver()
-        self.busywait(lambda: 1 == self.node_setup.start.call_count)
-    def test_shutdown_actor_stopped_when_cloud_node_delisted(self):
-        self.make_daemon(cloud_nodes=[testutil.cloud_node_mock()])
-        self.assertEqual(1, self.alive_monitor_count())
-        monitor = self.monitor_list()[0].proxy()
-        self.daemon.node_can_shutdown(monitor).get(self.TIMEOUT)
-        self.daemon.update_cloud_nodes([]).get(self.TIMEOUT)
-        self.busywait(lambda: 1 == self.last_shutdown.stop.call_count)
-    def test_idle_node_disappearing_clears_status_idle_time_counter(self):
-        size = testutil.MockSize(1)
-        status.tracker._idle_nodes = {}
-        cloud_nodes = [testutil.cloud_node_mock(1, size=size)]
-        arv_nodes = [testutil.arvados_node_mock(1, job_uuid=None)]
-        self.make_daemon(cloud_nodes, arv_nodes, [size])
-        self.assertwait(lambda: self.assertEqual(1, self.paired_monitor_count()))
-        for mon_ref in self.monitor_list():
-            monitor = mon_ref.proxy()
-            if monitor.cloud_node.get(self.TIMEOUT) is cloud_nodes[-1]:
-                break
-        else:
-            self.fail("monitor for idle node not found")
-        self.assertEqual(1, status.tracker.get('nodes_idle'))
-        hostname = monitor.arvados_node.get()['hostname']
-        self.assertIn(hostname, status.tracker._idle_nodes)
-        # Simulate the node disappearing from the cloud node list
-        self.daemon.update_cloud_nodes([]).get(self.TIMEOUT)
-        self.busywait(lambda: 0 == self.alive_monitor_count())
-        self.assertNotIn(hostname, status.tracker._idle_nodes)
-    def test_shutdown_actor_cleanup_copes_with_dead_actors(self):
-        self.make_daemon(cloud_nodes=[testutil.cloud_node_mock()])
-        self.assertEqual(1, self.alive_monitor_count())
-        monitor = self.monitor_list()[0].proxy()
-        self.daemon.node_can_shutdown(monitor).get(self.TIMEOUT)
-        # We're mainly testing that update_cloud_nodes catches and handles
-        # the ActorDeadError.
-        self.last_shutdown.stop.side_effect = pykka.ActorDeadError
-        self.daemon.update_cloud_nodes([]).get(self.TIMEOUT)
-        self.busywait(lambda: 1 == self.last_shutdown.stop.call_count)
-    def test_node_create_two_sizes(self):
-        small = testutil.MockSize(1)
-        big = testutil.MockSize(2)
-        avail_sizes = [(testutil.MockSize(1), {"cores":1}),
-                        (testutil.MockSize(2), {"cores":2})]
-        self.make_daemon(want_sizes=[small, small, small, big],
-                         avail_sizes=avail_sizes, max_nodes=4)
-        # the daemon runs in another thread, so we need to wait and see
-        # if it does all the work we're expecting it to do before stopping it.
-        self.busywait(lambda: self.node_setup.start.call_count == 4)
-        booting = self.daemon.booting.get(self.TIMEOUT)
-        self.stop_proxy(self.daemon)
-        sizecounts = {a[0].id: 0 for a in avail_sizes}
-        for b in booting.itervalues():
-            sizecounts[b.cloud_size.get().id] += 1
-        logging.info(sizecounts)
-        self.assertEqual(3, sizecounts[small.id])
-        self.assertEqual(1, sizecounts[big.id])
-    def test_node_max_nodes_two_sizes(self):
-        small = testutil.MockSize(1)
-        big = testutil.MockSize(2)
-        avail_sizes = [(testutil.MockSize(1), {"cores":1}),
-                        (testutil.MockSize(2), {"cores":2})]
-        self.make_daemon(want_sizes=[small, small, big, small],
-                         avail_sizes=avail_sizes, max_nodes=3)
-        # the daemon runs in another thread, so we need to wait and see
-        # if it does all the work we're expecting it to do before stopping it.
-        self.busywait(lambda: self.node_setup.start.call_count == 3)
-        booting = self.daemon.booting.get(self.TIMEOUT)
-        self.stop_proxy(self.daemon)
-        sizecounts = {a[0].id: 0 for a in avail_sizes}
-        for b in booting.itervalues():
-            sizecounts[b.cloud_size.get().id] += 1
-        self.assertEqual(2, sizecounts[small.id])
-        self.assertEqual(1, sizecounts[big.id])
-    def test_wishlist_ordering(self):
-        # Check that big nodes aren't prioritized; since #12199 containers are
-        # scheduled on specific node sizes.
-        small = testutil.MockSize(1)
-        big = testutil.MockSize(2)
-        avail_sizes = [(testutil.MockSize(1), {"cores":1}),
-                        (testutil.MockSize(2), {"cores":2})]
-        self.make_daemon(want_sizes=[small, small, small, big],
-                         avail_sizes=avail_sizes, max_nodes=3)
-        # the daemon runs in another thread, so we need to wait and see
-        # if it does all the work we're expecting it to do before stopping it.
-        self.busywait(lambda: self.node_setup.start.call_count == 3)
-        booting = self.daemon.booting.get(self.TIMEOUT)
-        self.stop_proxy(self.daemon)
-        sizecounts = {a[0].id: 0 for a in avail_sizes}
-        for b in booting.itervalues():
-            sizecounts[b.cloud_size.get().id] += 1
-        self.assertEqual(3, sizecounts[small.id])
-        self.assertEqual(0, sizecounts[big.id])
-    def test_wishlist_reconfigure(self):
-        small = testutil.MockSize(1)
-        big = testutil.MockSize(2)
-        avail_sizes = [(small, {"cores":1}), (big, {"cores":2})]
-        self.make_daemon(cloud_nodes=[testutil.cloud_node_mock(1, small),
-                                      testutil.cloud_node_mock(2, small),
-                                      testutil.cloud_node_mock(3, big)],
-                         arvados_nodes=[testutil.arvados_node_mock(1),
-                                        testutil.arvados_node_mock(2),
-                                        testutil.arvados_node_mock(3)],
-                         want_sizes=[small, small, big],
-                         avail_sizes=avail_sizes)
-        self.assertwait(lambda: self.assertEqual(3, self.paired_monitor_count()))
-        self.daemon.update_server_wishlist([small, big, big]).get(self.TIMEOUT)
-        self.assertEqual(0, self.node_shutdown.start.call_count)
-        for c in self.daemon.cloud_nodes.get().nodes.itervalues():
-            self.daemon.node_can_shutdown(c.actor)
-        booting = self.daemon.booting.get()
-        cloud_nodes = self.daemon.cloud_nodes.get()
-        self.busywait(lambda: 1 == self.node_setup.start.call_count)
-        self.busywait(lambda: 1 == self.node_shutdown.start.call_count)
-        self.stop_proxy(self.daemon)
-        # booting a new big node
-        sizecounts = {a[0].id: 0 for a in avail_sizes}
-        for b in booting.itervalues():
-            sizecounts[b.cloud_size.get().id] += 1
-        self.assertEqual(0, sizecounts[small.id])
-        self.assertEqual(1, sizecounts[big.id])
-        # shutting down a small node
-        sizecounts = {a[0].id: 0 for a in avail_sizes}
-        for b in cloud_nodes.nodes.itervalues():
-            if b.shutdown_actor is not None:
-                sizecounts[b.cloud_node.size.id] += 1
-        self.assertEqual(1, sizecounts[small.id])
-        self.assertEqual(0, sizecounts[big.id])
-    def test_node_max_price(self):
-        small = testutil.MockSize(1)
-        big = testutil.MockSize(2)
-        avail_sizes = [(testutil.MockSize(1), {"cores":1, "price":1}),
-                        (testutil.MockSize(2), {"cores":2, "price":2})]
-        self.make_daemon(want_sizes=[small, small, small, big],
-                         avail_sizes=avail_sizes,
-                         max_nodes=4,
-                         max_total_price=4)
-        # the daemon runs in another thread, so we need to wait and see
-        # if it does all the work we're expecting it to do before stopping it.
-        self.busywait(lambda: self.node_setup.start.call_count == 3)
-        booting = self.daemon.booting.get()
-        self.stop_proxy(self.daemon)
-        sizecounts = {a[0].id: 0 for a in avail_sizes}
-        for b in booting.itervalues():
-            sizecounts[b.cloud_size.get().id] += 1
-        logging.info(sizecounts)
-        # Booting 3 small nodes and not booting a big node would also partially
-        # satisfy the wishlist and come in under the price cap, however the way
-        # the update_server_wishlist() currently works effectively results in a
-        # round-robin creation of one node of each size in the wishlist, so
-        # test for that.
-        self.assertEqual(2, sizecounts[small.id])
-        self.assertEqual(1, sizecounts[big.id])
diff --git a/services/nodemanager/tests/test_failure.py b/services/nodemanager/tests/test_failure.py
deleted file mode 100644 (file)
index 8bf3ea8..0000000
+++ /dev/null
@@ -1,69 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-# SPDX-License-Identifier: AGPL-3.0
-from __future__ import absolute_import, print_function
-import errno
-import logging
-import time
-import threading
-import unittest
-import mock
-import pykka
-from . import testutil
-import arvnodeman.baseactor
-import arvnodeman.status as status
-class BogusActor(arvnodeman.baseactor.BaseNodeManagerActor):
-    def __init__(self, e, killfunc=None):
-        super(BogusActor, self).__init__(killfunc=killfunc)
-        self.exp = e
-    def doStuff(self):
-        raise self.exp
-    def ping(self):
-        # Called by WatchdogActorTest, this delay is longer than the test timeout
-        # of 1 second, which should cause the watchdog ping to fail.
-        time.sleep(2)
-        return True
-class ActorUnhandledExceptionTest(testutil.ActorTestMixin, unittest.TestCase):
-    def test_fatal_error(self):
-        for e in (MemoryError(), threading.ThreadError(), OSError(errno.ENOMEM, "")):
-            kill_mock = mock.Mock('os.kill')
-            bgact = BogusActor.start(e, killfunc=kill_mock)
-            act_thread = bgact.proxy().get_thread().get()
-            act = bgact.tell_proxy()
-            act.doStuff()
-            act.actor_ref.stop(block=True)
-            act_thread.join()
-            self.assertTrue(kill_mock.called)
-    def test_nonfatal_error(self):
-        status.tracker.update({'actor_exceptions': 0})
-        kill_mock = mock.Mock('os.kill')
-        bgact = BogusActor.start(OSError(errno.ENOENT, ""), killfunc=kill_mock)
-        act_thread = bgact.proxy().get_thread().get()
-        act = bgact.tell_proxy()
-        act.doStuff()
-        act.actor_ref.stop(block=True)
-        act_thread.join()
-        self.assertFalse(kill_mock.called)
-        self.assertEqual(1, status.tracker.get('actor_exceptions'))
-class WatchdogActorTest(testutil.ActorTestMixin, unittest.TestCase):
-    def test_time_timout(self):
-        kill_mock = mock.Mock('os.kill')
-        act = BogusActor.start(OSError(errno.ENOENT, ""))
-        watch = arvnodeman.baseactor.WatchdogActor.start(1, act, killfunc=kill_mock)
-        time.sleep(1)
-        watch.stop(block=True)
-        act.stop(block=True)
-        self.assertTrue(kill_mock.called)
diff --git a/services/nodemanager/tests/test_jobqueue.py b/services/nodemanager/tests/test_jobqueue.py
deleted file mode 100644 (file)
index de83b68..0000000
+++ /dev/null
@@ -1,239 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-# SPDX-License-Identifier: AGPL-3.0
-from __future__ import absolute_import, print_function
-import unittest
-import mock
-import arvnodeman.jobqueue as jobqueue
-from . import testutil
-class ServerCalculatorTestCase(unittest.TestCase):
-    def make_calculator(self, factors, **kwargs):
-        return jobqueue.ServerCalculator(
-            [(testutil.MockSize(n), {'cores': n}) for n in factors], **kwargs)
-    def calculate(self, servcalc, *constraints):
-        return servcalc.servers_for_queue(
-            [{'uuid': 'zzzzz-jjjjj-{:015x}'.format(index),
-              'runtime_constraints': cdict}
-             for index, cdict in enumerate(constraints)])
-    def test_empty_queue_needs_no_servers(self):
-        servcalc = self.make_calculator([1])
-        self.assertEqual(([], {}), servcalc.servers_for_queue([]))
-    def test_easy_server_count(self):
-        servcalc = self.make_calculator([1])
-        servlist, _ = self.calculate(servcalc, {'min_nodes': 3})
-        self.assertEqual(3, len(servlist))
-    def test_default_5pct_ram_value_decrease(self):
-        servcalc = self.make_calculator([1])
-        servlist, _ = self.calculate(servcalc, {'min_ram_mb_per_node': 128})
-        self.assertEqual(0, len(servlist))
-        servlist, _ = self.calculate(servcalc, {'min_ram_mb_per_node': 121})
-        self.assertEqual(1, len(servlist))
-    def test_custom_node_mem_scaling_factor(self):
-        # Simulate a custom 'node_mem_scaling' config parameter by passing
-        # the value to ServerCalculator
-        servcalc = self.make_calculator([1], node_mem_scaling=0.5)
-        servlist, _ = self.calculate(servcalc, {'min_ram_mb_per_node': 128})
-        self.assertEqual(0, len(servlist))
-        servlist, _ = self.calculate(servcalc, {'min_ram_mb_per_node': 64})
-        self.assertEqual(1, len(servlist))
-    def test_implicit_server_count(self):
-        servcalc = self.make_calculator([1])
-        servlist, _ = self.calculate(servcalc, {}, {'min_nodes': 3})
-        self.assertEqual(4, len(servlist))
-    def test_bad_min_nodes_override(self):
-        servcalc = self.make_calculator([1])
-        servlist, _ = self.calculate(servcalc,
-                                     {'min_nodes': -2}, {'min_nodes': 'foo'})
-        self.assertEqual(2, len(servlist))
-    def test_ignore_and_return_unsatisfiable_jobs(self):
-        servcalc = self.make_calculator([1], max_nodes=9)
-        servlist, u_jobs = self.calculate(servcalc,
-                                          {'min_cores_per_node': 2},
-                                          {'min_ram_mb_per_node': 256},
-                                          {'min_nodes': 6},
-                                          {'min_nodes': 12},
-                                          {'min_scratch_mb_per_node': 300000})
-        self.assertEqual(6, len(servlist))
-        # Only unsatisfiable jobs are returned on u_jobs
-        self.assertIn('zzzzz-jjjjj-000000000000000', u_jobs.keys())
-        self.assertIn('zzzzz-jjjjj-000000000000001', u_jobs.keys())
-        self.assertNotIn('zzzzz-jjjjj-000000000000002', u_jobs.keys())
-        self.assertIn('zzzzz-jjjjj-000000000000003', u_jobs.keys())
-        self.assertIn('zzzzz-jjjjj-000000000000004', u_jobs.keys())
-    def test_ignore_too_expensive_jobs(self):
-        servcalc = self.make_calculator([1, 2], max_nodes=12, max_price=6)
-        servlist, _ = self.calculate(servcalc,
-                                     {'min_cores_per_node': 1, 'min_nodes': 6})
-        self.assertEqual(6, len(servlist))
-        servlist, _ = self.calculate(servcalc,
-                                     {'min_cores_per_node': 2, 'min_nodes': 6})
-        self.assertEqual(0, len(servlist))
-    def test_job_requesting_max_nodes_accepted(self):
-        servcalc = self.make_calculator([1], max_nodes=4)
-        servlist, _ = self.calculate(servcalc, {'min_nodes': 4})
-        self.assertEqual(4, len(servlist))
-    def test_cheapest_size(self):
-        servcalc = self.make_calculator([2, 4, 1, 3])
-        self.assertEqual(testutil.MockSize(1), servcalc.cheapest_size())
-    def test_next_biggest(self):
-        servcalc = self.make_calculator([1, 2, 4, 8])
-        servlist, _ = self.calculate(servcalc,
-                                     {'min_cores_per_node': 3},
-                                     {'min_cores_per_node': 6})
-        self.assertEqual([servcalc.cloud_sizes[2].id,
-                          servcalc.cloud_sizes[3].id],
-                         [s.id for s in servlist])
-    def test_multiple_sizes(self):
-        servcalc = self.make_calculator([1, 2])
-        servlist, _ = self.calculate(servcalc,
-                                     {'min_cores_per_node': 2},
-                                     {'min_cores_per_node': 1},
-                                     {'min_cores_per_node': 1})
-        self.assertEqual([servcalc.cloud_sizes[1].id,
-                          servcalc.cloud_sizes[0].id,
-                          servcalc.cloud_sizes[0].id],
-                         [s.id for s in servlist])
-        servlist, _ = self.calculate(servcalc,
-                                     {'min_cores_per_node': 1},
-                                     {'min_cores_per_node': 2},
-                                     {'min_cores_per_node': 1})
-        self.assertEqual([servcalc.cloud_sizes[0].id,
-                          servcalc.cloud_sizes[1].id,
-                          servcalc.cloud_sizes[0].id],
-                         [s.id for s in servlist])
-        servlist, _ = self.calculate(servcalc,
-                                     {'min_cores_per_node': 1},
-                                     {'min_cores_per_node': 1},
-                                     {'min_cores_per_node': 2})
-        self.assertEqual([servcalc.cloud_sizes[0].id,
-                          servcalc.cloud_sizes[0].id,
-                          servcalc.cloud_sizes[1].id],
-                         [s.id for s in servlist])
-class JobQueueMonitorActorTestCase(testutil.RemotePollLoopActorTestMixin,
-                                   unittest.TestCase):
-    TEST_CLASS = jobqueue.JobQueueMonitorActor
-    class MockCalculator(object):
-        @staticmethod
-        def servers_for_queue(queue):
-            return ([testutil.MockSize(n) for n in queue], {})
-    class MockCalculatorUnsatisfiableJobs(object):
-        @staticmethod
-        def servers_for_queue(queue):
-            return ([], {k["uuid"]: "Unsatisfiable job mock" for k in queue})
-    def build_monitor(self, side_effect, *args, **kwargs):
-        super(JobQueueMonitorActorTestCase, self).build_monitor(*args, **kwargs)
-        self.client.jobs().queue().execute.side_effect = side_effect
-    @mock.patch("subprocess32.check_call")
-    @mock.patch("subprocess32.check_output")
-    def test_unsatisfiable_jobs(self, mock_squeue, mock_scancel):
-        job_uuid = 'zzzzz-8i9sb-zzzzzzzzzzzzzzz'
-        container_uuid = 'yyyyy-dz642-yyyyyyyyyyyyyyy'
-        mock_squeue.return_value = "1|1024|0|(Resources)|" + container_uuid + "||1234567890\n"
-        self.build_monitor([{'items': [{'uuid': job_uuid}]}],
-                           self.MockCalculatorUnsatisfiableJobs(), True, True)
-        self.monitor.subscribe(self.subscriber).get(self.TIMEOUT)
-        self.monitor.ping().get(self.TIMEOUT)
-        self.stop_proxy(self.monitor)
-        self.client.jobs().cancel.assert_called_with(uuid=job_uuid)
-        mock_scancel.assert_called_with(['scancel', '--name='+container_uuid])
-    @mock.patch("subprocess32.check_output")
-    def test_subscribers_get_server_lists(self, mock_squeue):
-        mock_squeue.return_value = ""
-        self.build_monitor([{'items': [1, 2]}], self.MockCalculator(), True, True)
-        self.monitor.subscribe(self.subscriber).get(self.TIMEOUT)
-        self.stop_proxy(self.monitor)
-        self.subscriber.assert_called_with([testutil.MockSize(1),
-                                            testutil.MockSize(2)])
-    @mock.patch("subprocess32.check_output")
-    def test_squeue_server_list(self, mock_squeue):
-        mock_squeue.return_value = """1|1024|0|(Resources)|zzzzz-dz642-zzzzzzzzzzzzzzy|(null)|1234567890
-        super(JobQueueMonitorActorTestCase, self).build_monitor(jobqueue.ServerCalculator(
-            [(testutil.MockSize(n), {'cores': n, 'ram': n*1024, 'scratch': n}) for n in range(1, 3)]),
-                                                                True, True)
-        self.monitor.subscribe(self.subscriber).get(self.TIMEOUT)
-        self.stop_proxy(self.monitor)
-        self.subscriber.assert_called_with([testutil.MockSize(1),
-                                            testutil.MockSize(2)])
-    @mock.patch("subprocess32.check_output")
-    def test_squeue_server_list_suffix(self, mock_squeue):
-        mock_squeue.return_value = """1|1024M|0|(ReqNodeNotAvail, UnavailableNodes:compute123)|zzzzz-dz642-zzzzzzzzzzzzzzy|(null)|1234567890
-        super(JobQueueMonitorActorTestCase, self).build_monitor(jobqueue.ServerCalculator(
-            [(testutil.MockSize(n), {'cores': n, 'ram': n*1024, 'scratch': n}) for n in range(1, 3)]),
-                                                                True, True)
-        self.monitor.subscribe(self.subscriber).get(self.TIMEOUT)
-        self.stop_proxy(self.monitor)
-        self.subscriber.assert_called_with([testutil.MockSize(1),
-                                            testutil.MockSize(2)])
-    @mock.patch("subprocess32.check_output")
-    def test_squeue_server_list_instancetype_constraint(self, mock_squeue):
-        mock_squeue.return_value = """1|1024|0|(Resources)|zzzzz-dz642-zzzzzzzzzzzzzzy|instancetype=z2.test|1234567890\n"""
-        super(JobQueueMonitorActorTestCase, self).build_monitor(jobqueue.ServerCalculator(
-            [(testutil.MockSize(n), {'cores': n, 'ram': n*1024, 'scratch': n}) for n in range(1, 3)]),
-                                                                True, True)
-        self.monitor.subscribe(self.subscriber).get(self.TIMEOUT)
-        self.stop_proxy(self.monitor)
-        self.subscriber.assert_called_with([testutil.MockSize(2)])
-    def test_coerce_to_mb(self):
-        self.assertEqual(1, jobqueue.JobQueueMonitorActor.coerce_to_mb("1"))
-        self.assertEqual(512, jobqueue.JobQueueMonitorActor.coerce_to_mb("512"))
-        self.assertEqual(512, jobqueue.JobQueueMonitorActor.coerce_to_mb("512M"))
-        self.assertEqual(1024, jobqueue.JobQueueMonitorActor.coerce_to_mb("1024M"))
-        self.assertEqual(1024, jobqueue.JobQueueMonitorActor.coerce_to_mb("1G"))
-        self.assertEqual(1536, jobqueue.JobQueueMonitorActor.coerce_to_mb("1.5G"))
-        self.assertEqual(2048, jobqueue.JobQueueMonitorActor.coerce_to_mb("2G"))
-        self.assertEqual(1025, jobqueue.JobQueueMonitorActor.coerce_to_mb("1025M"))
-        self.assertEqual(1048576, jobqueue.JobQueueMonitorActor.coerce_to_mb("1T"))
-        self.assertEqual(1572864, jobqueue.JobQueueMonitorActor.coerce_to_mb("1.5T"))
-        self.assertEqual(1073741824, jobqueue.JobQueueMonitorActor.coerce_to_mb("1P"))
-        self.assertEqual(1610612736, jobqueue.JobQueueMonitorActor.coerce_to_mb("1.5P"))
-        self.assertEqual(0, jobqueue.JobQueueMonitorActor.coerce_to_mb("0"))
-        self.assertEqual(0, jobqueue.JobQueueMonitorActor.coerce_to_mb("0M"))
-        self.assertEqual(0, jobqueue.JobQueueMonitorActor.coerce_to_mb("0G"))
-if __name__ == '__main__':
-    unittest.main()
diff --git a/services/nodemanager/tests/test_nodelist.py b/services/nodemanager/tests/test_nodelist.py
deleted file mode 100644 (file)
index df31a12..0000000
+++ /dev/null
@@ -1,106 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-# SPDX-License-Identifier: AGPL-3.0
-from __future__ import absolute_import, print_function
-import unittest
-import mock
-import arvnodeman.nodelist as nodelist
-from libcloud.compute.base import NodeSize
-from . import testutil
-class ArvadosNodeListMonitorActorTestCase(testutil.RemotePollLoopActorTestMixin,
-                                          unittest.TestCase):
-    TEST_CLASS = nodelist.ArvadosNodeListMonitorActor
-    def build_monitor(self, side_effect, *args, **kwargs):
-        super(ArvadosNodeListMonitorActorTestCase, self).build_monitor(
-            *args, **kwargs)
-        self.client.nodes().list().execute.side_effect = side_effect
-    @mock.patch("subprocess32.check_output")
-    def test_uuid_is_subscription_key(self, sinfo_mock):
-        sinfo_mock.return_value = ""
-        node = testutil.arvados_node_mock()
-        self.build_monitor([{
-            'items': [node],
-            'items_available': 1,
-            'offset': 0
-        }, {
-            'items': [],
-            'items_available': 1,
-            'offset': 1
-        }])
-        self.monitor.subscribe_to(node['uuid'],
-                                  self.subscriber).get(self.TIMEOUT)
-        self.stop_proxy(self.monitor)
-        self.subscriber.assert_called_with(node)
-        self.assertEqual("down", node["crunch_worker_state"])
-    @mock.patch("subprocess32.check_output")
-    def test_update_from_sinfo(self, sinfo_mock):
-        sinfo_mock.return_value = """compute1|idle|instancetype=a1.test
-        nodeIdle = testutil.arvados_node_mock(node_num=1)
-        nodeBusy = testutil.arvados_node_mock(node_num=2)
-        nodeMissing = testutil.arvados_node_mock(node_num=99)
-        self.build_monitor([{
-            'items': [nodeIdle, nodeBusy, nodeMissing],
-            'items_available': 1,
-            'offset': 0
-        }, {
-            'items': [],
-            'items_available': 1,
-            'offset': 1
-        }])
-        self.monitor.subscribe_to(nodeMissing['uuid'],
-                                  self.subscriber).get(self.TIMEOUT)
-        self.stop_proxy(self.monitor)
-        self.subscriber.assert_called_with(nodeMissing)
-        self.assertEqual("idle", nodeIdle["crunch_worker_state"])
-        self.assertEqual("busy", nodeBusy["crunch_worker_state"])
-        self.assertEqual("down", nodeMissing["crunch_worker_state"])
-        self.assertEqual("instancetype=a1.test", nodeIdle["slurm_node_features"])
-        self.assertEqual("", nodeBusy["slurm_node_features"])
-        self.assertEqual("", nodeMissing["slurm_node_features"])
-class CloudNodeListMonitorActorTestCase(testutil.RemotePollLoopActorTestMixin,
-                                        unittest.TestCase):
-    TEST_CLASS = nodelist.CloudNodeListMonitorActor
-    class MockNode(object):
-        def __init__(self, count):
-            self.id = str(count)
-            self.name = 'test{}.example.com'.format(count)
-            self.private_ips = ['10.0.0.{}'.format(count)]
-            self.public_ips = []
-            self.size = testutil.MockSize(1)
-            self.state = 0
-            self.extra = {'arvados_node_size': self.size.id}
-    def build_monitor(self, side_effect, *args, **kwargs):
-        super(CloudNodeListMonitorActorTestCase, self).build_monitor(
-            *args, **kwargs)
-        self.client.list_nodes.side_effect = side_effect
-    def test_id_is_subscription_key(self):
-        node = self.MockNode(1)
-        mock_calc = mock.MagicMock()
-        mock_calc.find_size.return_value = testutil.MockSize(2)
-        self.build_monitor([[node]], mock_calc)
-        self.monitor.subscribe_to('1', self.subscriber).get(self.TIMEOUT)
-        self.stop_proxy(self.monitor)
-        self.subscriber.assert_called_with(node)
-        self.assertEqual(testutil.MockSize(2), node.size)
-if __name__ == '__main__':
-    unittest.main()
diff --git a/services/nodemanager/tests/test_status.py b/services/nodemanager/tests/test_status.py
deleted file mode 100644 (file)
index 2a1c0fc..0000000
+++ /dev/null
@@ -1,139 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-# SPDX-License-Identifier: AGPL-3.0
-from __future__ import absolute_import, print_function
-from future import standard_library
-import json
-import mock
-import random
-import requests
-import unittest
-import arvnodeman.status as status
-import arvnodeman.config as config
-class TestServer(object):
-    def __init__(self, management_token=None):
-        self.mgmt_token = management_token
-    def __enter__(self):
-        cfg = config.NodeManagerConfig()
-        cfg.set('Manage', 'port', '0')
-        cfg.set('Manage', 'address', '')
-        if self.mgmt_token != None:
-            cfg.set('Manage', 'ManagementToken', self.mgmt_token)
-        self.srv = status.Server(cfg)
-        self.srv.start()
-        addr, port = self.srv.server_address
-        self.srv_base = ''+str(port)
-        return self
-    def __exit__(self, exc_type, exc_value, traceback):
-        self.srv.shutdown()
-    def get_status_response(self):
-        return requests.get(self.srv_base+'/status.json')
-    def get_status(self):
-        return self.get_status_response().json()
-    def get_healthcheck_ping(self, auth_header=None):
-        headers = {}
-        if auth_header != None:
-            headers['Authorization'] = auth_header
-        return requests.get(self.srv_base+'/_health/ping', headers=headers)
-class StatusServerUpdates(unittest.TestCase):
-    def test_updates(self):
-        with TestServer() as srv:
-            for n in [1, 2, 3]:
-                status.tracker.update({'nodes_'+str(n): n})
-                r = srv.get_status_response()
-                self.assertEqual(200, r.status_code)
-                self.assertEqual('application/json', r.headers['content-type'])
-                resp = r.json()
-                self.assertEqual(n, resp['nodes_'+str(n)])
-            self.assertEqual(1, resp['nodes_1'])
-            self.assertIn('Version', resp)
-            self.assertIn('config_max_nodes', resp)
-    def test_counters(self):
-        with TestServer() as srv:
-            resp = srv.get_status()
-            # Test counters existance
-            for counter in ['list_nodes_errors', 'create_node_errors',
-                'destroy_node_errors', 'boot_failures', 'actor_exceptions']:
-                self.assertIn(counter, resp)
-            # Test counter increment
-            for count in range(1, 3):
-                status.tracker.counter_add('a_counter')
-                resp = srv.get_status()
-                self.assertEqual(count, resp['a_counter'])
-    @mock.patch('time.time')
-    def test_idle_times(self, time_mock):
-        with TestServer() as srv:
-            resp = srv.get_status()
-            node_name = 'idle_compute{}'.format(random.randint(1, 1024))
-            self.assertIn('idle_times', resp)
-            # Test add an idle node
-            time_mock.return_value = 10
-            status.tracker.idle_in(node_name)
-            time_mock.return_value += 10
-            resp = srv.get_status()
-            self.assertEqual(10, resp['idle_times'][node_name])
-            # Test adding the same idle node a 2nd time
-            time_mock.return_value += 10
-            status.tracker.idle_in(node_name)
-            time_mock.return_value += 10
-            resp = srv.get_status()
-            # Idle timestamp doesn't get reset if already exists
-            self.assertEqual(30, resp['idle_times'][node_name])
-            # Test remove idle node
-            status.tracker.idle_out(node_name)
-            resp = srv.get_status()
-            self.assertNotIn(node_name, resp['idle_times'])
-class StatusServerDisabled(unittest.TestCase):
-    def test_config_disabled(self):
-        cfg = config.NodeManagerConfig()
-        cfg.set('Manage', 'port', '-1')
-        cfg.set('Manage', 'address', '')
-        self.srv = status.Server(cfg)
-        self.srv.start()
-        self.assertFalse(self.srv.enabled)
-        self.assertFalse(getattr(self.srv, '_thread', False))
-class HealthcheckPing(unittest.TestCase):
-    def test_ping_disabled(self):
-        with TestServer() as srv:
-            r = srv.get_healthcheck_ping()
-            self.assertEqual(404, r.status_code)
-    def test_ping_no_auth(self):
-        with TestServer('configuredmanagementtoken') as srv:
-            r = srv.get_healthcheck_ping()
-            self.assertEqual(401, r.status_code)
-    def test_ping_bad_auth_format(self):
-        with TestServer('configuredmanagementtoken') as srv:
-            r = srv.get_healthcheck_ping('noBearer')
-            self.assertEqual(403, r.status_code)
-    def test_ping_bad_auth_token(self):
-        with TestServer('configuredmanagementtoken') as srv:
-            r = srv.get_healthcheck_ping('Bearer badtoken')
-            self.assertEqual(403, r.status_code)
-    def test_ping_success(self):
-        with TestServer('configuredmanagementtoken') as srv:
-            r = srv.get_healthcheck_ping('Bearer configuredmanagementtoken')
-            self.assertEqual(200, r.status_code)
-            self.assertEqual('application/json', r.headers['content-type'])
-            resp = r.json()
-            self.assertEqual('{"health": "OK"}', json.dumps(resp))
diff --git a/services/nodemanager/tests/test_timedcallback.py b/services/nodemanager/tests/test_timedcallback.py
deleted file mode 100644 (file)
index 21a9b5a..0000000
+++ /dev/null
@@ -1,65 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-# SPDX-License-Identifier: AGPL-3.0
-from __future__ import absolute_import, print_function
-import time
-import unittest
-import mock
-import pykka
-import arvnodeman.timedcallback as timedcallback
-from . import testutil
-class TimedCallBackActorTestCase(testutil.ActorTestMixin, unittest.TestCase):
-    def test_immediate_turnaround(self):
-        receiver = mock.Mock()
-        deliverer = timedcallback.TimedCallBackActor.start().proxy()
-        deliverer.schedule(time.time() - 1, receiver,
-                           'immediate').get(self.TIMEOUT)
-        self.stop_proxy(deliverer)
-        receiver.assert_called_with('immediate')
-    def test_delayed_turnaround(self):
-        receiver = mock.Mock()
-        mock_now = mock.Mock()
-        mock_now.return_value = 0
-        deliverer = timedcallback.TimedCallBackActor.start(timefunc=mock_now).proxy()
-        deliverer.schedule(1, receiver, 'delayed')
-        deliverer.schedule(3, receiver, 'failure').get(self.TIMEOUT)
-        self.assertFalse(receiver.called)
-        mock_now.return_value = 2
-        deliverer.schedule(3, receiver, 'failure').get(self.TIMEOUT)
-        self.stop_proxy(deliverer)
-        receiver.assert_called_with('delayed')
-    def test_out_of_order_scheduling(self):
-        receiver = mock.Mock()
-        mock_now = mock.Mock()
-        mock_now.return_value = 1.5
-        deliverer = timedcallback.TimedCallBackActor.start(timefunc=mock_now).proxy()
-        deliverer.schedule(2, receiver, 'second')
-        deliverer.schedule(1, receiver, 'first')
-        deliverer.schedule(3, receiver, 'failure').get(self.TIMEOUT)
-        receiver.assert_called_with('first')
-        mock_now.return_value = 2.5
-        deliverer.schedule(3, receiver, 'failure').get(self.TIMEOUT)
-        self.stop_proxy(deliverer)
-        receiver.assert_called_with('second')
-    def test_dead_actors_ignored(self):
-        receiver = mock.Mock(name='dead_actor', spec=pykka.ActorRef)
-        receiver.tell.side_effect = pykka.ActorDeadError
-        deliverer = timedcallback.TimedCallBackActor.start().proxy()
-        deliverer.schedule(time.time() - 1, receiver.tell,
-                           'error').get(self.TIMEOUT)
-        self.assertTrue(self.stop_proxy(deliverer), "deliverer died")
-        receiver.tell.assert_called_with('error')
-if __name__ == '__main__':
-    unittest.main()
diff --git a/services/nodemanager/tests/testutil.py b/services/nodemanager/tests/testutil.py
deleted file mode 100644 (file)
index ee475ef..0000000
+++ /dev/null
@@ -1,236 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) The Arvados Authors. All rights reserved.
-# SPDX-License-Identifier: AGPL-3.0
-from __future__ import absolute_import, print_function
-import contextlib
-import datetime
-import mock
-import pykka
-import sys
-import threading
-import time
-import libcloud.common.types as cloud_types
-from . import pykka_timeout
-no_sleep = mock.patch('time.sleep', lambda n: None)
-def arvados_node_mock(node_num=99, job_uuid=None, age=-1, **kwargs):
-    mod_time = datetime.datetime.utcnow() - datetime.timedelta(seconds=age)
-    mod_time_s = mod_time.strftime('%Y-%m-%dT%H:%M:%S.%fZ')
-    if job_uuid is True:
-        job_uuid = 'zzzzz-jjjjj-jobjobjobjobjob'
-    crunch_worker_state = 'idle' if (job_uuid is None) else 'busy'
-    node = {'uuid': 'zzzzz-yyyyy-{:015x}'.format(node_num),
-            'created_at': '2014-01-01T01:02:03.04050607Z',
-            'modified_at': mod_time_s,
-            'first_ping_at': kwargs.pop('first_ping_at', mod_time_s),
-            'last_ping_at': mod_time_s,
-            'slot_number': node_num,
-            'hostname': 'compute{}'.format(node_num),
-            'domain': 'zzzzz.arvadosapi.com',
-            'ip_address': ip_address_mock(node_num),
-            'job_uuid': job_uuid,
-            'crunch_worker_state': crunch_worker_state,
-            'properties': {},
-            'info': {'ping_secret': 'defaulttestsecret', 'ec2_instance_id': str(node_num)}}
-    node.update(kwargs)
-    return node
-def cloud_object_mock(name_id, **extra):
-    # A very generic mock, useful for stubbing libcloud objects we
-    # only search for and pass around, like locations, subnets, etc.
-    cloud_object = mock.NonCallableMagicMock(['id', 'name'],
-                                             name='cloud_object')
-    cloud_object.name = str(name_id)
-    cloud_object.id = 'id_' + cloud_object.name
-    cloud_object.extra = extra
-    return cloud_object
-def cloud_node_fqdn(node):
-    # We intentionally put the FQDN somewhere goofy to make sure tested code is
-    # using this function for lookups.
-    return node.extra.get('testname', node.name+'.NoTestName.invalid')
-def ip_address_mock(last_octet):
-    return '10.20.30.{}'.format(last_octet)
-def redirected_streams(stdout=None, stderr=None):
-    orig_stdout, sys.stdout = sys.stdout, stdout or sys.stdout
-    orig_stderr, sys.stderr = sys.stderr, stderr or sys.stderr
-    try:
-        yield
-    finally:
-        sys.stdout = orig_stdout
-        sys.stderr = orig_stderr
-class MockShutdownTimer(object):
-    def _set_state(self, is_open, next_opening):
-        self.window_open = lambda: is_open
-        self.next_opening = lambda: next_opening
-class MockSize(object):
-    def __init__(self, factor, preemptible=False):
-        self.id = 'z{}.test'.format(factor)
-        self.name = 'test size '+self.id
-        self.ram = 128 * factor
-        self.disk = factor   # GB
-        self.scratch = 1000 * factor # MB
-        self.bandwidth = 16 * factor
-        self.price = float(factor)
-        self.extra = {}
-        self.real = self
-        self.preemptible = preemptible
-    def __eq__(self, other):
-        return self.id == other.id
-class MockTimer(object):
-    def __init__(self, deliver_immediately=True):
-        self.deliver_immediately = deliver_immediately
-        self.messages = []
-        self.lock = threading.Lock()
-    def deliver(self):
-        with self.lock:
-            to_deliver = self.messages
-            self.messages = []
-        for callback, args, kwargs in to_deliver:
-            try:
-                callback(*args, **kwargs)
-            except pykka.ActorDeadError:
-                pass
-    def schedule(self, want_time, callback, *args, **kwargs):
-        with self.lock:
-            self.messages.append((callback, args, kwargs))
-        if self.deliver_immediately:
-            self.deliver()
-class ActorTestMixin(object):
-    FUTURE_CLASS = pykka.ThreadingFuture
-    TIMEOUT = pykka_timeout
-    def tearDown(self):
-        pykka.ActorRegistry.stop_all()
-    def stop_proxy(self, proxy):
-        th = proxy.get_thread().get()
-        t = proxy.actor_ref.stop(timeout=self.TIMEOUT)
-        th.join()
-        return t
-    def wait_for_assignment(self, proxy, attr_name, unassigned=None,
-                            timeout=TIMEOUT):
-        deadline = time.time() + timeout
-        while True:
-            loop_timeout = deadline - time.time()
-            if loop_timeout <= 0:
-                self.fail("actor did not assign {} in time".format(attr_name))
-            result = getattr(proxy, attr_name).get(loop_timeout)
-            if result is not unassigned:
-                return result
-    def busywait(self, f, finalize=None):
-        n = 0
-        while not f() and n < 20:
-            time.sleep(.1)
-            n += 1
-        if finalize is not None:
-            finalize()
-        self.assertTrue(f())
-class DriverTestMixin(object):
-    def setUp(self):
-        self.driver_mock = mock.MagicMock(name='driver_mock')
-        super(DriverTestMixin, self).setUp()
-    def new_driver(self, auth_kwargs={}, list_kwargs={}, create_kwargs={}):
-        create_kwargs.setdefault('ping_host', '100::')
-        return self.TEST_CLASS(
-            auth_kwargs, list_kwargs, create_kwargs,
-            driver_class=self.driver_mock)
-    def driver_method_args(self, method_name):
-        return getattr(self.driver_mock(), method_name).call_args
-    def test_driver_create_retry(self):
-        with mock.patch('time.sleep'):
-            driver_mock2 = mock.MagicMock(name='driver_mock2')
-            self.driver_mock.side_effect = (Exception("oops"), driver_mock2)
-            kwargs = {'user_id': 'foo'}
-            driver = self.new_driver(auth_kwargs=kwargs)
-            self.assertTrue(self.driver_mock.called)
-            self.assertIs(driver.real, driver_mock2)
-    def test_create_can_find_node_after_timeout(self, create_kwargs={}, node_extra={}):
-        driver = self.new_driver(create_kwargs=create_kwargs)
-        arv_node = arvados_node_mock()
-        cloud_node = cloud_node_mock(**node_extra)
-        cloud_node.name = driver.create_cloud_name(arv_node)
-        create_method = self.driver_mock().create_node
-        create_method.side_effect = cloud_types.LibcloudError("fake timeout")
-        list_method = self.driver_mock().list_nodes
-        list_method.return_value = [cloud_node]
-        actual = driver.create_node(MockSize(1), arv_node)
-        self.assertIs(cloud_node, actual)
-    def test_create_can_raise_exception_after_timeout(self):
-        driver = self.new_driver()
-        arv_node = arvados_node_mock()
-        create_method = self.driver_mock().create_node
-        create_method.side_effect = cloud_types.LibcloudError("fake timeout")
-        list_method = self.driver_mock().list_nodes
-        list_method.return_value = []
-        with self.assertRaises(cloud_types.LibcloudError) as exc_test:
-            driver.create_node(MockSize(1), arv_node)
-        self.assertIs(create_method.side_effect, exc_test.exception)
-    def check_node_found_after_timeout_has_fixed_size(self, size, cloud_node,
-                                                      create_kwargs={}):
-        # This method needs to be called explicitly by driver test suites
-        # that need it.
-        self.driver_mock().list_sizes.return_value = [size]
-        driver = self.new_driver(create_kwargs=create_kwargs)
-        arv_node = arvados_node_mock()
-        cloud_node.name = driver.create_cloud_name(arv_node)
-        create_method = self.driver_mock().create_node
-        create_method.side_effect = cloud_types.LibcloudError("fake timeout")
-        self.driver_mock().list_nodes.return_value = [cloud_node]
-        actual = driver.create_node(size, arv_node)
-        self.assertIs(size, actual.size)
-class RemotePollLoopActorTestMixin(ActorTestMixin):
-    def build_monitor(self, *args, **kwargs):
-        self.timer = mock.MagicMock(name='timer_mock')
-        self.client = mock.MagicMock(name='client_mock')
-        self.subscriber = mock.Mock(name='subscriber_mock')
-        self.monitor = self.TEST_CLASS.start(
-            self.client, self.timer, *args, **kwargs).proxy()
-def cloud_node_mock(node_num=99, size=None, **extra):
-    if size is None:
-        size = MockSize(node_num)
-    node = mock.NonCallableMagicMock(
-        ['id', 'name', 'state', 'public_ips', 'private_ips', 'driver', 'size',
-         'image', 'extra'],
-        name='cloud_node')
-    node.id = str(node_num)
-    node.name = node.id
-    node.size = size
-    node.public_ips = []
-    node.private_ips = [ip_address_mock(node_num)]
-    node.extra = extra
-    return node
index 2c49dcae62f1a30be179735e15d42a6b1e148934..463c552c4f1eb5caf0868337858197a747bc8fa8 100644 (file)
@@ -364,7 +364,7 @@ class Summarizer(object):
-    # FIXME: This needs to be updated to account for current nodemanager algorithms
+    # FIXME: This needs to be updated to account for current a-d-c algorithms
     def _recommend_ram(self):
         """Recommend an economical RAM constraint for this job.