18772: add support for the AWS EBS autoscale script to the compute node
authorWard Vandewege <ward@curii.com>
Sat, 19 Feb 2022 00:27:11 +0000 (19:27 -0500)
committerWard Vandewege <ward@curii.com>
Mon, 21 Feb 2022 17:06:10 +0000 (12:06 -0500)
       image builder.

Arvados-DCO-1.1-Signed-off-by: Ward Vandewege <ward@curii.com>

doc/install/crunch2-cloud/install-dispatch-cloud.html.textile.liquid
lib/config/config.default.yml
tools/compute-images/arvados-images-aws.json
tools/compute-images/build.sh
tools/compute-images/scripts/base.sh
tools/compute-images/scripts/create-ebs-volume-nvme.patch [new file with mode: 0644]
tools/compute-images/scripts/usr-local-bin-ensure-encrypted-partitions-aws-ebs-autoscale.sh [new file with mode: 0644]

index 06a918dd37bcfab0df9a050dccbc1e4e0de68170..ee71d7a3f61b6b96577b34ff9d060509f0ee0b93 100644 (file)
@@ -120,6 +120,32 @@ The <span class="userinput">ImageID</span> value is the compute node image that
 </code></pre>
 </notextile>
 
+Example policy for the IAM role used by the cloud dispatcher:
+
+<notextile>
+<pre>
+{
+    "Version": "2012-10-17",
+    "Id": "arvados-dispatch-cloud policy",
+    "Statement": [
+        {
+            "Effect": "Allow",
+            "Action": [
+                "iam:PassRole",
+                "ec2:DescribeKeyPairs",
+                "ec2:ImportKeyPair",
+                "ec2:RunInstances",
+                "ec2:DescribeInstances",
+                "ec2:CreateTags",
+                "ec2:TerminateInstances"
+            ],
+            "Resource": "*"
+        }
+    ]
+}
+</pre>
+</notextile>
+
 h4. Minimal configuration example for Azure
 
 Using managed disks:
index 07ff772e21af2a4e39ad57040f51823dbc88f358..7f191eb118525d8c7637243d0841e9c48b1ae0cb 100644 (file)
@@ -1269,6 +1269,8 @@ Clusters:
           Region: ""
           EBSVolumeType: gp2
           AdminUsername: debian
+          # (ec2) name of the IamInstanceProfile for instances started by
+          # the cloud dispatcher. Leave blank when not needed.
           IamInstanceProfile: ""
 
           # (azure) Credentials.
index 23b7832fcba455b4e9ed3fe13a504eb4644fd4d1..131aa8a8786375543202294df52f8f80a52cfcdd 100644 (file)
@@ -6,6 +6,7 @@
     "aws_profile": "",
     "aws_secret_key": "",
     "aws_source_ami": "ami-031283ff8a43b021c",
+    "aws_ebs_autoscale": "",
     "build_environment": "aws",
     "public_key_file": "",
     "mksquashfs_mem": "",
     "type": "file",
     "source": "scripts/usr-local-bin-ensure-encrypted-partitions.sh",
     "destination": "/tmp/usr-local-bin-ensure-encrypted-partitions.sh"
+  },{
+    "type": "file",
+    "source": "scripts/usr-local-bin-ensure-encrypted-partitions-aws-ebs-autoscale.sh",
+    "destination": "/tmp/usr-local-bin-ensure-encrypted-partitions-aws-ebs-autoscale.sh"
+  },{
+    "type": "file",
+    "source": "scripts/create-ebs-volume-nvme.patch",
+    "destination": "/tmp/create-ebs-volume-nvme.patch"
   },{
     "type": "file",
     "source": "{{user `public_key_file`}}",
@@ -84,6 +93,6 @@
     "type": "shell",
     "execute_command": "sudo -S env {{ .Vars }} /bin/bash '{{ .Path }}'",
     "script": "scripts/base.sh",
-    "environment_vars": ["RESOLVER={{user `resolver`}}","REPOSUFFIX={{user `reposuffix`}}","MKSQUASHFS_MEM={{user `mksquashfs_mem`}}","NVIDIA_GPU_SUPPORT={{user `nvidia_gpu_support`}}","CLOUD=aws"]
+    "environment_vars": ["RESOLVER={{user `resolver`}}","REPOSUFFIX={{user `reposuffix`}}","MKSQUASHFS_MEM={{user `mksquashfs_mem`}}","NVIDIA_GPU_SUPPORT={{user `nvidia_gpu_support`}}","CLOUD=aws","AWS_EBS_AUTOSCALE={{user `aws_ebs_autoscale`}}"]
   }]
 }
index fce8b1918b0c7927dee58724c97cca2142fd8bf1..ea7676db1dc53a94271334abac573a7b864dede0 100755 (executable)
@@ -33,6 +33,8 @@ Options:
       VPC id for AWS, otherwise packer will pick the default one
   --aws-subnet-id
       Subnet id for AWS otherwise packer will pick the default one for the VPC
+  --aws-ebs-autoscale (default: false)
+      Install the AWS EBS autoscaler daemon.
   --gcp-project-id (default: false, required if building for GCP)
       GCP project id
   --gcp-account-file (default: false, required if building for GCP)
@@ -62,6 +64,8 @@ Options:
   --debug (default: false)
       Output debug information
 
+For more information, see the Arvados documentation at https://doc.arvados.org/install/crunch2-cloud/install-compute-node.html
+
 EOF
 
 JSON_FILE=
@@ -71,6 +75,7 @@ AWS_SECRETS_FILE=
 AWS_SOURCE_AMI=
 AWS_VPC_ID=
 AWS_SUBNET_ID=
+AWS_EBS_AUTOSCALE=
 GCP_PROJECT_ID=
 GCP_ACCOUNT_FILE=
 GCP_ZONE=
@@ -86,7 +91,7 @@ MKSQUASHFS_MEM=256M
 NVIDIA_GPU_SUPPORT=
 
 PARSEDOPTS=$(getopt --name "$0" --longoptions \
-    help,json-file:,arvados-cluster-id:,aws-source-ami:,aws-profile:,aws-secrets-file:,aws-region:,aws-vpc-id:,aws-subnet-id:,gcp-project-id:,gcp-account-file:,gcp-zone:,azure-secrets-file:,azure-resource-group:,azure-location:,azure-sku:,azure-cloud-environment:,ssh_user:,resolver:,reposuffix:,public-key-file:,mksquashfs-mem:,nvidia-gpu-support,debug \
+    help,json-file:,arvados-cluster-id:,aws-source-ami:,aws-profile:,aws-secrets-file:,aws-region:,aws-vpc-id:,aws-subnet-id:,aws-ebs-autoscale,gcp-project-id:,gcp-account-file:,gcp-zone:,azure-secrets-file:,azure-resource-group:,azure-location:,azure-sku:,azure-cloud-environment:,ssh_user:,resolver:,reposuffix:,public-key-file:,mksquashfs-mem:,nvidia-gpu-support,debug \
     -- "" "$@")
 if [ $? -ne 0 ]; then
     exit 1
@@ -124,6 +129,9 @@ while [ $# -gt 0 ]; do
         --aws-subnet-id)
             AWS_SUBNET_ID="$2"; shift
             ;;
+        --aws-ebs-autoscale)
+            AWS_EBS_AUTOSCALE=1
+            ;;
         --gcp-project-id)
             GCP_PROJECT_ID="$2"; shift
             ;;
@@ -235,6 +243,9 @@ fi
 if [[ "$AWS_DEFAULT_REGION" != "" ]]; then
   EXTRA2+=" -var aws_default_region=$AWS_DEFAULT_REGION"
 fi
+if [[ "$AWS_EBS_AUTOSCALE" != "" ]]; then
+  EXTRA2+=" -var aws_ebs_autoscale=$AWS_EBS_AUTOSCALE"
+fi
 if [[ "$GCP_PROJECT_ID" != "" ]]; then
   EXTRA2+=" -var project_id=$GCP_PROJECT_ID"
 fi
index 450a8b3c549bd124950931a644967526a147eb27..f02c94eda0037a4f3615a4ec81dd45aa1d431eae 100644 (file)
@@ -142,8 +142,30 @@ $SUDO chmod 700 /home/crunch/.ssh/
 if [ "x$RESOLVER" != "x" ]; then
   $SUDO sed -i "s/#prepend domain-name-servers 127.0.0.1;/prepend domain-name-servers ${RESOLVER};/" /etc/dhcp/dhclient.conf
 fi
-# Set up the cloud-init script that will ensure encrypted disks
-$SUDO mv /tmp/usr-local-bin-ensure-encrypted-partitions.sh /usr/local/bin/ensure-encrypted-partitions.sh
+
+if [ "$AWS_EBS_AUTOSCALE" != "1" ]; then
+  # Set up the cloud-init script that will ensure encrypted disks
+  $SUDO mv /tmp/usr-local-bin-ensure-encrypted-partitions.sh /usr/local/bin/ensure-encrypted-partitions.sh
+else
+  wait_for_apt_locks && $SUDO DEBIAN_FRONTEND=noninteractive apt-get -qq --yes install jq unzip
+
+  curl -s "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "/tmp/awscliv2.zip"
+  unzip -q /tmp/awscliv2.zip -d /tmp && $SUDO /tmp/aws/install
+  # Pinned to v2.4.5 because we apply a patch below
+  #export EBS_AUTOSCALE_VERSION=$(curl --silent "https://api.github.com/repos/awslabs/amazon-ebs-autoscale/releases/latest" | jq -r .tag_name)
+  export EBS_AUTOSCALE_VERSION="v2.4.5"
+  cd /opt && $SUDO git clone https://github.com/awslabs/amazon-ebs-autoscale.git
+  cd /opt/amazon-ebs-autoscale && $SUDO git checkout $EBS_AUTOSCALE_VERSION
+  cd bin
+  $SUDO patch -p1 < /tmp/create-ebs-volume-nvme.patch
+
+  # This script really requires bash and the shebang line is wrong
+  $SUDO sed -i 's|^#!/bin/sh|#!/bin/bash|' /opt/amazon-ebs-autoscale/bin/ebs-autoscale
+
+  # Set up the cloud-init script that makes use of the AWS EBS autoscaler
+  $SUDO mv /tmp/usr-local-bin-ensure-encrypted-partitions-aws-ebs-autoscale.sh /usr/local/bin/ensure-encrypted-partitions.sh
+fi
+
 $SUDO chmod 755 /usr/local/bin/ensure-encrypted-partitions.sh
 $SUDO chown root:root /usr/local/bin/ensure-encrypted-partitions.sh
 $SUDO mv /tmp/etc-cloud-cloud.cfg.d-07_compute_arvados_dispatch_cloud.cfg /etc/cloud/cloud.cfg.d/07_compute_arvados_dispatch_cloud.cfg
diff --git a/tools/compute-images/scripts/create-ebs-volume-nvme.patch b/tools/compute-images/scripts/create-ebs-volume-nvme.patch
new file mode 100644 (file)
index 0000000..1448ae1
--- /dev/null
@@ -0,0 +1,63 @@
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+Make the create-ebs-volume script work with nvme devices.
+
+--- a/create-ebs-volume        2022-02-18 15:24:19.866607848 -0500
++++ b/create-ebs-volume        2022-02-18 16:23:17.931870970 -0500
+@@ -149,9 +152,20 @@
+     for letter in ${alphabet[@]}; do
+         # use /dev/xvdb* device names to avoid contention for /dev/sd* and /dev/xvda names
+         # only supported by HVM instances
+-        if [ ! -b "/dev/xvdb${letter}" ]; then
++        if [[ $created_volumes =~ .*/dev/xvdb${letter}.* ]]; then
++            continue
++        fi
+             echo "/dev/xvdb${letter}"
+             break
++    done
++}
++
++numbers=( {1..255} )
++function get_next_logical_nvme_device() {
++    for num in ${numbers[@]}; do
++        if [ ! -b "/dev/nvme${num}n1" ]; then
++            echo "/dev/nvme${num}"
++            break
+         fi
+     done
+ }
+@@ -243,10 +257,12 @@
+     
+     # check if there are available device names
+     local device=$(get_next_logical_device)
++    local nvme_device=$(get_next_logical_nvme_device)
+     if [ -z "$device" ]; then
+         error "no device names available for volume"
+     fi
+     logthis "next available device: $device"
++    logthis "next available nvme device: $nvme_device"
+     # create the volume
+     local tmpfile=$(mktemp /tmp/ebs-autoscale.create-volume.XXXXXXXXXX)
+@@ -323,8 +339,8 @@
+     logthis "waiting for volume $volume_id on filesystem"
+     while true; do
+-        if [ -e "$device" ]; then
+-            logthis "volume $volume_id on filesystem as $device"
++        if [ -e "$nvme_device" ]; then
++            logthis "volume $volume_id on filesystem as $nvme_device (aws device $device)"
+             break
+         fi
+         sleep 1
+@@ -338,7 +354,7 @@
+     > /dev/null
+     logthis "volume $volume_id DeleteOnTermination ENABLED"
+-    echo $device
++    echo "$nvme_device"n1
+ }
+ create_and_attach_volume
diff --git a/tools/compute-images/scripts/usr-local-bin-ensure-encrypted-partitions-aws-ebs-autoscale.sh b/tools/compute-images/scripts/usr-local-bin-ensure-encrypted-partitions-aws-ebs-autoscale.sh
new file mode 100644 (file)
index 0000000..4b73c8b
--- /dev/null
@@ -0,0 +1,60 @@
+#!/bin/bash
+
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+set -e
+set -x
+
+MOUNTPATH=/tmp
+
+findmntq() {
+    findmnt "$@" >/dev/null
+}
+
+ensure_umount() {
+    if findmntq "$1"; then
+        umount "$1"
+    fi
+}
+
+# First make sure docker is not using /tmp, then unmount everything under it.
+if [ -d /etc/sv/docker.io ]
+then
+  sv stop docker.io || service stop docker.io || true
+else
+  service docker stop || true
+fi
+
+ensure_umount "$MOUNTPATH/docker/aufs"
+
+/bin/bash /opt/amazon-ebs-autoscale/install.sh -f lvm.ext4 -m $MOUNTPATH 2>&1 > /var/log/ebs-autoscale-install.log
+
+# Make sure docker uses the big partition
+cat <<EOF > /etc/docker/daemon.json
+{
+    "data-root": "$MOUNTPATH/docker-data"
+}
+EOF
+
+# restart docker
+if [ -d /etc/sv/docker.io ]
+then
+  ## runit
+  sv up docker.io
+else
+  service docker start
+fi
+
+end=$((SECONDS+60))
+
+while [ $SECONDS -lt $end ]; do
+  if /usr/bin/docker ps -q >/dev/null; then
+    exit 0
+  fi
+  sleep 1
+done
+
+# Docker didn't start within a minute, abort
+exit 1