Merge branch '18682-use-keyrings-instead-of-key-ids'
[arvados.git] / tools / compute-images / scripts / base.sh
index 8ea25087826fe2d2cee9596d60621363de81ca99..90b845f1ac8a30c0a0b30edb6d5361557085a374 100644 (file)
@@ -4,6 +4,8 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+set -eu -o pipefail
+
 SUDO=sudo
 
 wait_for_apt_locks() {
@@ -142,8 +144,29 @@ $SUDO chmod 700 /home/crunch/.ssh/
 if [ "x$RESOLVER" != "x" ]; then
   $SUDO sed -i "s/#prepend domain-name-servers 127.0.0.1;/prepend domain-name-servers ${RESOLVER};/" /etc/dhcp/dhclient.conf
 fi
-# Set up the cloud-init script that will ensure encrypted disks
-$SUDO mv /tmp/usr-local-bin-ensure-encrypted-partitions.sh /usr/local/bin/ensure-encrypted-partitions.sh
+
+if [ "$AWS_EBS_AUTOSCALE" != "1" ]; then
+  # Set up the cloud-init script that will ensure encrypted disks
+  $SUDO mv /tmp/usr-local-bin-ensure-encrypted-partitions.sh /usr/local/bin/ensure-encrypted-partitions.sh
+else
+  wait_for_apt_locks && $SUDO DEBIAN_FRONTEND=noninteractive apt-get -qq --yes install jq unzip
+
+  curl -s "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "/tmp/awscliv2.zip"
+  unzip -q /tmp/awscliv2.zip -d /tmp && $SUDO /tmp/aws/install
+  # Pinned to v2.4.5 because we apply a patch below
+  #export EBS_AUTOSCALE_VERSION=$(curl --silent "https://api.github.com/repos/awslabs/amazon-ebs-autoscale/releases/latest" | jq -r .tag_name)
+  export EBS_AUTOSCALE_VERSION="v2.4.5"
+  cd /opt && $SUDO git clone https://github.com/awslabs/amazon-ebs-autoscale.git
+  cd /opt/amazon-ebs-autoscale && $SUDO git checkout $EBS_AUTOSCALE_VERSION
+  $SUDO patch -p1 < /tmp/create-ebs-volume-nvme.patch
+
+  # This script really requires bash and the shebang line is wrong
+  $SUDO sed -i 's|^#!/bin/sh|#!/bin/bash|' /opt/amazon-ebs-autoscale/bin/ebs-autoscale
+
+  # Set up the cloud-init script that makes use of the AWS EBS autoscaler
+  $SUDO mv /tmp/usr-local-bin-ensure-encrypted-partitions-aws-ebs-autoscale.sh /usr/local/bin/ensure-encrypted-partitions.sh
+fi
+
 $SUDO chmod 755 /usr/local/bin/ensure-encrypted-partitions.sh
 $SUDO chown root:root /usr/local/bin/ensure-encrypted-partitions.sh
 $SUDO mv /tmp/etc-cloud-cloud.cfg.d-07_compute_arvados_dispatch_cloud.cfg /etc/cloud/cloud.cfg.d/07_compute_arvados_dispatch_cloud.cfg
@@ -207,6 +230,12 @@ if [ "$NVIDIA_GPU_SUPPORT" == "1" ]; then
   fi
   $SUDO apt-get update
   $SUDO apt-get -y install libnvidia-container1 libnvidia-container-tools nvidia-container-toolkit
+  # This service fails to start when the image is booted without Nvidia GPUs present, which makes
+  # `systemctl is-system-running` respond with "degraded" and since that command is our default
+  # BootProbeCommand, compute nodes never finish booting from Arvados' perspective.
+  # Disable the service to avoid this. This should be fine because crunch-run does its own basic
+  # CUDA initialization.
+  $SUDO systemctl disable nvidia-persistenced.service
 fi
 
 $SUDO apt-get clean