X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/1cc3a7c037077e3d64fda794e239c0096b3f033c..HEAD:/tools/compute-images/scripts/base.sh diff --git a/tools/compute-images/scripts/base.sh b/tools/compute-images/scripts/base.sh index a9323214ce..3daf5cadf9 100644 --- a/tools/compute-images/scripts/base.sh +++ b/tools/compute-images/scripts/base.sh @@ -15,8 +15,8 @@ wait_for_apt_locks() { done } -# $DIST should not have a dot if there is one in /etc/os-release (e.g. 18.04) -DIST=$(. /etc/os-release; echo $ID$VERSION_ID | tr -d '.') +. /etc/os-release +DISTRO_ID="$ID" # Run apt-get update $SUDO DEBIAN_FRONTEND=noninteractive apt-get --yes update @@ -36,20 +36,19 @@ if [[ ! -d /var/lib/cloud/scripts/per-boot ]]; then mkdir -p /var/lib/cloud/scripts/per-boot fi -TMP_LSB=`/usr/bin/lsb_release -c -s` -LSB_RELEASE_CODENAME=${TMP_LSB//[$'\t\r\n ']} - SET_RESOLVER= if [ -n "$RESOLVER" ]; then SET_RESOLVER="--dns ${RESOLVER}" fi +echo "Working directory is '${WORKDIR}'" + # Add the arvados apt repository echo "# apt.arvados.org" |$SUDO tee --append /etc/apt/sources.list.d/apt.arvados.org.list -echo "deb http://apt.arvados.org/$LSB_RELEASE_CODENAME $LSB_RELEASE_CODENAME${REPOSUFFIX} main" |$SUDO tee --append /etc/apt/sources.list.d/apt.arvados.org.list +echo "deb http://apt.arvados.org/$VERSION_CODENAME $VERSION_CODENAME${REPOSUFFIX} main" |$SUDO tee --append /etc/apt/sources.list.d/apt.arvados.org.list # Add the arvados signing key -cat /tmp/1078ECD7.asc | $SUDO apt-key add - +cat ${WORKDIR}/1078ECD7.asc | $SUDO apt-key add - # Add the debian keys (but don't abort if we can't find them, e.g. on Ubuntu where we don't need them) wait_for_apt_locks && $SUDO DEBIAN_FRONTEND=noninteractive apt-get install --yes debian-keyring debian-archive-keyring 2>/dev/null || true @@ -57,6 +56,13 @@ wait_for_apt_locks && $SUDO DEBIAN_FRONTEND=noninteractive apt-get install --yes $SUDO /bin/sed -ri 's/# en_US.UTF-8 UTF-8/en_US.UTF-8 UTF-8/' /etc/locale.gen $SUDO /usr/sbin/locale-gen +if [[ "${PIN_PACKAGES:-true}" != false ]]; then + $SUDO install -d /etc/apt/preferences.d + $SUDO install -m 0644 \ + "$WORKDIR/etc-apt-preferences.d-arvados.pref" \ + /etc/apt/preferences.d/arvados.pref +fi + # Install some packages we always need wait_for_apt_locks && $SUDO DEBIAN_FRONTEND=noninteractive apt-get --yes update wait_for_apt_locks && $SUDO DEBIAN_FRONTEND=noninteractive apt-get -qq --yes install \ @@ -75,32 +81,12 @@ wait_for_apt_locks && $SUDO DEBIAN_FRONTEND=noninteractive apt-get -qq --yes ins python3-arvados-fuse \ arvados-docker-cleaner -# We want Docker 20.10 or later so that we support glibc 2.33 and up in the container, cf. -# https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=1005906 -dockerversion=5:20.10.13~3-0 -if [[ "$DIST" =~ ^debian ]]; then - family="debian" - if [ "$DIST" == "debian10" ]; then - distro="buster" - elif [ "$DIST" == "debian11" ]; then - distro="bullseye" - fi -elif [[ "$DIST" =~ ^ubuntu ]]; then - family="ubuntu" - if [ "$DIST" == "ubuntu1804" ]; then - distro="bionic" - elif [ "$DIST" == "ubuntu2004" ]; then - distro="focal" - fi -else - echo "Unsupported distribution $DIST" - exit 1 -fi -curl -fsSL https://download.docker.com/linux/$family/gpg | $SUDO gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg -echo deb [arch=amd64 signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/$family/ $distro stable | \ +DOCKER_URL="https://download.docker.com/linux/$DISTRO_ID" +curl -fsSL "$DOCKER_URL/gpg" | $SUDO gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg +echo "deb [arch=amd64 signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] $DOCKER_URL/ $VERSION_CODENAME stable" | \ $SUDO tee /etc/apt/sources.list.d/docker.list $SUDO apt-get update -$SUDO apt-get -yq --no-install-recommends install docker-ce=${dockerversion}~${family}-${distro} +$SUDO apt-get -yq --no-install-recommends install docker-ce # Set a higher ulimit and the resolver (if set) for docker $SUDO sed "s/ExecStart=\(.*\)/ExecStart=\1 --default-ulimit nofile=10000:10000 ${SET_RESOLVER}/g" \ @@ -134,7 +120,7 @@ echo -e "# for the crunch user\ncrunch ALL=(ALL) NOPASSWD:ALL" | $SUDO tee /etc/ # Set up the ssh public key for the crunch user $SUDO mkdir /home/crunch/.ssh -$SUDO mv /tmp/crunch-authorized_keys /home/crunch/.ssh/authorized_keys +$SUDO mv ${WORKDIR}/crunch-authorized_keys /home/crunch/.ssh/authorized_keys $SUDO chown -R crunch:crunch /home/crunch/.ssh $SUDO chmod 600 /home/crunch/.ssh/authorized_keys $SUDO chmod 700 /home/crunch/.ssh/ @@ -150,30 +136,30 @@ EBS_AUTOSCALE=${AWS_EBS_AUTOSCALE:-} if [ "$EBS_AUTOSCALE" != "1" ]; then # Set up the cloud-init script that will ensure encrypted disks - $SUDO mv /tmp/usr-local-bin-ensure-encrypted-partitions.sh /usr/local/bin/ensure-encrypted-partitions.sh + $SUDO mv ${WORKDIR}/usr-local-bin-ensure-encrypted-partitions.sh /usr/local/bin/ensure-encrypted-partitions.sh else wait_for_apt_locks && $SUDO DEBIAN_FRONTEND=noninteractive apt-get -qq --yes install jq unzip - curl -s "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "/tmp/awscliv2.zip" - unzip -q /tmp/awscliv2.zip -d /tmp && $SUDO /tmp/aws/install + curl -s "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "${WORKDIR}/awscliv2.zip" + unzip -q ${WORKDIR}/awscliv2.zip -d ${WORKDIR} && $SUDO ${WORKDIR}/aws/install # Pinned to v2.4.5 because we apply a patch below #export EBS_AUTOSCALE_VERSION=$(curl --silent "https://api.github.com/repos/awslabs/amazon-ebs-autoscale/releases/latest" | jq -r .tag_name) - export EBS_AUTOSCALE_VERSION="5ca6e24e05787b8ae1184c2a10db80053ddd3038" + export EBS_AUTOSCALE_VERSION="ee323f0751c2b6f733692e805b51b9bf3c251bac" cd /opt && $SUDO git clone https://github.com/arvados/amazon-ebs-autoscale.git cd /opt/amazon-ebs-autoscale && $SUDO git checkout $EBS_AUTOSCALE_VERSION # Set up the cloud-init script that makes use of the AWS EBS autoscaler - $SUDO mv /tmp/usr-local-bin-ensure-encrypted-partitions-aws-ebs-autoscale.sh /usr/local/bin/ensure-encrypted-partitions.sh + $SUDO mv ${WORKDIR}/usr-local-bin-ensure-encrypted-partitions-aws-ebs-autoscale.sh /usr/local/bin/ensure-encrypted-partitions.sh fi $SUDO chmod 755 /usr/local/bin/ensure-encrypted-partitions.sh $SUDO chown root:root /usr/local/bin/ensure-encrypted-partitions.sh -$SUDO mv /tmp/etc-cloud-cloud.cfg.d-07_compute_arvados_dispatch_cloud.cfg /etc/cloud/cloud.cfg.d/07_compute_arvados_dispatch_cloud.cfg +$SUDO mv ${WORKDIR}/etc-cloud-cloud.cfg.d-07_compute_arvados_dispatch_cloud.cfg /etc/cloud/cloud.cfg.d/07_compute_arvados_dispatch_cloud.cfg $SUDO chown root:root /etc/cloud/cloud.cfg.d/07_compute_arvados_dispatch_cloud.cfg if [ "$NVIDIA_GPU_SUPPORT" == "1" ]; then # We need a kernel and matching headers - if [[ "$DIST" =~ ^debian ]]; then + if [[ "$DISTRO_ID" == debian ]]; then $SUDO apt-get -y install linux-image-cloud-amd64 linux-headers-cloud-amd64 elif [ "$CLOUD" == "azure" ]; then $SUDO apt-get -y install linux-image-azure linux-headers-azure @@ -182,40 +168,41 @@ if [ "$NVIDIA_GPU_SUPPORT" == "1" ]; then fi # Install CUDA - $SUDO apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/$DIST/x86_64/7fa2af80.pub - $SUDO apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/$DIST/x86_64/3bf863cc.pub + NVIDIA_URL="https://developer.download.nvidia.com/compute/cuda/repos/$(echo "$DISTRO_ID$VERSION_ID" | tr -d .)/x86_64" + $SUDO apt-key adv --fetch-keys "$NVIDIA_URL/7fa2af80.pub" + $SUDO apt-key adv --fetch-keys "$NVIDIA_URL/3bf863cc.pub" $SUDO apt-get -y install software-properties-common - $SUDO add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/$DIST/x86_64/ /" - # Ubuntu 18.04's add-apt-repository does not understand 'contrib' - $SUDO add-apt-repository contrib || true + $SUDO add-apt-repository "deb $NVIDIA_URL/ /" + $SUDO add-apt-repository contrib $SUDO apt-get update $SUDO apt-get -y install cuda # Install libnvidia-container, the tooling for Docker/Singularity curl -s -L https://nvidia.github.io/libnvidia-container/gpgkey | \ $SUDO apt-key add - - if [ "$DIST" == "debian11" ]; then - # As of 2021-12-16 libnvidia-container and friends are only available for - # Debian 10, not yet Debian 11. Install experimental rc1 package as per this - # workaround: - # https://github.com/NVIDIA/nvidia-docker/issues/1549#issuecomment-989670662 - curl -s -L https://nvidia.github.io/libnvidia-container/debian10/libnvidia-container.list | \ - $SUDO tee /etc/apt/sources.list.d/libnvidia-container.list - $SUDO sed -i -e '/experimental/ s/^#//g' /etc/apt/sources.list.d/libnvidia-container.list - else - # here, $DIST should have a dot if there is one in /etc/os-release (e.g. 18.04)... - DIST=$(. /etc/os-release; echo $ID$VERSION_ID) - curl -s -L https://nvidia.github.io/libnvidia-container/$DIST/libnvidia-container.list | \ - $SUDO tee /etc/apt/sources.list.d/libnvidia-container.list - fi + curl -fsSL "https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list" | + $SUDO tee /etc/apt/sources.list.d/nvidia-container-toolkit.list >/dev/null $SUDO apt-get update $SUDO apt-get -y install libnvidia-container1 libnvidia-container-tools nvidia-container-toolkit - # This service fails to start when the image is booted without Nvidia GPUs present, which makes - # `systemctl is-system-running` respond with "degraded" and since that command is our default - # BootProbeCommand, compute nodes never finish booting from Arvados' perspective. - # Disable the service to avoid this. This should be fine because crunch-run does its own basic - # CUDA initialization. + + # Various components fail to start, and cause systemd to boot in degraded + # state, if the system does not actually have an NVIDIA GPU. Configure the + # image to adapt at boot time. + + # Don't load modules unconditionally. + # Instead load them if hardware is detected. + if [[ -f /etc/modules-load.d/nvidia.conf ]]; then + $SUDO mv /etc/modules-load.d/nvidia.conf /etc/modules-load.d/nvidia.avail + fi + $SUDO install "$WORKDIR/usr-local-bin-detect-gpu.sh" /usr/local/bin/detect-gpu.sh + $SUDO install -d /etc/systemd/system/systemd-modules-load.service.d + $SUDO install -m 0644 \ + "$WORKDIR/etc-systemd-system-systemd-modules-load.service.d-detect-gpu.conf" \ + /etc/systemd/system/systemd-modules-load.service.d/detect-gpu.conf + + # Don't start the persistence daemon. + # Instead rely on crunch-run's CUDA initialization. $SUDO systemctl disable nvidia-persistenced.service fi